diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..0248cc6
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,4 @@
+generated_types/protos/google/ linguist-generated=true
+generated_types/protos/grpc/ linguist-generated=true
+generated_types/src/wal_generated.rs linguist-generated=true
+trace_exporters/src/thrift/ linguist-generated=true
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8e56943
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,15 @@
+**/target
+**/*.rs.bk
+.idea/
+.env
+.gdb_history
+*.tsm
+**/.DS_Store
+**/.vscode
+heaptrack.*
+massif.out.*
+perf.data*
+perf.svg
+perf.txt
+valgrind-out.txt
+*.pending-snap
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..fd4a283
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,7141 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "addr2line"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "ahash"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
+dependencies = [
+ "cfg-if",
+ "const-random",
+ "getrandom",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "aliasable"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd"
+
+[[package]]
+name = "alloc-no-stdlib"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3"
+
+[[package]]
+name = "alloc-stdlib"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece"
+dependencies = [
+ "alloc-no-stdlib",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstream"
+version = "0.6.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
+
+[[package]]
+name = "arc-swap"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+
+[[package]]
+name = "arrayref"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545"
+
+[[package]]
+name = "arrayvec"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
+
+[[package]]
+name = "arrow"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614"
+dependencies = [
+ "ahash",
+ "arrow-arith",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-csv",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-json",
+ "arrow-ord",
+ "arrow-row",
+ "arrow-schema",
+ "arrow-select",
+ "arrow-string",
+]
+
+[[package]]
+name = "arrow-arith"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-array"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d"
+dependencies = [
+ "ahash",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "chrono-tz",
+ "half",
+ "hashbrown 0.14.3",
+ "num",
+]
+
+[[package]]
+name = "arrow-buffer"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c"
+dependencies = [
+ "bytes",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-cast"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "base64",
+ "chrono",
+ "comfy-table",
+ "half",
+ "lexical-core",
+ "num",
+]
+
+[[package]]
+name = "arrow-csv"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e09aa6246a1d6459b3f14baeaa49606cfdbca34435c46320e14054d244987ca"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "csv",
+ "csv-core",
+ "lazy_static",
+ "lexical-core",
+ "regex",
+]
+
+[[package]]
+name = "arrow-data"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634"
+dependencies = [
+ "arrow-buffer",
+ "arrow-schema",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-flight"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "624e0dcb6b5a7a06222bfd2be3f7e905ce849a6b714ec989f18cdba330c77d38"
+dependencies = [
+ "arrow-arith",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-ord",
+ "arrow-row",
+ "arrow-schema",
+ "arrow-select",
+ "arrow-string",
+ "base64",
+ "bytes",
+ "futures",
+ "once_cell",
+ "paste",
+ "prost",
+ "tokio",
+ "tonic",
+]
+
+[[package]]
+name = "arrow-ipc"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "flatbuffers",
+ "lz4_flex",
+]
+
+[[package]]
+name = "arrow-json"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d82565c91fd627922ebfe2810ee4e8346841b6f9361b87505a9acea38b614fee"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "indexmap 2.2.2",
+ "lexical-core",
+ "num",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "arrow-ord"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-row"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "half",
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "arrow-schema"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167"
+
+[[package]]
+name = "arrow-select"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "num",
+]
+
+[[package]]
+name = "arrow-string"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "num",
+ "regex",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "arrow_util"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "arrow",
+ "chrono",
+ "comfy-table",
+ "datafusion",
+ "hashbrown 0.14.3",
+ "num-traits",
+ "once_cell",
+ "proptest",
+ "rand",
+ "regex",
+ "snafu 0.8.0",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "assert_cmd"
+version = "2.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00ad3f3a942eee60335ab4342358c161ee296829e0d16ff42fc1d6cb07815467"
+dependencies = [
+ "anstyle",
+ "bstr",
+ "doc-comment",
+ "predicates",
+ "predicates-core",
+ "predicates-tree",
+ "wait-timeout",
+]
+
+[[package]]
+name = "assert_matches"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
+
+[[package]]
+name = "async-channel"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca33f4bc4ed1babef42cad36cc1f51fa88be00420404e5b1e80ab1b18f7678c"
+dependencies = [
+ "concurrent-queue",
+ "event-listener 4.0.3",
+ "event-listener-strategy",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-compression"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a116f46a969224200a0a97f29cfd4c50e7534e4b4826bd23ea2c3c533039c82c"
+dependencies = [
+ "bzip2",
+ "flate2",
+ "futures-core",
+ "futures-io",
+ "memchr",
+ "pin-project-lite",
+ "tokio",
+ "xz2",
+ "zstd",
+ "zstd-safe",
+]
+
+[[package]]
+name = "async-lock"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b"
+dependencies = [
+ "event-listener 2.5.3",
+]
+
+[[package]]
+name = "async-stream"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "atoi"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "atomic-write-file"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edcdbedc2236483ab103a53415653d6b4442ea6141baf1ffa85df29635e88436"
+dependencies = [
+ "nix 0.27.1",
+ "rand",
+]
+
+[[package]]
+name = "authz"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "async-trait",
+ "backoff 0.1.0",
+ "base64",
+ "generated_types",
+ "http",
+ "iox_time",
+ "metric",
+ "observability_deps",
+ "parking_lot",
+ "paste",
+ "snafu 0.8.0",
+ "test_helpers_end_to_end",
+ "tokio",
+ "tonic",
+ "workspace-hack",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "axum"
+version = "0.6.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bitflags 1.3.2",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "mime",
+ "rustversion",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "backoff"
+version = "0.1.0"
+dependencies = [
+ "observability_deps",
+ "rand",
+ "snafu 0.8.0",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "backoff"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b62ddb9cb1ec0a098ad4bbf9344d0713fa193ae1a80af55febcff2627b6a00c1"
+dependencies = [
+ "getrandom",
+ "instant",
+ "rand",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
+
+[[package]]
+name = "base64"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
+
+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "blake2"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "blake3"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0231f06152bf547e9c2b5194f247cd97aacf6dcd8b15d8e5ec0663f64580da87"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "cc",
+ "cfg-if",
+ "constant_time_eq",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "brotli"
+version = "3.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+ "brotli-decompressor",
+]
+
+[[package]]
+name = "brotli-decompressor"
+version = "2.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f"
+dependencies = [
+ "alloc-no-stdlib",
+ "alloc-stdlib",
+]
+
+[[package]]
+name = "bstr"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
+dependencies = [
+ "memchr",
+ "regex-automata 0.4.5",
+ "serde",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
+
+[[package]]
+name = "bytecount"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205"
+
+[[package]]
+name = "bytemuck"
+version = "1.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea31d69bda4949c1c1562c1e6f042a1caefac98cdc8a298260a2ff41c1e2d42b"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+
+[[package]]
+name = "bzip2"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
+dependencies = [
+ "bzip2-sys",
+ "libc",
+]
+
+[[package]]
+name = "bzip2-sys"
+version = "0.1.11+1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "cache_system"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "backoff 0.1.0",
+ "criterion",
+ "futures",
+ "iox_time",
+ "metric",
+ "observability_deps",
+ "ouroboros",
+ "parking_lot",
+ "pdatastructs",
+ "proptest",
+ "rand",
+ "test_helpers",
+ "tokio",
+ "tokio-util",
+ "trace",
+ "tracker",
+ "workspace-hack",
+]
+
+[[package]]
+name = "camino"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cargo-platform"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ceed8ef69d8518a5dda55c07425450b58a4e1946f4951eab6d7191ee86c2443d"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cargo_metadata"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa"
+dependencies = [
+ "camino",
+ "cargo-platform",
+ "semver",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "catalog_cache"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "dashmap",
+ "futures",
+ "hyper",
+ "reqwest",
+ "snafu 0.8.0",
+ "tokio",
+ "tokio-util",
+ "url",
+ "workspace-hack",
+]
+
+[[package]]
+name = "cc"
+version = "1.0.83"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+dependencies = [
+ "jobserver",
+ "libc",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "serde",
+ "wasm-bindgen",
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "chrono-tz"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91d7b79e99bfaa0d47da0687c43aa3b7381938a62ad3a6498599039321f660b7"
+dependencies = [
+ "chrono",
+ "chrono-tz-build",
+ "phf",
+]
+
+[[package]]
+name = "chrono-tz-build"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f"
+dependencies = [
+ "parse-zoneinfo",
+ "phf",
+ "phf_codegen",
+]
+
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.4.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_blocks"
+version = "0.1.0"
+dependencies = [
+ "clap",
+ "ed25519-dalek",
+ "futures",
+ "http",
+ "humantime",
+ "iox_catalog",
+ "iox_time",
+ "itertools 0.12.1",
+ "metric",
+ "non-empty-string",
+ "object_store",
+ "observability_deps",
+ "parquet_cache",
+ "snafu 0.8.0",
+ "sysinfo",
+ "tempfile",
+ "test_helpers",
+ "trace_exporters",
+ "trogging",
+ "url",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.4.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
+
+[[package]]
+name = "client_util"
+version = "0.1.0"
+dependencies = [
+ "http",
+ "mockito",
+ "reqwest",
+ "thiserror",
+ "tokio",
+ "tonic",
+ "tower",
+ "workspace-hack",
+]
+
+[[package]]
+name = "colorchoice"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+
+[[package]]
+name = "comfy-table"
+version = "7.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c64043d6c7b7a4c58e39e7efccfdea7b93d885a795d0c054a69dbbf4dd52686"
+dependencies = [
+ "strum",
+ "strum_macros",
+ "unicode-width",
+]
+
+[[package]]
+name = "concurrent-queue"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d16048cd947b08fa32c24458a22f5dc5e835264f689f4f5653210c69fd107363"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "console"
+version = "0.15.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
+dependencies = [
+ "encode_unicode",
+ "lazy_static",
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "const-random"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
+dependencies = [
+ "const-random-macro",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom",
+ "once_cell",
+ "tiny-keccak",
+]
+
+[[package]]
+name = "constant_time_eq"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2"
+
+[[package]]
+name = "core-foundation"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
+[[package]]
+name = "cpp_demangle"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e8227005286ec39567949b33df9896bcadfa6051bccca2488129f108ca23119"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc"
+version = "3.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe"
+dependencies = [
+ "crc-catalog",
+]
+
+[[package]]
+name = "crc-catalog"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
+
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "futures",
+ "is-terminal",
+ "itertools 0.10.5",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "tokio",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools 0.10.5",
+]
+
+[[package]]
+name = "croaring"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7266f0a7275b00ce4c4f4753e8c31afdefe93828101ece83a06e2ddab1dd1010"
+dependencies = [
+ "byteorder",
+ "croaring-sys",
+]
+
+[[package]]
+name = "croaring-sys"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e47112498c394a7067949ebc07ef429b7384a413cf0efcf675846a47bcd307fb"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "crossbeam-channel"
+version = "0.5.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "176dc175b78f56c0f321911d9c8eb2b77a78a4860b9c19db83835fea1a46649b"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "csv"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "curve25519-dalek"
+version = "4.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a677b8922c94e01bdbb12126b0bc852f00447528dee1782229af9c720c3f348"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "platforms",
+ "rustc_version",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "darling"
+version = "0.20.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc5d6b04b3fd0ba9926f945895de7d806260a2d7431ba82e7edaecb043c4c6b8"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04e48a959bcd5c761246f5d090ebc2fbf7b9cd527a492b07a67510c108f1e7e3"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d1545d67a2149e1d93b7e5c7752dce5a7426eb5d1357ddcfd89336b94444f77"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "dashmap"
+version = "5.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.14.3",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "data_types"
+version = "0.1.0"
+dependencies = [
+ "arrow-buffer",
+ "assert_matches",
+ "bytes",
+ "chrono",
+ "croaring",
+ "generated_types",
+ "hex",
+ "influxdb-line-protocol",
+ "iox_time",
+ "murmur3",
+ "observability_deps",
+ "once_cell",
+ "ordered-float 4.2.0",
+ "paste",
+ "percent-encoding",
+ "proptest",
+ "prost",
+ "schema",
+ "serde_json",
+ "sha2",
+ "siphasher 1.0.0",
+ "snafu 0.8.0",
+ "sqlx",
+ "test_helpers",
+ "thiserror",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "datafusion"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-array",
+ "arrow-ipc",
+ "arrow-schema",
+ "async-compression",
+ "async-trait",
+ "bytes",
+ "bzip2",
+ "chrono",
+ "dashmap",
+ "datafusion-common",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-optimizer",
+ "datafusion-physical-expr",
+ "datafusion-physical-plan",
+ "datafusion-sql",
+ "flate2",
+ "futures",
+ "glob",
+ "half",
+ "hashbrown 0.14.3",
+ "indexmap 2.2.2",
+ "itertools 0.12.1",
+ "log",
+ "num_cpus",
+ "object_store",
+ "parking_lot",
+ "parquet",
+ "pin-project-lite",
+ "rand",
+ "sqlparser",
+ "tempfile",
+ "tokio",
+ "tokio-util",
+ "url",
+ "uuid",
+ "xz2",
+ "zstd",
+]
+
+[[package]]
+name = "datafusion-common"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "libc",
+ "num_cpus",
+ "object_store",
+ "parquet",
+ "sqlparser",
+]
+
+[[package]]
+name = "datafusion-execution"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "arrow",
+ "chrono",
+ "dashmap",
+ "datafusion-common",
+ "datafusion-expr",
+ "futures",
+ "hashbrown 0.14.3",
+ "log",
+ "object_store",
+ "parking_lot",
+ "rand",
+ "tempfile",
+ "url",
+]
+
+[[package]]
+name = "datafusion-expr"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-array",
+ "datafusion-common",
+ "paste",
+ "sqlparser",
+ "strum",
+ "strum_macros",
+]
+
+[[package]]
+name = "datafusion-optimizer"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "arrow",
+ "async-trait",
+ "chrono",
+ "datafusion-common",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "hashbrown 0.14.3",
+ "itertools 0.12.1",
+ "log",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "datafusion-physical-expr"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-ord",
+ "arrow-schema",
+ "base64",
+ "blake2",
+ "blake3",
+ "chrono",
+ "datafusion-common",
+ "datafusion-expr",
+ "half",
+ "hashbrown 0.14.3",
+ "hex",
+ "indexmap 2.2.2",
+ "itertools 0.12.1",
+ "log",
+ "md-5",
+ "paste",
+ "petgraph",
+ "rand",
+ "regex",
+ "sha2",
+ "unicode-segmentation",
+ "uuid",
+]
+
+[[package]]
+name = "datafusion-physical-plan"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-schema",
+ "async-trait",
+ "chrono",
+ "datafusion-common",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-expr",
+ "futures",
+ "half",
+ "hashbrown 0.14.3",
+ "indexmap 2.2.2",
+ "itertools 0.12.1",
+ "log",
+ "once_cell",
+ "parking_lot",
+ "pin-project-lite",
+ "rand",
+ "tokio",
+ "uuid",
+]
+
+[[package]]
+name = "datafusion-proto"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "arrow",
+ "chrono",
+ "datafusion",
+ "datafusion-common",
+ "datafusion-expr",
+ "object_store",
+ "prost",
+]
+
+[[package]]
+name = "datafusion-sql"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
+dependencies = [
+ "arrow",
+ "arrow-schema",
+ "datafusion-common",
+ "datafusion-expr",
+ "log",
+ "sqlparser",
+]
+
+[[package]]
+name = "datafusion_util"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "datafusion",
+ "futures",
+ "object_store",
+ "observability_deps",
+ "pin-project",
+ "schema",
+ "tokio",
+ "tokio-stream",
+ "url",
+ "workspace-hack",
+]
+
+[[package]]
+name = "debugid"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
+dependencies = [
+ "uuid",
+]
+
+[[package]]
+name = "delegate"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "082a24a9967533dc5d743c602157637116fc1b52806d694a5a45e6f32567fcdd"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "der"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
+dependencies = [
+ "const-oid",
+ "pem-rfc7468",
+ "zeroize",
+]
+
+[[package]]
+name = "derivative"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "diff"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
+
+[[package]]
+name = "difflib"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "const-oid",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "dml"
+version = "0.1.0"
+dependencies = [
+ "arrow_util",
+ "data_types",
+ "hashbrown 0.14.3",
+ "iox_time",
+ "mutable_batch",
+ "schema",
+ "trace",
+ "workspace-hack",
+]
+
+[[package]]
+name = "doc-comment"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
+
+[[package]]
+name = "dotenvy"
+version = "0.15.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
+
+[[package]]
+name = "dyn-clone"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d"
+
+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "pkcs8",
+ "signature",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "serde",
+ "sha2",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "either"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "encode_unicode"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "errno"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "error-chain"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "etcetera"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943"
+dependencies = [
+ "cfg-if",
+ "home",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "event-listener"
+version = "2.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
+
+[[package]]
+name = "event-listener"
+version = "4.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b215c49b2b248c855fb73579eb1f4f26c38ffdc12973e20e07b91d78d5646e"
+dependencies = [
+ "concurrent-queue",
+ "parking",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "event-listener-strategy"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
+dependencies = [
+ "event-listener 4.0.3",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "executor"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "libc",
+ "metric",
+ "observability_deps",
+ "once_cell",
+ "parking_lot",
+ "pin-project",
+ "snafu 0.8.0",
+ "tokio",
+ "tokio-util",
+ "tokio_metrics_bridge",
+ "tokio_watchdog",
+ "workspace-hack",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
+
+[[package]]
+name = "fiat-crypto"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1676f435fc1dadde4d03e43f5d62b259e1ce5f40bd4ffb21db2b42ebe59c1382"
+
+[[package]]
+name = "filetime"
+version = "0.2.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "findshlibs"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
+dependencies = [
+ "cc",
+ "lazy_static",
+ "libc",
+ "winapi",
+]
+
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
+[[package]]
+name = "fixedbitset"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+
+[[package]]
+name = "flatbuffers"
+version = "23.5.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640"
+dependencies = [
+ "bitflags 1.3.2",
+ "rustc_version",
+]
+
+[[package]]
+name = "flate2"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "flightsql"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow-flight",
+ "arrow_util",
+ "bytes",
+ "datafusion",
+ "iox_query",
+ "observability_deps",
+ "once_cell",
+ "prost",
+ "snafu 0.8.0",
+ "workspace-hack",
+]
+
+[[package]]
+name = "flume"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "spin 0.9.8",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "fsevent-sys"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "futures"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-intrusive"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f"
+dependencies = [
+ "futures-core",
+ "lock_api",
+ "parking_lot",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+
+[[package]]
+name = "futures-task"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+
+[[package]]
+name = "futures-util"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "generated_types"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "observability_deps",
+ "pbjson",
+ "pbjson-build",
+ "pbjson-types",
+ "prost",
+ "prost-build",
+ "serde",
+ "tonic",
+ "tonic-build",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "gimli"
+version = "0.28.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+
+[[package]]
+name = "glob"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
+
+[[package]]
+name = "grpc-binary-logger"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "base64",
+ "byteorder",
+ "bytes",
+ "futures",
+ "grpc-binary-logger-proto",
+ "grpc-binary-logger-test-proto",
+ "http",
+ "http-body",
+ "hyper",
+ "pin-project",
+ "prost",
+ "prost-build",
+ "tokio",
+ "tokio-stream",
+ "tonic",
+ "tonic-build",
+ "tower",
+ "workspace-hack",
+]
+
+[[package]]
+name = "grpc-binary-logger-proto"
+version = "0.1.0"
+dependencies = [
+ "prost",
+ "prost-build",
+ "prost-types",
+ "tonic",
+ "tonic-build",
+ "workspace-hack",
+]
+
+[[package]]
+name = "grpc-binary-logger-test-proto"
+version = "0.1.0"
+dependencies = [
+ "prost",
+ "prost-build",
+ "tonic",
+ "tonic-build",
+ "workspace-hack",
+]
+
+[[package]]
+name = "h2"
+version = "0.3.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http",
+ "indexmap 2.2.2",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "half"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
+[[package]]
+name = "handlebars"
+version = "5.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab283476b99e66691dee3f1640fea91487a8d81f50fb5ecc75538f8f8879a1e4"
+dependencies = [
+ "log",
+ "pest",
+ "pest_derive",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]
+
+[[package]]
+name = "hashlink"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+dependencies = [
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "heappy"
+version = "0.1.0"
+source = "git+https://github.com/mkmik/heappy?rev=01a1f88e1b404c5894f89eb1a57f813f713d7ad1#01a1f88e1b404c5894f89eb1a57f813f713d7ad1"
+dependencies = [
+ "backtrace",
+ "bytes",
+ "lazy_static",
+ "libc",
+ "pprof",
+ "spin 0.9.8",
+ "thiserror",
+ "tikv-jemalloc-sys",
+]
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+dependencies = [
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0c62115964e08cb8039170eb33c1d0e2388a256930279edca206fff675f82c3"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "hkdf"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7"
+dependencies = [
+ "hmac",
+]
+
+[[package]]
+name = "hmac"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "http"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8947b1a6fad4393052c7ba1f4cd97bed3e953a95c79c92ad9b051a04611d9fbb"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
+dependencies = [
+ "bytes",
+ "http",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "http-range-header"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f"
+
+[[package]]
+name = "httparse"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
+[[package]]
+name = "hyper"
+version = "0.14.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590"
+dependencies = [
+ "futures-util",
+ "http",
+ "hyper",
+ "log",
+ "rustls",
+ "rustls-native-certs",
+ "tokio",
+ "tokio-rustls",
+]
+
+[[package]]
+name = "hyper-timeout"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
+dependencies = [
+ "hyper",
+ "pin-project-lite",
+ "tokio",
+ "tokio-io-timeout",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.60"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
+[[package]]
+name = "idna"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "import_export"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "data_types",
+ "futures-util",
+ "generated_types",
+ "influxdb_iox_client",
+ "iox_catalog",
+ "object_store",
+ "observability_deps",
+ "parquet_file",
+ "schema",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "workspace-hack",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824b2ae422412366ba479e8111fd301f7b5faece8149317bb81925979a53f520"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "inferno"
+version = "0.11.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "321f0f839cd44a4686e9504b0a62b4d69a50b62072144c71c68f5873c167b8d9"
+dependencies = [
+ "ahash",
+ "indexmap 2.2.2",
+ "is-terminal",
+ "itoa",
+ "log",
+ "num-format",
+ "once_cell",
+ "quick-xml 0.26.0",
+ "rgb",
+ "str_stack",
+]
+
+[[package]]
+name = "influxdb-line-protocol"
+version = "1.0.0"
+dependencies = [
+ "bytes",
+ "log",
+ "nom",
+ "smallvec",
+ "snafu 0.8.0",
+ "test_helpers",
+]
+
+[[package]]
+name = "influxdb2_client"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "futures",
+ "mockito",
+ "once_cell",
+ "parking_lot",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "snafu 0.8.0",
+ "test_helpers",
+ "tokio",
+ "url",
+ "uuid",
+]
+
+[[package]]
+name = "influxdb_influxql_parser"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "chrono",
+ "chrono-tz",
+ "insta",
+ "nom",
+ "num-integer",
+ "num-traits",
+ "once_cell",
+ "paste",
+ "test_helpers",
+ "workspace-hack",
+]
+
+[[package]]
+name = "influxdb_iox_client"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow-flight",
+ "arrow_util",
+ "bytes",
+ "client_util",
+ "comfy-table",
+ "futures-util",
+ "generated_types",
+ "influxdb-line-protocol",
+ "insta",
+ "iox_query_params",
+ "prost",
+ "rand",
+ "reqwest",
+ "schema",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "tonic",
+]
+
+[[package]]
+name = "influxdb_storage_client"
+version = "0.1.0"
+dependencies = [
+ "client_util",
+ "futures-util",
+ "generated_types",
+ "observability_deps",
+ "prost",
+ "tonic",
+ "workspace-hack",
+]
+
+[[package]]
+name = "influxdb_tsm"
+version = "0.1.0"
+dependencies = [
+ "flate2",
+ "hex",
+ "integer-encoding 4.0.0",
+ "observability_deps",
+ "rand",
+ "snafu 0.7.5",
+ "snap",
+ "test_helpers",
+ "workspace-hack",
+]
+
+[[package]]
+name = "influxrpc_parser"
+version = "0.1.0"
+dependencies = [
+ "generated_types",
+ "snafu 0.8.0",
+ "sqlparser",
+ "workspace-hack",
+]
+
+[[package]]
+name = "ingester_query_grpc"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "base64",
+ "bytes",
+ "data_types",
+ "datafusion",
+ "datafusion-proto",
+ "flatbuffers",
+ "pbjson",
+ "pbjson-build",
+ "predicate",
+ "prost",
+ "prost-build",
+ "query_functions",
+ "serde",
+ "snafu 0.8.0",
+ "tonic",
+ "tonic-build",
+ "workspace-hack",
+]
+
+[[package]]
+name = "inotify"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+dependencies = [
+ "bitflags 1.3.2",
+ "inotify-sys",
+ "libc",
+]
+
+[[package]]
+name = "inotify-sys"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "insta"
+version = "1.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d64600be34b2fcfc267740a243fa7744441bb4947a619ac4e5bb6507f35fbfc"
+dependencies = [
+ "console",
+ "lazy_static",
+ "linked-hash-map",
+ "serde",
+ "similar",
+ "yaml-rust",
+]
+
+[[package]]
+name = "instant"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "integer-encoding"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
+
+[[package]]
+name = "integer-encoding"
+version = "4.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "924df4f0e24e2e7f9cdd90babb0b96f93b20f3ecfa949ea9e6613756b8c8e1bf"
+
+[[package]]
+name = "iox_catalog"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "async-trait",
+ "backoff 0.1.0",
+ "catalog_cache",
+ "data_types",
+ "dotenvy",
+ "futures",
+ "generated_types",
+ "iox_time",
+ "log",
+ "metric",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "observability_deps",
+ "once_cell",
+ "parking_lot",
+ "paste",
+ "pretty_assertions",
+ "proptest",
+ "rand",
+ "serde",
+ "siphasher 1.0.0",
+ "snafu 0.8.0",
+ "sqlx",
+ "sqlx-hotswap-pool",
+ "tempfile",
+ "test_helpers",
+ "thiserror",
+ "tokio",
+ "tonic",
+ "trace_http",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "iox_data_generator"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "chrono",
+ "clap",
+ "criterion",
+ "datafusion_util",
+ "futures",
+ "handlebars",
+ "humantime",
+ "influxdb2_client",
+ "itertools 0.12.1",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "parquet_file",
+ "rand",
+ "regex",
+ "schema",
+ "serde",
+ "serde_json",
+ "snafu 0.8.0",
+ "test_helpers",
+ "tokio",
+ "toml",
+ "tracing",
+ "tracing-subscriber",
+ "uuid",
+]
+
+[[package]]
+name = "iox_query"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow_util",
+ "assert_matches",
+ "async-trait",
+ "chrono",
+ "data_types",
+ "datafusion",
+ "datafusion_util",
+ "executor",
+ "futures",
+ "hashbrown 0.14.3",
+ "indexmap 2.2.2",
+ "insta",
+ "iox_time",
+ "itertools 0.12.1",
+ "metric",
+ "object_store",
+ "observability_deps",
+ "once_cell",
+ "parking_lot",
+ "parquet_file",
+ "predicate",
+ "query_functions",
+ "schema",
+ "serde",
+ "snafu 0.8.0",
+ "test_helpers",
+ "tokio",
+ "tokio-stream",
+ "trace",
+ "tracker",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "iox_query_influxql"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "assert_matches",
+ "chrono",
+ "chrono-tz",
+ "datafusion",
+ "datafusion_util",
+ "generated_types",
+ "influxdb_influxql_parser",
+ "insta",
+ "iox_query",
+ "itertools 0.12.1",
+ "observability_deps",
+ "once_cell",
+ "predicate",
+ "query_functions",
+ "regex",
+ "schema",
+ "serde_json",
+ "test_helpers",
+ "thiserror",
+ "workspace-hack",
+]
+
+[[package]]
+name = "iox_query_influxrpc"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow_util",
+ "data_types",
+ "datafusion",
+ "datafusion_util",
+ "futures",
+ "hashbrown 0.14.3",
+ "insta",
+ "iox_query",
+ "observability_deps",
+ "predicate",
+ "query_functions",
+ "schema",
+ "snafu 0.8.0",
+ "test_helpers",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "iox_query_params"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "datafusion",
+ "generated_types",
+ "observability_deps",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "workspace-hack",
+]
+
+[[package]]
+name = "iox_tests"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "data_types",
+ "datafusion",
+ "datafusion_util",
+ "generated_types",
+ "iox_catalog",
+ "iox_query",
+ "iox_time",
+ "metric",
+ "mutable_batch_lp",
+ "object_store",
+ "observability_deps",
+ "parquet_file",
+ "schema",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "iox_time"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "parking_lot",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "ioxd_common"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "authz",
+ "bytes",
+ "clap",
+ "clap_blocks",
+ "flate2",
+ "futures",
+ "generated_types",
+ "hashbrown 0.14.3",
+ "heappy",
+ "http",
+ "hyper",
+ "log",
+ "metric",
+ "metric_exporters",
+ "observability_deps",
+ "parking_lot",
+ "pprof",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "service_grpc_testing",
+ "snafu 0.8.0",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tonic",
+ "tonic-health",
+ "tonic-reflection",
+ "tower",
+ "tower-http",
+ "tower_trailer",
+ "trace",
+ "trace_exporters",
+ "trace_http",
+ "workspace-hack",
+]
+
+[[package]]
+name = "ioxd_test"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "clap",
+ "hyper",
+ "ioxd_common",
+ "metric",
+ "snafu 0.8.0",
+ "tokio-util",
+ "trace",
+ "workspace-hack",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
+
+[[package]]
+name = "is-terminal"
+version = "0.4.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455"
+dependencies = [
+ "hermit-abi",
+ "rustix",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c"
+
+[[package]]
+name = "jobserver"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.68"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "json-patch"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55ff1e1486799e3f64129f8ccad108b38290df9cd7015cd31bed17239f0789d6"
+dependencies = [
+ "serde",
+ "serde_json",
+ "thiserror",
+ "treediff",
+]
+
+[[package]]
+name = "jsonpath-rust"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829"
+dependencies = [
+ "pest",
+ "pest_derive",
+ "regex",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "k8s-openapi"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6"
+dependencies = [
+ "base64",
+ "bytes",
+ "chrono",
+ "schemars",
+ "serde",
+ "serde-value",
+ "serde_json",
+]
+
+[[package]]
+name = "kqueue"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
+dependencies = [
+ "kqueue-sys",
+ "libc",
+]
+
+[[package]]
+name = "kqueue-sys"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
+dependencies = [
+ "bitflags 1.3.2",
+ "libc",
+]
+
+[[package]]
+name = "kube"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3499c8d60c763246c7a213f51caac1e9033f46026904cb89bc8951ae8601f26e"
+dependencies = [
+ "k8s-openapi",
+ "kube-client",
+ "kube-core",
+ "kube-derive",
+ "kube-runtime",
+]
+
+[[package]]
+name = "kube-client"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "033450dfa0762130565890dadf2f8835faedf749376ca13345bcd8ecd6b5f29f"
+dependencies = [
+ "base64",
+ "bytes",
+ "chrono",
+ "either",
+ "futures",
+ "home",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-rustls",
+ "hyper-timeout",
+ "jsonpath-rust",
+ "k8s-openapi",
+ "kube-core",
+ "pem",
+ "pin-project",
+ "rustls",
+ "rustls-pemfile",
+ "secrecy",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tracing",
+]
+
+[[package]]
+name = "kube-core"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae"
+dependencies = [
+ "chrono",
+ "form_urlencoded",
+ "http",
+ "json-patch",
+ "k8s-openapi",
+ "once_cell",
+ "schemars",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "kube-derive"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e98dd5e5767c7b894c1f0e41fd628b145f808e981feb8b08ed66455d47f1a4"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "serde_json",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "kube-runtime"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d8893eb18fbf6bb6c80ef6ee7dd11ec32b1dc3c034c988ac1b3a84d46a230ae"
+dependencies = [
+ "ahash",
+ "async-trait",
+ "backoff 0.4.0",
+ "derivative",
+ "futures",
+ "hashbrown 0.14.3",
+ "json-patch",
+ "k8s-openapi",
+ "kube-client",
+ "parking_lot",
+ "pin-project",
+ "serde",
+ "serde_json",
+ "smallvec",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "kube_test"
+version = "0.1.0"
+dependencies = [
+ "http",
+ "hyper",
+ "k8s-openapi",
+ "kube-core",
+ "rand",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "tower",
+ "workspace-hack",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin 0.5.2",
+]
+
+[[package]]
+name = "lexical-core"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46"
+dependencies = [
+ "lexical-parse-float",
+ "lexical-parse-integer",
+ "lexical-util",
+ "lexical-write-float",
+ "lexical-write-integer",
+]
+
+[[package]]
+name = "lexical-parse-float"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f"
+dependencies = [
+ "lexical-parse-integer",
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-parse-integer"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-util"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc"
+dependencies = [
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-float"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862"
+dependencies = [
+ "lexical-util",
+ "lexical-write-integer",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-integer"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.153"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "libsqlite3-sys"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
+dependencies = [
+ "cc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "linked-hash-map"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+
+[[package]]
+name = "lock_api"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+
+[[package]]
+name = "logfmt"
+version = "0.1.0"
+dependencies = [
+ "observability_deps",
+ "once_cell",
+ "parking_lot",
+ "regex",
+ "tracing-subscriber",
+ "workspace-hack",
+]
+
+[[package]]
+name = "lz4_flex"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15"
+dependencies = [
+ "twox-hash",
+]
+
+[[package]]
+name = "lzma-sys"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
+
+[[package]]
+name = "memmap2"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "metric"
+version = "0.1.0"
+dependencies = [
+ "parking_lot",
+ "workspace-hack",
+]
+
+[[package]]
+name = "metric_exporters"
+version = "0.1.0"
+dependencies = [
+ "metric",
+ "observability_deps",
+ "prometheus",
+ "test_helpers",
+ "workspace-hack",
+]
+
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "mio"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+dependencies = [
+ "libc",
+ "log",
+ "wasi",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "mockito"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8d3038e23466858569c2d30a537f691fa0d53b51626630ae08262943e3bbb8b"
+dependencies = [
+ "assert-json-diff",
+ "futures",
+ "hyper",
+ "log",
+ "rand",
+ "regex",
+ "serde_json",
+ "serde_urlencoded",
+ "similar",
+ "tokio",
+]
+
+[[package]]
+name = "moka"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1911e88d5831f748a4097a43862d129e3c6fca831eecac9b8db6d01d93c9de2"
+dependencies = [
+ "async-lock",
+ "async-trait",
+ "crossbeam-channel",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "futures-util",
+ "once_cell",
+ "parking_lot",
+ "quanta",
+ "rustc_version",
+ "skeptic",
+ "smallvec",
+ "tagptr",
+ "thiserror",
+ "triomphe",
+ "uuid",
+]
+
+[[package]]
+name = "mpchash"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdd8199faa645318222f8aeb383fca4216a3f75b144f1e264ac74c0835d871a9"
+dependencies = [
+ "num-traits",
+ "rand",
+ "xxhash-rust",
+]
+
+[[package]]
+name = "multimap"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
+
+[[package]]
+name = "murmur3"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b"
+
+[[package]]
+name = "mutable_batch"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow_util",
+ "assert_matches",
+ "data_types",
+ "hashbrown 0.14.3",
+ "iox_time",
+ "itertools 0.12.1",
+ "mutable_batch_lp",
+ "partition",
+ "pretty_assertions",
+ "proptest",
+ "rand",
+ "schema",
+ "snafu 0.8.0",
+ "workspace-hack",
+]
+
+[[package]]
+name = "mutable_batch_lp"
+version = "0.1.0"
+dependencies = [
+ "arrow_util",
+ "assert_matches",
+ "criterion",
+ "hashbrown 0.14.3",
+ "influxdb-line-protocol",
+ "itertools 0.12.1",
+ "mutable_batch",
+ "schema",
+ "snafu 0.8.0",
+ "test_helpers",
+ "workspace-hack",
+]
+
+[[package]]
+name = "mutable_batch_pb"
+version = "0.1.0"
+dependencies = [
+ "arrow_util",
+ "data_types",
+ "dml",
+ "generated_types",
+ "hashbrown 0.14.3",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "partition",
+ "schema",
+ "snafu 0.8.0",
+ "workspace-hack",
+]
+
+[[package]]
+name = "mutable_batch_tests"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "criterion",
+ "data_types",
+ "dml",
+ "flate2",
+ "generated_types",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "mutable_batch_pb",
+ "prost",
+]
+
+[[package]]
+name = "nix"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+]
+
+[[package]]
+name = "nix"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+dependencies = [
+ "bitflags 2.4.2",
+ "cfg-if",
+ "libc",
+]
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "non-empty-string"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cf0f4060e345ae505219853da9ca1150564158a648a6aa6a528f0d5794bb33"
+dependencies = [
+ "delegate",
+]
+
+[[package]]
+name = "notify"
+version = "6.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+dependencies = [
+ "bitflags 2.4.2",
+ "crossbeam-channel",
+ "filetime",
+ "fsevent-sys",
+ "inotify",
+ "kqueue",
+ "libc",
+ "log",
+ "mio",
+ "walkdir",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "ntapi"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
+[[package]]
+name = "num"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint-dig"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
+dependencies = [
+ "byteorder",
+ "lazy_static",
+ "libm",
+ "num-integer",
+ "num-iter",
+ "num-traits",
+ "rand",
+ "smallvec",
+ "zeroize",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-format"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
+dependencies = [
+ "arrayvec",
+ "itoa",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "num_cpus"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
+name = "object"
+version = "0.32.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "object_store"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2524735495ea1268be33d200e1ee97455096a0846295a21548cd2f3541de7050"
+dependencies = [
+ "async-trait",
+ "base64",
+ "bytes",
+ "chrono",
+ "futures",
+ "humantime",
+ "hyper",
+ "itertools 0.11.0",
+ "parking_lot",
+ "percent-encoding",
+ "quick-xml 0.31.0",
+ "rand",
+ "reqwest",
+ "ring",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "snafu 0.7.5",
+ "tokio",
+ "tracing",
+ "url",
+ "walkdir",
+]
+
+[[package]]
+name = "object_store_metrics"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures",
+ "iox_time",
+ "metric",
+ "object_store",
+ "pin-project",
+ "snafu 0.8.0",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "observability_deps"
+version = "0.1.0"
+dependencies = [
+ "tracing",
+ "workspace-hack",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+dependencies = [
+ "parking_lot_core",
+]
+
+[[package]]
+name = "oorandom"
+version = "11.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
+
+[[package]]
+name = "openssl-probe"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
+
+[[package]]
+name = "ordered-float"
+version = "2.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ordered-float"
+version = "4.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ouroboros"
+version = "0.18.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97b7be5a8a3462b752f4be3ff2b2bf2f7f1d00834902e46be2a4d68b87b0573c"
+dependencies = [
+ "aliasable",
+ "ouroboros_macro",
+ "static_assertions",
+]
+
+[[package]]
+name = "ouroboros_macro"
+version = "0.18.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33"
+dependencies = [
+ "heck",
+ "itertools 0.12.1",
+ "proc-macro2",
+ "proc-macro2-diagnostics",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
+[[package]]
+name = "panic_logging"
+version = "0.1.0"
+dependencies = [
+ "metric",
+ "observability_deps",
+ "test_helpers",
+ "workspace-hack",
+]
+
+[[package]]
+name = "parking"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
+
+[[package]]
+name = "parking_lot"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "parquet"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af88740a842787da39b3d69ce5fbf6fce97d20211d3b299fee0a0da6430c74d4"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-schema",
+ "arrow-select",
+ "base64",
+ "brotli",
+ "bytes",
+ "chrono",
+ "flate2",
+ "futures",
+ "hashbrown 0.14.3",
+ "lz4_flex",
+ "num",
+ "num-bigint",
+ "object_store",
+ "paste",
+ "seq-macro",
+ "snap",
+ "thrift",
+ "tokio",
+ "twox-hash",
+ "zstd",
+]
+
+[[package]]
+name = "parquet_cache"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "arc-swap",
+ "assert_matches",
+ "async-channel",
+ "async-trait",
+ "backoff 0.1.0",
+ "bytes",
+ "chrono",
+ "data_types",
+ "fnv",
+ "futures",
+ "http",
+ "hyper",
+ "iox_catalog",
+ "iox_tests",
+ "iox_time",
+ "k8s-openapi",
+ "kube",
+ "kube_test",
+ "lazy_static",
+ "moka",
+ "mpchash",
+ "notify",
+ "object_store",
+ "observability_deps",
+ "parking_lot",
+ "parquet_file",
+ "pin-project",
+ "rand",
+ "reqwest",
+ "schemars",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tower",
+ "url",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "parquet_file"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "assert_matches",
+ "base64",
+ "bytes",
+ "data_types",
+ "datafusion",
+ "datafusion_util",
+ "futures",
+ "generated_types",
+ "iox_time",
+ "object_store",
+ "observability_deps",
+ "parquet",
+ "pbjson-types",
+ "prost",
+ "rand",
+ "schema",
+ "snafu 0.8.0",
+ "test_helpers",
+ "thiserror",
+ "thrift",
+ "tokio",
+ "uuid",
+ "workspace-hack",
+ "zstd",
+]
+
+[[package]]
+name = "parquet_to_line_protocol"
+version = "0.1.0"
+dependencies = [
+ "datafusion",
+ "datafusion_util",
+ "futures",
+ "influxdb-line-protocol",
+ "mutable_batch_lp",
+ "num_cpus",
+ "object_store",
+ "parquet_file",
+ "schema",
+ "snafu 0.8.0",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "parse-zoneinfo"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41"
+dependencies = [
+ "regex",
+]
+
+[[package]]
+name = "partition"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "assert_matches",
+ "chrono",
+ "criterion",
+ "data_types",
+ "generated_types",
+ "hashbrown 0.14.3",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "paste",
+ "percent-encoding",
+ "proptest",
+ "rand",
+ "schema",
+ "test_helpers",
+ "thiserror",
+ "unicode-segmentation",
+ "workspace-hack",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
+
+[[package]]
+name = "pbjson"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90"
+dependencies = [
+ "base64",
+ "serde",
+]
+
+[[package]]
+name = "pbjson-build"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735"
+dependencies = [
+ "heck",
+ "itertools 0.11.0",
+ "prost",
+ "prost-types",
+]
+
+[[package]]
+name = "pbjson-types"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12"
+dependencies = [
+ "bytes",
+ "chrono",
+ "pbjson",
+ "pbjson-build",
+ "prost",
+ "prost-build",
+ "serde",
+]
+
+[[package]]
+name = "pdatastructs"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bdcb4943c3c68659690124771ffb2fd93b73900bd0fb47e934f7b8b2e6687fa"
+dependencies = [
+ "fixedbitset",
+]
+
+[[package]]
+name = "pem"
+version = "3.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
+dependencies = [
+ "base64",
+ "serde",
+]
+
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pest"
+version = "2.7.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "219c0dcc30b6a27553f9cc242972b67f75b60eb0db71f0b5462f38b058c41546"
+dependencies = [
+ "memchr",
+ "thiserror",
+ "ucd-trie",
+]
+
+[[package]]
+name = "pest_derive"
+version = "2.7.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e1288dbd7786462961e69bfd4df7848c1e37e8b74303dbdab82c3a9cdd2809"
+dependencies = [
+ "pest",
+ "pest_generator",
+]
+
+[[package]]
+name = "pest_generator"
+version = "2.7.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1381c29a877c6d34b8c176e734f35d7f7f5b3adaefe940cb4d1bb7af94678e2e"
+dependencies = [
+ "pest",
+ "pest_meta",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "pest_meta"
+version = "2.7.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0934d6907f148c22a3acbda520c7eed243ad7487a30f51f6ce52b58b7077a8a"
+dependencies = [
+ "once_cell",
+ "pest",
+ "sha2",
+]
+
+[[package]]
+name = "petgraph"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.2.2",
+]
+
+[[package]]
+name = "phf"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
+dependencies = [
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
+dependencies = [
+ "phf_shared",
+ "rand",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
+dependencies = [
+ "siphasher 0.3.11",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0302c4a0442c456bd56f841aee5c3bfd17967563f6fadc9ceb9f9c23cf3807e0"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "pkcs1"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
+dependencies = [
+ "der",
+ "pkcs8",
+ "spki",
+]
+
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der",
+ "spki",
+]
+
+[[package]]
+name = "pkg-config"
+version = "0.3.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb"
+
+[[package]]
+name = "platforms"
+version = "3.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "626dec3cac7cc0e1577a2ec3fc496277ec2baa084bebad95bb6fdbfae235f84c"
+
+[[package]]
+name = "pprof"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef5c97c51bd34c7e742402e216abdeb44d415fbe6ae41d56b114723e953711cb"
+dependencies = [
+ "backtrace",
+ "cfg-if",
+ "findshlibs",
+ "inferno",
+ "libc",
+ "log",
+ "nix 0.26.4",
+ "once_cell",
+ "parking_lot",
+ "prost",
+ "prost-build",
+ "prost-derive",
+ "protobuf",
+ "sha2",
+ "smallvec",
+ "symbolic-demangle",
+ "tempfile",
+ "thiserror",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "predicate"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "chrono",
+ "data_types",
+ "datafusion",
+ "datafusion_util",
+ "itertools 0.12.1",
+ "observability_deps",
+ "query_functions",
+ "schema",
+ "snafu 0.8.0",
+ "sqlparser",
+ "test_helpers",
+ "workspace-hack",
+]
+
+[[package]]
+name = "predicates"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68b87bfd4605926cdfefc1c3b5f8fe560e3feca9d5552cf68c466d3d8236c7e8"
+dependencies = [
+ "anstyle",
+ "difflib",
+ "predicates-core",
+]
+
+[[package]]
+name = "predicates-core"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b794032607612e7abeb4db69adb4e33590fa6cf1149e95fd7cb00e634b92f174"
+
+[[package]]
+name = "predicates-tree"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "368ba315fb8c5052ab692e68a0eefec6ec57b23a36959c14496f0b0df2c0cecf"
+dependencies = [
+ "predicates-core",
+ "termtree",
+]
+
+[[package]]
+name = "pretty_assertions"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66"
+dependencies = [
+ "diff",
+ "yansi 0.5.1",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.78"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "proc-macro2-diagnostics"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+ "version_check",
+ "yansi 1.0.0-rc.1",
+]
+
+[[package]]
+name = "prometheus"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
+dependencies = [
+ "cfg-if",
+ "fnv",
+ "lazy_static",
+ "memchr",
+ "parking_lot",
+ "thiserror",
+]
+
+[[package]]
+name = "proptest"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31b476131c3c86cb68032fdc5cb6d5a1045e3e42d96b69fa599fd77701e1f5bf"
+dependencies = [
+ "bitflags 2.4.2",
+ "lazy_static",
+ "num-traits",
+ "rand",
+ "rand_chacha",
+ "rand_xorshift",
+ "regex-syntax 0.8.2",
+ "unarray",
+]
+
+[[package]]
+name = "prost"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a"
+dependencies = [
+ "bytes",
+ "prost-derive",
+]
+
+[[package]]
+name = "prost-build"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2"
+dependencies = [
+ "bytes",
+ "heck",
+ "itertools 0.11.0",
+ "log",
+ "multimap",
+ "once_cell",
+ "petgraph",
+ "prettyplease",
+ "prost",
+ "prost-types",
+ "regex",
+ "syn 2.0.48",
+ "tempfile",
+ "which",
+]
+
+[[package]]
+name = "prost-derive"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e"
+dependencies = [
+ "anyhow",
+ "itertools 0.11.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e"
+dependencies = [
+ "prost",
+]
+
+[[package]]
+name = "protobuf"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
+
+[[package]]
+name = "pulldown-cmark"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57206b407293d2bcd3af849ce869d52068623f19e1b5ff8e8778e3309439682b"
+dependencies = [
+ "bitflags 2.4.2",
+ "memchr",
+ "unicase",
+]
+
+[[package]]
+name = "quanta"
+version = "0.12.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ca0b7bac0b97248c40bb77288fc52029cf1459c0461ea1b05ee32ccf011de2c"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi",
+ "web-sys",
+ "winapi",
+]
+
+[[package]]
+name = "query_functions"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "chrono",
+ "datafusion",
+ "datafusion_util",
+ "itertools 0.12.1",
+ "once_cell",
+ "regex",
+ "regex-syntax 0.8.2",
+ "schema",
+ "snafu 0.8.0",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "quick-xml"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "quick-xml"
+version = "0.31.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_xorshift"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "raw-cpuid"
+version = "11.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d86a7c4638d42c44551f4791a20e687dbb4c3de1f33c43dd71e355cd429def1"
+dependencies = [
+ "bitflags 2.4.2",
+]
+
+[[package]]
+name = "rayon"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
+[[package]]
+name = "regex"
+version = "1.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata 0.4.5",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
+
+[[package]]
+name = "reqwest"
+version = "0.11.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6920094eb85afde5e4a138be3f2de8bbdf28000f0029e72c45025a56b042251"
+dependencies = [
+ "base64",
+ "bytes",
+ "encoding_rs",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-rustls",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "system-configuration",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams",
+ "web-sys",
+ "webpki-roots",
+ "winreg",
+]
+
+[[package]]
+name = "rgb"
+version = "0.8.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05aaa8004b64fd573fc9d002f4e632d51ad4f026c2b5ba95fcb6c2f32c2c47d8"
+dependencies = [
+ "bytemuck",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74"
+dependencies = [
+ "cc",
+ "getrandom",
+ "libc",
+ "spin 0.9.8",
+ "untrusted",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "rsa"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+dependencies = [
+ "const-oid",
+ "digest",
+ "num-bigint-dig",
+ "num-integer",
+ "num-traits",
+ "pkcs1",
+ "pkcs8",
+ "rand_core",
+ "signature",
+ "spki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
+dependencies = [
+ "bitflags 2.4.2",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustls"
+version = "0.21.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba"
+dependencies = [
+ "log",
+ "ring",
+ "rustls-webpki",
+ "sct",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00"
+dependencies = [
+ "openssl-probe",
+ "rustls-pemfile",
+ "schannel",
+ "security-framework",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
+dependencies = [
+ "base64",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.101.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
+
+[[package]]
+name = "ryu"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "schema"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "hashbrown 0.14.3",
+ "indexmap 2.2.2",
+ "observability_deps",
+ "once_cell",
+ "snafu 0.8.0",
+ "workspace-hack",
+]
+
+[[package]]
+name = "schemars"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45a28f4c49489add4ce10783f7911893516f15afe45d015608d41faca6bc4d29"
+dependencies = [
+ "dyn-clone",
+ "schemars_derive",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars_derive"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c767fd6fa65d9ccf9cf026122c1b555f2ef9a4f0cea69da4d7dbc3e258d30967"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "sct"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "secrecy"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9bd1c54ea06cfd2f6b63219704de0b9b4f72dcc2b8fdef820be6cd799780e91e"
+dependencies = [
+ "serde",
+ "zeroize",
+]
+
+[[package]]
+name = "security-framework"
+version = "2.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "seq-macro"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
+
+[[package]]
+name = "serde"
+version = "1.0.196"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde-value"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
+dependencies = [
+ "ordered-float 2.10.1",
+ "serde",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.196"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "serde_derive_internals"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85bf8229e7920a9f636479437026331ce11aa132b4dde37d121944a44d6e5f3c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.113"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79"
+dependencies = [
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_spanned"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_yaml"
+version = "0.9.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adf8a49373e98a4c5f0ceb5d05aa7c648d75f63774981ed95b7c7443bbd50c6e"
+dependencies = [
+ "indexmap 2.2.2",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
+[[package]]
+name = "service_common"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "datafusion",
+ "executor",
+ "tonic",
+ "workspace-hack",
+]
+
+[[package]]
+name = "service_grpc_flight"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow-flight",
+ "assert_matches",
+ "async-trait",
+ "authz",
+ "bytes",
+ "data_types",
+ "datafusion",
+ "flightsql",
+ "futures",
+ "generated_types",
+ "iox_query",
+ "iox_query_influxql",
+ "iox_query_params",
+ "metric",
+ "observability_deps",
+ "prost",
+ "serde",
+ "serde_json",
+ "service_common",
+ "snafu 0.8.0",
+ "test_helpers",
+ "tokio",
+ "tonic",
+ "tower_trailer",
+ "trace",
+ "trace_http",
+ "tracker",
+ "workspace-hack",
+]
+
+[[package]]
+name = "service_grpc_testing"
+version = "0.1.0"
+dependencies = [
+ "generated_types",
+ "observability_deps",
+ "tonic",
+ "workspace-hack",
+]
+
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "sharder"
+version = "0.1.0"
+dependencies = [
+ "criterion",
+ "data_types",
+ "hashbrown 0.14.3",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "parking_lot",
+ "rand",
+ "siphasher 1.0.0",
+ "workspace-hack",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "digest",
+ "rand_core",
+]
+
+[[package]]
+name = "similar"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32fea41aca09ee824cc9724996433064c89f7777e60762749a4170a14abbfa21"
+
+[[package]]
+name = "siphasher"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
+
+[[package]]
+name = "siphasher"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
+
+[[package]]
+name = "skeptic"
+version = "0.13.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8"
+dependencies = [
+ "bytecount",
+ "cargo_metadata",
+ "error-chain",
+ "glob",
+ "pulldown-cmark",
+ "tempfile",
+ "walkdir",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
+
+[[package]]
+name = "snafu"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6"
+dependencies = [
+ "doc-comment",
+ "snafu-derive 0.7.5",
+]
+
+[[package]]
+name = "snafu"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d342c51730e54029130d7dc9fd735d28c4cd360f1368c01981d4f03ff207f096"
+dependencies = [
+ "snafu-derive 0.8.0",
+]
+
+[[package]]
+name = "snafu-derive"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "snafu-derive"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "080c44971436b1af15d6f61ddd8b543995cf63ab8e677d46b00cc06f4ef267a0"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "snap"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
+
+[[package]]
+name = "socket2"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
+dependencies = [
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "spin"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
+
+[[package]]
+name = "spin"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der",
+]
+
+[[package]]
+name = "sqlformat"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce81b7bd7c4493975347ef60d8c7e8b742d4694f4c49f93e0a12ea263938176c"
+dependencies = [
+ "itertools 0.12.1",
+ "nom",
+ "unicode_categories",
+]
+
+[[package]]
+name = "sqlparser"
+version = "0.41.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5cc2c25a6c66789625ef164b4c7d2e548d627902280c13710d33da8222169964"
+dependencies = [
+ "log",
+ "sqlparser_derive",
+]
+
+[[package]]
+name = "sqlparser_derive"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "sqlx"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dba03c279da73694ef99763320dea58b51095dfe87d001b1d4b5fe78ba8763cf"
+dependencies = [
+ "sqlx-core",
+ "sqlx-macros",
+ "sqlx-mysql",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+]
+
+[[package]]
+name = "sqlx-core"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0a3c3739e220d94b3239fd69fb1f74bc36e16643423bd99de3b43c21bfbd"
+dependencies = [
+ "ahash",
+ "atoi",
+ "byteorder",
+ "bytes",
+ "crc",
+ "crossbeam-queue",
+ "dotenvy",
+ "either",
+ "event-listener 2.5.3",
+ "futures-channel",
+ "futures-core",
+ "futures-intrusive",
+ "futures-io",
+ "futures-util",
+ "hashlink",
+ "hex",
+ "indexmap 2.2.2",
+ "log",
+ "memchr",
+ "once_cell",
+ "paste",
+ "percent-encoding",
+ "rustls",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "sha2",
+ "smallvec",
+ "sqlformat",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+ "url",
+ "uuid",
+ "webpki-roots",
+]
+
+[[package]]
+name = "sqlx-hotswap-pool"
+version = "0.1.0"
+dependencies = [
+ "dotenvy",
+ "either",
+ "futures",
+ "rand",
+ "sqlx",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "sqlx-macros"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89961c00dc4d7dffb7aee214964b065072bff69e36ddb9e2c107541f75e4f2a5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "sqlx-core",
+ "sqlx-macros-core",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "sqlx-macros-core"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0bd4519486723648186a08785143599760f7cc81c52334a55d6a83ea1e20841"
+dependencies = [
+ "atomic-write-file",
+ "dotenvy",
+ "either",
+ "heck",
+ "hex",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "sha2",
+ "sqlx-core",
+ "sqlx-mysql",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+ "syn 1.0.109",
+ "tempfile",
+ "tokio",
+ "url",
+]
+
+[[package]]
+name = "sqlx-mysql"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e37195395df71fd068f6e2082247891bc11e3289624bbc776a0cdfa1ca7f1ea4"
+dependencies = [
+ "atoi",
+ "base64",
+ "bitflags 2.4.2",
+ "byteorder",
+ "bytes",
+ "crc",
+ "digest",
+ "dotenvy",
+ "either",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-util",
+ "generic-array",
+ "hex",
+ "hkdf",
+ "hmac",
+ "itoa",
+ "log",
+ "md-5",
+ "memchr",
+ "once_cell",
+ "percent-encoding",
+ "rand",
+ "rsa",
+ "serde",
+ "sha1",
+ "sha2",
+ "smallvec",
+ "sqlx-core",
+ "stringprep",
+ "thiserror",
+ "tracing",
+ "uuid",
+ "whoami",
+]
+
+[[package]]
+name = "sqlx-postgres"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6ac0ac3b7ccd10cc96c7ab29791a7dd236bd94021f31eec7ba3d46a74aa1c24"
+dependencies = [
+ "atoi",
+ "base64",
+ "bitflags 2.4.2",
+ "byteorder",
+ "crc",
+ "dotenvy",
+ "etcetera",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-util",
+ "hex",
+ "hkdf",
+ "hmac",
+ "home",
+ "itoa",
+ "log",
+ "md-5",
+ "memchr",
+ "once_cell",
+ "rand",
+ "serde",
+ "serde_json",
+ "sha1",
+ "sha2",
+ "smallvec",
+ "sqlx-core",
+ "stringprep",
+ "thiserror",
+ "tracing",
+ "uuid",
+ "whoami",
+]
+
+[[package]]
+name = "sqlx-sqlite"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "210976b7d948c7ba9fced8ca835b11cbb2d677c59c79de41ac0d397e14547490"
+dependencies = [
+ "atoi",
+ "flume",
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-intrusive",
+ "futures-util",
+ "libsqlite3-sys",
+ "log",
+ "percent-encoding",
+ "serde",
+ "sqlx-core",
+ "tracing",
+ "url",
+ "urlencoding",
+ "uuid",
+]
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "str_stack"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
+
+[[package]]
+name = "stringprep"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6"
+dependencies = [
+ "finl_unicode",
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "strsim"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+
+[[package]]
+name = "strum"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.25.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "subtle"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+
+[[package]]
+name = "symbolic-common"
+version = "12.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cccfffbc6bb3bb2d3a26cd2077f4d055f6808d266f9d4d158797a4c60510dfe"
+dependencies = [
+ "debugid",
+ "memmap2",
+ "stable_deref_trait",
+ "uuid",
+]
+
+[[package]]
+name = "symbolic-demangle"
+version = "12.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76a99812da4020a67e76c4eb41f08c87364c14170495ff780f30dd519c221a68"
+dependencies = [
+ "cpp_demangle",
+ "rustc-demangle",
+ "symbolic-common",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+
+[[package]]
+name = "synchronized-writer"
+version = "1.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3543ca0810e71767052bdcdd5653f23998b192642a22c5164bfa6581e40a4a2"
+
+[[package]]
+name = "sysinfo"
+version = "0.30.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fb4f3438c8f6389c864e61221cbc97e9bca98b4daf39a5beb7bea660f528bb2"
+dependencies = [
+ "cfg-if",
+ "core-foundation-sys",
+ "libc",
+ "ntapi",
+ "once_cell",
+ "rayon",
+ "windows",
+]
+
+[[package]]
+name = "system-configuration"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "tagptr"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
+
+[[package]]
+name = "tempfile"
+version = "3.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "rustix",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "termtree"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
+
+[[package]]
+name = "test_helpers"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "dotenvy",
+ "observability_deps",
+ "parking_lot",
+ "tempfile",
+ "tokio",
+ "tracing-log",
+ "tracing-subscriber",
+ "workspace-hack",
+]
+
+[[package]]
+name = "test_helpers_end_to_end"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow-flight",
+ "arrow_util",
+ "assert_cmd",
+ "assert_matches",
+ "bytes",
+ "data_types",
+ "dml",
+ "futures",
+ "generated_types",
+ "http",
+ "hyper",
+ "influxdb_iox_client",
+ "ingester_query_grpc",
+ "insta",
+ "iox_catalog",
+ "iox_query_params",
+ "mutable_batch_lp",
+ "mutable_batch_pb",
+ "nix 0.27.1",
+ "observability_deps",
+ "once_cell",
+ "parking_lot",
+ "prost",
+ "rand",
+ "regex",
+ "reqwest",
+ "serde_json",
+ "snafu 0.8.0",
+ "sqlx",
+ "tempfile",
+ "test_helpers",
+ "tokio",
+ "tokio-util",
+ "tonic",
+ "workspace-hack",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "thread_local"
+version = "1.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
+[[package]]
+name = "threadpool"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
+dependencies = [
+ "num_cpus",
+]
+
+[[package]]
+name = "thrift"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
+dependencies = [
+ "byteorder",
+ "integer-encoding 3.0.4",
+ "log",
+ "ordered-float 2.10.1",
+ "threadpool",
+]
+
+[[package]]
+name = "tikv-jemalloc-sys"
+version = "0.5.4+5.3.0-patched"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.36.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+dependencies = [
+ "backtrace",
+ "bytes",
+ "libc",
+ "mio",
+ "num_cpus",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "tracing",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "tokio-io-timeout"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.24.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "pin-project-lite",
+ "slab",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "tokio_metrics_bridge"
+version = "0.1.0"
+dependencies = [
+ "metric",
+ "parking_lot",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "tokio_watchdog"
+version = "0.1.0"
+dependencies = [
+ "metric",
+ "observability_deps",
+ "test_helpers",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "toml"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a9aad4a3066010876e8dcf5a8a06e70a558751117a145c6ce2b82c2e2054290"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c9ffdf896f8daaabf9b66ba8e77ea1ed5ed0f72821b398aba62352e95062951"
+dependencies = [
+ "indexmap 2.2.2",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "winnow",
+]
+
+[[package]]
+name = "tonic"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-timeout",
+ "percent-encoding",
+ "pin-project",
+ "prost",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "tokio",
+ "tokio-rustls",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "tonic-health"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f80db390246dfb46553481f6024f0082ba00178ea495dbb99e70ba9a4fafb5e1"
+dependencies = [
+ "async-stream",
+ "prost",
+ "tokio",
+ "tokio-stream",
+ "tonic",
+]
+
+[[package]]
+name = "tonic-reflection"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fa37c513df1339d197f4ba21d28c918b9ef1ac1768265f11ecb6b7f1cba1b76"
+dependencies = [
+ "prost",
+ "prost-types",
+ "tokio",
+ "tokio-stream",
+ "tonic",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 1.9.3",
+ "pin-project",
+ "pin-project-lite",
+ "rand",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
+dependencies = [
+ "base64",
+ "bitflags 2.4.2",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-range-header",
+ "mime",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
+
+[[package]]
+name = "tower-service"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
+
+[[package]]
+name = "tower_trailer"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "http",
+ "http-body",
+ "parking_lot",
+ "pin-project",
+ "tower",
+ "workspace-hack",
+]
+
+[[package]]
+name = "trace"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "observability_deps",
+ "parking_lot",
+ "rand",
+ "workspace-hack",
+]
+
+[[package]]
+name = "trace_exporters"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "chrono",
+ "clap",
+ "futures",
+ "iox_time",
+ "observability_deps",
+ "snafu 0.8.0",
+ "thrift",
+ "tokio",
+ "trace",
+ "workspace-hack",
+]
+
+[[package]]
+name = "trace_http"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "futures",
+ "hashbrown 0.14.3",
+ "http",
+ "http-body",
+ "itertools 0.12.1",
+ "metric",
+ "observability_deps",
+ "parking_lot",
+ "pin-project",
+ "snafu 0.8.0",
+ "tower",
+ "trace",
+ "workspace-hack",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "log",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "parking_lot",
+ "regex",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
+[[package]]
+name = "tracker"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "hashbrown 0.14.3",
+ "iox_time",
+ "lock_api",
+ "metric",
+ "observability_deps",
+ "parking_lot",
+ "pin-project",
+ "sysinfo",
+ "tempfile",
+ "test_helpers",
+ "tokio",
+ "tokio-util",
+ "trace",
+ "workspace-hack",
+]
+
+[[package]]
+name = "treediff"
+version = "4.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d127780145176e2b5d16611cc25a900150e86e9fd79d3bde6ff3a37359c9cb5"
+dependencies = [
+ "serde_json",
+]
+
+[[package]]
+name = "triomphe"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3"
+
+[[package]]
+name = "trogging"
+version = "0.1.0"
+dependencies = [
+ "clap",
+ "logfmt",
+ "observability_deps",
+ "regex",
+ "synchronized-writer",
+ "thiserror",
+ "tracing-log",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "twox-hash"
+version = "1.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
+dependencies = [
+ "cfg-if",
+ "static_assertions",
+]
+
+[[package]]
+name = "typenum"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
+
+[[package]]
+name = "ucd-trie"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9"
+
+[[package]]
+name = "unarray"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
+
+[[package]]
+name = "unicase"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
+dependencies = [
+ "version_check",
+]
+
+[[package]]
+name = "unicode-bidi"
+version = "0.3.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
+
+[[package]]
+name = "unicode_categories"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
+
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab4c90930b95a82d00dc9e9ac071b4991924390d46cbd0dfe566148667605e4b"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "url"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+]
+
+[[package]]
+name = "urlencoding"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
+
+[[package]]
+name = "uuid"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wait-timeout"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "wal"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "byteorder",
+ "crc32fast",
+ "data_types",
+ "dml",
+ "generated_types",
+ "hashbrown 0.14.3",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "mutable_batch_pb",
+ "observability_deps",
+ "parking_lot",
+ "prost",
+ "snafu 0.8.0",
+ "snap",
+ "test_helpers",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "wal_inspect"
+version = "0.1.0"
+dependencies = [
+ "data_types",
+ "dml",
+ "generated_types",
+ "hashbrown 0.14.3",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "mutable_batch_pb",
+ "parquet_to_line_protocol",
+ "schema",
+ "test_helpers",
+ "thiserror",
+ "tokio",
+ "wal",
+ "workspace-hack",
+]
+
+[[package]]
+name = "walkdir"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.91"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1e124130aee3fb58c5bdd6b639a0509486b0338acaaae0c84a5124b0f588b7f"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.91"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9e7e1900c352b609c8488ad12639a311045f40a35491fb69ba8c12f758af70b"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877b9c3f61ceea0e56331985743b13f3d25c406a7098d45180fb5f09bc19ed97"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.91"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b30af9e2d358182b5c7449424f017eba305ed32a7010509ede96cdc4696c46ed"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.91"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.91"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f186bd2dcf04330886ce82d6f33dd75a7bfcf69ecf5763b89fcde53b6ac9838"
+
+[[package]]
+name = "wasm-streams"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.68"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96565907687f7aceb35bc5fc03770a8a0471d82e479f25832f54a0e3f4b28446"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "0.25.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1"
+
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
+[[package]]
+name = "whoami"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
+dependencies = [
+ "windows-core",
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.0",
+ "windows_aarch64_msvc 0.52.0",
+ "windows_i686_gnu 0.52.0",
+ "windows_i686_msvc 0.52.0",
+ "windows_x86_64_gnu 0.52.0",
+ "windows_x86_64_gnullvm 0.52.0",
+ "windows_x86_64_msvc 0.52.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
+
+[[package]]
+name = "winnow"
+version = "0.5.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5389a154b01683d28c77f8f68f49dea75f0a4da32557a58f68ee51ebba472d29"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "winreg"
+version = "0.50.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
+dependencies = [
+ "cfg-if",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "workspace-hack"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "arrow",
+ "arrow-ipc",
+ "base64",
+ "bitflags 2.4.2",
+ "byteorder",
+ "bytes",
+ "cc",
+ "chrono",
+ "clap",
+ "clap_builder",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "crypto-common",
+ "digest",
+ "either",
+ "fixedbitset",
+ "flatbuffers",
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+ "getrandom",
+ "hashbrown 0.14.3",
+ "heck",
+ "hyper",
+ "hyper-rustls",
+ "indexmap 2.2.2",
+ "itertools 0.11.0",
+ "k8s-openapi",
+ "kube-core",
+ "libc",
+ "lock_api",
+ "log",
+ "md-5",
+ "memchr",
+ "mio",
+ "nix 0.27.1",
+ "nom",
+ "num-traits",
+ "object_store",
+ "once_cell",
+ "parking_lot",
+ "percent-encoding",
+ "petgraph",
+ "phf_shared",
+ "proptest",
+ "prost",
+ "prost-types",
+ "rand",
+ "rand_core",
+ "regex",
+ "regex-automata 0.4.5",
+ "regex-syntax 0.8.2",
+ "reqwest",
+ "ring",
+ "rustls",
+ "serde",
+ "serde_json",
+ "sha2",
+ "similar",
+ "spin 0.9.8",
+ "sqlparser",
+ "sqlx",
+ "sqlx-core",
+ "sqlx-macros",
+ "sqlx-macros-core",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+ "strum",
+ "syn 1.0.109",
+ "syn 2.0.48",
+ "thrift",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+ "unicode-bidi",
+ "unicode-normalization",
+ "url",
+ "uuid",
+ "winapi",
+ "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "xxhash-rust"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53be06678ed9e83edb1745eb72efc0bbcd7b5c3c35711a860906aed827a13d61"
+
+[[package]]
+name = "xz2"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
+dependencies = [
+ "lzma-sys",
+]
+
+[[package]]
+name = "yaml-rust"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85"
+dependencies = [
+ "linked-hash-map",
+]
+
+[[package]]
+name = "yansi"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec"
+
+[[package]]
+name = "yansi"
+version = "1.0.0-rc.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1367295b8f788d371ce2dbc842c7b709c73ee1364d30351dd300ec2203b12377"
+
+[[package]]
+name = "zerocopy"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
+
+[[package]]
+name = "zstd"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "7.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e"
+dependencies = [
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.9+zstd.1.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..5aecdd5
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,146 @@
+[workspace]
+# In alphabetical order
+members = [
+    "arrow_util",
+    "backoff",
+    "cache_system",
+    "clap_blocks",
+    "client_util",
+    "data_types",
+    "datafusion_util",
+    "dml",
+    "executor",
+    "flightsql",
+    "generated_types",
+    "grpc-binary-logger-proto",
+    "grpc-binary-logger-test-proto",
+    "grpc-binary-logger",
+    "import_export",
+    "influxdb_influxql_parser",
+    "influxdb_iox_client",
+    "influxdb_line_protocol",
+    "influxdb_storage_client",
+    "influxdb_tsm",
+    "influxdb2_client",
+    "influxrpc_parser",
+    "iox_catalog",
+    "iox_data_generator",
+    "iox_query_influxql",
+    "iox_query_influxrpc",
+    "iox_query",
+    "iox_tests",
+    "iox_time",
+    "ioxd_common",
+    "ioxd_test",
+    "logfmt",
+    "metric_exporters",
+    "metric",
+    "mutable_batch_lp",
+    "mutable_batch_pb",
+    "mutable_batch_tests",
+    "mutable_batch",
+    "object_store_metrics",
+    "observability_deps",
+    "panic_logging",
+    "parquet_file",
+    "parquet_to_line_protocol",
+    "predicate",
+    "query_functions",
+    "schema",
+    "service_common",
+    "service_grpc_flight",
+    "service_grpc_testing",
+    "sharder",
+    "sqlx-hotswap-pool",
+    "test_helpers_end_to_end",
+    "tokio_metrics_bridge",
+    "trace_exporters",
+    "trace_http",
+    "trace",
+    "tracker",
+    "trogging",
+    "wal_inspect",
+    "wal",
+    "workspace-hack",
+]
+
+resolver = "2"
+
+exclude = [
+    "*.md",
+    "*.txt",
+    ".git*",
+    ".github/",
+    "LICENSE*",
+    "massif.out.*",
+    "test_bench/",
+    "test_fixtures/",
+]
+
+[workspace.package]
+version = "0.1.0"
+authors = ["IOx Project Developers"]
+edition = "2021"
+license = "MIT OR Apache-2.0"
+
+[workspace.dependencies]
+arrow = { version = "49.0.0", features = ["prettyprint", "chrono-tz"] }
+arrow-buffer = { version = "49.0.0" }
+arrow-flight = { version = "49.0.0", features = ["flight-sql-experimental"] }
+datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" }
+datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" }
+hashbrown = { version = "0.14.3" }
+object_store = { version = "0.8.0" }
+parquet = { version = "49.0.0", features = ["object_store"] }
+pbjson = { version = "0.6.0" }
+pbjson-build = { version = "0.6.2" }
+pbjson-types = { version = "0.6.0" }
+prost = { version = "0.12.3" }
+prost-build = { version = "0.12.2" }
+prost-types = { version = "0.12.3" }
+sqlparser = { version = "0.41.0" }
+tonic = { version = "0.10.2", features = ["tls", "tls-roots"] }
+tonic-build = { version = "0.10.2" }
+tonic-health = { version = "0.10.2" }
+tonic-reflection = { version = "0.10.2" }
+
+[workspace.lints.rust]
+rust_2018_idioms = "deny"
+unreachable_pub = "deny"
+missing_debug_implementations = "deny"
+missing_copy_implementations = "deny"
+
+[workspace.lints.clippy]
+dbg_macro = "deny"
+todo = "deny"
+clone_on_ref_ptr = "deny"
+future_not_send = "deny"
+
+[workspace.lints.rustdoc]
+broken_intra_doc_links = "deny"
+bare_urls = "deny"
+
+# This profile optimizes for runtime performance and small binary size at the expense of longer
+# build times. It's most suitable for final release builds.
+[profile.release]
+codegen-units = 16
+debug = true
+lto = "thin"
+
+[profile.bench]
+debug = true
+
+# This profile optimizes for short build times at the expense of larger binary size and slower
+# runtime performance. It's most suitable for development iterations.
+[profile.quick-release]
+inherits = "release"
+codegen-units = 16
+lto = false
+incremental = true
+
+# Per insta docs: https://insta.rs/docs/quickstart/#optional-faster-runs
+[profile.dev.package.insta]
+opt-level = 3
+
+[profile.dev.package.similar]
+opt-level = 3
diff --git a/arrow_util/Cargo.toml b/arrow_util/Cargo.toml
new file mode 100644
index 0000000..18ac4bf
--- /dev/null
+++ b/arrow_util/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "arrow_util"
+description = "Apache Arrow utilities"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
+arrow = { workspace = true }
+# used by arrow anyway (needed for printing workaround)
+chrono = { version = "0.4", default-features = false }
+comfy-table = { version = "7.1", default-features = false }
+hashbrown = { workspace = true }
+num-traits = "0.2"
+once_cell = { version = "1.19", features = ["parking_lot"] }
+regex = "1.10.2"
+snafu = "0.8"
+uuid = "1"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+datafusion = { workspace = true }
+proptest = { version = "1.4.0", default-features = false, features = ["std"] }
+rand = "0.8.3"
diff --git a/arrow_util/src/bitset.rs b/arrow_util/src/bitset.rs
new file mode 100644
index 0000000..7fecee6
--- /dev/null
+++ b/arrow_util/src/bitset.rs
@@ -0,0 +1,879 @@
+use arrow::buffer::{BooleanBuffer, Buffer};
+use std::ops::Range;
+
+/// An arrow-compatible mutable bitset implementation
+///
+/// Note: This currently operates on individual bytes at a time
+/// it could be optimised to instead operate on usize blocks
+#[derive(Debug, Default, Clone)]
+pub struct BitSet {
+    /// The underlying data
+    ///
+    /// Data is stored in the least significant bit of a byte first
+    buffer: Vec<u8>,
+
+    /// The length of this mask in bits
+    len: usize,
+}
+
+impl BitSet {
+    /// Creates a new BitSet
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Construct an empty [`BitSet`] with a pre-allocated capacity for `n`
+    /// bits.
+    pub fn with_capacity(n: usize) -> Self {
+        Self {
+            buffer: Vec::with_capacity((n + 7) / 8),
+            len: 0,
+        }
+    }
+
+    /// Creates a new BitSet with `count` unset bits.
+    pub fn with_size(count: usize) -> Self {
+        let mut bitset = Self::default();
+        bitset.append_unset(count);
+        bitset
+    }
+
+    /// Reserve space for `count` further bits
+    pub fn reserve(&mut self, count: usize) {
+        let new_buf_len = (self.len + count + 7) / 8;
+        self.buffer.reserve(new_buf_len);
+    }
+
+    /// Appends `count` unset bits
+    pub fn append_unset(&mut self, count: usize) {
+        self.len += count;
+        let new_buf_len = (self.len + 7) / 8;
+        self.buffer.resize(new_buf_len, 0);
+    }
+
+    /// Appends `count` set bits
+    pub fn append_set(&mut self, count: usize) {
+        let new_len = self.len + count;
+        let new_buf_len = (new_len + 7) / 8;
+
+        let skew = self.len % 8;
+        if skew != 0 {
+            *self.buffer.last_mut().unwrap() |= 0xFF << skew;
+        }
+
+        self.buffer.resize(new_buf_len, 0xFF);
+
+        let rem = new_len % 8;
+        if rem != 0 {
+            *self.buffer.last_mut().unwrap() &= (1 << rem) - 1;
+        }
+
+        self.len = new_len;
+    }
+
+    /// Truncates the bitset to the provided length
+    pub fn truncate(&mut self, len: usize) {
+        let new_buf_len = (len + 7) / 8;
+        self.buffer.truncate(new_buf_len);
+        let overrun = len % 8;
+        if overrun > 0 {
+            *self.buffer.last_mut().unwrap() &= (1 << overrun) - 1;
+        }
+        self.len = len;
+    }
+
+    /// Split this bitmap at the specified bit boundary, such that after this
+    /// call, `self` contains the range `[0, n)` and the returned value contains
+    /// `[n, len)`.
+    pub fn split_off(&mut self, n: usize) -> Self {
+        let mut right = Self::with_capacity(self.len - n);
+        right.extend_from_range(self, n..self.len);
+
+        self.truncate(n);
+
+        right
+    }
+
+    /// Extends this [`BitSet`] by the context of `other`
+    pub fn extend_from(&mut self, other: &BitSet) {
+        self.append_bits(other.len, &other.buffer)
+    }
+
+    /// Extends this [`BitSet`] by `range` elements in `other`
+    pub fn extend_from_range(&mut self, other: &BitSet, range: Range<usize>) {
+        let count = range.end - range.start;
+        if count == 0 {
+            return;
+        }
+
+        let start_byte = range.start / 8;
+        let end_byte = (range.end + 7) / 8;
+        let skew = range.start % 8;
+
+        // `append_bits` requires the provided `to_set` to be byte aligned, therefore
+        // if the range being copied is not byte aligned we must first append
+        // the leading bits to reach a byte boundary
+        if skew == 0 {
+            // No skew can simply append bytes directly
+            self.append_bits(count, &other.buffer[start_byte..end_byte])
+        } else if start_byte + 1 == end_byte {
+            // Append bits from single byte
+            self.append_bits(count, &[other.buffer[start_byte] >> skew])
+        } else {
+            // Append trailing bits from first byte to reach byte boundary, then append
+            // bits from the remaining byte-aligned mask
+            let offset = 8 - skew;
+            self.append_bits(offset, &[other.buffer[start_byte] >> skew]);
+            self.append_bits(count - offset, &other.buffer[(start_byte + 1)..end_byte]);
+        }
+    }
+
+    /// Appends `count` boolean values from the slice of packed bits
+    pub fn append_bits(&mut self, count: usize, to_set: &[u8]) {
+        assert_eq!((count + 7) / 8, to_set.len());
+
+        let new_len = self.len + count;
+        let new_buf_len = (new_len + 7) / 8;
+        self.buffer.reserve(new_buf_len - self.buffer.len());
+
+        let whole_bytes = count / 8;
+        let overrun = count % 8;
+
+        let skew = self.len % 8;
+        if skew == 0 {
+            self.buffer.extend_from_slice(&to_set[..whole_bytes]);
+            if overrun > 0 {
+                let masked = to_set[whole_bytes] & ((1 << overrun) - 1);
+                self.buffer.push(masked)
+            }
+
+            self.len = new_len;
+            debug_assert_eq!(self.buffer.len(), new_buf_len);
+            return;
+        }
+
+        for to_set_byte in &to_set[..whole_bytes] {
+            let low = *to_set_byte << skew;
+            let high = *to_set_byte >> (8 - skew);
+
+            *self.buffer.last_mut().unwrap() |= low;
+            self.buffer.push(high);
+        }
+
+        if overrun > 0 {
+            let masked = to_set[whole_bytes] & ((1 << overrun) - 1);
+            let low = masked << skew;
+            *self.buffer.last_mut().unwrap() |= low;
+
+            if overrun > 8 - skew {
+                let high = masked >> (8 - skew);
+                self.buffer.push(high)
+            }
+        }
+
+        self.len = new_len;
+        debug_assert_eq!(self.buffer.len(), new_buf_len);
+    }
+
+    /// Sets a given bit
+    pub fn set(&mut self, idx: usize) {
+        assert!(idx <= self.len);
+
+        let byte_idx = idx / 8;
+        let bit_idx = idx % 8;
+        self.buffer[byte_idx] |= 1 << bit_idx;
+    }
+
+    /// Returns if the given index is set
+    pub fn get(&self, idx: usize) -> bool {
+        assert!(idx <= self.len);
+
+        let byte_idx = idx / 8;
+        let bit_idx = idx % 8;
+        (self.buffer[byte_idx] >> bit_idx) & 1 != 0
+    }
+
+    /// Converts this BitSet to a buffer compatible with arrows boolean encoding
+    pub fn to_arrow(&self) -> BooleanBuffer {
+        let offset = 0;
+        BooleanBuffer::new(Buffer::from(&self.buffer), offset, self.len)
+    }
+
+    /// Returns the number of values stored in the bitset
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns if this bitset is empty
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    /// Returns the number of bytes used by this bitset
+    pub fn byte_len(&self) -> usize {
+        self.buffer.len()
+    }
+
+    /// Return the raw packed bytes used by this bitset
+    pub fn bytes(&self) -> &[u8] {
+        &self.buffer
+    }
+
+    /// Return `true` if all bits in the [`BitSet`] are currently set.
+    pub fn is_all_set(&self) -> bool {
+        // An empty bitmap has no set bits.
+        if self.len == 0 {
+            return false;
+        }
+
+        // Check all the bytes in the bitmap that have all their bits considered
+        // part of the bit set.
+        let full_blocks = (self.len / 8).saturating_sub(1);
+        if !self.buffer.iter().take(full_blocks).all(|&v| v == u8::MAX) {
+            return false;
+        }
+
+        // Check the last byte of the bitmap that may only be partially part of
+        // the bit set, and therefore need masking to check only the relevant
+        // bits.
+        let mask = match self.len % 8 {
+            1..=8 => !(0xFF << (self.len % 8)), // LSB mask
+            0 => 0xFF,
+            _ => unreachable!(),
+        };
+        *self.buffer.last().unwrap() == mask
+    }
+
+    /// Return `true` if all bits in the [`BitSet`] are currently unset.
+    pub fn is_all_unset(&self) -> bool {
+        self.buffer.iter().all(|&v| v == 0)
+    }
+
+    /// Returns the number of set bits in this bitmap.
+    pub fn count_ones(&self) -> usize {
+        // Invariant: the bits outside of [0, self.len) are always 0
+        self.buffer.iter().map(|v| v.count_ones() as usize).sum()
+    }
+
+    /// Returns the number of unset bits in this bitmap.
+    pub fn count_zeros(&self) -> usize {
+        self.len() - self.count_ones()
+    }
+
+    /// Returns true if any bit is set (short circuiting).
+    pub fn is_any_set(&self) -> bool {
+        self.buffer.iter().any(|&v| v != 0)
+    }
+
+    /// Returns a value [`Iterator`] that yields boolean values encoded in the
+    /// bitmap.
+    pub fn iter(&self) -> Iter<'_> {
+        Iter::new(self)
+    }
+
+    /// Returns the bitwise AND between the two [`BitSet`] instances.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the two sets have differing lengths.
+    pub fn and(&self, other: &Self) -> Self {
+        assert_eq!(self.len, other.len);
+
+        Self {
+            buffer: self
+                .buffer
+                .iter()
+                .zip(other.buffer.iter())
+                .map(|(a, b)| a & b)
+                .collect(),
+            len: self.len,
+        }
+    }
+}
+
+/// A value iterator yielding the boolean values encoded in the bitmap.
+#[derive(Debug)]
+pub struct Iter<'a> {
+    /// A reference to the bitmap buffer.
+    buffer: &'a [u8],
+    /// The index of the next yielded bit in `buffer`.
+    idx: usize,
+    /// The number of bits stored in buffer.
+    len: usize,
+}
+
+impl<'a> Iter<'a> {
+    fn new(b: &'a BitSet) -> Self {
+        Self {
+            buffer: &b.buffer,
+            idx: 0,
+            len: b.len(),
+        }
+    }
+}
+
+impl<'a> Iterator for Iter<'a> {
+    type Item = bool;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.idx >= self.len {
+            return None;
+        }
+
+        let byte_idx = self.idx / 8;
+        let shift = self.idx % 8;
+
+        self.idx += 1;
+
+        let byte = self.buffer[byte_idx];
+        let byte = byte >> shift;
+
+        Some(byte & 1 == 1)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let v = self.len - self.idx;
+        (v, Some(v))
+    }
+}
+
+impl<'a> ExactSizeIterator for Iter<'a> {}
+
+/// Returns an iterator over set bit positions in increasing order
+pub fn iter_set_positions(bytes: &[u8]) -> impl Iterator<Item = usize> + '_ {
+    iter_set_positions_with_offset(bytes, 0)
+}
+
+/// Returns an iterator over set bit positions in increasing order starting
+/// at the provided bit offset
+pub fn iter_set_positions_with_offset(
+    bytes: &[u8],
+    offset: usize,
+) -> impl Iterator<Item = usize> + '_ {
+    let mut byte_idx = offset / 8;
+    let mut in_progress = bytes.get(byte_idx).cloned().unwrap_or(0);
+
+    let skew = offset % 8;
+    in_progress &= 0xFF << skew;
+
+    std::iter::from_fn(move || loop {
+        if in_progress != 0 {
+            let bit_pos = in_progress.trailing_zeros();
+            in_progress ^= 1 << bit_pos;
+            return Some((byte_idx * 8) + (bit_pos as usize));
+        }
+        byte_idx += 1;
+        in_progress = *bytes.get(byte_idx)?;
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::BooleanBufferBuilder;
+    use proptest::prelude::*;
+    use rand::prelude::*;
+    use rand::rngs::OsRng;
+
+    use super::*;
+
+    /// Computes a compacted representation of a given bool array
+    fn compact_bools(bools: &[bool]) -> Vec<u8> {
+        bools
+            .chunks(8)
+            .map(|x| {
+                let mut collect = 0_u8;
+                for (idx, set) in x.iter().enumerate() {
+                    if *set {
+                        collect |= 1 << idx
+                    }
+                }
+                collect
+            })
+            .collect()
+    }
+
+    fn iter_set_bools(bools: &[bool]) -> impl Iterator<Item = usize> + '_ {
+        bools
+            .iter()
+            .enumerate()
+            .filter(|&(_x, y)| *y)
+            .map(|(x, _y)| x)
+    }
+
+    #[test]
+    fn test_compact_bools() {
+        let bools = &[
+            false, false, true, true, false, false, true, false, true, false,
+        ];
+        let collected = compact_bools(bools);
+        let indexes: Vec<_> = iter_set_bools(bools).collect();
+        assert_eq!(collected.as_slice(), &[0b01001100, 0b00000001]);
+        assert_eq!(indexes.as_slice(), &[2, 3, 6, 8])
+    }
+
+    #[test]
+    fn test_bit_mask() {
+        let mut mask = BitSet::new();
+
+        assert!(!mask.is_any_set());
+
+        mask.append_bits(8, &[0b11111111]);
+        let d1 = mask.buffer.clone();
+        assert!(mask.is_any_set());
+
+        mask.append_bits(3, &[0b01010010]);
+        let d2 = mask.buffer.clone();
+
+        mask.append_bits(5, &[0b00010100]);
+        let d3 = mask.buffer.clone();
+
+        mask.append_bits(2, &[0b11110010]);
+        let d4 = mask.buffer.clone();
+
+        mask.append_bits(15, &[0b11011010, 0b01010101]);
+        let d5 = mask.buffer.clone();
+
+        assert_eq!(d1.as_slice(), &[0b11111111]);
+        assert_eq!(d2.as_slice(), &[0b11111111, 0b00000010]);
+        assert_eq!(d3.as_slice(), &[0b11111111, 0b10100010]);
+        assert_eq!(d4.as_slice(), &[0b11111111, 0b10100010, 0b00000010]);
+        assert_eq!(
+            d5.as_slice(),
+            &[0b11111111, 0b10100010, 0b01101010, 0b01010111, 0b00000001]
+        );
+
+        assert!(mask.get(0));
+        assert!(!mask.get(8));
+        assert!(mask.get(9));
+        assert!(mask.get(19));
+    }
+
+    fn make_rng() -> StdRng {
+        let seed = OsRng.next_u64();
+        println!("Seed: {seed}");
+        StdRng::seed_from_u64(seed)
+    }
+
+    #[test]
+    fn test_bit_mask_all_set() {
+        let mut mask = BitSet::new();
+        let mut all_bools = vec![];
+        let mut rng = make_rng();
+
+        for _ in 0..100 {
+            let mask_length = (rng.next_u32() % 50) as usize;
+            let bools: Vec<_> = std::iter::repeat(true).take(mask_length).collect();
+
+            let collected = compact_bools(&bools);
+            mask.append_bits(mask_length, &collected);
+            all_bools.extend_from_slice(&bools);
+        }
+
+        let collected = compact_bools(&all_bools);
+        assert_eq!(mask.buffer, collected);
+
+        let expected_indexes: Vec<_> = iter_set_bools(&all_bools).collect();
+        let actual_indexes: Vec<_> = iter_set_positions(&mask.buffer).collect();
+        assert_eq!(expected_indexes, actual_indexes);
+    }
+
+    #[test]
+    fn test_bit_mask_fuzz() {
+        let mut mask = BitSet::new();
+        let mut all_bools = vec![];
+        let mut rng = make_rng();
+
+        for _ in 0..100 {
+            let mask_length = (rng.next_u32() % 50) as usize;
+            let bools: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0))
+                .take(mask_length)
+                .collect();
+
+            let collected = compact_bools(&bools);
+            mask.append_bits(mask_length, &collected);
+            all_bools.extend_from_slice(&bools);
+        }
+
+        let collected = compact_bools(&all_bools);
+        assert_eq!(mask.buffer, collected);
+
+        let expected_indexes: Vec<_> = iter_set_bools(&all_bools).collect();
+        let actual_indexes: Vec<_> = iter_set_positions(&mask.buffer).collect();
+        assert_eq!(expected_indexes, actual_indexes);
+
+        if !all_bools.is_empty() {
+            for _ in 0..10 {
+                let offset = rng.next_u32() as usize % all_bools.len();
+
+                let expected_indexes: Vec<_> = iter_set_bools(&all_bools[offset..])
+                    .map(|x| x + offset)
+                    .collect();
+
+                let actual_indexes: Vec<_> =
+                    iter_set_positions_with_offset(&mask.buffer, offset).collect();
+
+                assert_eq!(expected_indexes, actual_indexes);
+            }
+        }
+
+        for index in actual_indexes {
+            assert!(mask.get(index));
+        }
+    }
+
+    #[test]
+    fn test_append_fuzz() {
+        let mut mask = BitSet::new();
+        let mut all_bools = vec![];
+        let mut rng = make_rng();
+
+        for _ in 0..100 {
+            let len = (rng.next_u32() % 32) as usize;
+            let set = rng.next_u32() & 1 == 0;
+
+            match set {
+                true => mask.append_set(len),
+                false => mask.append_unset(len),
+            }
+
+            all_bools.extend(std::iter::repeat(set).take(len));
+
+            let collected = compact_bools(&all_bools);
+            assert_eq!(mask.buffer, collected);
+        }
+    }
+
+    #[test]
+    fn test_truncate_fuzz() {
+        let mut mask = BitSet::new();
+        let mut all_bools = vec![];
+        let mut rng = make_rng();
+
+        for _ in 0..100 {
+            let mask_length = (rng.next_u32() % 32) as usize;
+            let bools: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0))
+                .take(mask_length)
+                .collect();
+
+            let collected = compact_bools(&bools);
+            mask.append_bits(mask_length, &collected);
+            all_bools.extend_from_slice(&bools);
+
+            if !all_bools.is_empty() {
+                let truncate = rng.next_u32() as usize % all_bools.len();
+                mask.truncate(truncate);
+                all_bools.truncate(truncate);
+            }
+
+            let collected = compact_bools(&all_bools);
+            assert_eq!(mask.buffer, collected);
+        }
+    }
+
+    #[test]
+    fn test_extend_range_fuzz() {
+        let mut rng = make_rng();
+        let src_len = 32;
+        let src_bools: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0))
+            .take(src_len)
+            .collect();
+
+        let mut src_mask = BitSet::new();
+        src_mask.append_bits(src_len, &compact_bools(&src_bools));
+
+        let mut dst_bools = Vec::new();
+        let mut dst_mask = BitSet::new();
+
+        for _ in 0..100 {
+            let a = rng.next_u32() as usize % src_len;
+            let b = rng.next_u32() as usize % src_len;
+
+            let start = a.min(b);
+            let end = a.max(b);
+
+            dst_bools.extend_from_slice(&src_bools[start..end]);
+            dst_mask.extend_from_range(&src_mask, start..end);
+
+            let collected = compact_bools(&dst_bools);
+            assert_eq!(dst_mask.buffer, collected);
+        }
+    }
+
+    #[test]
+    fn test_arrow_compat() {
+        let bools = &[
+            false, false, true, true, false, false, true, false, true, false, false, true,
+        ];
+
+        let mut builder = BooleanBufferBuilder::new(bools.len());
+        builder.append_slice(bools);
+        let buffer = builder.finish();
+
+        let collected = compact_bools(bools);
+        let mut mask = BitSet::new();
+        mask.append_bits(bools.len(), &collected);
+        let mask_buffer = mask.to_arrow();
+
+        assert_eq!(collected.as_slice(), buffer.values());
+        assert_eq!(buffer.values(), mask_buffer.into_inner().as_slice());
+    }
+
+    #[test]
+    #[should_panic = "idx <= self.len"]
+    fn test_bitset_set_get_out_of_bounds() {
+        let mut v = BitSet::with_size(4);
+
+        // The bitset is of length 4, which is backed by a single byte with 8
+        // bits of storage capacity.
+        //
+        // Accessing bits past the 4 the bitset "contains" should not succeed.
+
+        v.get(5);
+        v.set(5);
+    }
+
+    #[test]
+    fn test_all_set_unset() {
+        for i in 1..100 {
+            let mut v = BitSet::new();
+            assert!(!v.is_any_set());
+            v.append_set(i);
+            assert!(v.is_all_set());
+            assert!(!v.is_all_unset());
+            assert!(v.is_any_set());
+
+            let mut v = BitSet::new();
+            v.append_unset(i);
+            assert!(!v.is_any_set());
+            v.append_set(1);
+            assert!(v.is_any_set());
+        }
+    }
+
+    #[test]
+    fn test_all_set_unset_multi_byte() {
+        let mut v = BitSet::new();
+
+        // Bitmap is composed of entirely set bits.
+        v.append_set(100);
+        assert!(v.is_all_set());
+        assert!(!v.is_all_unset());
+
+        // Now the bitmap is neither composed of entirely set, nor entirely
+        // unset bits.
+        v.append_unset(1);
+        assert!(!v.is_all_set());
+        assert!(!v.is_all_unset());
+
+        let mut v = BitSet::new();
+
+        // Bitmap is composed of entirely unset bits.
+        v.append_unset(100);
+        assert!(!v.is_all_set());
+        assert!(v.is_all_unset());
+
+        // And once again, it is neither all set, nor all unset.
+        v.append_set(1);
+        assert!(!v.is_all_set());
+        assert!(!v.is_all_unset());
+    }
+
+    #[test]
+    fn test_all_set_unset_single_byte() {
+        let mut v = BitSet::new();
+
+        // Bitmap is composed of entirely set bits.
+        v.append_set(2);
+        assert!(v.is_all_set());
+        assert!(!v.is_all_unset());
+
+        // Now the bitmap is neither composed of entirely set, nor entirely
+        // unset bits.
+        v.append_unset(1);
+        assert!(!v.is_all_set());
+        assert!(!v.is_all_unset());
+
+        let mut v = BitSet::new();
+
+        // Bitmap is composed of entirely unset bits.
+        v.append_unset(2);
+        assert!(!v.is_all_set());
+        assert!(v.is_all_unset());
+
+        // And once again, it is neither all set, nor all unset.
+        v.append_set(1);
+        assert!(!v.is_all_set());
+        assert!(!v.is_all_unset());
+    }
+
+    #[test]
+    fn test_all_set_unset_empty() {
+        let v = BitSet::new();
+        assert!(!v.is_all_set());
+        assert!(v.is_all_unset());
+    }
+
+    #[test]
+    fn test_split_byte_boundary() {
+        let mut a = BitSet::new();
+
+        a.append_set(16);
+        a.append_unset(8);
+        a.append_set(8);
+
+        let b = a.split_off(16);
+
+        assert_eq!(a.len(), 16);
+        assert_eq!(b.len(), 16);
+
+        // All the bits in A are set.
+        assert!(a.is_all_set());
+        for i in 0..16 {
+            assert!(a.get(i));
+        }
+
+        // The first 8 bits in b are unset, and the next 8 bits are set.
+        for i in 0..8 {
+            assert!(!b.get(i));
+        }
+        for i in 8..16 {
+            assert!(b.get(i));
+        }
+    }
+
+    #[test]
+    fn test_split_sub_byte_boundary() {
+        let mut a = BitSet::new();
+
+        a.append_set(3);
+        a.append_unset(3);
+        a.append_set(1);
+
+        assert_eq!(a.bytes(), &[0b01000111]);
+
+        let b = a.split_off(5);
+
+        assert_eq!(a.len(), 5);
+        assert_eq!(b.len(), 2);
+
+        // A contains 3 set bits & 2 unset bits, with the rest masked out.
+        assert_eq!(a.bytes(), &[0b00000111]);
+
+        // B contains 1 unset bit, and then 1 set bit
+        assert_eq!(b.bytes(), &[0b0000010]);
+    }
+
+    #[test]
+    fn test_split_multi_byte_unclean_boundary() {
+        let mut a = BitSet::new();
+
+        a.append_set(8);
+        a.append_unset(1);
+        a.append_set(1);
+        a.append_unset(1);
+        a.append_set(1);
+
+        assert_eq!(a.bytes(), &[0b11111111, 0b00001010]);
+
+        let b = a.split_off(10);
+
+        assert_eq!(a.len(), 10);
+        assert_eq!(b.len(), 2);
+
+        assert_eq!(a.bytes(), &[0b11111111, 0b00000010]);
+        assert_eq!(b.bytes(), &[0b0000010]);
+    }
+
+    #[test]
+    fn test_count_ones_with_truncate() {
+        // For varying sizes of bitmaps.
+        for i in 1..150 {
+            let mut b = BitSet::new();
+
+            // Set "i" number of bits in 2*i values.
+            for _ in 0..i {
+                b.append_unset(1);
+                b.append_set(1);
+            }
+
+            assert_eq!(b.len(), 2 * i);
+            assert_eq!(b.count_ones(), i);
+            assert_eq!(b.count_zeros(), i);
+
+            // Split it such that the last bit is removed.
+            let other = b.split_off((2 * i) - 1);
+            assert_eq!(other.len(), 1);
+            assert_eq!(other.count_ones(), 1);
+            assert_eq!(other.count_zeros(), 0);
+
+            // Which means the original bitmap must now have 1 less 1 bit.
+            assert_eq!(b.len(), (2 * i) - 1);
+            assert_eq!(b.count_ones(), i - 1);
+            assert_eq!(b.count_zeros(), i);
+        }
+    }
+
+    prop_compose! {
+        /// Returns a [`BitSet`] of random length and content.
+        fn arbitrary_bitset()(
+            values in prop::collection::vec(any::<bool>(), 0..20)
+        ) -> BitSet {
+            let mut b = BitSet::new();
+
+            for v in &values {
+                match v {
+                    true => b.append_set(1),
+                    false => b.append_unset(1),
+                }
+            }
+
+            b
+        }
+    }
+
+    proptest! {
+        #[test]
+        fn prop_iter(
+            values in prop::collection::vec(any::<bool>(), 0..20),
+        ) {
+            let mut b = BitSet::new();
+
+            for v in &values {
+                match v {
+                    true => b.append_set(1),
+                    false => b.append_unset(1),
+                }
+            }
+
+            assert_eq!(values.len(), b.len());
+
+            let got = b.iter().collect::<Vec<_>>();
+            assert_eq!(values, got);
+
+            // Exact size iter
+            assert_eq!(b.iter().len(), values.len());
+        }
+
+        #[test]
+        fn prop_and(
+            mut a in arbitrary_bitset(),
+            mut b in arbitrary_bitset(),
+        ) {
+            let min_len = a.len().min(b.len());
+            // Truncate a and b to the same length.
+            a.truncate(min_len);
+            b.truncate(min_len);
+
+            let want = a
+                .iter()
+                .zip(b.iter())
+                .map(|(a, b)| a & b)
+                .collect::<Vec<_>>();
+
+            let c = a.and(&b);
+            let got = c.iter().collect::<Vec<_>>();
+
+            assert_eq!(got, want);
+        }
+    }
+}
diff --git a/arrow_util/src/dictionary.rs b/arrow_util/src/dictionary.rs
new file mode 100644
index 0000000..1885deb
--- /dev/null
+++ b/arrow_util/src/dictionary.rs
@@ -0,0 +1,299 @@
+//! Contains a structure to map from strings to integer symbols based on
+//! string interning.
+use std::convert::TryFrom;
+
+use arrow::array::{Array, ArrayDataBuilder, DictionaryArray};
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::{DataType, Int32Type};
+use hashbrown::HashMap;
+use num_traits::{AsPrimitive, FromPrimitive, Zero};
+use snafu::Snafu;
+
+use crate::string::PackedStringArray;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("duplicate key found {}", key))]
+    DuplicateKeyFound { key: String },
+}
+
+/// A String dictionary that builds on top of `PackedStringArray` adding O(1)
+/// index lookups for a given string
+///
+/// Heavily inspired by the string-interner crate
+#[derive(Debug, Clone)]
+pub struct StringDictionary<K> {
+    hash: ahash::RandomState,
+    /// Used to provide a lookup from string value to key type
+    ///
+    /// Note: K's hash implementation is not used, instead the raw entry
+    /// API is used to store keys w.r.t the hash of the strings themselves
+    ///
+    dedup: HashMap<K, (), ()>,
+    /// Used to store strings
+    storage: PackedStringArray<K>,
+}
+
+impl<K: AsPrimitive<usize> + FromPrimitive + Zero> Default for StringDictionary<K> {
+    fn default() -> Self {
+        Self {
+            hash: ahash::RandomState::new(),
+            dedup: Default::default(),
+            storage: PackedStringArray::new(),
+        }
+    }
+}
+
+impl<K: AsPrimitive<usize> + FromPrimitive + Zero> StringDictionary<K> {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    pub fn with_capacity(keys: usize, values: usize) -> StringDictionary<K> {
+        Self {
+            hash: Default::default(),
+            dedup: HashMap::with_capacity_and_hasher(keys, ()),
+            storage: PackedStringArray::with_capacity(keys, values),
+        }
+    }
+
+    /// Returns the id corresponding to value, adding an entry for the
+    /// id if it is not yet present in the dictionary.
+    pub fn lookup_value_or_insert(&mut self, value: &str) -> K {
+        use hashbrown::hash_map::RawEntryMut;
+
+        let hasher = &self.hash;
+        let storage = &mut self.storage;
+        let hash = hash_str(hasher, value);
+
+        let entry = self
+            .dedup
+            .raw_entry_mut()
+            .from_hash(hash, |key| value == storage.get(key.as_()).unwrap());
+
+        match entry {
+            RawEntryMut::Occupied(entry) => *entry.into_key(),
+            RawEntryMut::Vacant(entry) => {
+                let index = storage.append(value);
+                let key =
+                    K::from_usize(index).expect("failed to fit string index into dictionary key");
+                *entry
+                    .insert_with_hasher(hash, key, (), |key| {
+                        let string = storage.get(key.as_()).unwrap();
+                        hash_str(hasher, string)
+                    })
+                    .0
+            }
+        }
+    }
+
+    /// Returns the ID in self.dictionary that corresponds to `value`, if any.
+    pub fn lookup_value(&self, value: &str) -> Option<K> {
+        let hash = hash_str(&self.hash, value);
+        self.dedup
+            .raw_entry()
+            .from_hash(hash, |key| value == self.storage.get(key.as_()).unwrap())
+            .map(|(&symbol, &())| symbol)
+    }
+
+    /// Returns the str in self.dictionary that corresponds to `id`
+    pub fn lookup_id(&self, id: K) -> Option<&str> {
+        self.storage.get(id.as_())
+    }
+
+    pub fn size(&self) -> usize {
+        self.storage.size() + self.dedup.len() * std::mem::size_of::<K>()
+    }
+
+    pub fn values(&self) -> &PackedStringArray<K> {
+        &self.storage
+    }
+
+    pub fn into_inner(self) -> PackedStringArray<K> {
+        self.storage
+    }
+
+    /// Truncates this dictionary removing all keys larger than `id`
+    pub fn truncate(&mut self, id: K) {
+        let id = id.as_();
+        self.dedup.retain(|k, _| k.as_() <= id);
+        self.storage.truncate(id + 1)
+    }
+
+    /// Clears this dictionary removing all elements
+    pub fn clear(&mut self) {
+        self.storage.clear();
+        self.dedup.clear()
+    }
+}
+
+fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 {
+    hasher.hash_one(value)
+}
+
+impl StringDictionary<i32> {
+    /// Convert to an arrow representation with the provided set of
+    /// keys and an optional null bitmask
+    pub fn to_arrow<I>(&self, keys: I, nulls: Option<NullBuffer>) -> DictionaryArray<Int32Type>
+    where
+        I: IntoIterator<Item = i32>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        // the nulls are recorded in the keys array, the dictionary itself
+        // is entirely non null
+        let dictionary_nulls = None;
+        let keys = keys.into_iter();
+
+        let array_data = ArrayDataBuilder::new(DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Utf8),
+        ))
+        .len(keys.len())
+        .add_buffer(keys.collect())
+        .add_child_data(self.storage.to_arrow(dictionary_nulls).into_data())
+        .nulls(nulls)
+        // TODO consider skipping the validation checks by using
+        // `build_unchecked()`
+        .build()
+        .expect("Valid array data");
+
+        DictionaryArray::<Int32Type>::from(array_data)
+    }
+}
+
+impl<K> TryFrom<PackedStringArray<K>> for StringDictionary<K>
+where
+    K: AsPrimitive<usize> + FromPrimitive + Zero,
+{
+    type Error = Error;
+
+    fn try_from(storage: PackedStringArray<K>) -> Result<Self, Error> {
+        use hashbrown::hash_map::RawEntryMut;
+
+        let hasher = ahash::RandomState::new();
+        let mut dedup: HashMap<K, (), ()> = HashMap::with_capacity_and_hasher(storage.len(), ());
+        for (idx, value) in storage.iter().enumerate() {
+            let hash = hash_str(&hasher, value);
+
+            let entry = dedup
+                .raw_entry_mut()
+                .from_hash(hash, |key| value == storage.get(key.as_()).unwrap());
+
+            match entry {
+                RawEntryMut::Occupied(_) => {
+                    return Err(Error::DuplicateKeyFound {
+                        key: value.to_string(),
+                    })
+                }
+                RawEntryMut::Vacant(entry) => {
+                    let key =
+                        K::from_usize(idx).expect("failed to fit string index into dictionary key");
+
+                    entry.insert_with_hasher(hash, key, (), |key| {
+                        let string = storage.get(key.as_()).unwrap();
+                        hash_str(&hasher, string)
+                    });
+                }
+            }
+        }
+
+        Ok(Self {
+            hash: hasher,
+            dedup,
+            storage,
+        })
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::convert::TryInto;
+
+    use super::*;
+
+    #[test]
+    fn test_dictionary() {
+        let mut dictionary = StringDictionary::<i32>::new();
+
+        let id1 = dictionary.lookup_value_or_insert("cupcake");
+        let id2 = dictionary.lookup_value_or_insert("cupcake");
+        let id3 = dictionary.lookup_value_or_insert("womble");
+
+        let id4 = dictionary.lookup_value("cupcake").unwrap();
+        let id5 = dictionary.lookup_value("womble").unwrap();
+
+        let cupcake = dictionary.lookup_id(id4).unwrap();
+        let womble = dictionary.lookup_id(id5).unwrap();
+
+        let arrow_expected = arrow::array::StringArray::from(vec!["cupcake", "womble"]);
+        let arrow_actual = dictionary.values().to_arrow(None);
+
+        assert_eq!(id1, id2);
+        assert_eq!(id1, id4);
+        assert_ne!(id1, id3);
+        assert_eq!(id3, id5);
+
+        assert_eq!(cupcake, "cupcake");
+        assert_eq!(womble, "womble");
+
+        assert!(dictionary.lookup_value("foo").is_none());
+        assert!(dictionary.lookup_id(-1).is_none());
+        assert_eq!(arrow_expected, arrow_actual);
+    }
+
+    #[test]
+    fn from_string_array() {
+        let mut data = PackedStringArray::<u64>::new();
+        data.append("cupcakes");
+        data.append("foo");
+        data.append("bingo");
+
+        let dictionary: StringDictionary<_> = data.try_into().unwrap();
+
+        assert_eq!(dictionary.lookup_value("cupcakes"), Some(0));
+        assert_eq!(dictionary.lookup_value("foo"), Some(1));
+        assert_eq!(dictionary.lookup_value("bingo"), Some(2));
+
+        assert_eq!(dictionary.lookup_id(0), Some("cupcakes"));
+        assert_eq!(dictionary.lookup_id(1), Some("foo"));
+        assert_eq!(dictionary.lookup_id(2), Some("bingo"));
+    }
+
+    #[test]
+    fn from_string_array_duplicates() {
+        let mut data = PackedStringArray::<u64>::new();
+        data.append("cupcakes");
+        data.append("foo");
+        data.append("bingo");
+        data.append("cupcakes");
+
+        let err = TryInto::<StringDictionary<_>>::try_into(data).expect_err("expected failure");
+        assert!(matches!(err, Error::DuplicateKeyFound { key } if &key == "cupcakes"))
+    }
+
+    #[test]
+    fn test_truncate() {
+        let mut dictionary = StringDictionary::<i32>::new();
+        dictionary.lookup_value_or_insert("cupcake");
+        dictionary.lookup_value_or_insert("cupcake");
+        dictionary.lookup_value_or_insert("bingo");
+        let bingo = dictionary.lookup_value_or_insert("bingo");
+        let bongo = dictionary.lookup_value_or_insert("bongo");
+        dictionary.lookup_value_or_insert("bingo");
+        dictionary.lookup_value_or_insert("cupcake");
+
+        dictionary.truncate(bingo);
+
+        assert_eq!(dictionary.values().len(), 2);
+        assert_eq!(dictionary.dedup.len(), 2);
+
+        assert_eq!(dictionary.lookup_value("cupcake"), Some(0));
+        assert_eq!(dictionary.lookup_value("bingo"), Some(1));
+
+        assert!(dictionary.lookup_value("bongo").is_none());
+        assert!(dictionary.lookup_id(bongo).is_none());
+
+        dictionary.lookup_value_or_insert("bongo");
+        assert_eq!(dictionary.lookup_value("bongo"), Some(2));
+    }
+}
diff --git a/arrow_util/src/display.rs b/arrow_util/src/display.rs
new file mode 100644
index 0000000..cba4b91
--- /dev/null
+++ b/arrow_util/src/display.rs
@@ -0,0 +1,206 @@
+use arrow::array::{ArrayRef, DurationNanosecondArray, TimestampNanosecondArray};
+use arrow::datatypes::{DataType, TimeUnit};
+use arrow::error::{ArrowError, Result};
+use arrow::record_batch::RecordBatch;
+
+use comfy_table::{Cell, Table};
+
+use chrono::prelude::*;
+
+/// custom version of
+/// [pretty_format_batches](arrow::util::pretty::pretty_format_batches)
+/// that displays timestamps using RFC3339 format (e.g. `2021-07-20T23:28:50Z`)
+///
+/// Should be removed if/when the capability is added upstream to arrow:
+/// <https://github.com/apache/arrow-rs/issues/599>
+pub fn pretty_format_batches(results: &[RecordBatch]) -> Result<String> {
+    Ok(create_table(results)?.to_string())
+}
+
+/// Convert the value at `column[row]` to a String
+///
+/// Special cases printing Timestamps in RFC3339 for IOx, otherwise
+/// falls back to Arrow's implementation
+///
+fn array_value_to_string(column: &ArrayRef, row: usize) -> Result<String> {
+    match column.data_type() {
+        DataType::Timestamp(TimeUnit::Nanosecond, None) if column.is_valid(row) => {
+            let ts_column = column
+                .as_any()
+                .downcast_ref::<TimestampNanosecondArray>()
+                .unwrap();
+
+            let ts_value = ts_column.value(row);
+            const NANOS_IN_SEC: i64 = 1_000_000_000;
+            let secs = ts_value / NANOS_IN_SEC;
+            let nanos = (ts_value - (secs * NANOS_IN_SEC)) as u32;
+            let ts = NaiveDateTime::from_timestamp_opt(secs, nanos).ok_or_else(|| {
+                ArrowError::ExternalError(
+                    format!("Cannot process timestamp (secs={secs}, nanos={nanos})").into(),
+                )
+            })?;
+            // treat as UTC
+            let ts = DateTime::<Utc>::from_naive_utc_and_offset(ts, Utc);
+            // convert to string in preferred influx format
+            let use_z = true;
+            Ok(ts.to_rfc3339_opts(SecondsFormat::AutoSi, use_z))
+        }
+        // TODO(edd): see https://github.com/apache/arrow-rs/issues/1168
+        DataType::Duration(TimeUnit::Nanosecond) if column.is_valid(row) => {
+            let dur_column = column
+                .as_any()
+                .downcast_ref::<DurationNanosecondArray>()
+                .unwrap();
+
+            let duration = std::time::Duration::from_nanos(
+                dur_column
+                    .value(row)
+                    .try_into()
+                    .map_err(|e| ArrowError::InvalidArgumentError(format!("{e:?}")))?,
+            );
+            Ok(format!("{duration:?}"))
+        }
+        _ => {
+            // fallback to arrow's default printing for other types
+            arrow::util::display::array_value_to_string(column, row)
+        }
+    }
+}
+
+/// Convert a series of record batches into a table
+///
+/// NB: COPIED FROM ARROW
+fn create_table(results: &[RecordBatch]) -> Result<Table> {
+    let mut table = Table::new();
+    table.load_preset("||--+-++|    ++++++");
+
+    if results.is_empty() {
+        return Ok(table);
+    }
+
+    let schema = results[0].schema();
+
+    let mut header = Vec::new();
+    for field in schema.fields() {
+        header.push(Cell::new(field.name()));
+    }
+    table.set_header(header);
+
+    for (i, batch) in results.iter().enumerate() {
+        if batch.schema() != schema {
+            return Err(ArrowError::SchemaError(format!(
+                "Batches have different schemas:\n\nFirst:\n{}\n\nBatch {}:\n{}",
+                schema,
+                i + 1,
+                batch.schema()
+            )));
+        }
+
+        for row in 0..batch.num_rows() {
+            let mut cells = Vec::new();
+            for col in 0..batch.num_columns() {
+                let column = batch.column(col);
+                cells.push(Cell::new(array_value_to_string(column, row)?));
+            }
+            table.add_row(cells);
+        }
+    }
+
+    Ok(table)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+
+    use arrow::{
+        array::{
+            ArrayRef, BooleanArray, DictionaryArray, Float64Array, Int64Array, StringArray,
+            UInt64Array,
+        },
+        datatypes::Int32Type,
+    };
+    use datafusion::common::assert_contains;
+
+    #[test]
+    fn test_formatting() {
+        // tests formatting all of the Arrow array types used in IOx
+
+        // tags use string dictionary
+        let dict_array: ArrayRef = Arc::new(
+            vec![Some("a"), None, Some("b")]
+                .into_iter()
+                .collect::<DictionaryArray<Int32Type>>(),
+        );
+
+        // field types
+        let int64_array: ArrayRef =
+            Arc::new([Some(-1), None, Some(2)].iter().collect::<Int64Array>());
+        let uint64_array: ArrayRef =
+            Arc::new([Some(1), None, Some(2)].iter().collect::<UInt64Array>());
+        let float64_array: ArrayRef = Arc::new(
+            [Some(1.0), None, Some(2.0)]
+                .iter()
+                .collect::<Float64Array>(),
+        );
+        let bool_array: ArrayRef = Arc::new(
+            [Some(true), None, Some(false)]
+                .iter()
+                .collect::<BooleanArray>(),
+        );
+        let string_array: ArrayRef = Arc::new(
+            vec![Some("foo"), None, Some("bar")]
+                .into_iter()
+                .collect::<StringArray>(),
+        );
+
+        // timestamp type
+        let ts_array: ArrayRef = Arc::new(
+            [None, Some(100), Some(1626823730000000000)]
+                .iter()
+                .collect::<TimestampNanosecondArray>(),
+        );
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("dict", dict_array),
+            ("int64", int64_array),
+            ("uint64", uint64_array),
+            ("float64", float64_array),
+            ("bool", bool_array),
+            ("string", string_array),
+            ("time", ts_array),
+        ])
+        .unwrap();
+
+        let table = pretty_format_batches(&[batch]).unwrap();
+
+        let expected = vec![
+            "+------+-------+--------+---------+-------+--------+--------------------------------+",
+            "| dict | int64 | uint64 | float64 | bool  | string | time                           |",
+            "+------+-------+--------+---------+-------+--------+--------------------------------+",
+            "| a    | -1    | 1      | 1.0     | true  | foo    |                                |",
+            "|      |       |        |         |       |        | 1970-01-01T00:00:00.000000100Z |",
+            "| b    | 2     | 2      | 2.0     | false | bar    | 2021-07-20T23:28:50Z           |",
+            "+------+-------+--------+---------+-------+--------+--------------------------------+",
+        ];
+
+        let actual: Vec<&str> = table.lines().collect();
+        assert_eq!(
+            expected, actual,
+            "Expected:\n\n{expected:#?}\nActual:\n\n{actual:#?}\n"
+        );
+    }
+
+    #[test]
+    fn test_pretty_format_batches_checks_schemas() {
+        let int64_array: ArrayRef = Arc::new([Some(2)].iter().collect::<Int64Array>());
+        let uint64_array: ArrayRef = Arc::new([Some(2)].iter().collect::<UInt64Array>());
+
+        let batch1 = RecordBatch::try_from_iter(vec![("col", int64_array)]).unwrap();
+        let batch2 = RecordBatch::try_from_iter(vec![("col", uint64_array)]).unwrap();
+
+        let err = pretty_format_batches(&[batch1, batch2]).unwrap_err();
+        assert_contains!(err.to_string(), "Batches have different schemas:");
+    }
+}
diff --git a/arrow_util/src/flight.rs b/arrow_util/src/flight.rs
new file mode 100644
index 0000000..66521aa
--- /dev/null
+++ b/arrow_util/src/flight.rs
@@ -0,0 +1,26 @@
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+
+/// Prepare an arrow Schema for transport over the Arrow Flight protocol
+///
+/// Converts dictionary types to underlying types due to <https://github.com/apache/arrow-rs/issues/3389>
+pub fn prepare_schema_for_flight(schema: SchemaRef) -> SchemaRef {
+    let fields: Fields = schema
+        .fields()
+        .iter()
+        .map(|field| match field.data_type() {
+            DataType::Dictionary(_, value_type) => Arc::new(
+                Field::new(
+                    field.name(),
+                    value_type.as_ref().clone(),
+                    field.is_nullable(),
+                )
+                .with_metadata(field.metadata().clone()),
+            ),
+            _ => Arc::clone(field),
+        })
+        .collect();
+
+    Arc::new(Schema::new(fields).with_metadata(schema.metadata().clone()))
+}
diff --git a/arrow_util/src/lib.rs b/arrow_util/src/lib.rs
new file mode 100644
index 0000000..613d794
--- /dev/null
+++ b/arrow_util/src/lib.rs
@@ -0,0 +1,27 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![allow(clippy::clone_on_ref_ptr)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod bitset;
+pub mod dictionary;
+pub mod display;
+pub mod flight;
+pub mod optimize;
+pub mod string;
+pub mod util;
+
+/// This has a collection of testing helper functions
+pub mod test_util;
diff --git a/arrow_util/src/optimize.rs b/arrow_util/src/optimize.rs
new file mode 100644
index 0000000..2f7ffbf
--- /dev/null
+++ b/arrow_util/src/optimize.rs
@@ -0,0 +1,299 @@
+use std::collections::BTreeSet;
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, DictionaryArray, StringArray};
+use arrow::datatypes::{DataType, Int32Type};
+use arrow::error::{ArrowError, Result};
+use arrow::record_batch::RecordBatch;
+use hashbrown::HashMap;
+
+use crate::dictionary::StringDictionary;
+
+/// Takes a record batch and returns a new record batch with dictionaries
+/// optimized to contain no duplicate or unreferenced values
+///
+/// Where the input dictionaries are sorted, the output dictionaries
+/// will also be
+pub fn optimize_dictionaries(batch: &RecordBatch) -> Result<RecordBatch> {
+    let schema = batch.schema();
+    let new_columns = batch
+        .columns()
+        .iter()
+        .zip(schema.fields())
+        .map(|(col, field)| match field.data_type() {
+            DataType::Dictionary(key, value) => optimize_dict_col(col, key, value),
+            _ => Ok(Arc::clone(col)),
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    RecordBatch::try_new(schema, new_columns)
+}
+
+/// Optimizes the dictionaries for a column
+fn optimize_dict_col(
+    col: &ArrayRef,
+    key_type: &DataType,
+    value_type: &DataType,
+) -> Result<ArrayRef> {
+    if key_type != &DataType::Int32 {
+        return Err(ArrowError::NotYetImplemented(format!(
+            "truncating non-Int32 dictionaries not supported: {key_type}"
+        )));
+    }
+
+    if value_type != &DataType::Utf8 {
+        return Err(ArrowError::NotYetImplemented(format!(
+            "truncating non-string dictionaries not supported: {value_type}"
+        )));
+    }
+
+    let col = col
+        .as_any()
+        .downcast_ref::<DictionaryArray<Int32Type>>()
+        .expect("unexpected datatype");
+
+    let keys = col.keys();
+    let values = col.values();
+    let values = values
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .expect("unexpected datatype");
+
+    // The total length of the resulting values array
+    let mut values_len = 0_usize;
+
+    // Keys that appear in the values array
+    // Use a BTreeSet to preserve the order of the dictionary
+    let mut used_keys = BTreeSet::new();
+    for key in keys.iter().flatten() {
+        if used_keys.insert(key) {
+            values_len += values.value_length(key as usize) as usize;
+        }
+    }
+
+    // Then perform deduplication
+    let mut new_dictionary = StringDictionary::with_capacity(used_keys.len(), values_len);
+    let mut old_to_new_idx: HashMap<i32, i32> = HashMap::with_capacity(used_keys.len());
+    for key in used_keys {
+        let new_key = new_dictionary.lookup_value_or_insert(values.value(key as usize));
+        old_to_new_idx.insert(key, new_key);
+    }
+
+    let new_keys = keys.iter().map(|x| match x {
+        Some(x) => *old_to_new_idx.get(&x).expect("no mapping found"),
+        None => -1,
+    });
+
+    let nulls = keys.nulls().cloned();
+    Ok(Arc::new(new_dictionary.to_arrow(new_keys, nulls)))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate as arrow_util;
+    use crate::assert_batches_eq;
+    use arrow::array::{ArrayDataBuilder, DictionaryArray, Float64Array, Int32Array, StringArray};
+    use arrow::compute::concat;
+    use std::iter::FromIterator;
+
+    #[test]
+    fn test_optimize_dictionaries() {
+        let values = StringArray::from(vec![
+            "duplicate",
+            "duplicate",
+            "foo",
+            "boo",
+            "unused",
+            "duplicate",
+        ]);
+        let keys = Int32Array::from(vec![
+            Some(0),
+            Some(1),
+            None,
+            Some(1),
+            Some(2),
+            Some(5),
+            Some(3),
+        ]);
+
+        let batch = RecordBatch::try_from_iter(vec![(
+            "foo",
+            Arc::new(build_dict(keys, values)) as ArrayRef,
+        )])
+        .unwrap();
+
+        let optimized = optimize_dictionaries(&batch).unwrap();
+
+        let col = optimized
+            .column(0)
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int32Type>>()
+            .unwrap();
+
+        let values = col.values();
+        let values = values.as_any().downcast_ref::<StringArray>().unwrap();
+        let values = values.iter().flatten().collect::<Vec<_>>();
+        assert_eq!(values, vec!["duplicate", "foo", "boo"]);
+
+        assert_batches_eq!(
+            vec![
+                "+-----------+",
+                "| foo       |",
+                "+-----------+",
+                "| duplicate |",
+                "| duplicate |",
+                "|           |",
+                "| duplicate |",
+                "| foo       |",
+                "| duplicate |",
+                "| boo       |",
+                "+-----------+",
+            ],
+            &[optimized]
+        );
+    }
+
+    #[test]
+    fn test_optimize_dictionaries_concat() {
+        let f1_1 = Float64Array::from(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0)]);
+        let t2_1 = DictionaryArray::<Int32Type>::from_iter(vec![
+            Some("a"),
+            Some("g"),
+            Some("a"),
+            Some("b"),
+        ]);
+        let t1_1 = DictionaryArray::<Int32Type>::from_iter(vec![
+            Some("a"),
+            Some("a"),
+            Some("b"),
+            Some("b"),
+        ]);
+
+        let f1_2 = Float64Array::from(vec![Some(1.0), Some(5.0), Some(2.0), Some(46.0)]);
+        let t2_2 = DictionaryArray::<Int32Type>::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("a"),
+            Some("a"),
+        ]);
+        let t1_2 = DictionaryArray::<Int32Type>::from_iter(vec![
+            Some("a"),
+            Some("d"),
+            Some("a"),
+            Some("b"),
+        ]);
+
+        let concat = RecordBatch::try_from_iter(vec![
+            ("f1", concat(&[&f1_1, &f1_2]).unwrap()),
+            ("t2", concat(&[&t2_1, &t2_2]).unwrap()),
+            ("t1", concat(&[&t1_1, &t1_2]).unwrap()),
+        ])
+        .unwrap();
+
+        let optimized = optimize_dictionaries(&concat).unwrap();
+
+        let col = optimized
+            .column(optimized.schema().column_with_name("t2").unwrap().0)
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int32Type>>()
+            .unwrap();
+
+        let values = col.values();
+        let values = values.as_any().downcast_ref::<StringArray>().unwrap();
+        let values = values.iter().flatten().collect::<Vec<_>>();
+        assert_eq!(values, vec!["a", "g", "b"]);
+
+        let col = optimized
+            .column(optimized.schema().column_with_name("t1").unwrap().0)
+            .as_any()
+            .downcast_ref::<DictionaryArray<Int32Type>>()
+            .unwrap();
+
+        let values = col.values();
+        let values = values.as_any().downcast_ref::<StringArray>().unwrap();
+        let values = values.iter().flatten().collect::<Vec<_>>();
+        assert_eq!(values, vec!["a", "b", "d"]);
+
+        assert_batches_eq!(
+            vec![
+                "+------+----+----+",
+                "| f1   | t2 | t1 |",
+                "+------+----+----+",
+                "| 1.0  | a  | a  |",
+                "| 2.0  | g  | a  |",
+                "| 3.0  | a  | b  |",
+                "| 4.0  | b  | b  |",
+                "| 1.0  | a  | a  |",
+                "| 5.0  | b  | d  |",
+                "| 2.0  | a  | a  |",
+                "| 46.0 | a  | b  |",
+                "+------+----+----+",
+            ],
+            &[optimized]
+        );
+    }
+
+    #[test]
+    fn test_optimize_dictionaries_null() {
+        let values = StringArray::from(vec!["bananas"]);
+        let keys = Int32Array::from(vec![None, None, Some(0)]);
+        let col = Arc::new(build_dict(keys, values)) as ArrayRef;
+
+        let col = optimize_dict_col(&col, &DataType::Int32, &DataType::Utf8).unwrap();
+
+        let batch = RecordBatch::try_from_iter(vec![("t", col)]).unwrap();
+
+        assert_batches_eq!(
+            vec![
+                "+---------+",
+                "| t       |",
+                "+---------+",
+                "|         |",
+                "|         |",
+                "| bananas |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+    }
+
+    #[test]
+    fn test_optimize_dictionaries_slice() {
+        let values = StringArray::from(vec!["bananas"]);
+        let keys = Int32Array::from(vec![None, Some(0), None]);
+        let col = Arc::new(build_dict(keys, values)) as ArrayRef;
+        let col = col.slice(1, 2);
+
+        let col = optimize_dict_col(&col, &DataType::Int32, &DataType::Utf8).unwrap();
+
+        let batch = RecordBatch::try_from_iter(vec![("t", col)]).unwrap();
+
+        assert_batches_eq!(
+            vec![
+                "+---------+",
+                "| t       |",
+                "+---------+",
+                "| bananas |",
+                "|         |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+    }
+
+    fn build_dict(keys: Int32Array, values: StringArray) -> DictionaryArray<Int32Type> {
+        let data = ArrayDataBuilder::new(DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Utf8),
+        ))
+        .len(keys.len())
+        .add_buffer(keys.to_data().buffers()[0].clone())
+        .nulls(keys.nulls().cloned())
+        .add_child_data(values.into_data())
+        .build()
+        .unwrap();
+
+        DictionaryArray::from(data)
+    }
+}
diff --git a/arrow_util/src/string.rs b/arrow_util/src/string.rs
new file mode 100644
index 0000000..5460a38
--- /dev/null
+++ b/arrow_util/src/string.rs
@@ -0,0 +1,384 @@
+use arrow::array::ArrayDataBuilder;
+use arrow::array::StringArray;
+use arrow::buffer::Buffer;
+use arrow::buffer::NullBuffer;
+use num_traits::{AsPrimitive, FromPrimitive, Zero};
+use std::fmt::Debug;
+use std::ops::Range;
+
+/// A packed string array that stores start and end indexes into
+/// a contiguous string slice.
+///
+/// The type parameter K alters the type used to store the offsets
+#[derive(Debug, Clone)]
+pub struct PackedStringArray<K> {
+    /// The start and end offsets of strings stored in storage
+    offsets: Vec<K>,
+    /// A contiguous array of string data
+    storage: String,
+}
+
+impl<K: Zero> Default for PackedStringArray<K> {
+    fn default() -> Self {
+        Self {
+            offsets: vec![K::zero()],
+            storage: String::new(),
+        }
+    }
+}
+
+impl<K: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<K> {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn new_empty(len: usize) -> Self {
+        Self {
+            offsets: vec![K::zero(); len + 1],
+            storage: String::new(),
+        }
+    }
+
+    pub fn with_capacity(keys: usize, values: usize) -> Self {
+        let mut offsets = Vec::with_capacity(keys + 1);
+        offsets.push(K::zero());
+
+        Self {
+            offsets,
+            storage: String::with_capacity(values),
+        }
+    }
+
+    /// Append a value
+    ///
+    /// Returns the index of the appended data
+    pub fn append(&mut self, data: &str) -> usize {
+        let id = self.offsets.len() - 1;
+
+        let offset = self.storage.len() + data.len();
+        let offset = K::from_usize(offset).expect("failed to fit into offset type");
+
+        self.offsets.push(offset);
+        self.storage.push_str(data);
+
+        id
+    }
+
+    /// Extends this [`PackedStringArray`] by the contents of `other`
+    pub fn extend_from(&mut self, other: &PackedStringArray<K>) {
+        let offset = self.storage.len();
+        self.storage.push_str(other.storage.as_str());
+        // Copy offsets skipping the first element as this string start delimiter is already
+        // provided by the end delimiter of the current offsets array
+        self.offsets.extend(
+            other
+                .offsets
+                .iter()
+                .skip(1)
+                .map(|x| K::from_usize(x.as_() + offset).expect("failed to fit into offset type")),
+        )
+    }
+
+    /// Extends this [`PackedStringArray`] by `range` elements from `other`
+    pub fn extend_from_range(&mut self, other: &PackedStringArray<K>, range: Range<usize>) {
+        let first_offset: usize = other.offsets[range.start].as_();
+        let end_offset: usize = other.offsets[range.end].as_();
+
+        let insert_offset = self.storage.len();
+
+        self.storage
+            .push_str(&other.storage[first_offset..end_offset]);
+
+        self.offsets.extend(
+            other.offsets[(range.start + 1)..(range.end + 1)]
+                .iter()
+                .map(|x| {
+                    K::from_usize(x.as_() - first_offset + insert_offset)
+                        .expect("failed to fit into offset type")
+                }),
+        )
+    }
+
+    /// Get the value at a given index
+    pub fn get(&self, index: usize) -> Option<&str> {
+        let start_offset = self.offsets.get(index)?.as_();
+        let end_offset = self.offsets.get(index + 1)?.as_();
+
+        Some(&self.storage[start_offset..end_offset])
+    }
+
+    /// Pads with empty strings to reach length
+    pub fn extend(&mut self, len: usize) {
+        let offset = K::from_usize(self.storage.len()).expect("failed to fit into offset type");
+        self.offsets.resize(self.offsets.len() + len, offset);
+    }
+
+    /// Truncates the array to the given length
+    pub fn truncate(&mut self, len: usize) {
+        self.offsets.truncate(len + 1);
+        let last_idx = self.offsets.last().expect("offsets empty");
+        self.storage.truncate(last_idx.as_());
+    }
+
+    /// Removes all elements from this array
+    pub fn clear(&mut self) {
+        self.offsets.truncate(1);
+        self.storage.clear();
+    }
+
+    pub fn iter(&self) -> PackedStringIterator<'_, K> {
+        PackedStringIterator {
+            array: self,
+            index: 0,
+        }
+    }
+
+    /// The number of strings in this array
+    pub fn len(&self) -> usize {
+        self.offsets.len() - 1
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.offsets.len() == 1
+    }
+
+    /// Return the amount of memory in bytes taken up by this array
+    pub fn size(&self) -> usize {
+        self.storage.capacity() + self.offsets.capacity() * std::mem::size_of::<K>()
+    }
+
+    pub fn inner(&self) -> (&[K], &str) {
+        (&self.offsets, &self.storage)
+    }
+
+    pub fn into_inner(self) -> (Vec<K>, String) {
+        (self.offsets, self.storage)
+    }
+
+    /// Split this [`PackedStringArray`] at `n`, such that `self`` contains the
+    /// elements `[0, n)` and the returned [`PackedStringArray`] contains
+    /// elements `[n, len)`.
+    pub fn split_off(&mut self, n: usize) -> Self {
+        if n > self.len() {
+            return Default::default();
+        }
+
+        let offsets = self.offsets.split_off(n + 1);
+
+        // Figure out where to split the string storage.
+        let split_point = self.offsets.last().map(|v| v.as_()).unwrap();
+
+        // Split the storage at the split point, such that the first N values
+        // appear in self.
+        let storage = self.storage.split_off(split_point);
+
+        // The new "offsets" now needs remapping such that the first offset
+        // starts at 0, so that indexing into the new storage string will hit
+        // the right start point.
+        let offsets = std::iter::once(K::zero())
+            .chain(
+                offsets
+                    .into_iter()
+                    .map(|v| K::from_usize(v.as_() - split_point).unwrap()),
+            )
+            .collect::<Vec<_>>();
+
+        Self { offsets, storage }
+    }
+}
+
+impl PackedStringArray<i32> {
+    /// Convert to an arrow with an optional null bitmask
+    pub fn to_arrow(&self, nulls: Option<NullBuffer>) -> StringArray {
+        let len = self.offsets.len() - 1;
+        let offsets = Buffer::from_slice_ref(&self.offsets);
+        let values = Buffer::from(self.storage.as_bytes());
+
+        let data = ArrayDataBuilder::new(arrow::datatypes::DataType::Utf8)
+            .len(len)
+            .add_buffer(offsets)
+            .add_buffer(values)
+            .nulls(nulls)
+            .build()
+            // TODO consider skipping the validation checks by using
+            // `new_unchecked`
+            .expect("Valid array data");
+        StringArray::from(data)
+    }
+}
+
+#[derive(Debug)]
+pub struct PackedStringIterator<'a, K> {
+    array: &'a PackedStringArray<K>,
+    index: usize,
+}
+
+impl<'a, K: AsPrimitive<usize> + FromPrimitive + Zero> Iterator for PackedStringIterator<'a, K> {
+    type Item = &'a str;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let item = self.array.get(self.index)?;
+        self.index += 1;
+        Some(item)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.array.len() - self.index;
+        (len, Some(len))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::string::PackedStringArray;
+
+    use proptest::prelude::*;
+
+    #[test]
+    fn test_storage() {
+        let mut array = PackedStringArray::<i32>::new();
+
+        array.append("hello");
+        array.append("world");
+        array.append("cupcake");
+
+        assert_eq!(array.get(0).unwrap(), "hello");
+        assert_eq!(array.get(1).unwrap(), "world");
+        assert_eq!(array.get(2).unwrap(), "cupcake");
+        assert!(array.get(-1_i32 as usize).is_none());
+
+        assert!(array.get(3).is_none());
+
+        array.extend(2);
+        assert_eq!(array.get(3).unwrap(), "");
+        assert_eq!(array.get(4).unwrap(), "");
+        assert!(array.get(5).is_none());
+    }
+
+    #[test]
+    fn test_empty() {
+        let array = PackedStringArray::<u8>::new_empty(20);
+        assert_eq!(array.get(12).unwrap(), "");
+        assert_eq!(array.get(9).unwrap(), "");
+        assert_eq!(array.get(3).unwrap(), "");
+    }
+
+    #[test]
+    fn test_truncate() {
+        let mut array = PackedStringArray::<i32>::new();
+
+        array.append("hello");
+        array.append("world");
+        array.append("cupcake");
+
+        array.truncate(1);
+        assert_eq!(array.len(), 1);
+        assert_eq!(array.get(0).unwrap(), "hello");
+
+        array.append("world");
+        assert_eq!(array.len(), 2);
+        assert_eq!(array.get(0).unwrap(), "hello");
+        assert_eq!(array.get(1).unwrap(), "world");
+    }
+
+    #[test]
+    fn test_extend_from() {
+        let mut a = PackedStringArray::<i32>::new();
+
+        a.append("hello");
+        a.append("world");
+        a.append("cupcake");
+        a.append("");
+
+        let mut b = PackedStringArray::<i32>::new();
+
+        b.append("foo");
+        b.append("bar");
+
+        a.extend_from(&b);
+
+        let a_content: Vec<_> = a.iter().collect();
+        assert_eq!(
+            a_content,
+            vec!["hello", "world", "cupcake", "", "foo", "bar"]
+        );
+    }
+
+    #[test]
+    fn test_extend_from_range() {
+        let mut a = PackedStringArray::<i32>::new();
+
+        a.append("hello");
+        a.append("world");
+        a.append("cupcake");
+        a.append("");
+
+        let mut b = PackedStringArray::<i32>::new();
+
+        b.append("foo");
+        b.append("bar");
+        b.append("");
+        b.append("fiz");
+
+        a.extend_from_range(&b, 1..3);
+
+        assert_eq!(a.len(), 6);
+
+        let a_content: Vec<_> = a.iter().collect();
+        assert_eq!(a_content, vec!["hello", "world", "cupcake", "", "bar", ""]);
+
+        // Should be a no-op
+        a.extend_from_range(&b, 0..0);
+
+        let a_content: Vec<_> = a.iter().collect();
+        assert_eq!(a_content, vec!["hello", "world", "cupcake", "", "bar", ""]);
+
+        a.extend_from_range(&b, 0..1);
+
+        let a_content: Vec<_> = a.iter().collect();
+        assert_eq!(
+            a_content,
+            vec!["hello", "world", "cupcake", "", "bar", "", "foo"]
+        );
+
+        a.extend_from_range(&b, 1..4);
+
+        let a_content: Vec<_> = a.iter().collect();
+        assert_eq!(
+            a_content,
+            vec!["hello", "world", "cupcake", "", "bar", "", "foo", "bar", "", "fiz"]
+        );
+    }
+
+    proptest! {
+        #[test]
+        fn prop_split_off(
+            a in prop::collection::vec(any::<String>(), 0..20),
+            b in prop::collection::vec(any::<String>(), 0..20),
+        ) {
+            let mut p = PackedStringArray::<i32>::new();
+
+            // Add all the elements in "a" and "b" to the string array.
+            for v in a.iter().chain(b.iter()) {
+                p.append(v);
+            }
+
+            // Split the packed string array at the boundary of "a".
+            let p2 = p.split_off(a.len());
+
+            assert_eq!(p.iter().collect::<Vec<_>>(), a, "parent");
+            assert_eq!(p2.iter().collect::<Vec<_>>(), b, "child");
+        }
+    }
+
+    #[test]
+    fn test_split_off_oob() {
+        let mut p = PackedStringArray::<i32>::new();
+
+        p.append("bananas");
+
+        let got = p.split_off(42);
+        assert_eq!(p.len(), 1);
+        assert_eq!(got.len(), 0);
+    }
+}
diff --git a/arrow_util/src/test_util.rs b/arrow_util/src/test_util.rs
new file mode 100644
index 0000000..8126e25
--- /dev/null
+++ b/arrow_util/src/test_util.rs
@@ -0,0 +1,419 @@
+//! A collection of testing functions for arrow based code
+use std::sync::Arc;
+
+use crate::display::pretty_format_batches;
+use arrow::{
+    array::{new_null_array, ArrayRef, StringArray},
+    compute::kernels::sort::{lexsort, SortColumn, SortOptions},
+    datatypes::Schema,
+    error::ArrowError,
+    record_batch::RecordBatch,
+};
+use once_cell::sync::Lazy;
+use regex::{Captures, Regex};
+use std::{borrow::Cow, collections::HashMap};
+use uuid::Uuid;
+
+/// Compares the formatted output with the pretty formatted results of
+/// record batches. This is a macro so errors appear on the correct line
+///
+/// Designed so that failure output can be directly copy/pasted
+/// into the test code as expected results.
+///
+/// Expects to be called about like this:
+/// assert_batches_eq(expected_lines: &[&str], chunks: &[RecordBatch])
+#[macro_export]
+macro_rules! assert_batches_eq {
+    ($EXPECTED_LINES: expr, $CHUNKS: expr) => {
+        let expected_lines: Vec<String> =
+            $EXPECTED_LINES.into_iter().map(|s| s.to_string()).collect();
+
+        let actual_lines = arrow_util::test_util::batches_to_lines($CHUNKS);
+
+        assert_eq!(
+            expected_lines, actual_lines,
+            "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
+            expected_lines, actual_lines
+        );
+    };
+}
+
+/// Compares formatted output of a record batch with an expected
+/// vector of strings in a way that order does not matter.
+/// This is a macro so errors appear on the correct line
+///
+/// Designed so that failure output can be directly copy/pasted
+/// into the test code as expected results.
+///
+/// Expects to be called about like this:
+///
+/// `assert_batch_sorted_eq!(expected_lines: &[&str], batches: &[RecordBatch])`
+#[macro_export]
+macro_rules! assert_batches_sorted_eq {
+    ($EXPECTED_LINES: expr, $CHUNKS: expr) => {
+        let expected_lines: Vec<String> = $EXPECTED_LINES.iter().map(|&s| s.into()).collect();
+        let expected_lines = arrow_util::test_util::sort_lines(expected_lines);
+
+        let actual_lines = arrow_util::test_util::batches_to_sorted_lines($CHUNKS);
+
+        assert_eq!(
+            expected_lines, actual_lines,
+            "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
+            expected_lines, actual_lines
+        );
+    };
+}
+
+/// Converts the [`RecordBatch`]es into a pretty printed output suitable for
+/// comparing in tests
+///
+/// Example:
+///
+/// ```text
+/// "+-----+------+------+--------------------------------+",
+/// "| foo | host | load | time                           |",
+/// "+-----+------+------+--------------------------------+",
+/// "|     | a    | 1.0  | 1970-01-01T00:00:00.000000011Z |",
+/// "|     | a    | 14.0 | 1970-01-01T00:00:00.000010001Z |",
+/// "|     | a    | 3.0  | 1970-01-01T00:00:00.000000033Z |",
+/// "|     | b    | 5.0  | 1970-01-01T00:00:00.000000011Z |",
+/// "|     | z    | 0.0  | 1970-01-01T00:00:00Z           |",
+/// "+-----+------+------+--------------------------------+",
+/// ```
+pub fn batches_to_lines(batches: &[RecordBatch]) -> Vec<String> {
+    crate::display::pretty_format_batches(batches)
+        .unwrap()
+        .trim()
+        .lines()
+        .map(|s| s.to_string())
+        .collect()
+}
+
+/// Converts the [`RecordBatch`]es into a pretty printed output suitable for
+/// comparing in tests where sorting does not matter.
+pub fn batches_to_sorted_lines(batches: &[RecordBatch]) -> Vec<String> {
+    sort_lines(batches_to_lines(batches))
+}
+
+/// Sorts the lines (assumed to be the output of `batches_to_lines` for stable comparison)
+pub fn sort_lines(mut lines: Vec<String>) -> Vec<String> {
+    // sort except for header + footer
+    let num_lines = lines.len();
+    if num_lines > 3 {
+        lines.as_mut_slice()[2..num_lines - 1].sort_unstable()
+    }
+    lines
+}
+
+// sort a record batch by all columns (to provide a stable output order for test
+// comparison)
+pub fn sort_record_batch(batch: RecordBatch) -> RecordBatch {
+    let sort_input: Vec<SortColumn> = batch
+        .columns()
+        .iter()
+        .map(|col| SortColumn {
+            values: Arc::clone(col),
+            options: Some(SortOptions {
+                descending: false,
+                nulls_first: false,
+            }),
+        })
+        .collect();
+
+    let sort_output = lexsort(&sort_input, None).expect("Sorting to complete");
+
+    RecordBatch::try_new(batch.schema(), sort_output).unwrap()
+}
+
+/// Return a new `StringArray` where each element had a normalization
+/// function `norm` applied.
+pub fn normalize_string_array<N>(arr: &StringArray, norm: N) -> ArrayRef
+where
+    N: Fn(&str) -> String,
+{
+    let normalized: StringArray = arr.iter().map(|s| s.map(&norm)).collect();
+    Arc::new(normalized)
+}
+
+/// Return a new set of `RecordBatch`es where the function `norm` has
+/// applied to all `StringArray` rows.
+pub fn normalize_batches<N>(batches: Vec<RecordBatch>, norm: N) -> Vec<RecordBatch>
+where
+    N: Fn(&str) -> String,
+{
+    // The idea here is is to get a function that normalizes strings
+    // and apply it to each StringArray element by element
+    batches
+        .into_iter()
+        .map(|batch| {
+            let new_columns: Vec<_> = batch
+                .columns()
+                .iter()
+                .map(|array| {
+                    if let Some(array) = array.as_any().downcast_ref::<StringArray>() {
+                        normalize_string_array(array, &norm)
+                    } else {
+                        Arc::clone(array)
+                    }
+                })
+                .collect();
+
+            RecordBatch::try_new(batch.schema(), new_columns)
+                .expect("error occurred during normalization")
+        })
+        .collect()
+}
+
+/// Equalize batch schemas by creating NULL columns.
+pub fn equalize_batch_schemas(batches: Vec<RecordBatch>) -> Result<Vec<RecordBatch>, ArrowError> {
+    let common_schema = Arc::new(Schema::try_merge(
+        batches.iter().map(|batch| batch.schema().as_ref().clone()),
+    )?);
+
+    Ok(batches
+        .into_iter()
+        .map(|batch| {
+            let batch_schema = batch.schema();
+            let columns = common_schema
+                .fields()
+                .iter()
+                .map(|field| match batch_schema.index_of(field.name()) {
+                    Ok(idx) => Arc::clone(batch.column(idx)),
+                    Err(_) => new_null_array(field.data_type(), batch.num_rows()),
+                })
+                .collect();
+            RecordBatch::try_new(Arc::clone(&common_schema), columns).unwrap()
+        })
+        .collect())
+}
+
+/// Match the parquet UUID
+///
+/// For example, given
+/// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet`
+///
+/// matches `1d325760-2b20-48de-ab48-2267b034133d`
+pub static REGEX_UUID: Lazy<Regex> = Lazy::new(|| {
+    Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").expect("UUID regex")
+});
+
+/// Match the parquet directory names
+/// For example, given
+/// `51/216/1a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f4502/1d325760-2b20-48de-ab48-2267b034133d.parquet`
+///
+/// matches `51/216/1a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f4502`
+static REGEX_DIRS: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"[0-9]+/[0-9]+/[0-9a-f]+/"#).expect("directory regex"));
+
+/// Replace table row separators of flexible width with fixed with. This is required
+/// because the original timing values may differ in "printed width", so the table
+/// cells have different widths and hence the separators / borders. E.g.:
+///
+///   `+--+--+`     -> `----------`
+///   `+--+------+` -> `----------`
+///
+/// Note that we're kinda inexact with our regex here, but it gets the job done.
+static REGEX_LINESEP: Lazy<Regex> = Lazy::new(|| Regex::new(r#"[+-]{6,}"#).expect("linesep regex"));
+
+/// Similar to the row separator issue above, the table columns are right-padded
+/// with spaces. Due to the different "printed width" of the timing values, we need
+/// to normalize this padding as well. E.g.:
+///
+///   `        |`  -> `    |`
+///   `         |` -> `    |`
+static REGEX_COL: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+\|").expect("col regex"));
+
+/// Matches line like `metrics=[foo=1, bar=2]`
+static REGEX_METRICS: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"metrics=\[([^\]]*)\]").expect("metrics regex"));
+
+/// Matches things like  `1s`, `1.2ms` and `10.2μs`
+static REGEX_TIMING: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r"[0-9]+(\.[0-9]+)?.s").expect("timing regex"));
+
+/// Matches things like `FilterExec: .*` and `ParquetExec: .*`
+///
+/// Should be used in combination w/ [`REGEX_TIME_OP`].
+static REGEX_FILTER: Lazy<Regex> = Lazy::new(|| {
+    Regex::new("(?P<prefix>(FilterExec)|(ParquetExec): )(?P<expr>.*)").expect("filter regex")
+});
+
+/// Matches things like `time@3 < -9223372036854775808` and `time_min@2 > 1641031200399937022`
+static REGEX_TIME_OP: Lazy<Regex> = Lazy::new(|| {
+    Regex::new("(?P<prefix>time((_min)|(_max))?@[0-9]+ [<>=]=? (CAST\\()?)(?P<value>-?[0-9]+)(?P<suffix> AS Timestamp\\(Nanosecond, \"[^\"]\"\\)\\))?")
+        .expect("time opt regex")
+});
+
+fn normalize_for_variable_width(s: Cow<'_, str>) -> String {
+    let s = REGEX_LINESEP.replace_all(&s, "----------");
+    REGEX_COL.replace_all(&s, "    |").to_string()
+}
+
+pub fn strip_table_lines(s: Cow<'_, str>) -> String {
+    let s = REGEX_LINESEP.replace_all(&s, "----------");
+    REGEX_COL.replace_all(&s, "").to_string()
+}
+
+fn normalize_time_ops(s: &str) -> String {
+    REGEX_TIME_OP
+        .replace_all(s, |c: &Captures<'_>| {
+            let prefix = c.name("prefix").expect("always captures").as_str();
+            let suffix = c.name("suffix").map_or("", |m| m.as_str());
+            format!("{prefix}<REDACTED>{suffix}")
+        })
+        .to_string()
+}
+
+/// A query to run with optional annotations
+#[derive(Debug, PartialEq, Eq, Default, Clone, Copy)]
+pub struct Normalizer {
+    /// If true, results are sorted first
+    pub sorted_compare: bool,
+
+    /// If true, replace UUIDs with static placeholders.
+    pub normalized_uuids: bool,
+
+    /// If true, normalize timings in queries by replacing them with
+    /// static placeholders, for example:
+    ///
+    /// `1s`     -> `1.234ms`
+    pub normalized_metrics: bool,
+
+    /// if true, normalize filter predicates for explain plans
+    /// `FilterExec: <REDACTED>`
+    pub normalized_filters: bool,
+
+    /// if `true`, render tables without borders.
+    pub no_table_borders: bool,
+}
+
+impl Normalizer {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Take the output of running the query and apply the specified normalizations to them
+    pub fn normalize_results(&self, mut results: Vec<RecordBatch>) -> Vec<String> {
+        // compare against sorted results, if requested
+        if self.sorted_compare && !results.is_empty() {
+            let schema = results[0].schema();
+            let batch =
+                arrow::compute::concat_batches(&schema, &results).expect("concatenating batches");
+            results = vec![sort_record_batch(batch)];
+        }
+
+        let mut current_results = pretty_format_batches(&results)
+            .unwrap()
+            .trim()
+            .lines()
+            .map(|s| s.to_string())
+            .collect::<Vec<_>>();
+
+        // normalize UUIDs, if requested
+        if self.normalized_uuids {
+            let mut seen: HashMap<String, u128> = HashMap::new();
+            current_results = current_results
+                .into_iter()
+                .map(|s| {
+                    // Rewrite Parquet directory names like
+                    // `51/216/1a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f4502/1d325760-2b20-48de-ab48-2267b034133d.parquet`
+                    //
+                    // to:
+                    // 1/1/1/00000000-0000-0000-0000-000000000000.parquet
+
+                    let s = REGEX_UUID.replace_all(&s, |s: &Captures<'_>| {
+                        let next = seen.len() as u128;
+                        Uuid::from_u128(
+                            *seen
+                                .entry(s.get(0).unwrap().as_str().to_owned())
+                                .or_insert(next),
+                        )
+                        .to_string()
+                    });
+
+                    let s = normalize_for_variable_width(s);
+                    REGEX_DIRS.replace_all(&s, "1/1/1/").to_string()
+                })
+                .collect();
+        }
+
+        // normalize metrics, if requested
+        if self.normalized_metrics {
+            current_results = current_results
+                .into_iter()
+                .map(|s| {
+                    // Replace timings with fixed value, e.g.:
+                    //
+                    //   `1s`     -> `1.234ms`
+                    //   `1.2ms`  -> `1.234ms`
+                    //   `10.2μs` -> `1.234ms`
+                    let s = REGEX_TIMING.replace_all(&s, "1.234ms");
+
+                    let s = normalize_for_variable_width(s);
+
+                    // Metrics are currently ordered by value (not by key), so different timings may
+                    // reorder them. We "parse" the list and normalize the sorting. E.g.:
+                    //
+                    // `metrics=[]`             => `metrics=[]`
+                    // `metrics=[foo=1, bar=2]` => `metrics=[bar=2, foo=1]`
+                    // `metrics=[foo=2, bar=1]` => `metrics=[bar=1, foo=2]`
+                    REGEX_METRICS
+                        .replace_all(&s, |c: &Captures<'_>| {
+                            let mut metrics: Vec<_> = c[1].split(", ").collect();
+                            metrics.sort();
+                            format!("metrics=[{}]", metrics.join(", "))
+                        })
+                        .to_string()
+                })
+                .collect();
+        }
+
+        // normalize Filters, if requested
+        //
+        // Converts:
+        // FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000
+        // ParquetExec: limit=None, partitions={...}, predicate=time@2 > 1640995204240217000, pruning_predicate=time@2 > 1640995204240217000, output_ordering=[...], projection=[...]
+        //
+        // to
+        // FilterExec: time@2 < <REDACTED> OR time@2 > <REDACTED>
+        // ParquetExec: limit=None, partitions={...}, predicate=time@2 > <REDACTED>, pruning_predicate=time@2 > <REDACTED>, output_ordering=[...], projection=[...]
+        if self.normalized_filters {
+            current_results = current_results
+                .into_iter()
+                .map(|s| {
+                    REGEX_FILTER
+                        .replace_all(&s, |c: &Captures<'_>| {
+                            let prefix = c.name("prefix").expect("always captrues").as_str();
+
+                            let expr = c.name("expr").expect("always captures").as_str();
+                            let expr = normalize_time_ops(expr);
+
+                            format!("{prefix}{expr}")
+                        })
+                        .to_string()
+                })
+                .collect();
+        }
+
+        current_results
+    }
+
+    /// Adds information on what normalizations were applied to the input
+    pub fn add_description(&self, output: &mut Vec<String>) {
+        if self.sorted_compare {
+            output.push("-- Results After Sorting".into())
+        }
+        if self.normalized_uuids {
+            output.push("-- Results After Normalizing UUIDs".into())
+        }
+        if self.normalized_metrics {
+            output.push("-- Results After Normalizing Metrics".into())
+        }
+        if self.normalized_filters {
+            output.push("-- Results After Normalizing Filters".into())
+        }
+        if self.no_table_borders {
+            output.push("-- Results After No Table Borders".into())
+        }
+    }
+}
diff --git a/arrow_util/src/util.rs b/arrow_util/src/util.rs
new file mode 100644
index 0000000..3677dd0
--- /dev/null
+++ b/arrow_util/src/util.rs
@@ -0,0 +1,57 @@
+//! Utility functions for working with arrow
+
+use std::iter::FromIterator;
+use std::sync::Arc;
+
+use arrow::{
+    array::{new_null_array, ArrayRef, StringArray},
+    datatypes::SchemaRef,
+    error::ArrowError,
+    record_batch::RecordBatch,
+};
+
+/// Returns a single column record batch of type Utf8 from the
+/// contents of something that can be turned into an iterator over
+/// `Option<&str>`
+pub fn str_iter_to_batch<Ptr, I>(field_name: &str, iter: I) -> Result<RecordBatch, ArrowError>
+where
+    I: IntoIterator<Item = Option<Ptr>>,
+    Ptr: AsRef<str>,
+{
+    let array = StringArray::from_iter(iter);
+
+    RecordBatch::try_from_iter(vec![(field_name, Arc::new(array) as ArrayRef)])
+}
+
+/// Ensures the record batch has the specified schema
+pub fn ensure_schema(
+    output_schema: &SchemaRef,
+    batch: &RecordBatch,
+) -> Result<RecordBatch, ArrowError> {
+    let batch_schema = batch.schema();
+
+    // Go over all columns of output_schema
+    let batch_output_columns = output_schema
+        .fields()
+        .iter()
+        .map(|output_field| {
+            // See if the output_field available in the batch
+            let batch_field_index = batch_schema
+                .fields()
+                .iter()
+                .enumerate()
+                .find(|(_, batch_field)| output_field.name() == batch_field.name())
+                .map(|(idx, _)| idx);
+
+            if let Some(batch_field_index) = batch_field_index {
+                // The column available, use it
+                Arc::clone(batch.column(batch_field_index))
+            } else {
+                // the column not available, add it with all null values
+                new_null_array(output_field.data_type(), batch.num_rows())
+            }
+        })
+        .collect::<Vec<_>>();
+
+    RecordBatch::try_new(Arc::clone(output_schema), batch_output_columns)
+}
diff --git a/authz/Cargo.toml b/authz/Cargo.toml
new file mode 100644
index 0000000..9fc5ed9
--- /dev/null
+++ b/authz/Cargo.toml
@@ -0,0 +1,35 @@
+[package]
+name = "authz"
+description = "Interface to authorization checking services"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+backoff = { path = "../backoff" }
+http = {version = "0.2.11", optional = true }
+iox_time = { version = "0.1.0", path = "../iox_time" }
+generated_types = { path = "../generated_types" }
+metric = { version = "0.1.0", path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+# crates.io dependencies in alphabetical order.
+async-trait = "0.1"
+base64 = "0.21.7"
+snafu = "0.8"
+tonic = { workspace = true }
+
+[dev-dependencies]
+assert_matches = "1.5.0"
+parking_lot = "0.12.1"
+paste = "1.0.14"
+test_helpers_end_to_end = { path = "../test_helpers_end_to_end" }
+tokio = "1.35.1"
+
+[features]
+http = ["dep:http"]
diff --git a/authz/src/authorizer.rs b/authz/src/authorizer.rs
new file mode 100644
index 0000000..488ceb5
--- /dev/null
+++ b/authz/src/authorizer.rs
@@ -0,0 +1,88 @@
+use std::ops::ControlFlow;
+
+use async_trait::async_trait;
+use backoff::{Backoff, BackoffConfig};
+
+use super::{Error, Permission};
+
+/// An authorizer is used to validate a request
+/// (+ associated permissions needed to fulfill the request)
+/// with an authorization token that has been extracted from the request.
+#[async_trait]
+pub trait Authorizer: std::fmt::Debug + Send + Sync {
+    /// Determine the permissions associated with a request token.
+    ///
+    /// The returned list of permissions is the intersection of the permissions
+    /// requested and the permissions associated with the token.
+    ///
+    /// Implementations of this trait should return the specified errors under
+    /// the following conditions:
+    ///
+    /// * [`Error::InvalidToken`]: the token is invalid / in an incorrect
+    ///       format / otherwise corrupt and a permission check cannot be
+    ///       performed
+    ///
+    /// * [`Error::NoToken`]: the token was not provided
+    ///
+    /// * [`Error::Forbidden`]: the token was well formed, but lacks
+    ///       authorisation to perform the requested action
+    ///
+    /// * [`Error::Verification`]: the token permissions were not possible
+    ///       to validate - an internal error has occurred
+    async fn permissions(
+        &self,
+        token: Option<Vec<u8>>,
+        perms: &[Permission],
+    ) -> Result<Vec<Permission>, Error>;
+
+    /// Make a test request that determines if end-to-end communication
+    /// with the service is working.
+    ///
+    /// Test is performed during deployment, with ordering of availability not being guaranteed.
+    async fn probe(&self) -> Result<(), Error> {
+        Backoff::new(&BackoffConfig::default())
+            .retry_with_backoff("probe iox-authz service", move || {
+                async {
+                    match self.permissions(Some(b"".to_vec()), &[]).await {
+                        // got response from authorizer server
+                        Ok(_)
+                        | Err(Error::Forbidden)
+                        | Err(Error::InvalidToken)
+                        | Err(Error::NoToken) => ControlFlow::Break(Ok(())),
+                        // communication error == Error::Verification
+                        Err(e) => ControlFlow::<_, Error>::Continue(e),
+                    }
+                }
+            })
+            .await
+            .expect("retry forever")
+    }
+}
+
+/// Wrapped `Option<dyn Authorizer>`
+/// Provides response to inner `IoxAuthorizer::permissions()`
+#[async_trait]
+impl<T: Authorizer> Authorizer for Option<T> {
+    async fn permissions(
+        &self,
+        token: Option<Vec<u8>>,
+        perms: &[Permission],
+    ) -> Result<Vec<Permission>, Error> {
+        match self {
+            Some(authz) => authz.permissions(token, perms).await,
+            // no authz rpc service => return same perms requested. Used for testing.
+            None => Ok(perms.to_vec()),
+        }
+    }
+}
+
+#[async_trait]
+impl<T: AsRef<dyn Authorizer> + std::fmt::Debug + Send + Sync> Authorizer for T {
+    async fn permissions(
+        &self,
+        token: Option<Vec<u8>>,
+        perms: &[Permission],
+    ) -> Result<Vec<Permission>, Error> {
+        self.as_ref().permissions(token, perms).await
+    }
+}
diff --git a/authz/src/http.rs b/authz/src/http.rs
new file mode 100644
index 0000000..e45b37e
--- /dev/null
+++ b/authz/src/http.rs
@@ -0,0 +1,29 @@
+//! HTTP authorisation helpers.
+
+use http::HeaderValue;
+
+/// We strip off the "authorization" header from the request, to prevent it from being accidentally logged
+/// and we put it in an extension of the request. Extensions are typed and this is the typed wrapper that
+/// holds an (optional) authorization header value.
+pub struct AuthorizationHeaderExtension(Option<HeaderValue>);
+
+impl AuthorizationHeaderExtension {
+    /// Construct new extension wrapper for a possible header value
+    pub fn new(header: Option<HeaderValue>) -> Self {
+        Self(header)
+    }
+}
+
+impl std::fmt::Debug for AuthorizationHeaderExtension {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str("AuthorizationHeaderExtension(...)")
+    }
+}
+
+impl std::ops::Deref for AuthorizationHeaderExtension {
+    type Target = Option<HeaderValue>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
diff --git a/authz/src/instrumentation.rs b/authz/src/instrumentation.rs
new file mode 100644
index 0000000..b64fc2c
--- /dev/null
+++ b/authz/src/instrumentation.rs
@@ -0,0 +1,248 @@
+use async_trait::async_trait;
+use iox_time::{SystemProvider, TimeProvider};
+use metric::{DurationHistogram, Metric, Registry};
+
+use super::{Authorizer, Error, Permission};
+
+const AUTHZ_DURATION_METRIC: &str = "authz_permission_check_duration";
+
+/// An instrumentation decorator over a [`Authorizer`] implementation.
+///
+/// This wrapper captures the latency distribution of the decorated
+/// [`Authorizer::permissions()`] call, faceted by success/error result.
+#[derive(Debug)]
+pub struct AuthorizerInstrumentation<T, P = SystemProvider> {
+    inner: T,
+    time_provider: P,
+
+    /// Permissions-check duration distribution for successesful rpc, but not authorized.
+    ioxauth_rpc_duration_success_unauth: DurationHistogram,
+
+    /// Permissions-check duration distribution for successesful rpc + authorized.
+    ioxauth_rpc_duration_success_auth: DurationHistogram,
+
+    /// Permissions-check duration distribution for errors.
+    ioxauth_rpc_duration_error: DurationHistogram,
+}
+
+impl<T> AuthorizerInstrumentation<T> {
+    /// Record permissions-check duration metrics, broken down by result.
+    pub fn new(registry: &Registry, inner: T) -> Self {
+        let metric: Metric<DurationHistogram> =
+            registry.register_metric(AUTHZ_DURATION_METRIC, "duration of authz permissions check");
+
+        let ioxauth_rpc_duration_success_unauth =
+            metric.recorder(&[("result", "success"), ("auth_state", "unauthorised")]);
+        let ioxauth_rpc_duration_success_auth =
+            metric.recorder(&[("result", "success"), ("auth_state", "authorised")]);
+        let ioxauth_rpc_duration_error =
+            metric.recorder(&[("result", "error"), ("auth_state", "unauthorised")]);
+
+        Self {
+            inner,
+            time_provider: Default::default(),
+            ioxauth_rpc_duration_success_unauth,
+            ioxauth_rpc_duration_success_auth,
+            ioxauth_rpc_duration_error,
+        }
+    }
+}
+
+#[async_trait]
+impl<T> Authorizer for AuthorizerInstrumentation<T>
+where
+    T: Authorizer,
+{
+    async fn permissions(
+        &self,
+        token: Option<Vec<u8>>,
+        perms: &[Permission],
+    ) -> Result<Vec<Permission>, Error> {
+        let t = self.time_provider.now();
+        let res = self.inner.permissions(token, perms).await;
+
+        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
+            match &res {
+                Ok(_) => self.ioxauth_rpc_duration_success_auth.record(delta),
+                Err(Error::Forbidden) | Err(Error::InvalidToken) => {
+                    self.ioxauth_rpc_duration_success_unauth.record(delta)
+                }
+                Err(Error::Verification { .. }) => self.ioxauth_rpc_duration_error.record(delta),
+                Err(Error::NoToken) => {} // rpc was not made
+            };
+        }
+
+        res
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::VecDeque;
+
+    use metric::{assert_histogram, Attributes, Registry};
+    use parking_lot::Mutex;
+
+    use super::*;
+    use crate::{Action, Resource};
+
+    #[derive(Debug, Default)]
+    struct MockAuthorizerState {
+        ret: VecDeque<Result<Vec<Permission>, Error>>,
+    }
+
+    #[derive(Debug, Default)]
+    struct MockAuthorizer {
+        state: Mutex<MockAuthorizerState>,
+    }
+
+    impl MockAuthorizer {
+        pub(crate) fn with_permissions_return(
+            self,
+            ret: impl Into<VecDeque<Result<Vec<Permission>, Error>>>,
+        ) -> Self {
+            self.state.lock().ret = ret.into();
+            self
+        }
+    }
+
+    #[async_trait]
+    impl Authorizer for MockAuthorizer {
+        async fn permissions(
+            &self,
+            _token: Option<Vec<u8>>,
+            _perms: &[Permission],
+        ) -> Result<Vec<Permission>, Error> {
+            self.state
+                .lock()
+                .ret
+                .pop_front()
+                .expect("no mock sink value to return")
+        }
+    }
+
+    macro_rules! assert_metric_counts {
+        (
+            $metrics:ident,
+            expected_success = $expected_success:expr,
+            expected_rpc_success_unauth = $expected_rpc_success_unauth:expr,
+            expected_rpc_failures = $expected_rpc_failures:expr,
+        ) => {
+            let histogram = &$metrics
+                .get_instrument::<Metric<DurationHistogram>>(AUTHZ_DURATION_METRIC)
+                .expect("failed to read metric");
+
+            let success_labels =
+                Attributes::from(&[("result", "success"), ("auth_state", "authorised")]);
+            let histogram_success = &histogram
+                .get_observer(&success_labels)
+                .expect("failed to find metric with provided attributes")
+                .fetch();
+
+            assert_histogram!(
+                $metrics,
+                DurationHistogram,
+                AUTHZ_DURATION_METRIC,
+                labels = success_labels,
+                samples = $expected_success,
+                sum = histogram_success.total,
+            );
+
+            let success_unauth_labels =
+                Attributes::from(&[("result", "success"), ("auth_state", "unauthorised")]);
+            let histogram_success_unauth = &histogram
+                .get_observer(&success_unauth_labels)
+                .expect("failed to find metric with provided attributes")
+                .fetch();
+
+            assert_histogram!(
+                $metrics,
+                DurationHistogram,
+                AUTHZ_DURATION_METRIC,
+                labels = success_unauth_labels,
+                samples = $expected_rpc_success_unauth,
+                sum = histogram_success_unauth.total,
+            );
+
+            let rpc_error_labels =
+                Attributes::from(&[("result", "error"), ("auth_state", "unauthorised")]);
+            let histogram_rpc_error = &histogram
+                .get_observer(&rpc_error_labels)
+                .expect("failed to find metric with provided attributes")
+                .fetch();
+
+            assert_histogram!(
+                $metrics,
+                DurationHistogram,
+                AUTHZ_DURATION_METRIC,
+                labels = rpc_error_labels,
+                samples = $expected_rpc_failures,
+                sum = histogram_rpc_error.total,
+            );
+        };
+    }
+
+    macro_rules! test_authorizer_metric {
+        (
+            $name:ident,
+            rpc_response = $rpc_response:expr,
+            will_pass_auth = $will_pass_auth:expr,
+            expected_success_cnt = $expected_success_cnt:expr,
+            expected_success_unauth_cnt = $expected_success_unauth_cnt:expr,
+            expected_error_cnt = $expected_error_cnt:expr,
+        ) => {
+            paste::paste! {
+                #[tokio::test]
+                async fn [<test_authorizer_metric_ $name>]() {
+                    let metrics = Registry::default();
+                    let decorated_authz = AuthorizerInstrumentation::new(
+                        &metrics,
+                        MockAuthorizer::default().with_permissions_return([$rpc_response])
+                    );
+
+                    let token = "any".as_bytes().to_vec();
+                    let got = decorated_authz
+                        .permissions(Some(token), &[])
+                        .await;
+                    assert_eq!(got.is_ok(), $will_pass_auth);
+                    assert_metric_counts!(
+                        metrics,
+                        expected_success = $expected_success_cnt,
+                        expected_rpc_success_unauth = $expected_success_unauth_cnt,
+                        expected_rpc_failures = $expected_error_cnt,
+                    );
+                }
+            }
+        };
+    }
+
+    test_authorizer_metric!(
+        ok,
+        rpc_response = Ok(vec![Permission::ResourceAction(
+            Resource::Database("foo".to_string()),
+            Action::Write,
+        )]),
+        will_pass_auth = true,
+        expected_success_cnt = 1,
+        expected_success_unauth_cnt = 0,
+        expected_error_cnt = 0,
+    );
+
+    test_authorizer_metric!(
+        will_record_failure_if_rpc_fails,
+        rpc_response = Err(Error::verification("test", "test error")),
+        will_pass_auth = false,
+        expected_success_cnt = 0,
+        expected_success_unauth_cnt = 0,
+        expected_error_cnt = 1,
+    );
+
+    test_authorizer_metric!(
+        will_record_success_if_rpc_pass_but_auth_fails,
+        rpc_response = Err(Error::InvalidToken),
+        will_pass_auth = false,
+        expected_success_cnt = 0,
+        expected_success_unauth_cnt = 1,
+        expected_error_cnt = 0,
+    );
+}
diff --git a/authz/src/iox_authorizer.rs b/authz/src/iox_authorizer.rs
new file mode 100644
index 0000000..7228d3d
--- /dev/null
+++ b/authz/src/iox_authorizer.rs
@@ -0,0 +1,309 @@
+use async_trait::async_trait;
+use generated_types::influxdata::iox::authz::v1::{self as proto, AuthorizeResponse};
+use observability_deps::tracing::warn;
+use snafu::Snafu;
+use tonic::Response;
+
+use super::{Authorizer, Permission};
+
+/// Authorizer implementation using influxdata.iox.authz.v1 protocol.
+#[derive(Clone, Debug)]
+pub struct IoxAuthorizer {
+    client:
+        proto::iox_authorizer_service_client::IoxAuthorizerServiceClient<tonic::transport::Channel>,
+}
+
+impl IoxAuthorizer {
+    /// Attempt to create a new client by connecting to a given endpoint.
+    pub fn connect_lazy<D>(dst: D) -> Result<Self, Box<dyn std::error::Error>>
+    where
+        D: TryInto<tonic::transport::Endpoint> + Send,
+        D::Error: Into<tonic::codegen::StdError>,
+    {
+        let ep = tonic::transport::Endpoint::new(dst)?;
+        let client = proto::iox_authorizer_service_client::IoxAuthorizerServiceClient::new(
+            ep.connect_lazy(),
+        );
+        Ok(Self { client })
+    }
+
+    async fn request(
+        &self,
+        token: Vec<u8>,
+        requested_perms: &[Permission],
+    ) -> Result<Response<AuthorizeResponse>, tonic::Status> {
+        let req = proto::AuthorizeRequest {
+            token,
+            permissions: requested_perms
+                .iter()
+                .filter_map(|p| p.clone().try_into().ok())
+                .collect(),
+        };
+        let mut client = self.client.clone();
+        client.authorize(req).await
+    }
+}
+
+#[async_trait]
+impl Authorizer for IoxAuthorizer {
+    async fn permissions(
+        &self,
+        token: Option<Vec<u8>>,
+        requested_perms: &[Permission],
+    ) -> Result<Vec<Permission>, Error> {
+        let authz_rpc_result = self
+            .request(token.ok_or(Error::NoToken)?, requested_perms)
+            .await
+            .map_err(|status| Error::Verification {
+                msg: status.message().to_string(),
+                source: Box::new(status),
+            })?
+            .into_inner();
+
+        if !authz_rpc_result.valid {
+            return Err(Error::InvalidToken);
+        }
+
+        let intersected_perms: Vec<Permission> = authz_rpc_result
+            .permissions
+            .into_iter()
+            .filter_map(|p| match p.try_into() {
+                Ok(p) => Some(p),
+                Err(e) => {
+                    warn!(error=%e, "authz service returned incompatible permission");
+                    None
+                }
+            })
+            .collect();
+
+        if intersected_perms.is_empty() {
+            return Err(Error::Forbidden);
+        }
+        Ok(intersected_perms)
+    }
+}
+
+/// Authorization related error.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    /// Communication error when verifying a token.
+    #[snafu(display("token verification not possible: {msg}"))]
+    Verification {
+        /// Message describing the error.
+        msg: String,
+        /// Source of the error.
+        source: Box<dyn std::error::Error + Send + Sync + 'static>,
+    },
+
+    /// Token is invalid.
+    #[snafu(display("invalid token"))]
+    InvalidToken,
+
+    /// The token's permissions do not allow the operation.
+    #[snafu(display("forbidden"))]
+    Forbidden,
+
+    /// No token has been supplied, but is required.
+    #[snafu(display("no token"))]
+    NoToken,
+}
+
+impl Error {
+    /// Create new Error::Verification.
+    pub fn verification(
+        msg: impl Into<String>,
+        source: impl Into<Box<dyn std::error::Error + Send + Sync + 'static>>,
+    ) -> Self {
+        Self::Verification {
+            msg: msg.into(),
+            source: source.into(),
+        }
+    }
+}
+
+impl From<tonic::Status> for Error {
+    fn from(value: tonic::Status) -> Self {
+        Self::verification(value.message(), value.clone())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{
+        net::SocketAddr,
+        sync::{
+            atomic::{AtomicBool, Ordering},
+            Arc,
+        },
+        time::Duration,
+    };
+
+    use assert_matches::assert_matches;
+    use test_helpers_end_to_end::Authorizer as AuthorizerServer;
+    use tokio::{
+        net::TcpListener,
+        task::{spawn, JoinHandle},
+    };
+    use tonic::transport::{server::TcpIncoming, Server};
+
+    use super::*;
+    use crate::{Action, Authorizer, Permission, Resource};
+
+    const NAMESPACE: &str = "bananas";
+
+    macro_rules! test_iox_authorizer {
+        (
+            $name:ident,
+            token_permissions = $token_permissions:expr,
+            permissions_required = $permissions_required:expr,
+            want = $want:pat
+        ) => {
+            paste::paste! {
+                #[tokio::test]
+                async fn [<test_iox_authorizer_ $name>]() {
+                    let mut authz_server = AuthorizerServer::create().await;
+                    let authz = IoxAuthorizer::connect_lazy(authz_server.addr())
+                            .expect("Failed to create IoxAuthorizer client.");
+
+                    let token = authz_server.create_token_for(NAMESPACE, $token_permissions);
+
+                    let got = authz.permissions(
+                        Some(token.as_bytes().to_vec()),
+                        $permissions_required
+                    ).await;
+
+                    assert_matches!(got, $want);
+                }
+            }
+        };
+    }
+
+    test_iox_authorizer!(
+        ok,
+        token_permissions = &["ACTION_WRITE"],
+        permissions_required = &[Permission::ResourceAction(
+            Resource::Database(NAMESPACE.to_string()),
+            Action::Write,
+        )],
+        want = Ok(_)
+    );
+
+    test_iox_authorizer!(
+        insufficient_perms,
+        token_permissions = &["ACTION_READ"],
+        permissions_required = &[Permission::ResourceAction(
+            Resource::Database(NAMESPACE.to_string()),
+            Action::Write,
+        )],
+        want = Err(Error::Forbidden)
+    );
+
+    test_iox_authorizer!(
+        any_of_required_perms,
+        token_permissions = &["ACTION_WRITE"],
+        permissions_required = &[
+            Permission::ResourceAction(Resource::Database(NAMESPACE.to_string()), Action::Write,),
+            Permission::ResourceAction(Resource::Database(NAMESPACE.to_string()), Action::Create,)
+        ],
+        want = Ok(_)
+    );
+
+    #[tokio::test]
+    async fn test_invalid_token() {
+        let authz_server = AuthorizerServer::create().await;
+        let authz = IoxAuthorizer::connect_lazy(authz_server.addr())
+            .expect("Failed to create IoxAuthorizer client.");
+
+        let invalid_token = b"UGLY";
+
+        let got = authz
+            .permissions(
+                Some(invalid_token.to_vec()),
+                &[Permission::ResourceAction(
+                    Resource::Database(NAMESPACE.to_string()),
+                    Action::Read,
+                )],
+            )
+            .await;
+
+        assert_matches!(got, Err(Error::InvalidToken));
+    }
+
+    #[tokio::test]
+    async fn test_delayed_probe_response() {
+        #[derive(Default, Debug)]
+        struct DelayedAuthorizer(Arc<AtomicBool>);
+
+        impl DelayedAuthorizer {
+            fn start_countdown(&self) {
+                let started = Arc::clone(&self.0);
+                spawn(async move {
+                    tokio::time::sleep(Duration::from_secs(2)).await;
+                    started.store(true, Ordering::Relaxed);
+                });
+            }
+        }
+
+        #[async_trait]
+        impl proto::iox_authorizer_service_server::IoxAuthorizerService for DelayedAuthorizer {
+            async fn authorize(
+                &self,
+                _request: tonic::Request<proto::AuthorizeRequest>,
+            ) -> Result<tonic::Response<AuthorizeResponse>, tonic::Status> {
+                let startup_done = self.0.load(Ordering::Relaxed);
+                if !startup_done {
+                    return Err(tonic::Status::deadline_exceeded("startup not done"));
+                }
+
+                Ok(tonic::Response::new(AuthorizeResponse {
+                    valid: true,
+                    subject: None,
+                    permissions: vec![],
+                }))
+            }
+        }
+
+        #[derive(Debug)]
+        struct DelayedServer {
+            addr: SocketAddr,
+            handle: JoinHandle<Result<(), tonic::transport::Error>>,
+        }
+
+        impl DelayedServer {
+            async fn create() -> Self {
+                let listener = TcpListener::bind("localhost:0").await.unwrap();
+                let addr = listener.local_addr().unwrap();
+                let incoming = TcpIncoming::from_listener(listener, false, None).unwrap();
+
+                // start countdown mocking startup delay of sidecar
+                let authz = DelayedAuthorizer::default();
+                authz.start_countdown();
+
+                let router = Server::builder().add_service(
+                    proto::iox_authorizer_service_server::IoxAuthorizerServiceServer::new(authz),
+                );
+                let handle = spawn(router.serve_with_incoming(incoming));
+                Self { addr, handle }
+            }
+
+            fn addr(&self) -> String {
+                format!("http://{}", self.addr)
+            }
+
+            fn close(self) {
+                self.handle.abort();
+            }
+        }
+
+        let authz_server = DelayedServer::create().await;
+        let authz_client = IoxAuthorizer::connect_lazy(authz_server.addr())
+            .expect("Failed to create IoxAuthorizer client.");
+
+        assert_matches!(
+            authz_client.probe().await,
+            Ok(()),
+            "authz probe should work even with delay"
+        );
+        authz_server.close();
+    }
+}
diff --git a/authz/src/lib.rs b/authz/src/lib.rs
new file mode 100644
index 0000000..7b2fd54
--- /dev/null
+++ b/authz/src/lib.rs
@@ -0,0 +1,100 @@
+//! IOx authorization client.
+//!
+//! Authorization client interface to be used by IOx components to
+//! restrict access to authorized requests where required.
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(rustdoc::private_intra_doc_links)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use base64::{prelude::BASE64_STANDARD, Engine};
+use generated_types::influxdata::iox::authz::v1::{self as proto};
+use observability_deps::tracing::warn;
+
+mod authorizer;
+pub use authorizer::Authorizer;
+mod iox_authorizer;
+pub use iox_authorizer::{Error, IoxAuthorizer};
+mod instrumentation;
+pub use instrumentation::AuthorizerInstrumentation;
+mod permission;
+pub use permission::{Action, Permission, Resource};
+
+#[cfg(feature = "http")]
+pub mod http;
+
+/// Extract a token from an HTTP header or gRPC metadata value.
+pub fn extract_token<T: AsRef<[u8]> + ?Sized>(value: Option<&T>) -> Option<Vec<u8>> {
+    let mut parts = value?.as_ref().splitn(2, |&v| v == b' ');
+    let token = match parts.next()? {
+        b"Token" | b"Bearer" => parts.next()?.to_vec(),
+        b"Basic" => parts
+            .next()
+            .and_then(|v| BASE64_STANDARD.decode(v).ok())?
+            .splitn(2, |&v| v == b':')
+            .nth(1)?
+            .to_vec(),
+        _ => return None,
+    };
+    if token.is_empty() {
+        None
+    } else {
+        Some(token)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn verify_error_from_tonic_status() {
+        let s = tonic::Status::resource_exhausted("test error");
+        let e = Error::from(s);
+        assert_eq!(
+            "token verification not possible: test error",
+            format!("{e}")
+        )
+    }
+
+    #[test]
+    fn test_extract_token() {
+        assert_eq!(None, extract_token::<&str>(None));
+        assert_eq!(None, extract_token(Some("")));
+        assert_eq!(None, extract_token(Some("Basic")));
+        assert_eq!(None, extract_token(Some("Basic Og=="))); // ":"
+        assert_eq!(None, extract_token(Some("Basic dXNlcm5hbWU6"))); // "username:"
+        assert_eq!(None, extract_token(Some("Basic Og=="))); // ":"
+        assert_eq!(
+            Some(b"password".to_vec()),
+            extract_token(Some("Basic OnBhc3N3b3Jk"))
+        ); // ":password"
+        assert_eq!(
+            Some(b"password2".to_vec()),
+            extract_token(Some("Basic dXNlcm5hbWU6cGFzc3dvcmQy"))
+        ); // "username:password2"
+        assert_eq!(None, extract_token(Some("Bearer")));
+        assert_eq!(None, extract_token(Some("Bearer ")));
+        assert_eq!(Some(b"token".to_vec()), extract_token(Some("Bearer token")));
+        assert_eq!(None, extract_token(Some("Token")));
+        assert_eq!(None, extract_token(Some("Token ")));
+        assert_eq!(
+            Some(b"token2".to_vec()),
+            extract_token(Some("Token token2"))
+        );
+    }
+}
diff --git a/authz/src/permission.rs b/authz/src/permission.rs
new file mode 100644
index 0000000..9ffced0
--- /dev/null
+++ b/authz/src/permission.rs
@@ -0,0 +1,310 @@
+use super::proto;
+use snafu::Snafu;
+
+/// Action is the type of operation being attempted on a resource.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum Action {
+    /// The create action is used when a new instance of the resource will
+    /// be created.
+    Create,
+    /// The delete action is used when a resource will be deleted.
+    Delete,
+    /// The read action is used when the data contained by a resource will
+    /// be read.
+    Read,
+    /// The read-schema action is used when only metadata about a resource
+    /// will be read.
+    ReadSchema,
+    /// The write action is used when data is being written to the resource.
+    Write,
+}
+
+impl TryFrom<proto::resource_action_permission::Action> for Action {
+    type Error = IncompatiblePermissionError;
+
+    fn try_from(value: proto::resource_action_permission::Action) -> Result<Self, Self::Error> {
+        match value {
+            proto::resource_action_permission::Action::ReadSchema => Ok(Self::ReadSchema),
+            proto::resource_action_permission::Action::Read => Ok(Self::Read),
+            proto::resource_action_permission::Action::Write => Ok(Self::Write),
+            proto::resource_action_permission::Action::Create => Ok(Self::Create),
+            proto::resource_action_permission::Action::Delete => Ok(Self::Delete),
+            _ => Err(IncompatiblePermissionError {}),
+        }
+    }
+}
+
+impl From<Action> for proto::resource_action_permission::Action {
+    fn from(value: Action) -> Self {
+        match value {
+            Action::Create => Self::Create,
+            Action::Delete => Self::Delete,
+            Action::Read => Self::Read,
+            Action::ReadSchema => Self::ReadSchema,
+            Action::Write => Self::Write,
+        }
+    }
+}
+
+/// An incompatible-permission-error is the error that is returned if
+/// there is an attempt to convert a permssion into a form that is
+/// unsupported. For the most part this should not cause an error to
+/// be returned to the user, but more as a signal that the conversion
+/// can never succeed and therefore the permisison can never be granted.
+/// This error will normally be silently dropped along with the source
+/// permission that caused it.
+#[derive(Clone, Copy, Debug, PartialEq, Snafu)]
+#[snafu(display("incompatible permission"))]
+pub struct IncompatiblePermissionError {}
+
+/// A permission is an authorization that can be checked with an
+/// authorizer. Not all authorizers neccessarily support all forms of
+/// permission. If an authorizer doesn't support a permission then it
+/// is not an error, the permission will always be denied.
+#[derive(Clone, Debug, PartialEq)]
+pub enum Permission {
+    /// ResourceAction is a permission in the form of a reasource and an
+    /// action.
+    ResourceAction(Resource, Action),
+}
+
+impl TryFrom<proto::Permission> for Permission {
+    type Error = IncompatiblePermissionError;
+
+    fn try_from(value: proto::Permission) -> Result<Self, Self::Error> {
+        match value.permission_one_of {
+            Some(proto::permission::PermissionOneOf::ResourceAction(ra)) => {
+                let r = Resource::try_from_proto(
+                    proto::resource_action_permission::ResourceType::try_from(ra.resource_type)
+                        .map_err(|_| IncompatiblePermissionError {})?,
+                    ra.resource_id,
+                )?;
+                let a = Action::try_from(
+                    proto::resource_action_permission::Action::try_from(ra.action)
+                        .map_err(|_| IncompatiblePermissionError {})?,
+                )?;
+                Ok(Self::ResourceAction(r, a))
+            }
+            _ => Err(IncompatiblePermissionError {}),
+        }
+    }
+}
+
+impl TryFrom<Permission> for proto::Permission {
+    type Error = IncompatiblePermissionError;
+
+    fn try_from(value: Permission) -> Result<Self, Self::Error> {
+        match value {
+            Permission::ResourceAction(r, a) => {
+                let (rt, ri) = r.try_into_proto()?;
+                let a: proto::resource_action_permission::Action = a.into();
+                Ok(Self {
+                    permission_one_of: Some(proto::permission::PermissionOneOf::ResourceAction(
+                        proto::ResourceActionPermission {
+                            resource_type: rt as i32,
+                            resource_id: ri,
+                            action: a as i32,
+                        },
+                    )),
+                })
+            }
+        }
+    }
+}
+
+/// A resource is the object that a request is trying to access.
+#[derive(Clone, Debug, PartialEq)]
+pub enum Resource {
+    /// A database is a named IOx database.
+    Database(String),
+}
+
+impl Resource {
+    fn try_from_proto(
+        rt: proto::resource_action_permission::ResourceType,
+        ri: Option<String>,
+    ) -> Result<Self, IncompatiblePermissionError> {
+        match (rt, ri) {
+            (proto::resource_action_permission::ResourceType::Database, Some(s)) => {
+                Ok(Self::Database(s))
+            }
+            _ => Err(IncompatiblePermissionError {}),
+        }
+    }
+
+    fn try_into_proto(
+        self,
+    ) -> Result<
+        (
+            proto::resource_action_permission::ResourceType,
+            Option<String>,
+        ),
+        IncompatiblePermissionError,
+    > {
+        match self {
+            Self::Database(s) => Ok((
+                proto::resource_action_permission::ResourceType::Database,
+                Some(s),
+            )),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn action_try_from_proto() {
+        assert_eq!(
+            Action::Create,
+            Action::try_from(proto::resource_action_permission::Action::Create).unwrap(),
+        );
+        assert_eq!(
+            Action::Delete,
+            Action::try_from(proto::resource_action_permission::Action::Delete).unwrap(),
+        );
+        assert_eq!(
+            Action::Read,
+            Action::try_from(proto::resource_action_permission::Action::Read).unwrap(),
+        );
+        assert_eq!(
+            Action::ReadSchema,
+            Action::try_from(proto::resource_action_permission::Action::ReadSchema).unwrap(),
+        );
+        assert_eq!(
+            Action::Write,
+            Action::try_from(proto::resource_action_permission::Action::Write).unwrap(),
+        );
+        assert_eq!(
+            IncompatiblePermissionError {},
+            Action::try_from(proto::resource_action_permission::Action::Unspecified).unwrap_err(),
+        );
+    }
+
+    #[test]
+    fn action_into_proto() {
+        assert_eq!(
+            proto::resource_action_permission::Action::Create,
+            proto::resource_action_permission::Action::from(Action::Create)
+        );
+        assert_eq!(
+            proto::resource_action_permission::Action::Delete,
+            proto::resource_action_permission::Action::from(Action::Delete)
+        );
+        assert_eq!(
+            proto::resource_action_permission::Action::Read,
+            proto::resource_action_permission::Action::from(Action::Read)
+        );
+        assert_eq!(
+            proto::resource_action_permission::Action::ReadSchema,
+            proto::resource_action_permission::Action::from(Action::ReadSchema)
+        );
+        assert_eq!(
+            proto::resource_action_permission::Action::Write,
+            proto::resource_action_permission::Action::from(Action::Write)
+        );
+    }
+
+    #[test]
+    fn resource_try_from_proto() {
+        assert_eq!(
+            Resource::Database("ns1".into()),
+            Resource::try_from_proto(
+                proto::resource_action_permission::ResourceType::Database,
+                Some("ns1".into())
+            )
+            .unwrap()
+        );
+        assert_eq!(
+            IncompatiblePermissionError {},
+            Resource::try_from_proto(
+                proto::resource_action_permission::ResourceType::Database,
+                None
+            )
+            .unwrap_err()
+        );
+        assert_eq!(
+            IncompatiblePermissionError {},
+            Resource::try_from_proto(
+                proto::resource_action_permission::ResourceType::Unspecified,
+                Some("ns1".into())
+            )
+            .unwrap_err()
+        );
+    }
+
+    #[test]
+    fn resource_try_into_proto() {
+        assert_eq!(
+            (
+                proto::resource_action_permission::ResourceType::Database,
+                Some("ns1".into())
+            ),
+            Resource::Database("ns1".into()).try_into_proto().unwrap(),
+        );
+    }
+
+    #[test]
+    fn permission_try_from_proto() {
+        assert_eq!(
+            Permission::ResourceAction(Resource::Database("ns2".into()), Action::Create),
+            Permission::try_from(proto::Permission {
+                permission_one_of: Some(proto::permission::PermissionOneOf::ResourceAction(
+                    proto::ResourceActionPermission {
+                        resource_type: 1,
+                        resource_id: Some("ns2".into()),
+                        action: 4,
+                    }
+                ))
+            })
+            .unwrap()
+        );
+        assert_eq!(
+            IncompatiblePermissionError {},
+            Permission::try_from(proto::Permission {
+                permission_one_of: Some(proto::permission::PermissionOneOf::ResourceAction(
+                    proto::ResourceActionPermission {
+                        resource_type: 0,
+                        resource_id: Some("ns2".into()),
+                        action: 4,
+                    }
+                ))
+            })
+            .unwrap_err()
+        );
+        assert_eq!(
+            IncompatiblePermissionError {},
+            Permission::try_from(proto::Permission {
+                permission_one_of: Some(proto::permission::PermissionOneOf::ResourceAction(
+                    proto::ResourceActionPermission {
+                        resource_type: 1,
+                        resource_id: Some("ns2".into()),
+                        action: 0,
+                    }
+                ))
+            })
+            .unwrap_err()
+        );
+    }
+
+    #[test]
+    fn permission_try_into_proto() {
+        assert_eq!(
+            proto::Permission {
+                permission_one_of: Some(proto::permission::PermissionOneOf::ResourceAction(
+                    proto::ResourceActionPermission {
+                        resource_type: 1,
+                        resource_id: Some("ns3".into()),
+                        action: 4,
+                    }
+                ))
+            },
+            proto::Permission::try_from(Permission::ResourceAction(
+                Resource::Database("ns3".into()),
+                Action::Create
+            ))
+            .unwrap()
+        );
+    }
+}
diff --git a/backoff/Cargo.toml b/backoff/Cargo.toml
new file mode 100644
index 0000000..484412f
--- /dev/null
+++ b/backoff/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "backoff"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+tokio = { version = "1.35", features = ["macros", "time"] }
+observability_deps = { path = "../observability_deps" }
+rand = "0.8"
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/backoff/src/lib.rs b/backoff/src/lib.rs
new file mode 100644
index 0000000..907847b
--- /dev/null
+++ b/backoff/src/lib.rs
@@ -0,0 +1,399 @@
+//! Backoff functionality.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use observability_deps::tracing::warn;
+use rand::prelude::*;
+use snafu::Snafu;
+use std::ops::ControlFlow;
+use std::time::Duration;
+
+/// Exponential backoff with jitter
+///
+/// See <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
+#[derive(Debug, Clone, PartialEq)]
+#[allow(missing_copy_implementations)]
+pub struct BackoffConfig {
+    /// Initial backoff.
+    pub init_backoff: Duration,
+
+    /// Maximum backoff.
+    pub max_backoff: Duration,
+
+    /// Multiplier for each backoff round.
+    pub base: f64,
+
+    /// Timeout until we try to retry.
+    pub deadline: Option<Duration>,
+}
+
+impl Default for BackoffConfig {
+    fn default() -> Self {
+        Self {
+            init_backoff: Duration::from_millis(100),
+            max_backoff: Duration::from_secs(500),
+            base: 3.,
+            deadline: None,
+        }
+    }
+}
+
+/// Error after giving up retrying.
+#[derive(Debug, Snafu, PartialEq, Eq)]
+#[allow(missing_copy_implementations, missing_docs)]
+pub enum BackoffError<E>
+where
+    E: std::error::Error + 'static,
+{
+    #[snafu(display("Retry did not exceed within {deadline:?}: {source}"))]
+    DeadlineExceeded { deadline: Duration, source: E },
+}
+
+/// Backoff result.
+pub type BackoffResult<T, E> = Result<T, BackoffError<E>>;
+
+/// [`Backoff`] can be created from a [`BackoffConfig`]
+///
+/// Consecutive calls to [`Backoff::next`] will return the next backoff interval
+///
+pub struct Backoff {
+    init_backoff: f64,
+    next_backoff_secs: f64,
+    max_backoff_secs: f64,
+    base: f64,
+    total: f64,
+    deadline: Option<f64>,
+    rng: Option<Box<dyn RngCore + Sync + Send>>,
+}
+
+impl std::fmt::Debug for Backoff {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Backoff")
+            .field("init_backoff", &self.init_backoff)
+            .field("next_backoff_secs", &self.next_backoff_secs)
+            .field("max_backoff_secs", &self.max_backoff_secs)
+            .field("base", &self.base)
+            .field("total", &self.total)
+            .field("deadline", &self.deadline)
+            .finish()
+    }
+}
+
+impl Backoff {
+    /// Create a new [`Backoff`] from the provided [`BackoffConfig`].
+    ///
+    /// # Pancis
+    /// Panics if [`BackoffConfig::base`] is not finite or < 1.0.
+    pub fn new(config: &BackoffConfig) -> Self {
+        Self::new_with_rng(config, None)
+    }
+
+    /// Creates a new `Backoff` with the optional `rng`.
+    ///
+    /// Used [`rand::thread_rng()`] if no rng provided.
+    ///
+    /// See [`new`](Self::new) for panic handling.
+    pub fn new_with_rng(
+        config: &BackoffConfig,
+        rng: Option<Box<dyn RngCore + Sync + Send>>,
+    ) -> Self {
+        assert!(
+            config.base.is_finite(),
+            "Backoff base ({}) must be finite.",
+            config.base,
+        );
+        assert!(
+            config.base >= 1.0,
+            "Backoff base ({}) must be greater or equal than 1.",
+            config.base,
+        );
+
+        let max_backoff = config.max_backoff.as_secs_f64();
+        let init_backoff = config.init_backoff.as_secs_f64().min(max_backoff);
+        Self {
+            init_backoff,
+            next_backoff_secs: init_backoff,
+            max_backoff_secs: max_backoff,
+            base: config.base,
+            total: 0.0,
+            deadline: config.deadline.map(|d| d.as_secs_f64()),
+            rng,
+        }
+    }
+
+    /// Fade this backoff over to a different backoff config.
+    pub fn fade_to(&mut self, config: &BackoffConfig) {
+        // Note: `new` won't have the same RNG, but this doesn't matter
+        let new = Self::new(config);
+
+        *self = Self {
+            init_backoff: new.init_backoff,
+            next_backoff_secs: self.next_backoff_secs,
+            max_backoff_secs: new.max_backoff_secs,
+            base: new.base,
+            total: self.total,
+            deadline: new.deadline,
+            rng: self.rng.take(),
+        };
+    }
+
+    /// Perform an async operation that retries with a backoff
+    pub async fn retry_with_backoff<F, F1, B, E>(
+        &mut self,
+        task_name: &str,
+        mut do_stuff: F,
+    ) -> BackoffResult<B, E>
+    where
+        F: (FnMut() -> F1) + Send,
+        F1: std::future::Future<Output = ControlFlow<B, E>> + Send,
+        E: std::error::Error + Send + 'static,
+    {
+        let mut fail_count = 0_usize;
+        loop {
+            // first execute `F` and then use it, so we can avoid `F: Sync`.
+            let do_stuff = do_stuff();
+
+            let e = match do_stuff.await {
+                ControlFlow::Break(r) => break Ok(r),
+                ControlFlow::Continue(e) => e,
+            };
+
+            let backoff = match self.next() {
+                Some(backoff) => backoff,
+                None => {
+                    return Err(BackoffError::DeadlineExceeded {
+                        deadline: Duration::from_secs_f64(self.deadline.expect("deadline")),
+                        source: e,
+                    });
+                }
+            };
+
+            fail_count += 1;
+
+            warn!(
+                error=%e,
+                task_name,
+                backoff_secs = backoff.as_secs(),
+                fail_count,
+                "request encountered non-fatal error - backing off",
+            );
+            tokio::time::sleep(backoff).await;
+        }
+    }
+
+    /// Retry all errors.
+    pub async fn retry_all_errors<F, F1, B, E>(
+        &mut self,
+        task_name: &str,
+        mut do_stuff: F,
+    ) -> BackoffResult<B, E>
+    where
+        F: (FnMut() -> F1) + Send,
+        F1: std::future::Future<Output = Result<B, E>> + Send,
+        E: std::error::Error + Send + 'static,
+    {
+        self.retry_with_backoff(task_name, move || {
+            // first execute `F` and then use it, so we can avoid `F: Sync`.
+            let do_stuff = do_stuff();
+
+            async {
+                match do_stuff.await {
+                    Ok(b) => ControlFlow::Break(b),
+                    Err(e) => ControlFlow::Continue(e),
+                }
+            }
+        })
+        .await
+    }
+}
+
+impl Iterator for Backoff {
+    type Item = Duration;
+
+    /// Returns the next backoff duration to wait for, if any
+    fn next(&mut self) -> Option<Self::Item> {
+        let range = self.init_backoff..=(self.next_backoff_secs * self.base);
+
+        let rand_backoff = match self.rng.as_mut() {
+            Some(rng) => rng.gen_range(range),
+            None => thread_rng().gen_range(range),
+        };
+
+        let next_backoff = self.max_backoff_secs.min(rand_backoff);
+        self.total += next_backoff;
+        let res = std::mem::replace(&mut self.next_backoff_secs, next_backoff);
+        if let Some(deadline) = self.deadline {
+            if self.total >= deadline {
+                return None;
+            }
+        }
+        duration_try_from_secs_f64(res)
+    }
+}
+
+const MAX_F64_SECS: f64 = 1_000_000.0;
+
+/// Try to get `Duration` from `f64` secs.
+///
+/// This is required till <https://github.com/rust-lang/rust/issues/83400> is resolved.
+fn duration_try_from_secs_f64(secs: f64) -> Option<Duration> {
+    (secs.is_finite() && (0.0..=MAX_F64_SECS).contains(&secs))
+        .then(|| Duration::from_secs_f64(secs))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::rngs::mock::StepRng;
+
+    #[test]
+    fn test_backoff() {
+        let init_backoff_secs = 1.;
+        let max_backoff_secs = 500.;
+        let base = 3.;
+
+        let config = BackoffConfig {
+            init_backoff: Duration::from_secs_f64(init_backoff_secs),
+            max_backoff: Duration::from_secs_f64(max_backoff_secs),
+            deadline: None,
+            base,
+        };
+
+        let assert_fuzzy_eq = |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{a} != {b}");
+
+        // Create a static rng that takes the minimum of the range
+        let rng = Box::new(StepRng::new(0, 0));
+        let mut backoff = Backoff::new_with_rng(&config, Some(rng));
+
+        for _ in 0..20 {
+            assert_eq!(backoff.next().unwrap().as_secs_f64(), init_backoff_secs);
+        }
+
+        // Create a static rng that takes the maximum of the range
+        let rng = Box::new(StepRng::new(u64::MAX, 0));
+        let mut backoff = Backoff::new_with_rng(&config, Some(rng));
+
+        for i in 0..20 {
+            let value = (base.powi(i) * init_backoff_secs).min(max_backoff_secs);
+            assert_fuzzy_eq(backoff.next().unwrap().as_secs_f64(), value);
+        }
+
+        // Create a static rng that takes the mid point of the range
+        let rng = Box::new(StepRng::new(u64::MAX / 2, 0));
+        let mut backoff = Backoff::new_with_rng(&config, Some(rng));
+
+        let mut value = init_backoff_secs;
+        for _ in 0..20 {
+            assert_fuzzy_eq(backoff.next().unwrap().as_secs_f64(), value);
+            value =
+                (init_backoff_secs + (value * base - init_backoff_secs) / 2.).min(max_backoff_secs);
+        }
+
+        // deadline
+        let rng = Box::new(StepRng::new(u64::MAX, 0));
+        let deadline = Duration::from_secs_f64(init_backoff_secs);
+        let mut backoff = Backoff::new_with_rng(
+            &BackoffConfig {
+                deadline: Some(deadline),
+                ..config
+            },
+            Some(rng),
+        );
+        assert_eq!(backoff.next(), None);
+    }
+
+    #[test]
+    fn test_overflow() {
+        let rng = Box::new(StepRng::new(u64::MAX, 0));
+        let cfg = BackoffConfig {
+            init_backoff: Duration::MAX,
+            max_backoff: Duration::MAX,
+            ..Default::default()
+        };
+        let mut backoff = Backoff::new_with_rng(&cfg, Some(rng));
+        assert_eq!(backoff.next(), None);
+    }
+
+    #[test]
+    fn test_duration_try_from_f64() {
+        for d in [-0.1, f64::INFINITY, f64::NAN, MAX_F64_SECS + 0.1] {
+            assert!(duration_try_from_secs_f64(d).is_none());
+        }
+
+        for d in [0.0, MAX_F64_SECS] {
+            assert!(duration_try_from_secs_f64(d).is_some());
+        }
+    }
+
+    #[test]
+    fn test_max_backoff_smaller_init() {
+        let rng = Box::new(StepRng::new(u64::MAX, 0));
+        let cfg = BackoffConfig {
+            init_backoff: Duration::from_secs(2),
+            max_backoff: Duration::from_secs(1),
+            ..Default::default()
+        };
+        let mut backoff = Backoff::new_with_rng(&cfg, Some(rng));
+        assert_eq!(backoff.next(), Some(Duration::from_secs(1)));
+        assert_eq!(backoff.next(), Some(Duration::from_secs(1)));
+    }
+
+    #[test]
+    #[should_panic(expected = "Backoff base (inf) must be finite.")]
+    fn test_panic_inf_base() {
+        let cfg = BackoffConfig {
+            base: f64::INFINITY,
+            ..Default::default()
+        };
+        Backoff::new(&cfg);
+    }
+
+    #[test]
+    #[should_panic(expected = "Backoff base (NaN) must be finite.")]
+    fn test_panic_nan_base() {
+        let cfg = BackoffConfig {
+            base: f64::NAN,
+            ..Default::default()
+        };
+        Backoff::new(&cfg);
+    }
+
+    #[test]
+    #[should_panic(expected = "Backoff base (0) must be greater or equal than 1.")]
+    fn test_panic_zero_base() {
+        let cfg = BackoffConfig {
+            base: 0.0,
+            ..Default::default()
+        };
+        Backoff::new(&cfg);
+    }
+
+    #[test]
+    fn test_constant_backoff() {
+        let rng = Box::new(StepRng::new(u64::MAX, 0));
+        let cfg = BackoffConfig {
+            init_backoff: Duration::from_secs(1),
+            max_backoff: Duration::from_secs(1),
+            base: 1.0,
+            ..Default::default()
+        };
+        let mut backoff = Backoff::new_with_rng(&cfg, Some(rng));
+        assert_eq!(backoff.next(), Some(Duration::from_secs(1)));
+        assert_eq!(backoff.next(), Some(Duration::from_secs(1)));
+    }
+}
diff --git a/cache_system/Cargo.toml b/cache_system/Cargo.toml
new file mode 100644
index 0000000..bb07eba
--- /dev/null
+++ b/cache_system/Cargo.toml
@@ -0,0 +1,40 @@
+[package]
+name = "cache_system"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+async-trait = "0.1.77"
+backoff = { path = "../backoff" }
+futures = "0.3"
+iox_time = { path = "../iox_time" }
+metric = { path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+ouroboros = "0.18"
+parking_lot = { version = "0.12", features = ["arc_lock"] }
+pdatastructs = { version = "0.7", default-features = false, features = ["fixedbitset"] }
+rand = "0.8.3"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio-util = { version = "0.7.10" }
+trace = { path = "../trace"}
+tracker = { path = "../tracker"}
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+criterion = { version = "0.5", default-features = false, features = ["rayon"]}
+proptest = { version = "1", default_features = false, features = ["std"] }
+test_helpers = { path = "../test_helpers" }
+
+[lib]
+# Allow --save-baseline to work
+# https://github.com/bheisler/criterion.rs/issues/275
+bench = false
+
+[[bench]]
+name = "addressable_heap"
+harness = false
diff --git a/cache_system/benches/addressable_heap.rs b/cache_system/benches/addressable_heap.rs
new file mode 100644
index 0000000..42a9e8b
--- /dev/null
+++ b/cache_system/benches/addressable_heap.rs
@@ -0,0 +1,420 @@
+use std::mem::size_of;
+
+use cache_system::addressable_heap::AddressableHeap;
+use criterion::{
+    criterion_group, criterion_main, measurement::WallTime, AxisScale, BatchSize, BenchmarkGroup,
+    BenchmarkId, Criterion, PlotConfiguration, SamplingMode,
+};
+use rand::{prelude::SliceRandom, thread_rng, Rng};
+
+/// Payload (`V`) for testing.
+///
+/// This is a 64bit-wide object which is enough to store a [`Box`] or a [`usize`].
+#[derive(Debug, Clone, Default)]
+struct Payload([u8; 8]);
+
+const _: () = assert!(size_of::<Payload>() == 8);
+const _: () = assert!(size_of::<Payload>() >= size_of::<Box<Vec<u32>>>());
+const _: () = assert!(size_of::<Payload>() >= size_of::<usize>());
+
+type TestHeap = AddressableHeap<u64, Payload, u64>;
+
+const TEST_SIZES: &[usize] = &[0, 1, 10, 100, 1_000, 10_000];
+
+#[derive(Debug, Clone)]
+struct Entry {
+    k: u64,
+    o: u64,
+}
+
+impl Entry {
+    fn new_random<R>(rng: &mut R) -> Self
+    where
+        R: Rng,
+    {
+        Self {
+            // leave some room at the top and bottom
+            k: (rng.gen::<u64>() << 1) + (u64::MAX << 2),
+            // leave some room at the top and bottom
+            o: (rng.gen::<u64>() << 1) + (u64::MAX << 2),
+        }
+    }
+
+    fn new_random_n<R>(rng: &mut R, n: usize) -> Vec<Self>
+    where
+        R: Rng,
+    {
+        (0..n).map(|_| Self::new_random(rng)).collect()
+    }
+}
+
+fn create_filled_heap<R>(rng: &mut R, n: usize) -> (TestHeap, Vec<Entry>)
+where
+    R: Rng,
+{
+    let mut heap = TestHeap::default();
+    let mut entries = Vec::with_capacity(n);
+
+    for _ in 0..n {
+        let entry = Entry::new_random(rng);
+        heap.insert(entry.k, Payload::default(), entry.o);
+        entries.push(entry);
+    }
+
+    (heap, entries)
+}
+
+fn setup_group(g: &mut BenchmarkGroup<'_, WallTime>) {
+    g.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic));
+    g.sampling_mode(SamplingMode::Flat);
+}
+
+fn bench_insert_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("insert_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || (TestHeap::default(), Entry::new_random_n(&mut rng, *n)),
+                |(mut heap, entries)| {
+                    for entry in &entries {
+                        heap.insert(entry.k, Payload::default(), entry.o);
+                    }
+
+                    // let criterion handle the drop
+                    (heap, entries)
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_peek_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("peek_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || create_filled_heap(&mut rng, *n).0,
+                |heap| {
+                    heap.peek();
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_get_existing_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("get_existing_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        if *n == 0 {
+            continue;
+        }
+
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, entries) = create_filled_heap(&mut rng, *n);
+                    let entry = entries.choose(&mut rng).unwrap().clone();
+                    (heap, entry)
+                },
+                |(heap, entry)| {
+                    heap.get(&entry.k);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_get_new_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("get_new_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, _entries) = create_filled_heap(&mut rng, *n);
+                    let entry = Entry::new_random(&mut rng);
+                    (heap, entry)
+                },
+                |(heap, entry)| {
+                    heap.get(&entry.k);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_pop_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("pop_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || create_filled_heap(&mut rng, *n).0,
+                |mut heap| {
+                    for _ in 0..*n {
+                        heap.pop();
+                    }
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_remove_existing_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("remove_existing_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        if *n == 0 {
+            continue;
+        }
+
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, entries) = create_filled_heap(&mut rng, *n);
+                    let entry = entries.choose(&mut rng).unwrap().clone();
+                    (heap, entry)
+                },
+                |(mut heap, entry)| {
+                    heap.remove(&entry.k);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_remove_new_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("remove_new_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, _entries) = create_filled_heap(&mut rng, *n);
+                    let entry = Entry::new_random(&mut rng);
+                    (heap, entry)
+                },
+                |(mut heap, entry)| {
+                    heap.remove(&entry.k);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_replace_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("replace_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        if *n == 0 {
+            continue;
+        }
+
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, entries) = create_filled_heap(&mut rng, *n);
+                    let entry = entries.choose(&mut rng).unwrap().clone();
+                    let entry = Entry {
+                        k: entry.k,
+                        o: Entry::new_random(&mut rng).o,
+                    };
+                    (heap, entry)
+                },
+                |(mut heap, entry)| {
+                    heap.insert(entry.k, Payload::default(), entry.o);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_update_order_existing_to_random_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("update_order_existing_to_random_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        if *n == 0 {
+            continue;
+        }
+
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, entries) = create_filled_heap(&mut rng, *n);
+                    let entry = entries.choose(&mut rng).unwrap().clone();
+                    let entry = Entry {
+                        k: entry.k,
+                        o: Entry::new_random(&mut rng).o,
+                    };
+                    (heap, entry)
+                },
+                |(mut heap, entry)| {
+                    heap.update_order(&entry.k, entry.o);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_update_order_existing_to_last_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("update_order_existing_to_first_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        if *n == 0 {
+            continue;
+        }
+
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, entries) = create_filled_heap(&mut rng, *n);
+                    let entry = entries.choose(&mut rng).unwrap().clone();
+                    let entry = Entry {
+                        k: entry.k,
+                        o: u64::MAX - (u64::MAX << 2),
+                    };
+                    (heap, entry)
+                },
+                |(mut heap, entry)| {
+                    heap.update_order(&entry.k, entry.o);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+fn bench_update_order_new_after_n_elements(c: &mut Criterion) {
+    let mut g = c.benchmark_group("update_order_new_after_n_elements");
+    setup_group(&mut g);
+
+    let mut rng = thread_rng();
+
+    for n in TEST_SIZES {
+        g.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &_n| {
+            b.iter_batched(
+                || {
+                    let (heap, _entries) = create_filled_heap(&mut rng, *n);
+                    let entry = Entry::new_random(&mut rng);
+                    (heap, entry)
+                },
+                |(mut heap, entry)| {
+                    heap.update_order(&entry.k, entry.o);
+
+                    // let criterion handle the drop
+                    heap
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
+
+    g.finish();
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default();
+    targets =
+        bench_insert_n_elements,
+        bench_peek_after_n_elements,
+        bench_get_existing_after_n_elements,
+        bench_get_new_after_n_elements,
+        bench_pop_n_elements,
+        bench_remove_existing_after_n_elements,
+        bench_remove_new_after_n_elements,
+        bench_replace_after_n_elements,
+        bench_update_order_existing_to_random_after_n_elements,
+        bench_update_order_existing_to_last_after_n_elements,
+        bench_update_order_new_after_n_elements,
+}
+criterion_main!(benches);
diff --git a/cache_system/src/addressable_heap.rs b/cache_system/src/addressable_heap.rs
new file mode 100644
index 0000000..4f3466f
--- /dev/null
+++ b/cache_system/src/addressable_heap.rs
@@ -0,0 +1,611 @@
+//! Implementation of an [`AddressableHeap`].
+use std::{
+    collections::{hash_map, BTreeSet, HashMap},
+    hash::Hash,
+};
+
+/// Addressable heap.
+///
+/// Stores a value `V` together with a key `K` and an order `O`. Elements are sorted by `O` and the smallest element can
+/// be peeked/popped. At the same time elements can be addressed via `K`.
+///
+/// Note that `K` requires the inner data structure to implement [`Ord`] as a tie breaker.
+#[derive(Debug, Clone)]
+pub struct AddressableHeap<K, V, O>
+where
+    K: Clone + Eq + Hash + Ord,
+    O: Clone + Ord,
+{
+    /// Key to order and value.
+    ///
+    /// The order is required to lookup data within the queue.
+    ///
+    /// The value is stored here instead of the queue since HashMap entries are copied around less often than queue elements.
+    key_to_order_and_value: HashMap<K, (V, O)>,
+
+    /// Queue that handles the priorities.
+    ///
+    /// The order goes first, the key goes second.
+    ///
+    /// Note: This is not really a heap, but it fulfills the interface that we need.
+    queue: BTreeSet<(O, K)>,
+}
+
+impl<K, V, O> AddressableHeap<K, V, O>
+where
+    K: Clone + Eq + Hash + Ord,
+    O: Clone + Ord,
+{
+    /// Create new, empty heap.
+    pub fn new() -> Self {
+        Self {
+            key_to_order_and_value: HashMap::new(),
+            queue: BTreeSet::new(),
+        }
+    }
+
+    /// Check if the heap is empty.
+    pub fn is_empty(&self) -> bool {
+        let res1 = self.key_to_order_and_value.is_empty();
+        let res2 = self.queue.is_empty();
+        assert_eq!(res1, res2, "data structures out of sync");
+        res1
+    }
+
+    /// Insert element.
+    ///
+    /// If the element (compared by `K`) already exists, it will be returned.
+    pub fn insert(&mut self, k: K, v: V, o: O) -> Option<(V, O)> {
+        let (result, k) = match self.key_to_order_and_value.entry(k.clone()) {
+            hash_map::Entry::Occupied(mut entry_o) => {
+                // `entry_o.replace_entry(...)` is not stabel yet, see https://github.com/rust-lang/rust/issues/44286
+                let mut tmp = (v, o.clone());
+                std::mem::swap(&mut tmp, entry_o.get_mut());
+                let (v_old, o_old) = tmp;
+
+                let query = (o_old, k);
+                let existed = self.queue.remove(&query);
+                assert!(existed, "key was in key_to_order");
+                let (o_old, k) = query;
+
+                (Some((v_old, o_old)), k)
+            }
+            hash_map::Entry::Vacant(entry_v) => {
+                entry_v.insert((v, o.clone()));
+                (None, k)
+            }
+        };
+
+        let inserted = self.queue.insert((o, k));
+        assert!(inserted, "entry should have been removed by now");
+
+        result
+    }
+
+    /// Peek first element (by smallest `O`).
+    pub fn peek(&self) -> Option<(&K, &V, &O)> {
+        self.iter().next()
+    }
+
+    /// Pop first element (by smallest `O`) from heap.
+    pub fn pop(&mut self) -> Option<(K, V, O)> {
+        if let Some((o, k)) = self.queue.pop_first() {
+            let (v, o2) = self
+                .key_to_order_and_value
+                .remove(&k)
+                .expect("value is in queue");
+            assert!(o == o2);
+            Some((k, v, o))
+        } else {
+            None
+        }
+    }
+
+    /// Iterate over elements in order of `O` (starting at smallest).
+    ///
+    /// This is equivalent to calling [`pop`](Self::pop) multiple times, but does NOT modify the collection.
+    pub fn iter(&self) -> AddressableHeapIter<'_, K, V, O> {
+        AddressableHeapIter {
+            key_to_order_and_value: &self.key_to_order_and_value,
+            queue_iter: self.queue.iter(),
+        }
+    }
+
+    /// Get element by key.
+    pub fn get(&self, k: &K) -> Option<(&V, &O)> {
+        self.key_to_order_and_value.get(k).map(project_tuple)
+    }
+
+    /// Remove element by key.
+    ///
+    /// If the element exists within the heap (addressed via `K`), the value and order will be returned.
+    pub fn remove(&mut self, k: &K) -> Option<(V, O)> {
+        if let Some((k, (v, o))) = self.key_to_order_and_value.remove_entry(k) {
+            let query = (o, k);
+            let existed = self.queue.remove(&query);
+            assert!(existed, "key was in key_to_order");
+            let (o, _k) = query;
+            Some((v, o))
+        } else {
+            None
+        }
+    }
+
+    /// Update order of a given key.
+    ///
+    /// Returns existing order if the key existed.
+    pub fn update_order(&mut self, k: &K, o: O) -> Option<O> {
+        match self.key_to_order_and_value.get_mut(k) {
+            Some(entry) => {
+                let mut o_old = o.clone();
+                std::mem::swap(&mut entry.1, &mut o_old);
+
+                let query = (o_old, k.clone());
+                let existed = self.queue.remove(&query);
+                assert!(existed, "key was in key_to_order");
+                let (o_old, k) = query;
+
+                let inserted = self.queue.insert((o, k));
+                assert!(inserted, "entry should have been removed by now");
+
+                Some(o_old)
+            }
+            None => None,
+        }
+    }
+}
+
+impl<K, V, O> Default for AddressableHeap<K, V, O>
+where
+    K: Clone + Eq + Hash + Ord,
+    O: Clone + Ord,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Project tuple references.
+fn project_tuple<A, B>(t: &(A, B)) -> (&A, &B) {
+    (&t.0, &t.1)
+}
+
+/// Iterator of [`AddressableHeap::iter`].
+#[derive(Debug)]
+pub struct AddressableHeapIter<'a, K, V, O>
+where
+    K: Clone + Eq + Hash + Ord,
+    O: Clone + Ord,
+{
+    key_to_order_and_value: &'a HashMap<K, (V, O)>,
+    queue_iter: std::collections::btree_set::Iter<'a, (O, K)>,
+}
+
+impl<'a, K, V, O> Iterator for AddressableHeapIter<'a, K, V, O>
+where
+    K: Clone + Eq + Hash + Ord,
+    O: Clone + Ord,
+{
+    type Item = (&'a K, &'a V, &'a O);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.queue_iter.next().map(|(o, k)| {
+            let (v, o2) = self
+                .key_to_order_and_value
+                .get(k)
+                .expect("value is in queue");
+            assert!(o == o2);
+            (k, v, o)
+        })
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.queue_iter.size_hint()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use proptest::prelude::*;
+
+    use super::*;
+
+    #[test]
+    fn test_peek_empty() {
+        let heap = AddressableHeap::<i32, &str, i32>::new();
+
+        assert_eq!(heap.peek(), None);
+    }
+
+    #[test]
+    fn test_peek_some() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+        heap.insert(3, "c", 5);
+
+        assert_eq!(heap.peek(), Some((&2, &"b", &3)));
+    }
+
+    #[test]
+    fn test_peek_tie() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(3, "a", 1);
+        heap.insert(1, "b", 1);
+        heap.insert(2, "c", 1);
+
+        assert_eq!(heap.peek(), Some((&1, &"b", &1)));
+    }
+
+    #[test]
+    fn test_peek_after_remove() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+        heap.insert(3, "c", 5);
+
+        assert_eq!(heap.peek(), Some((&2, &"b", &3)));
+        heap.remove(&3);
+        assert_eq!(heap.peek(), Some((&2, &"b", &3)));
+        heap.remove(&2);
+        assert_eq!(heap.peek(), Some((&1, &"a", &4)));
+        heap.remove(&1);
+        assert_eq!(heap.peek(), None);
+    }
+
+    #[test]
+    fn test_peek_after_override() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+        heap.insert(1, "c", 2);
+
+        assert_eq!(heap.peek(), Some((&1, &"c", &2)));
+    }
+
+    #[test]
+    fn test_pop_empty() {
+        let mut heap = AddressableHeap::<i32, &str, i32>::new();
+
+        assert_eq!(heap.pop(), None);
+    }
+
+    #[test]
+    fn test_pop_all() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+        heap.insert(3, "c", 5);
+
+        assert_eq!(heap.pop(), Some((2, "b", 3)));
+        assert_eq!(heap.pop(), Some((1, "a", 4)));
+        assert_eq!(heap.pop(), Some((3, "c", 5)));
+        assert_eq!(heap.pop(), None);
+    }
+
+    #[test]
+    fn test_pop_tie() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(3, "a", 1);
+        heap.insert(1, "b", 1);
+        heap.insert(2, "c", 1);
+
+        assert_eq!(heap.pop(), Some((1, "b", 1)));
+        assert_eq!(heap.pop(), Some((2, "c", 1)));
+        assert_eq!(heap.pop(), Some((3, "a", 1)));
+        assert_eq!(heap.pop(), None);
+    }
+
+    #[test]
+    fn test_pop_after_insert() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+        heap.insert(3, "c", 5);
+
+        assert_eq!(heap.pop(), Some((2, "b", 3)));
+
+        heap.insert(4, "d", 2);
+        assert_eq!(heap.pop(), Some((4, "d", 2)));
+        assert_eq!(heap.pop(), Some((1, "a", 4)));
+    }
+
+    #[test]
+    fn test_pop_after_remove() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+        heap.insert(3, "c", 5);
+
+        heap.remove(&2);
+        assert_eq!(heap.pop(), Some((1, "a", 4)));
+    }
+
+    #[test]
+    fn test_pop_after_override() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+        heap.insert(1, "c", 2);
+
+        assert_eq!(heap.pop(), Some((1, "c", 2)));
+        assert_eq!(heap.pop(), Some((2, "b", 3)));
+        assert_eq!(heap.pop(), None);
+    }
+
+    #[test]
+    fn test_get_empty() {
+        let heap = AddressableHeap::<i32, &str, i32>::new();
+
+        assert_eq!(heap.get(&1), None);
+    }
+
+    #[test]
+    fn test_get_multiple() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+
+        assert_eq!(heap.get(&1), Some((&"a", &4)));
+        assert_eq!(heap.get(&2), Some((&"b", &3)));
+    }
+
+    #[test]
+    fn test_get_after_remove() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+
+        heap.remove(&1);
+
+        assert_eq!(heap.get(&1), None);
+        assert_eq!(heap.get(&2), Some((&"b", &3)));
+    }
+
+    #[test]
+    fn test_get_after_pop() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+
+        heap.pop();
+
+        assert_eq!(heap.get(&1), Some((&"a", &4)));
+        assert_eq!(heap.get(&2), None);
+    }
+
+    #[test]
+    fn test_get_after_override() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(1, "b", 3);
+
+        assert_eq!(heap.get(&1), Some((&"b", &3)));
+    }
+
+    #[test]
+    fn test_remove_empty() {
+        let mut heap = AddressableHeap::<i32, &str, i32>::new();
+
+        assert_eq!(heap.remove(&1), None);
+    }
+
+    #[test]
+    fn test_remove_some() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+
+        assert_eq!(heap.remove(&1), Some(("a", 4)));
+        assert_eq!(heap.remove(&2), Some(("b", 3)));
+    }
+
+    #[test]
+    fn test_remove_twice() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+
+        assert_eq!(heap.remove(&1), Some(("a", 4)));
+        assert_eq!(heap.remove(&1), None);
+    }
+
+    #[test]
+    fn test_remove_after_pop() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(2, "b", 3);
+
+        heap.pop();
+
+        assert_eq!(heap.remove(&1), Some(("a", 4)));
+        assert_eq!(heap.remove(&2), None);
+    }
+
+    #[test]
+    fn test_remove_after_override() {
+        let mut heap = AddressableHeap::new();
+
+        heap.insert(1, "a", 4);
+        heap.insert(1, "b", 3);
+
+        assert_eq!(heap.remove(&1), Some(("b", 3)));
+        assert_eq!(heap.remove(&1), None);
+    }
+
+    #[test]
+    fn test_override() {
+        let mut heap = AddressableHeap::new();
+
+        assert_eq!(heap.insert(1, "a", 4), None);
+        assert_eq!(heap.insert(2, "b", 3), None);
+        assert_eq!(heap.insert(1, "c", 5), Some(("a", 4)));
+    }
+
+    /// Simple version of [`AddressableHeap`] for testing.
+    struct SimpleAddressableHeap {
+        inner: Vec<(u8, String, i8)>,
+    }
+
+    impl SimpleAddressableHeap {
+        fn new() -> Self {
+            Self { inner: Vec::new() }
+        }
+
+        fn is_empty(&self) -> bool {
+            self.inner.is_empty()
+        }
+
+        fn insert(&mut self, k: u8, v: String, o: i8) -> Option<(String, i8)> {
+            let res = self.remove(&k);
+            self.inner.push((k, v, o));
+
+            res
+        }
+
+        #[allow(clippy::map_identity)] // https://github.com/rust-lang/rust-clippy/issues/11764
+        fn peek(&self) -> Option<(&u8, &String, &i8)> {
+            self.inner
+                .iter()
+                .min_by_key(|(k, _v, o)| (o, k))
+                .map(|(k, v, o)| (k, v, o))
+        }
+
+        fn dump_ordered(&self) -> Vec<(u8, String, i8)> {
+            let mut inner = self.inner.clone();
+            inner.sort_by_key(|(k, _v, o)| (*o, *k));
+            inner
+        }
+
+        fn pop(&mut self) -> Option<(u8, String, i8)> {
+            self.inner
+                .iter()
+                .enumerate()
+                .min_by_key(|(_idx, (k, _v, o))| (o, k))
+                .map(|(idx, _)| idx)
+                .map(|idx| self.inner.remove(idx))
+        }
+
+        fn get(&self, k: &u8) -> Option<(&String, &i8)> {
+            self.inner
+                .iter()
+                .find(|(k2, _v, _o)| k2 == k)
+                .map(|(_k, v, o)| (v, o))
+        }
+
+        fn remove(&mut self, k: &u8) -> Option<(String, i8)> {
+            self.inner
+                .iter()
+                .enumerate()
+                .find(|(_idx, (k2, _v, _o))| k2 == k)
+                .map(|(idx, _)| idx)
+                .map(|idx| {
+                    let (_k, v, o) = self.inner.remove(idx);
+                    (v, o)
+                })
+        }
+
+        fn update_order(&mut self, k: &u8, o: i8) -> Option<i8> {
+            if let Some((v, o_old)) = self.remove(k) {
+                self.insert(*k, v, o);
+                Some(o_old)
+            } else {
+                None
+            }
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    enum Action {
+        IsEmpty,
+        Insert { k: u8, v: String, o: i8 },
+        Peek,
+        Iter,
+        Pop,
+        Get { k: u8 },
+        Remove { k: u8 },
+        UpdateOrder { k: u8, o: i8 },
+    }
+
+    // Use a hand-rolled strategy instead of `proptest-derive`, because the latter one is quite a heavy dependency.
+    fn action() -> impl Strategy<Value = Action> {
+        prop_oneof![
+            Just(Action::IsEmpty),
+            (any::<u8>(), ".*", any::<i8>()).prop_map(|(k, v, o)| Action::Insert { k, v, o }),
+            Just(Action::Peek),
+            Just(Action::Iter),
+            Just(Action::Pop),
+            any::<u8>().prop_map(|k| Action::Get { k }),
+            any::<u8>().prop_map(|k| Action::Remove { k }),
+            (any::<u8>(), any::<i8>()).prop_map(|(k, o)| Action::UpdateOrder { k, o }),
+        ]
+    }
+
+    proptest! {
+        #[test]
+        fn test_proptest(actions in prop::collection::vec(action(), 0..100)) {
+            let mut heap = AddressableHeap::new();
+            let mut sim = SimpleAddressableHeap::new();
+
+            for action in actions {
+                match action {
+                    Action::IsEmpty => {
+                        let res1 = heap.is_empty();
+                        let res2 = sim.is_empty();
+                        assert_eq!(res1, res2);
+                    }
+                    Action::Insert{k, v, o} => {
+                        let res1 = heap.insert(k, v.clone(), o);
+                        let res2 = sim.insert(k, v, o);
+                        assert_eq!(res1, res2);
+                    }
+                    Action::Peek => {
+                        let res1 = heap.peek();
+                        let res2 = sim.peek();
+                        assert_eq!(res1, res2);
+                    }
+                    Action::Iter => {
+                        let res1 = heap.iter().map(|(k, v, o)| (*k, v.clone(), *o)).collect::<Vec<_>>();
+                        let res2 = sim.dump_ordered();
+                        assert_eq!(res1, res2);
+                    }
+                    Action::Pop => {
+                        let res1 = heap.pop();
+                        let res2 = sim.pop();
+                        assert_eq!(res1, res2);
+                    }
+                    Action::Get{k} => {
+                        let res1 = heap.get(&k);
+                        let res2 = sim.get(&k);
+                        assert_eq!(res1, res2);
+                    }
+                    Action::Remove{k} => {
+                        let res1 = heap.remove(&k);
+                        let res2 = sim.remove(&k);
+                        assert_eq!(res1, res2);
+                    }
+                    Action::UpdateOrder{k, o} => {
+                        let res1 = heap.update_order(&k, o);
+                        let res2 = sim.update_order(&k, o);
+                        assert_eq!(res1, res2);
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/cache_system/src/backend/hash_map.rs b/cache_system/src/backend/hash_map.rs
new file mode 100644
index 0000000..cb3c302
--- /dev/null
+++ b/cache_system/src/backend/hash_map.rs
@@ -0,0 +1,51 @@
+//! Implements [`CacheBackend`] for [`HashMap`].
+use std::{
+    any::Any,
+    collections::HashMap,
+    fmt::Debug,
+    hash::{BuildHasher, Hash},
+};
+
+use super::CacheBackend;
+
+impl<K, V, S> CacheBackend for HashMap<K, V, S>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    S: BuildHasher + Send + 'static,
+{
+    type K = K;
+    type V = V;
+
+    fn get(&mut self, k: &Self::K) -> Option<Self::V> {
+        Self::get(self, k).cloned()
+    }
+
+    fn set(&mut self, k: Self::K, v: Self::V) {
+        self.insert(k, v);
+    }
+
+    fn remove(&mut self, k: &Self::K) {
+        self.remove(k);
+    }
+
+    fn is_empty(&self) -> bool {
+        self.is_empty()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_generic() {
+        use crate::backend::test_util::test_generic;
+
+        test_generic(HashMap::new);
+    }
+}
diff --git a/cache_system/src/backend/mod.rs b/cache_system/src/backend/mod.rs
new file mode 100644
index 0000000..8395c83
--- /dev/null
+++ b/cache_system/src/backend/mod.rs
@@ -0,0 +1,66 @@
+//! Storage backends to keep and manage cached entries.
+use std::{any::Any, fmt::Debug, hash::Hash};
+
+pub mod hash_map;
+pub mod policy;
+
+#[cfg(test)]
+mod test_util;
+
+/// Backend to keep and manage stored entries.
+///
+/// A backend might remove entries at any point, e.g. due to memory pressure or expiration.
+pub trait CacheBackend: Debug {
+    /// Cache key.
+    type K: Clone + Eq + Hash + Ord + Debug + Send + 'static;
+
+    /// Cached value.
+    type V: Clone + Debug + Send + 'static;
+
+    /// Get value for given key if it exists.
+    fn get(&mut self, k: &Self::K) -> Option<Self::V>;
+
+    /// Set value for given key.
+    ///
+    /// It is OK to set and override a key that already exists.
+    fn set(&mut self, k: Self::K, v: Self::V);
+
+    /// Remove value for given key.
+    ///
+    /// It is OK to remove a key even when it does not exist.
+    fn remove(&mut self, k: &Self::K);
+
+    /// Check if backend is empty.
+    fn is_empty(&self) -> bool;
+
+    /// Return backend as [`Any`] which can be used to downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+}
+
+impl<T> CacheBackend for Box<T>
+where
+    T: CacheBackend + ?Sized + 'static,
+{
+    type K = T::K;
+    type V = T::V;
+
+    fn get(&mut self, k: &Self::K) -> Option<Self::V> {
+        self.as_mut().get(k)
+    }
+
+    fn set(&mut self, k: Self::K, v: Self::V) {
+        self.as_mut().set(k, v)
+    }
+
+    fn remove(&mut self, k: &Self::K) {
+        self.as_mut().remove(k)
+    }
+
+    fn is_empty(&self) -> bool {
+        self.as_ref().is_empty()
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self as &dyn Any
+    }
+}
diff --git a/cache_system/src/backend/policy/integration_tests.rs b/cache_system/src/backend/policy/integration_tests.rs
new file mode 100644
index 0000000..c99a2d0
--- /dev/null
+++ b/cache_system/src/backend/policy/integration_tests.rs
@@ -0,0 +1,599 @@
+//! Test integration between different policies.
+
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use iox_time::{MockProvider, Time};
+use parking_lot::Mutex;
+use rand::rngs::mock::StepRng;
+use test_helpers::maybe_start_logging;
+use tokio::{runtime::Handle, sync::Notify};
+
+use crate::{
+    backend::{
+        policy::refresh::test_util::{backoff_cfg, NotifyExt},
+        CacheBackend,
+    },
+    loader::test_util::TestLoader,
+    resource_consumption::{test_util::TestSize, ResourceEstimator},
+};
+
+use super::{
+    lru::{LruPolicy, ResourcePool},
+    refresh::{test_util::TestRefreshDurationProvider, RefreshPolicy},
+    remove_if::{RemoveIfHandle, RemoveIfPolicy},
+    ttl::{test_util::TestTtlProvider, TtlPolicy},
+    PolicyBackend,
+};
+
+#[tokio::test]
+async fn test_refresh_can_prevent_expiration() {
+    let TestStateTtlAndRefresh {
+        mut backend,
+        refresh_duration_provider,
+        ttl_provider,
+        time_provider,
+        loader,
+        notify_idle,
+        ..
+    } = TestStateTtlAndRefresh::new();
+
+    loader.mock_next(1, String::from("foo"));
+
+    refresh_duration_provider.set_refresh_in(
+        1,
+        String::from("a"),
+        Some(backoff_cfg(Duration::from_secs(1))),
+    );
+    ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(2)));
+
+    refresh_duration_provider.set_refresh_in(1, String::from("foo"), None);
+    ttl_provider.set_expires_in(1, String::from("foo"), Some(Duration::from_secs(2)));
+
+    backend.set(1, String::from("a"));
+
+    // perform refresh
+    time_provider.inc(Duration::from_secs(1));
+    notify_idle.notified_with_timeout().await;
+
+    // no expired because refresh resets the timer
+    time_provider.inc(Duration::from_secs(1));
+    assert_eq!(backend.get(&1), Some(String::from("foo")));
+
+    // we don't request a 2nd refresh (refresh duration is None), so this finally expires
+    time_provider.inc(Duration::from_secs(1));
+    assert_eq!(backend.get(&1), None);
+}
+
+#[tokio::test]
+async fn test_refresh_sets_new_expiration_after_it_finishes() {
+    let TestStateTtlAndRefresh {
+        mut backend,
+        refresh_duration_provider,
+        ttl_provider,
+        time_provider,
+        loader,
+        notify_idle,
+        ..
+    } = TestStateTtlAndRefresh::new();
+
+    let barrier = loader.block_next(1, String::from("foo"));
+
+    refresh_duration_provider.set_refresh_in(
+        1,
+        String::from("a"),
+        Some(backoff_cfg(Duration::from_secs(1))),
+    );
+    ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(3)));
+
+    refresh_duration_provider.set_refresh_in(1, String::from("foo"), None);
+    ttl_provider.set_expires_in(1, String::from("foo"), Some(Duration::from_secs(3)));
+
+    backend.set(1, String::from("a"));
+
+    // perform refresh
+    time_provider.inc(Duration::from_secs(1));
+    notify_idle.notified_with_timeout().await;
+
+    time_provider.inc(Duration::from_secs(1));
+    barrier.wait().await;
+    notify_idle.notified_with_timeout().await;
+    assert_eq!(backend.get(&1), Some(String::from("foo")));
+
+    // no expired because refresh resets the timer after it was ready (now), not when it started (1s ago)
+    time_provider.inc(Duration::from_secs(2));
+    assert_eq!(backend.get(&1), Some(String::from("foo")));
+
+    // we don't request a 2nd refresh (refresh duration is None), so this finally expires
+    time_provider.inc(Duration::from_secs(1));
+    assert_eq!(backend.get(&1), None);
+}
+
+#[tokio::test]
+async fn test_refresh_does_not_update_lru_time() {
+    let TestStateLruAndRefresh {
+        mut backend,
+        refresh_duration_provider,
+        size_estimator,
+        time_provider,
+        loader,
+        notify_idle,
+        pool,
+        ..
+    } = TestStateLruAndRefresh::new();
+
+    size_estimator.mock_size(1, String::from("a"), TestSize(4));
+    size_estimator.mock_size(1, String::from("foo"), TestSize(4));
+    size_estimator.mock_size(2, String::from("b"), TestSize(4));
+    size_estimator.mock_size(3, String::from("c"), TestSize(4));
+
+    refresh_duration_provider.set_refresh_in(
+        1,
+        String::from("a"),
+        Some(backoff_cfg(Duration::from_secs(1))),
+    );
+    refresh_duration_provider.set_refresh_in(1, String::from("foo"), None);
+    refresh_duration_provider.set_refresh_in(2, String::from("b"), None);
+    refresh_duration_provider.set_refresh_in(3, String::from("c"), None);
+
+    let barrier = loader.block_next(1, String::from("foo"));
+    backend.set(1, String::from("a"));
+    pool.wait_converged().await;
+
+    // trigger refresh
+    time_provider.inc(Duration::from_secs(1));
+
+    time_provider.inc(Duration::from_secs(1));
+    backend.set(2, String::from("b"));
+    pool.wait_converged().await;
+
+    time_provider.inc(Duration::from_secs(1));
+
+    notify_idle.notified_with_timeout().await;
+    barrier.wait().await;
+    notify_idle.notified_with_timeout().await;
+
+    // add a third item to the cache, forcing LRU to evict one of the items
+    backend.set(3, String::from("c"));
+    pool.wait_converged().await;
+
+    // Should evict `1` even though it was refreshed after `2` was added
+    assert_eq!(backend.get(&1), None);
+}
+
+#[tokio::test]
+async fn test_if_refresh_to_slow_then_expire() {
+    let TestStateTtlAndRefresh {
+        mut backend,
+        refresh_duration_provider,
+        ttl_provider,
+        time_provider,
+        loader,
+        notify_idle,
+        ..
+    } = TestStateTtlAndRefresh::new();
+
+    let barrier = loader.block_next(1, String::from("foo"));
+    refresh_duration_provider.set_refresh_in(
+        1,
+        String::from("a"),
+        Some(backoff_cfg(Duration::from_secs(1))),
+    );
+    ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(2)));
+    backend.set(1, String::from("a"));
+
+    // perform refresh
+    time_provider.inc(Duration::from_secs(1));
+    notify_idle.notified_with_timeout().await;
+
+    time_provider.inc(Duration::from_secs(1));
+    notify_idle.not_notified().await;
+    assert_eq!(backend.get(&1), None);
+
+    // late loader finish will NOT bring the entry back
+    barrier.wait().await;
+    notify_idle.notified_with_timeout().await;
+    assert_eq!(backend.get(&1), None);
+}
+
+#[tokio::test]
+async fn test_refresh_can_trigger_lru_eviction() {
+    maybe_start_logging();
+
+    let TestStateLRUAndRefresh {
+        mut backend,
+        refresh_duration_provider,
+        loader,
+        size_estimator,
+        time_provider,
+        notify_idle,
+        pool,
+        ..
+    } = TestStateLRUAndRefresh::new();
+
+    assert_eq!(pool.limit(), TestSize(10));
+
+    loader.mock_next(1, String::from("b"));
+
+    refresh_duration_provider.set_refresh_in(
+        1,
+        String::from("a"),
+        Some(backoff_cfg(Duration::from_secs(1))),
+    );
+    refresh_duration_provider.set_refresh_in(1, String::from("b"), None);
+    refresh_duration_provider.set_refresh_in(2, String::from("c"), None);
+    refresh_duration_provider.set_refresh_in(3, String::from("d"), None);
+
+    size_estimator.mock_size(1, String::from("a"), TestSize(1));
+    size_estimator.mock_size(1, String::from("b"), TestSize(9));
+    size_estimator.mock_size(2, String::from("c"), TestSize(1));
+    size_estimator.mock_size(3, String::from("d"), TestSize(1));
+
+    backend.set(1, String::from("a"));
+    backend.set(2, String::from("c"));
+    backend.set(3, String::from("d"));
+    pool.wait_converged().await;
+    assert_eq!(backend.get(&2), Some(String::from("c")));
+    assert_eq!(backend.get(&3), Some(String::from("d")));
+    time_provider.inc(Duration::from_millis(1));
+    assert_eq!(backend.get(&1), Some(String::from("a")));
+
+    // refresh
+    time_provider.inc(Duration::from_secs(10));
+    notify_idle.notified_with_timeout().await;
+    pool.wait_converged().await;
+
+    // needed to evict 2->"c"
+    assert_eq!(backend.get(&1), Some(String::from("b")));
+    assert_eq!(backend.get(&2), None);
+    assert_eq!(backend.get(&3), Some(String::from("d")));
+}
+
+#[tokio::test]
+async fn test_lru_learns_about_ttl_evictions() {
+    let TestStateTtlAndLRU {
+        mut backend,
+        ttl_provider,
+        size_estimator,
+        time_provider,
+        pool,
+        ..
+    } = TestStateTtlAndLRU::new().await;
+
+    assert_eq!(pool.limit(), TestSize(10));
+
+    ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+    ttl_provider.set_expires_in(2, String::from("b"), None);
+    ttl_provider.set_expires_in(3, String::from("c"), None);
+
+    size_estimator.mock_size(1, String::from("a"), TestSize(4));
+    size_estimator.mock_size(2, String::from("b"), TestSize(4));
+    size_estimator.mock_size(3, String::from("c"), TestSize(4));
+
+    backend.set(1, String::from("a"));
+    backend.set(2, String::from("b"));
+
+    assert_eq!(pool.current(), TestSize(8));
+
+    // evict
+    time_provider.inc(Duration::from_secs(1));
+    assert_eq!(backend.get(&1), None);
+
+    // now there's space for 3->"c"
+    assert_eq!(pool.current(), TestSize(4));
+    backend.set(3, String::from("c"));
+
+    assert_eq!(pool.current(), TestSize(8));
+    assert_eq!(backend.get(&1), None);
+    assert_eq!(backend.get(&2), Some(String::from("b")));
+    assert_eq!(backend.get(&3), Some(String::from("c")));
+}
+
+#[tokio::test]
+async fn test_remove_if_check_does_not_extend_lifetime() {
+    let TestStateLruAndRemoveIf {
+        mut backend,
+        size_estimator,
+        time_provider,
+        remove_if_handle,
+        pool,
+        ..
+    } = TestStateLruAndRemoveIf::new().await;
+
+    size_estimator.mock_size(1, String::from("a"), TestSize(4));
+    size_estimator.mock_size(2, String::from("b"), TestSize(4));
+    size_estimator.mock_size(3, String::from("c"), TestSize(4));
+
+    backend.set(1, String::from("a"));
+    pool.wait_converged().await;
+    time_provider.inc(Duration::from_secs(1));
+
+    backend.set(2, String::from("b"));
+    pool.wait_converged().await;
+    time_provider.inc(Duration::from_secs(1));
+
+    // Checking remove_if should not count as a "use" of 1
+    // for the "least recently used" calculation
+    remove_if_handle.remove_if(&1, |_| false);
+    backend.set(3, String::from("c"));
+    pool.wait_converged().await;
+
+    // adding "c" totals 12 size, but backend has room for only 10
+    // so "least recently used" (in this case 1, not 2) should be removed
+    assert_eq!(backend.get(&1), None);
+    assert!(backend.get(&2).is_some());
+}
+
+/// Test setup that integrates the TTL policy with a refresh.
+struct TestStateTtlAndRefresh {
+    backend: PolicyBackend<u8, String>,
+    ttl_provider: Arc<TestTtlProvider>,
+    refresh_duration_provider: Arc<TestRefreshDurationProvider>,
+    time_provider: Arc<MockProvider>,
+    loader: Arc<TestLoader<u8, (), String>>,
+    notify_idle: Arc<Notify>,
+}
+
+impl TestStateTtlAndRefresh {
+    fn new() -> Self {
+        let refresh_duration_provider = Arc::new(TestRefreshDurationProvider::new());
+        let ttl_provider = Arc::new(TestTtlProvider::new());
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let metric_registry = metric::Registry::new();
+        let loader = Arc::new(TestLoader::default());
+        let notify_idle = Arc::new(Notify::new());
+
+        // set up "RNG" that always generates the maximum, so we can test things easier
+        let rng_overwrite = StepRng::new(u64::MAX, 0);
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(RefreshPolicy::new_inner(
+            Arc::clone(&time_provider) as _,
+            Arc::clone(&refresh_duration_provider) as _,
+            Arc::clone(&loader) as _,
+            "my_cache",
+            &metric_registry,
+            Arc::clone(&notify_idle),
+            &Handle::current(),
+            Some(rng_overwrite),
+        ));
+        backend.add_policy(TtlPolicy::new(
+            Arc::clone(&ttl_provider) as _,
+            "my_cache",
+            &metric_registry,
+        ));
+
+        Self {
+            backend,
+            refresh_duration_provider,
+            ttl_provider,
+            time_provider,
+            loader,
+            notify_idle,
+        }
+    }
+}
+
+/// Test setup that integrates the LRU policy with a refresh.
+struct TestStateLRUAndRefresh {
+    backend: PolicyBackend<u8, String>,
+    size_estimator: Arc<TestSizeEstimator>,
+    refresh_duration_provider: Arc<TestRefreshDurationProvider>,
+    time_provider: Arc<MockProvider>,
+    loader: Arc<TestLoader<u8, (), String>>,
+    pool: Arc<ResourcePool<TestSize>>,
+    notify_idle: Arc<Notify>,
+}
+
+impl TestStateLRUAndRefresh {
+    fn new() -> Self {
+        let refresh_duration_provider = Arc::new(TestRefreshDurationProvider::new());
+        let size_estimator = Arc::new(TestSizeEstimator::default());
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let loader = Arc::new(TestLoader::default());
+        let notify_idle = Arc::new(Notify::new());
+
+        // set up "RNG" that always generates the maximum, so we can test things easier
+        let rng_overwrite = StepRng::new(u64::MAX, 0);
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(RefreshPolicy::new_inner(
+            Arc::clone(&time_provider) as _,
+            Arc::clone(&refresh_duration_provider) as _,
+            Arc::clone(&loader) as _,
+            "my_cache",
+            &metric_registry,
+            Arc::clone(&notify_idle),
+            &Handle::current(),
+            Some(rng_overwrite),
+        ));
+        let pool = Arc::new(ResourcePool::new(
+            "my_pool",
+            TestSize(10),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "my_cache",
+            Arc::clone(&size_estimator) as _,
+        ));
+
+        Self {
+            backend,
+            refresh_duration_provider,
+            size_estimator,
+            time_provider,
+            loader,
+            pool,
+            notify_idle,
+        }
+    }
+}
+
+/// Test setup that integrates the TTL policy with LRU.
+struct TestStateTtlAndLRU {
+    backend: PolicyBackend<u8, String>,
+    ttl_provider: Arc<TestTtlProvider>,
+    time_provider: Arc<MockProvider>,
+    size_estimator: Arc<TestSizeEstimator>,
+    pool: Arc<ResourcePool<TestSize>>,
+}
+
+impl TestStateTtlAndLRU {
+    async fn new() -> Self {
+        let ttl_provider = Arc::new(TestTtlProvider::new());
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let size_estimator = Arc::new(TestSizeEstimator::default());
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(TtlPolicy::new(
+            Arc::clone(&ttl_provider) as _,
+            "my_cache",
+            &metric_registry,
+        ));
+        let pool = Arc::new(ResourcePool::new(
+            "my_pool",
+            TestSize(10),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "my_cache",
+            Arc::clone(&size_estimator) as _,
+        ));
+
+        Self {
+            backend,
+            ttl_provider,
+            time_provider,
+            size_estimator,
+            pool,
+        }
+    }
+}
+
+/// Test setup that integrates the LRU policy with RemoveIf and max size of 10
+struct TestStateLruAndRemoveIf {
+    backend: PolicyBackend<u8, String>,
+    time_provider: Arc<MockProvider>,
+    size_estimator: Arc<TestSizeEstimator>,
+    remove_if_handle: RemoveIfHandle<u8, String>,
+    pool: Arc<ResourcePool<TestSize>>,
+}
+
+impl TestStateLruAndRemoveIf {
+    async fn new() -> Self {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let size_estimator = Arc::new(TestSizeEstimator::default());
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+
+        let pool = Arc::new(ResourcePool::new(
+            "my_pool",
+            TestSize(10),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "my_cache",
+            Arc::clone(&size_estimator) as _,
+        ));
+
+        let (constructor, remove_if_handle) =
+            RemoveIfPolicy::create_constructor_and_handle("my_cache", &metric_registry);
+        backend.add_policy(constructor);
+
+        Self {
+            backend,
+            time_provider,
+            size_estimator,
+            remove_if_handle,
+            pool,
+        }
+    }
+}
+
+/// Test setup that integrates the LRU policy with a refresh.
+struct TestStateLruAndRefresh {
+    backend: PolicyBackend<u8, String>,
+    size_estimator: Arc<TestSizeEstimator>,
+    refresh_duration_provider: Arc<TestRefreshDurationProvider>,
+    time_provider: Arc<MockProvider>,
+    loader: Arc<TestLoader<u8, (), String>>,
+    notify_idle: Arc<Notify>,
+    pool: Arc<ResourcePool<TestSize>>,
+}
+
+impl TestStateLruAndRefresh {
+    fn new() -> Self {
+        let refresh_duration_provider = Arc::new(TestRefreshDurationProvider::new());
+        let size_estimator = Arc::new(TestSizeEstimator::default());
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let loader = Arc::new(TestLoader::default());
+        let notify_idle = Arc::new(Notify::new());
+
+        // set up "RNG" that always generates the maximum, so we can test things easier
+        let rng_overwrite = StepRng::new(u64::MAX, 0);
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(RefreshPolicy::new_inner(
+            Arc::clone(&time_provider) as _,
+            Arc::clone(&refresh_duration_provider) as _,
+            Arc::clone(&loader) as _,
+            "my_cache",
+            &metric_registry,
+            Arc::clone(&notify_idle),
+            &Handle::current(),
+            Some(rng_overwrite),
+        ));
+
+        let pool = Arc::new(ResourcePool::new(
+            "my_pool",
+            TestSize(10),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "my_cache",
+            Arc::clone(&size_estimator) as _,
+        ));
+
+        Self {
+            backend,
+            refresh_duration_provider,
+            size_estimator,
+            time_provider,
+            loader,
+            notify_idle,
+            pool,
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+struct TestSizeEstimator {
+    sizes: Mutex<HashMap<(u8, String), TestSize>>,
+}
+
+impl TestSizeEstimator {
+    fn mock_size(&self, k: u8, v: String, s: TestSize) {
+        self.sizes.lock().insert((k, v), s);
+    }
+}
+
+impl ResourceEstimator for TestSizeEstimator {
+    type K = u8;
+    type V = String;
+    type S = TestSize;
+
+    fn consumption(&self, k: &Self::K, v: &Self::V) -> Self::S {
+        *self.sizes.lock().get(&(*k, v.clone())).unwrap()
+    }
+}
diff --git a/cache_system/src/backend/policy/lru.rs b/cache_system/src/backend/policy/lru.rs
new file mode 100644
index 0000000..4f5c9ab
--- /dev/null
+++ b/cache_system/src/backend/policy/lru.rs
@@ -0,0 +1,2055 @@
+//! LRU (Least Recently Used) cache system.
+//!
+//! # Usage
+//!
+//! ```
+//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
+//! use std::{
+//!     collections::HashMap,
+//!     ops::{Add, Sub},
+//!     sync::Arc,
+//! };
+//! use iox_time::SystemProvider;
+//! use cache_system::{
+//!     backend::{
+//!         CacheBackend,
+//!         policy::{
+//!             lru::{LruPolicy, ResourcePool},
+//!             PolicyBackend,
+//!         },
+//!     },
+//!     resource_consumption::{Resource, ResourceEstimator},
+//! };
+//! use tokio::runtime::Handle;
+//!
+//! // first we implement a strongly-typed RAM size measurement
+//! #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+//! struct RamSize(usize);
+//!
+//! impl Resource for RamSize {
+//!     fn zero() -> Self {
+//!         Self(0)
+//!     }
+//!
+//!     fn unit() -> &'static str {
+//!         "bytes"
+//!     }
+//! }
+//!
+//! impl From<RamSize> for u64 {
+//!     fn from(s: RamSize) -> Self {
+//!         s.0 as Self
+//!     }
+//! }
+//!
+//! impl Add for RamSize {
+//!     type Output = Self;
+//!
+//!     fn add(self, rhs: Self) -> Self::Output {
+//!         Self(self.0.checked_add(rhs.0).expect("overflow"))
+//!     }
+//! }
+//!
+//! impl Sub for RamSize {
+//!     type Output = Self;
+//!
+//!     fn sub(self, rhs: Self) -> Self::Output {
+//!         Self(self.0.checked_sub(rhs.0).expect("underflow"))
+//!     }
+//! }
+//!
+//! // a time provider is required to determine the age of entries
+//! let time_provider = Arc::new(SystemProvider::new());
+//!
+//! // registry to capture metrics emitted by the LRU cache
+//! let metric_registry = Arc::new(metric::Registry::new());
+//!
+//! // set up a memory pool
+//! let limit = RamSize(50);
+//! let pool = Arc::new(ResourcePool::new(
+//!     "my_pool",
+//!     limit,
+//!     metric_registry,
+//!     &Handle::current(),
+//! ));
+//!
+//! // set up first pool user: a u64->String map
+//! #[derive(Debug)]
+//! struct Estimator1 {}
+//!
+//! impl ResourceEstimator for Estimator1 {
+//!     type K = u64;
+//!     type V = String;
+//!     type S = RamSize;
+//!
+//!     fn consumption(&self, _k: &Self::K, v: &Self::V) -> Self::S {
+//!         RamSize(8) + RamSize(v.capacity())
+//!     }
+//! }
+//!
+//! let mut backend1 = PolicyBackend::new(
+//!     Box::new(HashMap::new()),
+//!     Arc::clone(&time_provider) as _,
+//! );
+//! backend1.add_policy(
+//!     LruPolicy::new(
+//!         Arc::clone(&pool),
+//!         "id1",
+//!         Arc::new(Estimator1{}),
+//!     )
+//! );
+//!
+//! // add some data
+//! backend1.set(1, String::from("some_entry"));
+//! backend1.set(2, String::from("another_entry"));
+//! assert_eq!(pool.current(), RamSize(39));
+//!
+//! // only test first one
+//! assert!(backend1.get(&1).is_some());
+//!
+//! // fill up pool
+//! backend1.set(3, String::from("this_will_evict_data"));
+//!
+//! // the policy will eventually evict the data, in tests we can use a help
+//! // method to wait for that
+//! pool.wait_converged().await;
+//!
+//! assert!(backend1.get(&1).is_some());
+//! assert!(backend1.get(&2).is_none());
+//! assert!(backend1.get(&3).is_some());
+//! assert_eq!(pool.current(), RamSize(46));
+//!
+//! // set up second pool user with totally different types: a u8->Vec<u8> map
+//! #[derive(Debug)]
+//! struct Estimator2 {}
+//!
+//! impl ResourceEstimator for Estimator2 {
+//!     type K = u8;
+//!     type V = Vec<u8>;
+//!     type S = RamSize;
+//!
+//!     fn consumption(&self, _k: &Self::K, v: &Self::V) -> Self::S {
+//!         RamSize(1) + RamSize(v.capacity())
+//!     }
+//! }
+//!
+//! let mut backend2 = PolicyBackend::new(
+//!     Box::new(HashMap::new()),
+//!     time_provider,
+//! );
+//! backend2.add_policy(
+//!     LruPolicy::new(
+//!         Arc::clone(&pool),
+//!         "id2",
+//!         Arc::new(Estimator2{}),
+//!     )
+//! );
+//!
+//! // eviction works for all pool members
+//! backend2.set(1, vec![1, 2, 3, 4]);
+//! pool.wait_converged().await;
+//! assert!(backend1.get(&1).is_none());
+//! assert!(backend1.get(&2).is_none());
+//! assert!(backend1.get(&3).is_some());
+//! assert!(backend2.get(&1).is_some());
+//! assert_eq!(pool.current(), RamSize(33));
+//! # });
+//! ```
+//!
+//! # Internals
+//! Here we describe the internals of the LRU cache system.
+//!
+//! ## Requirements
+//! To understand the construction, we first must understand what the LRU system tries to achieve:
+//!
+//! - **Single Pool:** Have a single resource pool for multiple LRU backends.
+//! - **Eviction Cascade:** Adding data to any of the backends (or modifying an existing entry) should check if there is
+//!   enough space left in the LRU backend. If not, we must EVENTUALLY remove the least recently used entries over all
+//!   backends (including the one that just got a new entry) until there is enough space.
+//!
+//! This has the following consequences:
+//!
+//! - **Cyclic Structure:** The LRU backends communicate with the pool, but the pool also needs to communicate with
+//!   all the backends. This creates some form of cyclic data structure.
+//! - **Type Erasure:** The pool is only specific to the resource type, not the key and value types of the
+//!   participating backends. So at some place we need to perform type erasure.
+//!
+//! ## Data Structures
+//!
+//! ```text
+//!                                                   .~~~~~~~~~~~~~~~~.
+//!           +---------------------------------------: CallbackHandle :
+//!           |                                       :  <K, V>        :
+//!           |                                       .~~~~~~~~~~~~~~~~.
+//!           |                                                 ^
+//!           |                        .~~~~~~~~~~~~~~~~~.      |
+//!           |                        : AddressableHeap :      |
+//!           |                        : <K, S, Time>    :   (mutex)
+//!           |                        .~~~~~~~~~~~~~~~~~.      |
+//!           |                                ^                |
+//!           |                                |                |
+//!           V                             (mutex)             |
+//!    .~~~~~~~~~~~~~~~.    .~~~~~~~~~~~.      |      .~~~~~~~~~~~~~~~~.           .~~~~~~~~~~~~.
+//! -->: PolicyBackend :--->: LruPolicy :      |      : PoolMemberImpl :           : PoolMember :
+//!    :  <K, V>       :    : <K, V, S> :      |      :   <K, V, S>    :           :    <S>     :
+//!    :               :    :           :      +------:                :<--(dyn)---:            :
+//!    .~~~~~~~~~~~~~~~.    .~~~~~~~~~~~.             .~~~~~~~~~~~~~~~~.           .~~~~~~~~~~~~.
+//!                               |   |                                                ^   ^
+//!                               |   |                                                |   |
+//!                               |   +--------------------------------------(arc)-----+   |
+//!                             (arc)                                                      |
+//!                               |                                                      (weak)
+//!                               V                                                        |
+//!                        .~~~~~~~~~~~~~~.                                        .~~~~~~~~~~~~~.
+//! ---------------------->: ResourcePool :-----+-------(arc)--------------------->: SharedState :
+//!                        :     <S>      :     |                                  :   <S>       :
+//!                        .~~~~~~~~~~~~~~.     |                                  .~~~~~~~~~~~~~.
+//!                               |             |
+//!                            (handle)         |
+//!                               |             |
+//!                               V             |
+//!                        .~~~~~~~~~~~~~~~.    |
+//!                        : clean_up_loop :----+
+//!                        :     <S>       :
+//!                        .~~~~~~~~~~~~~~~.
+//! ```
+//!
+//! ## State
+//! State is held in the following structures:
+//!
+//! - `LruPolicyInner`: Holds [`CallbackHandle`] as well as an [`AddressableHeap`] to
+//!   memorize when entries were used for the last time.
+//! - `ResourcePoolInner`: Holds a reference to all pool members as well as the current consumption.
+//!
+//! All other structures and traits "only" act as glue.
+//!
+//! ## Locking
+//! What and how we lock depends on the operation.
+//!
+//! Note that all locks are bare mutexes, there are no read-write-locks. "Only read" is not really an important use
+//! case since even `get` requires updating the "last used" timestamp of the corresponding entry.
+//!
+//! ### Get
+//! For [`GET`] we only need to update the "last used" timestamp for the affected entry. No
+//! pool-wide operations are required. We update [`AddressableHeap`] and then perform the read operation of the inner
+//! backend.
+//!
+//! ### Remove
+//! For [`REMOVE`] the pool usage can only decrease, so other backends are never affected. We
+//! first lock [`AddressableHeap`] and check if the entry is present. If it is, we also the "current" counter in
+//! [`SharedState`] and then perform the modification on both.
+//!
+//! ### Set
+//! [`SET`] locks [`AddressableHeap`] to figure out if th item exists. If it does, it locks the "current" counter in
+//! [`SharedState`] and removes the old value. Then it updates [`AddressableHeap`] with the new value and locks&updates
+//! the "current" counter in [`SharedState`] again. It then notifies the clean-up loop that there was an up.
+//!
+//! Note that in case of an override, the existing "last used" time will be used instead of "now", because just
+//! replacing an existing value (e.g. via a [refresh]) should not count as a use.
+//!
+//! ### Clean-up Loop
+//! This is the beefy bit. First it locks and reads the "current" counter in [`SharedState`]. It instantly unlocks the
+//! value to not block all pool members adding new values while it we figure out what to evict. Then it selects victims
+//! one by one by asking the individual pool members what they could remove. This shortly locks their
+//! [`AddressableHeap`]s (one member at the time). After enough victims where selected for eviction, it will delete in
+//! them one pool member at the time. Each pool member will lock their [`CallbackHandle`] and when the deletion happens
+//! also their [`AddressableHeap`] and the "current" counter in [`SharedState`]. However the lock order is identical to
+//! a normal "remove" operation.
+//!
+//! Note that the clean up loop does not directly update the "current" counter in [`SharedState`] since the "remove"
+//! routine already does that.
+//!
+//! ## Consistency
+//! This system is eventually consistent and we are a bit loose at a few places to make it more efficient and easier to
+//! implement. This subsection explains cases where this could be visible to an observer.
+//!
+//! ### Overcommit
+//! Since we add new data to the cache pool and the clean-up loop will eventually evict data, we overcommit the pool for
+//! a short time. In practice however we already allocated the memory before adding it to the pool.
+//!
+//! There is a another risk that the cached users will add data so fast that the clean-up loop cannot keep up. This
+//! however is highly unlikely, since the loop selects enough victims to get the resource usage below the limit and
+//! deletes these victims in batches. The more it runs behind, the large the batch will be.
+//!
+//! ### Overdelete
+//! Similar to "overcommit", it is possible that the clean-up loop deletes more items than necessary. This can happen
+//! when between victim selection and actual deletion, entries are removed from the cache (e.g. via [TTL]). However the
+//! timing for that is very tight and we would have deleted the data anyways if the delete would have happened a tiny
+//! bit later, so in reality this is not a concern. On the other hand, the effect might also be a cache miss that was
+//! not strictly necessary and in turn worse performance than we could have had.
+//!
+//! ### Victim-Use-Delete
+//! It is possible that a key is used between victim selection and its removal. In theory we should not remove the key
+//! in this case because its no longer "least recently used". However if the key usage would have occurred only a bit
+//! later, we would have removed the key anyways so this tight race has no practical meaning. No user can rely on such
+//! tight timings and the fullness of a cache pool.
+//!
+//! ### Victim-Downsize-Delete
+//! A selected victim might be replaced with a smaller one between victim selection and its deletion. In this case, the
+//! clean-up loop does not delete enough data in its current try but needs an additional iteration.  In reality this is
+//! very unlikely since most cached entries rarely shrink and even if they do, the clean-up loop will eventually catch
+//! up again.
+//!
+//!
+//! [`GET`]: Subscriber::get
+//! [`PolicyBackend`]: super::PolicyBackend
+//! [refresh]: super::refresh
+//! [`REMOVE`]: Subscriber::remove
+//! [`SET`]: Subscriber::set
+//! [TTL]: super::ttl
+use std::{
+    any::Any,
+    collections::{btree_map::Entry, BTreeMap, BinaryHeap},
+    fmt::Debug,
+    hash::Hash,
+    sync::{Arc, Weak},
+};
+
+use iox_time::Time;
+use metric::{U64Counter, U64Gauge};
+use observability_deps::tracing::trace;
+use ouroboros::self_referencing;
+use parking_lot::Mutex;
+use tokio::{runtime::Handle, sync::Notify, task::JoinSet};
+
+use crate::{
+    addressable_heap::{AddressableHeap, AddressableHeapIter},
+    backend::CacheBackend,
+    resource_consumption::{Resource, ResourceEstimator},
+};
+
+use super::{CallbackHandle, ChangeRequest, Subscriber};
+
+/// Wrapper around something that can be converted into `u64`
+/// to enable emitting metrics.
+#[derive(Debug)]
+struct MeasuredT<S>
+where
+    S: Resource,
+{
+    v: S,
+    metric: U64Gauge,
+}
+
+impl<S> MeasuredT<S>
+where
+    S: Resource,
+{
+    fn new(v: S, metric: U64Gauge) -> Self {
+        metric.set(v.into());
+
+        Self { v, metric }
+    }
+
+    fn inc(&mut self, delta: &S) {
+        self.v = self.v + *delta;
+        self.metric.inc((*delta).into());
+    }
+
+    fn dec(&mut self, delta: &S) {
+        self.v = self.v - *delta;
+        self.metric.dec((*delta).into());
+    }
+}
+
+/// Shared state between [`ResourcePool`] and [`clean_up_loop`].
+#[derive(Debug)]
+struct SharedState<S>
+where
+    S: Resource,
+{
+    /// Resource limit.
+    limit: MeasuredT<S>,
+
+    /// Current resource usage.
+    current: Mutex<MeasuredT<S>>,
+
+    /// Members (= backends) that use this pool.
+    members: Mutex<BTreeMap<&'static str, Weak<dyn PoolMember<S = S>>>>,
+
+    /// Notification when [`current`](Self::current) as changed.
+    change_notify: Notify,
+}
+
+impl<S> SharedState<S>
+where
+    S: Resource,
+{
+    /// Get current members.
+    ///
+    /// This also performs a clean-up.
+    fn members(&self) -> BTreeMap<&'static str, Arc<dyn PoolMember<S = S>>> {
+        let mut members = self.members.lock();
+        let mut out = BTreeMap::new();
+
+        members.retain(|id, member| match member.upgrade() {
+            Some(member) => {
+                out.insert(*id, member);
+                true
+            }
+            None => false,
+        });
+
+        out
+    }
+}
+
+/// Resource pool.
+///
+/// This can be used with [`LruPolicy`].
+#[derive(Debug)]
+pub struct ResourcePool<S>
+where
+    S: Resource,
+{
+    /// Name of the pool.
+    name: &'static str,
+
+    /// Shared state.
+    shared: Arc<SharedState<S>>,
+
+    /// Metric registry associated with the pool.
+    ///
+    /// This is used to generate member-specific metrics as well.
+    metric_registry: Arc<metric::Registry>,
+
+    /// Background task.
+    _background_task: JoinSet<()>,
+
+    /// Notification when the background worker is idle, so tests know that the state has converged and that they can
+    /// continue working.
+    #[allow(dead_code)]
+    notify_idle_test_side:
+        tokio::sync::mpsc::UnboundedSender<futures::channel::oneshot::Sender<()>>,
+}
+
+impl<S> ResourcePool<S>
+where
+    S: Resource,
+{
+    /// Creates new empty resource pool with given limit.
+    pub fn new(
+        name: &'static str,
+        limit: S,
+        metric_registry: Arc<metric::Registry>,
+        runtime_handle: &Handle,
+    ) -> Self {
+        let metric_limit = metric_registry
+            .register_metric::<U64Gauge>("cache_lru_pool_limit", "Limit of the LRU resource pool")
+            .recorder(&[("unit", S::unit()), ("pool", name)]);
+        let limit = MeasuredT::new(limit, metric_limit);
+
+        let metric_current = metric_registry
+            .register_metric::<U64Gauge>(
+                "cache_lru_pool_usage",
+                "Current consumption of the LRU resource pool",
+            )
+            .recorder(&[("unit", S::unit()), ("pool", name)]);
+        let current = Mutex::new(MeasuredT::new(S::zero(), metric_current));
+
+        let shared = Arc::new(SharedState {
+            limit,
+            current,
+            members: Default::default(),
+            change_notify: Default::default(),
+        });
+
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+
+        let mut background_task = JoinSet::new();
+        background_task.spawn_on(clean_up_loop(Arc::clone(&shared), rx), runtime_handle);
+
+        Self {
+            name,
+            shared,
+            metric_registry,
+            _background_task: background_task,
+            notify_idle_test_side: tx,
+        }
+    }
+
+    /// Get pool limit.
+    pub fn limit(&self) -> S {
+        self.shared.limit.v
+    }
+
+    /// Get current pool usage.
+    pub fn current(&self) -> S {
+        self.shared.current.lock().v
+    }
+
+    /// Register new pool member.
+    ///
+    /// # Panic
+    /// Panics when a member with the specific ID is already registered.
+    fn register_member(&self, id: &'static str, member: Weak<dyn PoolMember<S = S>>) {
+        let mut members = self.shared.members.lock();
+
+        match members.entry(id) {
+            Entry::Vacant(v) => {
+                v.insert(member);
+            }
+            Entry::Occupied(mut o) => {
+                if o.get().strong_count() > 0 {
+                    panic!("Member '{}' already registered", o.key());
+                } else {
+                    *o.get_mut() = member;
+                }
+            }
+        }
+    }
+
+    /// Add used resource from pool.
+    fn add(&self, s: S) {
+        let mut current = self.shared.current.lock();
+        current.inc(&s);
+        if current.v > self.shared.limit.v {
+            self.shared.change_notify.notify_one();
+        }
+    }
+
+    /// Remove used resource from pool.
+    fn remove(&self, s: S) {
+        self.shared.current.lock().dec(&s);
+    }
+
+    /// Wait for the pool to converge to a steady state.
+    ///
+    /// This usually means that the background worker that runs the eviction loop is idle.
+    ///
+    /// # Panic
+    /// Panics if the background worker is not idle within 5s or if the worker died.
+    pub async fn wait_converged(&self) {
+        let (tx, rx) = futures::channel::oneshot::channel();
+        self.notify_idle_test_side
+            .send(tx)
+            .expect("background worker alive");
+        tokio::time::timeout(std::time::Duration::from_secs(5), rx)
+            .await
+            .unwrap()
+            .unwrap();
+    }
+}
+
+/// Cache policy that wraps another backend and limits its resource usage.
+#[derive(Debug)]
+pub struct LruPolicy<K, V, S>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    S: Resource,
+{
+    /// Link to central resource pool.
+    pool: Arc<ResourcePool<S>>,
+
+    /// Pool member
+    member: Arc<PoolMemberImpl<K, V, S>>,
+
+    /// Resource estimator that is used for new (via [`SET`](Subscriber::set)) entries.
+    resource_estimator: Arc<dyn ResourceEstimator<K = K, V = V, S = S>>,
+
+    /// Count number of elements within this specific pool member.
+    metric_count: U64Gauge,
+
+    /// Count resource usage of this specific pool member.
+    metric_usage: U64Gauge,
+}
+
+impl<K, V, S> LruPolicy<K, V, S>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    S: Resource,
+{
+    /// Create new backend w/o any known keys.
+    ///
+    /// The inner backend MUST NOT contain any data at this point, otherwise we will not track any resource consumption
+    /// for these entries.
+    ///
+    /// # Panic
+    /// - Panics if the given ID is already used within the given pool.
+    /// - If the inner backend is not empty.
+    pub fn new(
+        pool: Arc<ResourcePool<S>>,
+        id: &'static str,
+        resource_estimator: Arc<dyn ResourceEstimator<K = K, V = V, S = S>>,
+    ) -> impl FnOnce(CallbackHandle<K, V>) -> Self {
+        let metric_count = pool
+            .metric_registry
+            .register_metric::<U64Gauge>(
+                "cache_lru_member_count",
+                "Number of entries for a given LRU cache pool member",
+            )
+            .recorder(&[("pool", pool.name), ("member", id)]);
+        let metric_usage = pool
+            .metric_registry
+            .register_metric::<U64Gauge>(
+                "cache_lru_member_usage",
+                "Resource usage of a given LRU cache pool member",
+            )
+            .recorder(&[("pool", pool.name), ("member", id), ("unit", S::unit())]);
+        let metric_evicted = pool
+            .metric_registry
+            .register_metric::<U64Counter>(
+                "cache_lru_member_evicted",
+                "Number of entries that were evicted from a given LRU cache pool member",
+            )
+            .recorder(&[("pool", pool.name), ("member", id)]);
+
+        move |mut callback_handle| {
+            callback_handle.execute_requests(vec![ChangeRequest::ensure_empty()]);
+
+            let member = Arc::new(PoolMemberImpl {
+                id,
+                last_used: Arc::new(Mutex::new(AddressableHeap::new())),
+                metric_evicted,
+                callback_handle: Mutex::new(callback_handle),
+            });
+
+            pool.register_member(id, Arc::downgrade(&member) as _);
+
+            Self {
+                pool,
+                member,
+                resource_estimator,
+                metric_count,
+                metric_usage,
+            }
+        }
+    }
+}
+
+impl<K, V, S> Drop for LruPolicy<K, V, S>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    S: Resource,
+{
+    fn drop(&mut self) {
+        let size_total = {
+            let mut guard = self.member.last_used.lock();
+            let mut accu = S::zero();
+            while let Some((_k, s, _t)) = guard.pop() {
+                accu = accu + s;
+            }
+            accu
+        };
+        self.pool.remove(size_total);
+    }
+}
+
+impl<K, V, S> Subscriber for LruPolicy<K, V, S>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    S: Resource,
+{
+    type K = K;
+    type V = V;
+
+    fn get(&mut self, k: &Self::K, now: Time) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        trace!(?k, now = now.timestamp_nanos(), "LRU get",);
+        let mut last_used = self.member.last_used.lock();
+
+        // update "last used"
+        last_used.update_order(k, now);
+
+        vec![]
+    }
+
+    fn set(
+        &mut self,
+        k: &Self::K,
+        v: &Self::V,
+        now: Time,
+    ) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        trace!(?k, now = now.timestamp_nanos(), "LRU set",);
+
+        // determine all attributes before getting any locks
+        let consumption = self.resource_estimator.consumption(k, v);
+
+        // "last used" time for new entry
+        // Note: this might be updated if the entry already exists
+        let mut last_used_t = now;
+
+        // check for oversized entries
+        if consumption > self.pool.shared.limit.v {
+            return vec![ChangeRequest::remove(k.clone())];
+        }
+
+        {
+            let mut last_used = self.member.last_used.lock();
+
+            // maybe clean from pool
+            if let Some((consumption, last_used_t_previously)) = last_used.remove(k) {
+                self.pool.remove(consumption);
+                self.metric_count.dec(1);
+                self.metric_usage.dec(consumption.into());
+                last_used_t = last_used_t_previously;
+            }
+
+            // add new entry to inner backend BEFORE adding it to the pool, because the we can overcommit for a short
+            // time and we want to give the pool a chance to also evict the new resource
+            last_used.insert(k.clone(), consumption, last_used_t);
+            self.metric_count.inc(1);
+            self.metric_usage.inc(consumption.into());
+        }
+
+        // pool-wide operation
+        // Since this may wake-up the background worker and cause evictions, drop the `last_used` lock before doing this (see
+        // block above) to avoid lock contention.
+        self.pool.add(consumption);
+
+        vec![]
+    }
+
+    fn remove(&mut self, k: &Self::K, now: Time) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        trace!(?k, now = now.timestamp_nanos(), "LRU remove",);
+        let mut last_used = self.member.last_used.lock();
+
+        if let Some((consumption, _last_used)) = last_used.remove(k) {
+            self.pool.remove(consumption);
+            self.metric_count.dec(1);
+            self.metric_usage.dec(consumption.into());
+        }
+
+        vec![]
+    }
+}
+
+/// Iterator for enumerating removal candidates of a [`PoolMember`].
+///
+/// This is type-erased to make [`PoolMember`] object-safe.
+type PoolMemberCouldRemove<S> = Box<dyn Iterator<Item = (Time, S, Box<dyn Any>)>>;
+
+/// A member of a [`ResourcePool`]/[`SharedState`].
+///
+/// The only implementation of this is [`PoolMemberImpl`]. This indirection is required to erase `K` and `V` from specific
+/// backend so we can stick it into the generic pool.
+trait PoolMember: Debug + Send + Sync + 'static {
+    /// Resource type.
+    type S;
+
+    /// Check if this member has anything that could be removed.
+    ///
+    /// If so, return:
+    /// - "last used" timestamp
+    /// - resource consumption of that entry
+    /// - type-erased key
+    ///
+    /// Elements are returned in order of the "last used" timestamp, in increasing order.
+    fn could_remove(&self) -> PoolMemberCouldRemove<Self::S>;
+
+    /// Remove given set of keys.
+    ///
+    /// The keys MUST be a result of [`could_remove`](Self::could_remove), otherwise the downcasting may not work and panic.
+    fn remove_keys(&self, keys: Vec<Box<dyn Any>>);
+}
+
+/// The only implementation of [`PoolMember`].
+///
+/// In contrast to the trait, this still contains `K` and `V`.
+#[derive(Debug)]
+pub struct PoolMemberImpl<K, V, S>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    S: Resource,
+{
+    /// Pool member ID.
+    id: &'static str,
+
+    /// Count number of evicted items.
+    metric_evicted: U64Counter,
+
+    /// Tracks usage of the last used elements.
+    ///
+    /// See documentation of [`callback_handle`](Self::callback_handle) for a reasoning about locking.
+    last_used: Arc<Mutex<AddressableHeap<K, S, Time>>>,
+
+    /// Handle to call back into the [`PolicyBackend`] to evict data.
+    ///
+    /// # Locking
+    /// This MUST NOT share a lock with [`last_used`](Self::last_used) because otherwise we would deadlock during
+    /// eviction:
+    ///
+    /// 1. [`remove_keys`](PoolMember::remove_keys)
+    /// 2. lock both [`callback_handle`](Self::callback_handle) and [`last_used`](Self::last_used)
+    /// 3. [`CallbackHandle::execute_requests`]
+    /// 4. [`Subscriber::remove`]
+    /// 5. need to lock [`last_used`](Self::last_used) again
+    ///
+    ///
+    /// [`PolicyBackend`]: super::PolicyBackend
+    callback_handle: Mutex<CallbackHandle<K, V>>,
+}
+
+impl<K, V, S> PoolMember for PoolMemberImpl<K, V, S>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    S: Resource,
+{
+    type S = S;
+
+    fn could_remove(&self) -> Box<dyn Iterator<Item = (Time, Self::S, Box<dyn Any>)>> {
+        it::build_it(self.last_used.lock_arc())
+    }
+
+    fn remove_keys(&self, keys: Vec<Box<dyn Any>>) {
+        let keys = keys
+            .into_iter()
+            .map(|k| *k.downcast::<K>().expect("wrong type"))
+            .collect::<Vec<K>>();
+
+        trace!(
+            id = self.id,
+            ?keys,
+            "evicting cache entries due to LRU pressure",
+        );
+        self.metric_evicted.inc(keys.len() as u64);
+
+        let combined = ChangeRequest::from_fn(move |backend| {
+            for k in keys {
+                backend.remove(&k);
+            }
+        });
+
+        self.callback_handle.lock().execute_requests(vec![combined]);
+    }
+}
+
+/// Helper module that wraps the iterator handling for [`PoolMember`]/[`PoolMemberImpl`].
+///
+/// This is required because [`ouroboros`] generates a bunch of code that we do not want to leak all over the place.
+mod it {
+    // ignore some lints for the ouroboros codegen
+    #![allow(clippy::future_not_send)]
+
+    use super::*;
+
+    /// The lock that we need to generate a candidate iterator.
+    pub type Lock<K, S> =
+        parking_lot::lock_api::ArcMutexGuard<parking_lot::RawMutex, AddressableHeap<K, S, Time>>;
+
+    #[self_referencing]
+    struct PoolMemberIter<K, S>
+    where
+        K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+        S: Resource,
+    {
+        lock: Lock<K, S>,
+
+        #[borrows(lock)]
+        #[covariant]
+        it: AddressableHeapIter<'this, K, S, Time>,
+    }
+
+    impl<K, S> Iterator for PoolMemberIter<K, S>
+    where
+        K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+        S: Resource,
+    {
+        type Item = (Time, S, Box<dyn Any>);
+
+        fn next(&mut self) -> Option<Self::Item> {
+            self.with_it_mut(|it| {
+                it.next()
+                    .map(|(k, s, t)| (*t, *s, Box::new(k.clone()) as _))
+            })
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            self.borrow_it().size_hint()
+        }
+    }
+
+    /// Build iterator.
+    pub fn build_it<K, S>(lock: Lock<K, S>) -> PoolMemberCouldRemove<S>
+    where
+        K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+        S: Resource,
+    {
+        Box::new(
+            PoolMemberIterBuilder {
+                lock,
+                it_builder: |lock| lock.iter(),
+            }
+            .build(),
+        )
+    }
+}
+
+/// Background worker that eventually cleans up data if the pool reaches capacity.
+///
+/// This method NEVER returns.
+async fn clean_up_loop<S>(
+    shared: Arc<SharedState<S>>,
+    mut notify_idle_worker_side: tokio::sync::mpsc::UnboundedReceiver<
+        futures::channel::oneshot::Sender<()>,
+    >,
+) where
+    S: Resource,
+{
+    'outer: loop {
+        // yield to tokio so that the runtime has a chance to abort this function during shutdown
+        tokio::task::yield_now().await;
+
+        // get current value but drop the lock immediately
+        // Especially we must NOT hold the lock when we later execute the change requests, otherwise there will be two
+        // lock direction:
+        // - someone adding new resource: member -> pool
+        // - clean up loop: pool -> memeber
+        let mut current = {
+            let guard = shared.current.lock();
+            guard.v
+        };
+
+        if current <= shared.limit.v {
+            // nothing to do, sleep and then continue w/ next round
+            loop {
+                tokio::select! {
+                    // biased sleep so we can notify test hooks if we're idle
+                    biased;
+
+                    _ = shared.change_notify.notified() => {continue 'outer;},
+
+                    idle_notify = notify_idle_worker_side.recv() => {
+                        if let Some(n) = idle_notify {
+                            n.send(()).ok();
+                        }
+                    },
+                }
+            }
+        }
+
+        // receive members
+        // Do NOT hold the member lock during the deletion later because this can lead to deadlocks during shutdown.
+        let members = shared.members();
+        if members.is_empty() {
+            // early retry, there's nothing we can do
+            continue;
+        }
+
+        // select victims
+        let mut victims: BTreeMap<&'static str, Vec<Box<dyn Any>>> = Default::default();
+        {
+            trace!(
+                current = current.into(),
+                limit = shared.limit.v.into(),
+                "select eviction victims"
+            );
+
+            // limit scope of member iterators, because they contain locks and we MUST drop them before proceeding to
+            // the actual deletion
+            let mut heap: BinaryHeap<EvictionCandidateIter<S>> = members
+                .iter()
+                .map(|(id, member)| EvictionCandidateIter::new(id, member.could_remove()))
+                .collect();
+
+            while current > shared.limit.v {
+                let candidate = heap.pop().expect("checked that we have at least 1 member");
+                let (candidate, victim) = candidate.next();
+
+                match victim {
+                    Some((t, s, k)) => {
+                        trace!(
+                            id = candidate.id,
+                            s = s.into(),
+                            t_ns = t.timestamp_nanos(),
+                            "found victim"
+                        );
+                        current = current - s;
+                        victims.entry(candidate.id).or_default().push(k);
+                    }
+                    None => {
+                        // The custom `Ord` implementation ensures that we prefer iterators with data over iterators
+                        // without any candidates. So if the "best" iterators has NO candidates, this means that ALL
+                        // iterators are empty.
+                        //
+                        // Or in other words: some data was deleted between retrieving the "current" value and locking
+                        // the iterators. This is fine, just stop looping and remove the victims that we have selected
+                        // so far.
+                        trace!("no more data");
+                        break;
+                    }
+                }
+
+                heap.push(candidate);
+            }
+
+            trace!("done selecting eviction victims");
+        }
+
+        for (id, keys) in victims {
+            let member = members.get(id).expect("did get this ID from this map");
+            member.remove_keys(keys);
+        }
+    }
+}
+
+/// Current element presented by the [`EvictionCandidateIter`].
+type EvictionCandidate<S> = Option<(Time, S, Box<dyn Any>)>;
+
+/// Wraps a [`PoolMember`] so we can compare it in a "tournament" to find out what data to evict.
+struct EvictionCandidateIter<S>
+where
+    S: Resource,
+{
+    id: &'static str,
+    it: PoolMemberCouldRemove<S>,
+    current: EvictionCandidate<S>,
+}
+
+impl<S> EvictionCandidateIter<S>
+where
+    S: Resource,
+{
+    fn new(id: &'static str, mut it: PoolMemberCouldRemove<S>) -> Self {
+        let current = it.next();
+        Self { id, it, current }
+    }
+
+    /// Get next eviction candidate.
+    ///
+    /// This advances the internal state so that this iterator compares correctly afterwards.
+    fn next(mut self) -> (Self, EvictionCandidate<S>) {
+        let mut tmp = self.it.next();
+        std::mem::swap(&mut tmp, &mut self.current);
+        (self, tmp)
+    }
+}
+
+impl<S> PartialEq for EvictionCandidateIter<S>
+where
+    S: Resource,
+{
+    fn eq(&self, other: &Self) -> bool {
+        match (self.current.as_ref(), other.current.as_ref()) {
+            (None, None) | (Some(_), None) | (None, Some(_)) => false,
+            (Some((t1, s1, _k1)), Some((t2, s2, _k2))) => (t1, s1) == (t2, s2),
+        }
+    }
+}
+
+impl<S> Eq for EvictionCandidateIter<S> where S: Resource {}
+
+impl<S> PartialOrd for EvictionCandidateIter<S>
+where
+    S: Resource,
+{
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<S> Ord for EvictionCandidateIter<S>
+where
+    S: Resource,
+{
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // Note: reverse order because iterators are kept in a MAX heap
+        match (self.current.as_ref(), other.current.as_ref()) {
+            (None, None) => {
+                // break tie
+                self.id.cmp(other.id).reverse()
+            }
+
+            // prefer iterators with candidates over empty iterators
+            (Some(_), None) => std::cmp::Ordering::Greater,
+            (None, Some(_)) => std::cmp::Ordering::Less,
+
+            (Some((t1, _s1, _k1)), Some((t2, _s2, _k2))) => {
+                // compare by time, break tie using member ID
+                (t1, self.id).cmp(&(t2, other.id)).reverse()
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, time::Duration};
+
+    use iox_time::{MockProvider, SystemProvider};
+    use metric::{Observation, RawReporter};
+    use test_helpers::maybe_start_logging;
+
+    use crate::{
+        backend::{policy::PolicyBackend, CacheBackend},
+        resource_consumption::test_util::TestSize,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    #[should_panic(expected = "inner backend is not empty")]
+    async fn test_panic_inner_not_empty() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        let policy_constructor = LruPolicy::new(
+            Arc::clone(&pool),
+            "id",
+            Arc::clone(&resource_estimator) as _,
+        );
+        backend.add_policy(|mut callback_handle| {
+            callback_handle.execute_requests(vec![ChangeRequest::set(String::from("foo"), 1usize)]);
+            policy_constructor(callback_handle)
+        })
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "Member 'id' already registered")]
+    async fn test_panic_id_collision() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend1 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend1.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let mut backend2 = PolicyBackend::hashmap_backed(time_provider);
+        backend2.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id",
+            Arc::clone(&resource_estimator) as _,
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_reregister_member() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend1 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend1.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id",
+            Arc::clone(&resource_estimator) as _,
+        ));
+        backend1.set(String::from("a"), 1usize);
+        assert_eq!(pool.current(), TestSize(1));
+
+        // drop the backend so re-registering the same ID ("id") MUST NOT panic
+        drop(backend1);
+        assert_eq!(pool.current(), TestSize(0));
+
+        let mut backend2 = PolicyBackend::hashmap_backed(time_provider);
+        backend2.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id",
+            Arc::clone(&resource_estimator) as _,
+        ));
+        backend2.set(String::from("a"), 2usize);
+        assert_eq!(pool.current(), TestSize(2));
+    }
+
+    #[tokio::test]
+    async fn test_empty() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        assert_eq!(pool.current().0, 0);
+
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        assert_eq!(pool.current().0, 0);
+    }
+
+    #[tokio::test]
+    async fn test_double_set() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(2),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        backend.set(String::from("a"), 1usize);
+        time_provider.inc(Duration::from_millis(1));
+
+        backend.set(String::from("b"), 1usize);
+        time_provider.inc(Duration::from_millis(1));
+
+        // does NOT count as "used"
+        backend.set(String::from("a"), 1usize);
+        time_provider.inc(Duration::from_millis(1));
+
+        backend.set(String::from("c"), 1usize);
+        pool.wait_converged().await;
+
+        assert_eq!(backend.get(&String::from("a")), None);
+    }
+
+    #[tokio::test]
+    async fn test_override() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        backend.set(String::from("a"), 5usize);
+        assert_eq!(pool.current().0, 5);
+
+        backend.set(String::from("b"), 3usize);
+        assert_eq!(pool.current().0, 8);
+
+        backend.set(String::from("a"), 4usize);
+        assert_eq!(pool.current().0, 7);
+    }
+
+    #[tokio::test]
+    async fn test_remove() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        backend.set(String::from("a"), 5usize);
+        assert_eq!(pool.current().0, 5);
+
+        backend.set(String::from("b"), 3usize);
+        assert_eq!(pool.current().0, 8);
+
+        backend.remove(&String::from("a"));
+        assert_eq!(pool.current().0, 3);
+
+        assert_eq!(backend.get(&String::from("a")), None);
+        assert_inner_backend(&mut backend, [(String::from("b"), 3)]);
+
+        // removing it again should just work
+        backend.remove(&String::from("a"));
+        assert_eq!(pool.current().0, 3);
+    }
+
+    #[tokio::test]
+    async fn test_eviction_order() {
+        maybe_start_logging();
+
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(21),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend1 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend1.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let mut backend2 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend2.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id2",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        backend1.set(String::from("b"), 1usize);
+        backend2.set(String::from("a"), 2usize);
+        backend1.set(String::from("a"), 3usize);
+        backend1.set(String::from("c"), 4usize);
+        assert_eq!(pool.current().0, 10);
+
+        time_provider.inc(Duration::from_millis(1));
+
+        backend1.set(String::from("d"), 5usize);
+        assert_eq!(pool.current().0, 15);
+
+        time_provider.inc(Duration::from_millis(1));
+        backend2.set(String::from("b"), 6usize);
+        assert_eq!(pool.current().0, 21);
+
+        time_provider.inc(Duration::from_millis(1));
+
+        // now are exactly at capacity
+        pool.wait_converged().await;
+        assert_inner_backend(
+            &mut backend1,
+            [
+                (String::from("a"), 3),
+                (String::from("b"), 1),
+                (String::from("c"), 4),
+                (String::from("d"), 5),
+            ],
+        );
+        assert_inner_backend(
+            &mut backend2,
+            [(String::from("a"), 2), (String::from("b"), 6)],
+        );
+
+        // adding a single element will drop the smallest key from the first backend (by ID)
+        backend1.set(String::from("foo1"), 1usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current().0, 19);
+        assert_inner_backend(
+            &mut backend1,
+            [
+                (String::from("b"), 1),
+                (String::from("c"), 4),
+                (String::from("d"), 5),
+                (String::from("foo1"), 1),
+            ],
+        );
+        assert_inner_backend(
+            &mut backend2,
+            [(String::from("a"), 2), (String::from("b"), 6)],
+        );
+
+        // now we can fill up data up to the capacity again
+        backend1.set(String::from("foo2"), 2usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current().0, 21);
+        assert_inner_backend(
+            &mut backend1,
+            [
+                (String::from("b"), 1),
+                (String::from("c"), 4),
+                (String::from("d"), 5),
+                (String::from("foo1"), 1),
+                (String::from("foo2"), 2),
+            ],
+        );
+        assert_inner_backend(
+            &mut backend2,
+            [(String::from("a"), 2), (String::from("b"), 6)],
+        );
+
+        // can evict two keys at the same time
+        backend1.set(String::from("foo3"), 2usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current().0, 18);
+        assert_inner_backend(
+            &mut backend1,
+            [
+                (String::from("d"), 5),
+                (String::from("foo1"), 1),
+                (String::from("foo2"), 2),
+                (String::from("foo3"), 2),
+            ],
+        );
+        assert_inner_backend(
+            &mut backend2,
+            [(String::from("a"), 2), (String::from("b"), 6)],
+        );
+
+        // can evict from another backend
+        backend1.set(String::from("foo4"), 4usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current().0, 20);
+        assert_inner_backend(
+            &mut backend1,
+            [
+                (String::from("d"), 5),
+                (String::from("foo1"), 1),
+                (String::from("foo2"), 2),
+                (String::from("foo3"), 2),
+                (String::from("foo4"), 4),
+            ],
+        );
+        assert_inner_backend(&mut backend2, [(String::from("b"), 6)]);
+
+        // can evict multiple timestamps
+        backend1.set(String::from("foo5"), 7usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current().0, 16);
+        assert_inner_backend(
+            &mut backend1,
+            [
+                (String::from("foo1"), 1),
+                (String::from("foo2"), 2),
+                (String::from("foo3"), 2),
+                (String::from("foo4"), 4),
+                (String::from("foo5"), 7),
+            ],
+        );
+        assert_inner_backend(&mut backend2, []);
+    }
+
+    #[tokio::test]
+    async fn test_get_updates_last_used() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(6),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        backend.set(String::from("a"), 1usize);
+        backend.set(String::from("b"), 2usize);
+
+        time_provider.inc(Duration::from_millis(1));
+
+        backend.set(String::from("c"), 3usize);
+        pool.wait_converged().await;
+
+        time_provider.inc(Duration::from_millis(1));
+
+        assert_eq!(backend.get(&String::from("a")), Some(1usize));
+
+        assert_eq!(pool.current().0, 6);
+        assert_inner_backend(
+            &mut backend,
+            [
+                (String::from("a"), 1),
+                (String::from("b"), 2),
+                (String::from("c"), 3),
+            ],
+        );
+
+        backend.set(String::from("foo"), 3usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current().0, 4);
+        assert_inner_backend(
+            &mut backend,
+            [(String::from("a"), 1), (String::from("foo"), 3)],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_oversized_entries() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        backend.set(String::from("a"), 1usize);
+        pool.wait_converged().await;
+        backend.set(String::from("b"), 11usize);
+        pool.wait_converged().await;
+
+        // "a" did NOT get evicted. Instead we removed the oversized entry straight away.
+        assert_eq!(pool.current().0, 1);
+        assert_inner_backend(&mut backend, [(String::from("a"), 1)]);
+    }
+
+    #[tokio::test]
+    async fn test_values_are_dropped() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(3),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+
+        #[derive(Debug)]
+        struct Provider {}
+
+        impl ResourceEstimator for Provider {
+            type K = Arc<String>;
+            type V = Arc<usize>;
+            type S = TestSize;
+
+            fn consumption(&self, _k: &Self::K, v: &Self::V) -> Self::S {
+                TestSize(*v.as_ref())
+            }
+        }
+
+        let resource_estimator = Arc::new(Provider {});
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let k1 = Arc::new(String::from("a"));
+        let v1 = Arc::new(2usize);
+        let k2 = Arc::new(String::from("b"));
+        let v2 = Arc::new(2usize);
+        let k1_weak = Arc::downgrade(&k1);
+        let v1_weak = Arc::downgrade(&v1);
+
+        backend.set(k1, v1);
+        pool.wait_converged().await;
+
+        time_provider.inc(Duration::from_millis(1));
+
+        backend.set(k2, v2);
+        pool.wait_converged().await;
+
+        assert_eq!(k1_weak.strong_count(), 0);
+        assert_eq!(v1_weak.strong_count(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_backends_are_dropped() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(3),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        #[derive(Debug)]
+        struct Backend {
+            #[allow(dead_code)]
+            marker: Arc<()>,
+            inner: HashMap<String, usize>,
+        }
+
+        impl CacheBackend for Backend {
+            type K = String;
+            type V = usize;
+
+            fn get(&mut self, k: &Self::K) -> Option<Self::V> {
+                self.inner.get(k).copied()
+            }
+
+            fn set(&mut self, k: Self::K, v: Self::V) {
+                self.inner.set(k, v)
+            }
+
+            fn remove(&mut self, k: &Self::K) {
+                self.inner.remove(k);
+            }
+
+            fn is_empty(&self) -> bool {
+                self.inner.is_empty()
+            }
+
+            fn as_any(&self) -> &dyn Any {
+                self as &dyn Any
+            }
+        }
+
+        let marker = Arc::new(());
+        let marker_weak = Arc::downgrade(&marker);
+
+        let mut backend = PolicyBackend::new(
+            Box::new(Backend {
+                marker,
+                inner: HashMap::new(),
+            }),
+            Arc::clone(&time_provider) as _,
+        );
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+        backend.set(String::from("a"), 2usize);
+
+        drop(backend);
+        assert_eq!(marker_weak.strong_count(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_metrics() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("cache_lru_pool_limit")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes")])
+                .unwrap(),
+            &Observation::U64Gauge(10)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_pool_usage")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes")])
+                .unwrap(),
+            &Observation::U64Gauge(0)
+        );
+
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("cache_lru_pool_limit")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes")])
+                .unwrap(),
+            &Observation::U64Gauge(10)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_pool_usage")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes")])
+                .unwrap(),
+            &Observation::U64Gauge(0)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_member_count")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("member", "id")])
+                .unwrap(),
+            &Observation::U64Gauge(0)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_member_usage")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes"), ("member", "id")])
+                .unwrap(),
+            &Observation::U64Gauge(0)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_member_evicted")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("member", "id")])
+                .unwrap(),
+            &Observation::U64Counter(0)
+        );
+
+        backend.set(String::from("a"), 1usize); // usage = 1
+        pool.wait_converged().await;
+        backend.set(String::from("b"), 2usize); // usage = 3
+        pool.wait_converged().await;
+        backend.set(String::from("b"), 3usize); // usage = 4
+        pool.wait_converged().await;
+        backend.set(String::from("c"), 4usize); // usage = 8
+        pool.wait_converged().await;
+        backend.set(String::from("d"), 3usize); // usage = 10 (evicted "a")
+        pool.wait_converged().await;
+        backend.remove(&String::from("c")); // usage = 6
+        pool.wait_converged().await;
+
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("cache_lru_pool_limit")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes")])
+                .unwrap(),
+            &Observation::U64Gauge(10)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_pool_usage")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes")])
+                .unwrap(),
+            &Observation::U64Gauge(6)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_member_count")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("member", "id")])
+                .unwrap(),
+            &Observation::U64Gauge(2), // b and d
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_member_usage")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("unit", "bytes"), ("member", "id")])
+                .unwrap(),
+            &Observation::U64Gauge(6)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_lru_member_evicted")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("member", "id")])
+                .unwrap(),
+            &Observation::U64Counter(1)
+        );
+    }
+
+    /// A note regarding the test flavor:
+    ///
+    /// The main generic test function is not async, so the background clean-up would never fire because we don't
+    /// yield to tokio. The test will pass in both cases (w/ a single worker and w/ multiple), however if the
+    /// background worker is a actually doing anything it might be a more realistic test case.
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_generic_backend() {
+        use crate::backend::test_util::test_generic;
+
+        #[derive(Debug)]
+        struct ZeroSizeProvider {}
+
+        impl ResourceEstimator for ZeroSizeProvider {
+            type K = u8;
+            type V = String;
+            type S = TestSize;
+
+            fn consumption(&self, _k: &Self::K, _v: &Self::V) -> Self::S {
+                TestSize(0)
+            }
+        }
+
+        test_generic(|| {
+            let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+            let pool = Arc::new(ResourcePool::new(
+                "pool",
+                TestSize(10),
+                Arc::new(metric::Registry::new()),
+                &Handle::current(),
+            ));
+            let resource_estimator = Arc::new(ZeroSizeProvider {});
+
+            let mut backend = PolicyBackend::hashmap_backed(time_provider);
+            backend.add_policy(LruPolicy::new(
+                Arc::clone(&pool),
+                "id",
+                Arc::clone(&resource_estimator) as _,
+            ));
+            backend
+        });
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_deadlock() {
+        // Regression test for <https://github.com/influxdata/influxdb_iox/issues/8334>.
+        test_deadlock_inner(Duration::from_secs(1)).await;
+
+        // Regression test for <https://github.com/influxdata/influxdb_iox/issues/8378>
+        for _ in 0..100 {
+            test_deadlock_inner(Duration::from_millis(1)).await;
+        }
+    }
+
+    async fn test_deadlock_inner(test_duration: Duration) {
+        #[derive(Debug)]
+        struct OneSizeProvider {}
+
+        impl ResourceEstimator for OneSizeProvider {
+            type K = u128;
+            type V = ();
+            type S = TestSize;
+
+            fn consumption(&self, _k: &Self::K, _v: &Self::V) -> Self::S {
+                TestSize(1)
+            }
+        }
+
+        let time_provider = Arc::new(SystemProvider::new()) as _;
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(100),
+            Arc::new(metric::Registry::new()),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(OneSizeProvider {});
+
+        let mut backend1 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider));
+        backend1.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let mut backend2 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider));
+        backend2.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id2",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let worker1 = tokio::spawn(async move {
+            let mut counter = 0u128;
+            loop {
+                backend1.set(counter, ());
+                counter += 2;
+                tokio::task::yield_now().await;
+            }
+        });
+        let worker2 = tokio::spawn(async move {
+            let mut counter = 1u128;
+            loop {
+                backend2.set(counter, ());
+                counter += 2;
+                tokio::task::yield_now().await;
+            }
+        });
+
+        tokio::time::sleep(test_duration).await;
+
+        worker1.abort();
+        worker2.abort();
+    }
+
+    #[tokio::test]
+    async fn test_efficient_eviction() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(10),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        // fill up pool
+        for i in 0..10 {
+            backend.set(i.to_string(), 1usize);
+        }
+        assert_eq!(pool.current(), TestSize(10));
+
+        // evict all members using a single large one
+        time_provider.inc(Duration::from_millis(1));
+        backend.set(String::from("big"), 10usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current(), TestSize(10));
+
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("cache_lru_member_evicted")
+                .unwrap()
+                .observation(&[("pool", "pool"), ("member", "id")])
+                .unwrap(),
+            // it is important that all 10 items are evicted with a single eviction
+            &Observation::U64Counter(10)
+        );
+    }
+
+    #[tokio::test]
+    async fn test_eviction_half_half() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(20),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend1 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend1.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let mut backend2 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend2.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id2",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        // fill up pool
+        for i in 0..10 {
+            backend1.set(i.to_string(), 1usize);
+            backend2.set(i.to_string(), 1usize);
+            time_provider.inc(Duration::from_millis(1));
+        }
+        assert_eq!(pool.current(), TestSize(20));
+
+        // evict members using a single large one
+        time_provider.inc(Duration::from_millis(1));
+        backend1.set(String::from("big"), 10usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current(), TestSize(20));
+
+        // every member lost 5 entries
+        // Note: backend1 has 5+1 items because it own the "big" key
+        assert_inner_len(&mut backend1, 6);
+        assert_inner_len(&mut backend2, 5);
+    }
+
+    #[tokio::test]
+    async fn test_eviction_one_member_all_other_member_some() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
+        let metric_registry = Arc::new(metric::Registry::new());
+        let pool = Arc::new(ResourcePool::new(
+            "pool",
+            TestSize(3),
+            Arc::clone(&metric_registry),
+            &Handle::current(),
+        ));
+        let resource_estimator = Arc::new(TestResourceEstimator {});
+
+        let mut backend1 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend1.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id1",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        let mut backend2 = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend2.add_policy(LruPolicy::new(
+            Arc::clone(&pool),
+            "id2",
+            Arc::clone(&resource_estimator) as _,
+        ));
+
+        // fill up pool
+        backend1.set(String::from("a"), 1usize);
+        time_provider.inc(Duration::from_millis(1));
+        backend2.set(String::from("a"), 1usize);
+        time_provider.inc(Duration::from_millis(1));
+        backend2.set(String::from("b"), 1usize);
+        assert_eq!(pool.current(), TestSize(3));
+
+        // evict members using a single large one
+        time_provider.inc(Duration::from_millis(1));
+        backend2.set(String::from("big"), 2usize);
+        pool.wait_converged().await;
+        assert_eq!(pool.current(), TestSize(3));
+
+        assert_inner_backend(&mut backend1, []);
+        assert_inner_backend(
+            &mut backend2,
+            [(String::from("b"), 1usize), (String::from("big"), 2usize)],
+        );
+    }
+
+    #[derive(Debug)]
+    struct TestResourceEstimator {}
+
+    impl ResourceEstimator for TestResourceEstimator {
+        type K = String;
+        type V = usize;
+        type S = TestSize;
+
+        fn consumption(&self, _k: &Self::K, v: &Self::V) -> Self::S {
+            TestSize(*v)
+        }
+    }
+
+    #[track_caller]
+    fn assert_inner_backend<const N: usize>(
+        backend: &mut PolicyBackend<String, usize>,
+        data: [(String, usize); N],
+    ) {
+        let inner_backend = backend.inner_ref();
+        let inner_backend = inner_backend
+            .as_any()
+            .downcast_ref::<HashMap<String, usize>>()
+            .unwrap();
+        let expected = HashMap::from(data);
+        assert_eq!(inner_backend, &expected);
+    }
+
+    #[track_caller]
+    fn assert_inner_len(backend: &mut PolicyBackend<String, usize>, len: usize) {
+        let inner_backend = backend.inner_ref();
+        let inner_backend = inner_backend
+            .as_any()
+            .downcast_ref::<HashMap<String, usize>>()
+            .unwrap();
+        assert_eq!(inner_backend.len(), len);
+    }
+}
diff --git a/cache_system/src/backend/policy/mod.rs b/cache_system/src/backend/policy/mod.rs
new file mode 100644
index 0000000..c503c2a
--- /dev/null
+++ b/cache_system/src/backend/policy/mod.rs
@@ -0,0 +1,1974 @@
+//! Policy framework for [backends](crate::backend::CacheBackend).
+
+use std::{
+    cell::RefCell,
+    collections::{HashMap, VecDeque},
+    fmt::Debug,
+    hash::Hash,
+    marker::PhantomData,
+    ops::Deref,
+    sync::{Arc, Weak},
+};
+
+use iox_time::{Time, TimeProvider};
+use parking_lot::{lock_api::ArcReentrantMutexGuard, RawMutex, RawThreadId, ReentrantMutex};
+
+use super::CacheBackend;
+
+pub mod lru;
+pub mod refresh;
+pub mod remove_if;
+pub mod ttl;
+
+#[cfg(test)]
+mod integration_tests;
+
+/// Convenience macro to easily follow the borrow/lock chain of [`StrongSharedInner`].
+///
+/// This cannot just be a method because we cannot return references to local variables.
+macro_rules! lock_inner {
+    ($guard:ident = $inner:expr) => {
+        let $guard = $inner.lock();
+        let $guard = $guard.try_borrow_mut().expect("illegal recursive access");
+    };
+    (mut $guard:ident = $inner:expr) => {
+        let $guard = $inner.lock();
+        let mut $guard = $guard.try_borrow_mut().expect("illegal recursive access");
+    };
+}
+
+/// Backend that is controlled by different policies.
+///
+/// # Policies & Recursion
+///
+/// Policies have two tasks:
+///
+/// - initiate changes (e.g. based on timers)
+/// - react to changes
+///
+/// Getting data from a [`PolicyBackend`] and feeding data back into it in a somewhat synchronous
+/// manner sounds really close to recursion. Uncontrolled recursion however is bad for the
+/// following reasons:
+///
+/// 1. **Stack space:** We may easily run out of stack space.
+/// 2. **Ownership:** Looping back into the same data structure can easily lead to deadlocks (data
+///    corruption is luckily prevented by Rust's ownership model).
+///
+/// However sometimes we need to have interactions of policies in a "recursive" manner. E.g.:
+///
+/// 1. A refresh policies updates a value based on a timer. The value gets bigger.
+/// 2. Some resource-pool policy decides that this is now too much data and wants to evict data.
+/// 3. The refresh policy gets informed about the values that are removed so it can stop refreshing
+///    them.
+///
+/// The solution that [`PolicyBackend`] uses is the following:
+///
+/// All interaction of the policy with a [`PolicyBackend`] happens through a proxy object called
+/// [`ChangeRequest`]. The [`ChangeRequest`] encapsulates a single atomic "transaction" on the
+/// underlying store. This can be a simple operation as [`REMOVE`](CacheBackend::remove) but also
+/// compound operations like "get+remove" (e.g. to check if a value needs to be pruned from the
+/// cache). The policy has two ways of issuing [`ChangeRequest`]s:
+///
+/// 1. **Initial / self-driven:** Upon creation the policy receives a [`CallbackHandle`] that it
+///    can use initiate requests. This handle must only be used to create requests "out of thin
+///    air" (e.g. based on a timer). It MUST NOT be used to react to changes (see next point) to
+///    avoid deadlocks.
+/// 2. **Reactions:** Each policy implements a [`Subscriber`] that receives notifications for each
+///    changes. These notification return [`ChangeRequest`]s that the policy wishes to be
+///    performed. This construct is designed to avoid recursion.
+///
+/// Also note that a policy that uses the subscriber interface MUST NOT hold locks on their
+/// internal data structure while performing _initial requests_ to avoid deadlocks (since the
+/// subscriber will be informed about the changes).
+///
+/// We cannot guarantee that policies fulfill this interface, but [`PolicyBackend`] performs some
+/// sanity checks (e.g. it will catch if the same thread that started an initial requests recurses
+/// into another initial request).
+///
+/// # Change Propagation
+///
+/// Each [`ChangeRequest`] is processed atomically, so "get + set" / "compare + exchange" patterns
+/// work as expected.
+///
+/// Changes will be propagated "breadth first". This means that the initial changes will form a
+/// task list. For every task in this list (front to back), we will execute the [`ChangeRequest`].
+/// Every change that is performed within this request (usually only one) we propagate the change
+/// as follows:
+///
+/// 1. underlying backend
+/// 2. policies (in the order they where added)
+///
+/// From step 2 we collect new change requests that will be added to the back of the task list.
+///
+/// The original requests will return to the caller once all tasks are completed.
+///
+/// When a [`ChangeRequest`] performs multiple operations -- e.g. [`GET`](CacheBackend::get) and
+/// [`SET`](CacheBackend::set) -- we first inform all subscribers about the first operation (in
+/// this case: [`GET`](CacheBackend::get)) and collect the resulting [`ChangeRequest`]s. Then we
+/// process the second operation (in this case: [`SET`](CacheBackend::set)).
+///
+/// # `GET`
+///
+/// The return value for [`CacheBackend::get`] is fetched from the inner backend AFTER all changes
+/// are applied.
+///
+/// Note [`ChangeRequest::get`] has no way of returning a result to the [`Subscriber`] that created
+/// it. The "changes" solely act as some kind of "keep alive" / "this was used" signal.
+///
+/// # Example
+///
+/// **The policies in these examples are deliberately silly but simple!**
+///
+/// Let's start with a purely reactive policy that will round up all integer values to the next
+/// even number:
+///
+/// ```
+/// use std::{
+///     collections::HashMap,
+///     sync::Arc,
+/// };
+/// use cache_system::backend::{
+///     CacheBackend,
+///     policy::{
+///         ChangeRequest,
+///         PolicyBackend,
+///         Subscriber,
+///     },
+/// };
+/// use iox_time::{
+///     SystemProvider,
+///     Time,
+/// };
+///
+/// #[derive(Debug)]
+/// struct EvenNumberPolicy;
+///
+/// type CR = ChangeRequest<'static, &'static str, u64>;
+///
+/// impl Subscriber for EvenNumberPolicy {
+///     type K = &'static str;
+///     type V = u64;
+///
+///     fn set(&mut self, k: &&'static str, v: &u64, _now: Time) -> Vec<CR> {
+///       // When new key `k` is set to value `v` if `v` is odd,
+///       // request a change to set `k` to `v+1`
+///         if v % 2 == 1 {
+///             vec![CR::set(k, v + 1)]
+///         } else {
+///             vec![]
+///         }
+///     }
+/// }
+///
+/// let mut backend = PolicyBackend::new(
+///     Box::new(HashMap::new()),
+///     Arc::new(SystemProvider::new()),
+/// );
+/// backend.add_policy(|_callback_backend| EvenNumberPolicy);
+///
+/// backend.set("foo", 8);
+/// backend.set("bar", 9);
+///
+/// assert_eq!(backend.get(&"foo"), Some(8));
+/// assert_eq!(backend.get(&"bar"), Some(10));
+/// ```
+///
+/// And here is a more active backend that regularly writes the current system time to a key:
+///
+/// ```
+/// use std::{
+///     collections::HashMap,
+///     sync::{
+///         Arc,
+///         atomic::{AtomicBool, Ordering},
+///     },
+///     thread::{JoinHandle, sleep, spawn},
+///     time::{Duration, Instant},
+/// };
+/// use cache_system::backend::{
+///     CacheBackend,
+///     policy::{
+///         ChangeRequest,
+///         PolicyBackend,
+///         Subscriber,
+///     },
+/// };
+/// use iox_time::SystemProvider;
+///
+/// #[derive(Debug)]
+/// struct NowPolicy {
+///     cancel: Arc<AtomicBool>,
+///     join_handle: Option<JoinHandle<()>>,
+/// };
+///
+/// impl Drop for NowPolicy {
+///     fn drop(&mut self) {
+///         self.cancel.store(true, Ordering::SeqCst);
+///         self.join_handle
+///             .take()
+///             .expect("worker thread present")
+///             .join()
+///             .expect("worker thread finished");
+///     }
+/// }
+///
+/// type CR = ChangeRequest<'static, &'static str, Instant>;
+///
+/// impl Subscriber for NowPolicy {
+///     type K = &'static str;
+///     type V = Instant;
+/// }
+///
+/// let mut backend = PolicyBackend::new(
+///     Box::new(HashMap::new()),
+///     Arc::new(SystemProvider::new()),
+/// );
+/// backend.add_policy(|mut callback_handle| {
+///     let cancel = Arc::new(AtomicBool::new(false));
+///     let cancel_captured = Arc::clone(&cancel);
+///     let join_handle = spawn(move || {
+///         loop {
+///             if cancel_captured.load(Ordering::SeqCst) {
+///                 break;
+///             }
+///             callback_handle.execute_requests(vec![
+///                 CR::set("now", Instant::now()),
+///             ]);
+///             sleep(Duration::from_millis(1));
+///         }
+///     });
+///     NowPolicy{cancel, join_handle: Some(join_handle)}
+/// });
+///
+///
+/// // eventually we should see a key
+/// let t_start = Instant::now();
+/// loop {
+///     if let Some(t) = backend.get(&"now") {
+///         // value should be fresh
+///         assert!(t.elapsed() < Duration::from_millis(100));
+///         break;
+///     }
+///
+///     assert!(t_start.elapsed() < Duration::from_secs(1));
+///     sleep(Duration::from_millis(10));
+/// }
+/// ```
+#[derive(Debug)]
+pub struct PolicyBackend<K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    inner: StrongSharedInner<K, V>,
+}
+
+impl<K, V> PolicyBackend<K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Create new backend w/o any policies.
+    ///
+    /// # Panic
+    ///
+    /// Panics if `inner` is not empty.
+    pub fn new(
+        inner: Box<dyn CacheBackend<K = K, V = V> + Send>,
+        time_provider: Arc<dyn TimeProvider>,
+    ) -> Self {
+        assert!(inner.is_empty(), "inner backend is not empty");
+
+        Self {
+            inner: Arc::new(ReentrantMutex::new(RefCell::new(PolicyBackendInner {
+                inner,
+                subscribers: Vec::new(),
+                time_provider,
+            }))),
+        }
+    }
+
+    /// Create a new backend with a HashMap as the [`CacheBackend`].
+    pub fn hashmap_backed(time_provider: Arc<dyn TimeProvider>) -> Self {
+        // See <https://github.com/rust-lang/rust-clippy/issues/9621>. This clippy lint suggests
+        // replacing `Box::new(HashMap::new())` with `Box::default()`, which in most cases would be
+        // shorter, but because this type is actually a `Box<dyn Trait>`, the replacement would
+        // need to be `Box::<HashMap<_, _>>::default()`, which doesn't seem like an improvement.
+        #[allow(clippy::box_default)]
+        Self::new(Box::new(HashMap::new()), Arc::clone(&time_provider))
+    }
+
+    /// Adds new policy.
+    ///
+    /// See documentation of [`PolicyBackend`] for more information.
+    ///
+    /// This is called with a function that receives the "callback backend" to this backend and
+    /// should return a [`Subscriber`]. This loopy construct was chosen to discourage the leakage
+    /// of the "callback backend" to any other object.
+    pub fn add_policy<C, S>(&mut self, policy_constructor: C)
+    where
+        C: FnOnce(CallbackHandle<K, V>) -> S,
+        S: Subscriber<K = K, V = V>,
+    {
+        let callback_handle = CallbackHandle {
+            inner: Arc::downgrade(&self.inner),
+        };
+        let subscriber = policy_constructor(callback_handle);
+        lock_inner!(mut guard = self.inner);
+        guard.subscribers.push(Box::new(subscriber));
+    }
+
+    /// Provide temporary read-only access to the underlying backend.
+    ///
+    /// This is mostly useful for debugging and testing.
+    pub fn inner_ref(&mut self) -> InnerBackendRef<'_, K, V> {
+        // NOTE: We deliberately use a mutable reference here to prevent users from using `<Self as
+        // CacheBackend>` while we hold a lock to the underlying backend.
+
+        inner_ref::build(Arc::clone(&self.inner))
+    }
+}
+
+impl<K, V> CacheBackend for PolicyBackend<K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    type K = K;
+    type V = V;
+
+    fn get(&mut self, k: &Self::K) -> Option<Self::V> {
+        lock_inner!(mut guard = self.inner);
+        perform_changes(&mut guard, vec![ChangeRequest::get(k.clone())]);
+
+        // poll inner backend AFTER everything has settled
+        guard.inner.get(k)
+    }
+
+    fn set(&mut self, k: Self::K, v: Self::V) {
+        lock_inner!(mut guard = self.inner);
+        perform_changes(&mut guard, vec![ChangeRequest::set(k, v)]);
+    }
+
+    fn remove(&mut self, k: &Self::K) {
+        lock_inner!(mut guard = self.inner);
+        perform_changes(&mut guard, vec![ChangeRequest::remove(k.clone())]);
+    }
+
+    fn is_empty(&self) -> bool {
+        lock_inner!(guard = self.inner);
+        guard.inner.is_empty()
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
+
+/// Handle that allows a [`Subscriber`] to send [`ChangeRequest`]s back to the [`PolicyBackend`]
+/// that owns that very [`Subscriber`].
+#[derive(Debug)]
+pub struct CallbackHandle<K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    inner: WeakSharedInner<K, V>,
+}
+
+impl<K, V> CallbackHandle<K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Start a series of requests to the [`PolicyBackend`] that is referenced by this handle.
+    ///
+    /// This method returns AFTER the requests and all the follow-up changes requested by all
+    /// policies are played out. You should NOT hold a lock on your policies internal data
+    /// structures while calling this function if you plan to also [subscribe](Subscriber) to
+    /// changes because this would easily lead to deadlocks.
+    pub fn execute_requests(&mut self, change_requests: Vec<ChangeRequest<'_, K, V>>) {
+        let Some(inner) = self.inner.upgrade() else {
+            // backend gone, can happen during shutdowns, try not to panic
+            return;
+        };
+
+        lock_inner!(mut guard = inner);
+        perform_changes(&mut guard, change_requests);
+    }
+}
+
+#[derive(Debug)]
+struct PolicyBackendInner<K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Underlying cache backend.
+    inner: Box<dyn CacheBackend<K = K, V = V> + Send>,
+
+    /// List of subscribers.
+    subscribers: Vec<Box<dyn Subscriber<K = K, V = V>>>,
+
+    /// Time provider.
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+type WeakSharedInner<K, V> = Weak<ReentrantMutex<RefCell<PolicyBackendInner<K, V>>>>;
+type StrongSharedInner<K, V> = Arc<ReentrantMutex<RefCell<PolicyBackendInner<K, V>>>>;
+
+/// Perform changes breadth first.
+fn perform_changes<K, V>(
+    inner: &mut PolicyBackendInner<K, V>,
+    change_requests: Vec<ChangeRequest<'_, K, V>>,
+) where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    let mut tasks = VecDeque::from(change_requests);
+    let now = inner.time_provider.now();
+
+    while let Some(change_request) = tasks.pop_front() {
+        let mut recorder = Recorder {
+            inner: inner.inner.as_mut(),
+            records: vec![],
+        };
+
+        change_request.eval(&mut recorder);
+
+        for record in recorder.records {
+            for subscriber in &mut inner.subscribers {
+                let requests = match &record {
+                    Record::Get { k } => subscriber.get(k, now),
+                    Record::Set { k, v } => subscriber.set(k, v, now),
+                    Record::Remove { k } => subscriber.remove(k, now),
+                };
+
+                tasks.extend(requests.into_iter());
+            }
+        }
+    }
+}
+
+/// Subscriber to change events.
+pub trait Subscriber: Debug + Send + 'static {
+    /// Cache key.
+    type K: Clone + Eq + Hash + Ord + Debug + Send + 'static;
+
+    /// Cached value.
+    type V: Clone + Debug + Send + 'static;
+
+    /// Get value for given key if it exists.
+    ///
+    /// The current time `now` is provided as a parameter so that all policies and backends use a
+    /// unified timestamp rather than their own provider, which is more consistent and performant.
+    fn get(&mut self, _k: &Self::K, _now: Time) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        // do nothing by default
+        vec![]
+    }
+
+    /// Set value for given key.
+    ///
+    /// It is OK to set and override a key that already exists.
+    ///
+    /// The current time `now` is provided as a parameter so that all policies and backends use a
+    /// unified timestamp rather than their own provider, which is more consistent and performant.
+    fn set(
+        &mut self,
+        _k: &Self::K,
+        _v: &Self::V,
+        _now: Time,
+    ) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        // do nothing by default
+        vec![]
+    }
+
+    /// Remove value for given key.
+    ///
+    /// It is OK to remove a key even when it does not exist.
+    ///
+    /// The current time `now` is provided as a parameter so that all policies and backends use a
+    /// unified timestamp rather than their own provider, which is more consistent and performant.
+    fn remove(
+        &mut self,
+        _k: &Self::K,
+        _now: Time,
+    ) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        // do nothing by default
+        vec![]
+    }
+}
+
+/// A change request to a backend.
+pub struct ChangeRequest<'a, K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    fun: ChangeRequestFn<'a, K, V>,
+}
+
+impl<'a, K, V> Debug for ChangeRequest<'a, K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CacheRequest").finish_non_exhaustive()
+    }
+}
+
+impl<'a, K, V> ChangeRequest<'a, K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Custom way of constructing a change request.
+    ///
+    /// This is considered a rather low-level function and you should prefer the higher-level
+    /// constructs like [`get`](Self::get), [`set`](Self::set), and [`remove`](Self::remove).
+    ///
+    /// Takes a "callback backend" and can freely act on it. The underlying backend of
+    /// [`PolicyBackend`] is guaranteed to be locked during a single request, so "get + modify"
+    /// patterns work out of the box without the need to fear interleaving modifications.
+    pub fn from_fn<F>(f: F) -> Self
+    where
+        F: for<'b, 'c> FnOnce(&'c mut Recorder<'b, K, V>) + 'a,
+    {
+        Self { fun: Box::new(f) }
+    }
+
+    /// [`GET`](CacheBackend::get)
+    pub fn get(k: K) -> Self {
+        Self::from_fn(move |backend| {
+            backend.get(&k);
+        })
+    }
+
+    /// [`SET`](CacheBackend::set)
+    pub fn set(k: K, v: V) -> Self {
+        Self::from_fn(move |backend| {
+            backend.set(k, v);
+        })
+    }
+
+    /// [`REMOVE`](CacheBackend::remove).
+    pub fn remove(k: K) -> Self {
+        Self::from_fn(move |backend| {
+            backend.remove(&k);
+        })
+    }
+
+    /// Ensure that backend is empty and panic otherwise.
+    ///
+    /// This is mostly useful during initialization.
+    pub fn ensure_empty() -> Self {
+        Self::from_fn(|backend| {
+            assert!(backend.is_empty(), "inner backend is not empty");
+        })
+    }
+
+    /// Execute this change request.
+    pub fn eval(self, backend: &mut Recorder<'_, K, V>) {
+        (self.fun)(backend);
+    }
+}
+
+/// Function captured within [`ChangeRequest`].
+type ChangeRequestFn<'a, K, V> = Box<dyn for<'b, 'c> FnOnce(&'c mut Recorder<'b, K, V>) + 'a>;
+
+/// Records of interactions with the callback [`CacheBackend`].
+#[derive(Debug, PartialEq)]
+enum Record<K, V> {
+    /// [`GET`](CacheBackend::get)
+    Get {
+        /// Key.
+        k: K,
+    },
+
+    /// [`SET`](CacheBackend::set)
+    Set {
+        /// Key.
+        k: K,
+
+        /// Value.
+        v: V,
+    },
+
+    /// [`REMOVE`](CacheBackend::remove).
+    Remove {
+        /// Key.
+        k: K,
+    },
+}
+
+/// Specialized [`CacheBackend`] that forwards changes and requests to the underlying backend of
+/// [`PolicyBackend`] but also records all changes into [`Record`]s.
+#[derive(Debug)]
+pub struct Recorder<'a, K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    inner: &'a mut (dyn CacheBackend<K = K, V = V> + Send),
+    records: Vec<Record<K, V>>,
+}
+
+impl<'a, K, V> Recorder<'a, K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Perform a [`GET`](CacheBackend::get) request that is NOT seen by other policies.
+    ///
+    /// This is helpful if you just want to check the underlying data of a key without treating it
+    /// as "used".
+    ///
+    /// Note that this functionality only exists for [`GET`](CacheBackend::get) requests, not for
+    /// modifying requests like [`SET`](CacheBackend::set) or [`REMOVE`](CacheBackend::remove)
+    /// since they always require policies to be in-sync.
+    pub fn get_untracked(&mut self, k: &K) -> Option<V> {
+        self.inner.get(k)
+    }
+}
+
+impl<'a, K, V> CacheBackend for Recorder<'a, K, V>
+where
+    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    type K = K;
+    type V = V;
+
+    fn get(&mut self, k: &Self::K) -> Option<Self::V> {
+        self.records.push(Record::Get { k: k.clone() });
+        self.inner.get(k)
+    }
+
+    fn set(&mut self, k: Self::K, v: Self::V) {
+        self.records.push(Record::Set {
+            k: k.clone(),
+            v: v.clone(),
+        });
+        self.inner.set(k, v);
+    }
+
+    fn remove(&mut self, k: &Self::K) {
+        self.records.push(Record::Remove { k: k.clone() });
+        self.inner.remove(k);
+    }
+
+    fn is_empty(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        panic!("don't any-cast the recorder please")
+    }
+}
+
+/// Helper module that wraps the implementation of [`InnerBackendRef`].
+///
+/// This is required because [`ouroboros`] generates a bunch of code that we do not want to leak all over the place.
+mod inner_ref {
+    #![allow(non_snake_case, clippy::future_not_send)]
+
+    use super::*;
+    use ouroboros::self_referencing;
+
+    /// Read-only ref to the inner backend of [`PolicyBackend`] for debugging.
+    #[self_referencing]
+    pub struct InnerBackendRef<'a, K, V>
+    where
+        K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+        V: Clone + Debug + Send + 'static,
+    {
+        l1: ArcReentrantMutexGuard<RawMutex, RawThreadId, RefCell<PolicyBackendInner<K, V>>>,
+        #[borrows(l1)]
+        #[covariant]
+        l2: std::cell::RefMut<'this, PolicyBackendInner<K, V>>,
+        _phantom: PhantomData<&'a mut ()>,
+    }
+
+    impl<'a, K, V> Deref for InnerBackendRef<'a, K, V>
+    where
+        K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+        V: Clone + Debug + Send + 'static,
+    {
+        type Target = dyn CacheBackend<K = K, V = V>;
+
+        fn deref(&self) -> &Self::Target {
+            self.borrow_l2().inner.as_ref()
+        }
+    }
+
+    pub(super) fn build<'a, K, V>(inner: StrongSharedInner<K, V>) -> InnerBackendRef<'a, K, V>
+    where
+        K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+        V: Clone + Debug + Send + 'static,
+    {
+        let inner = inner.lock_arc();
+        InnerBackendRefBuilder {
+            l1: inner,
+            l2_builder: |l1| l1.borrow_mut(),
+            _phantom: PhantomData,
+        }
+        .build()
+    }
+}
+
+pub use inner_ref::InnerBackendRef;
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, sync::Barrier, thread::JoinHandle};
+
+    use iox_time::MockProvider;
+
+    use super::*;
+
+    #[allow(dead_code)]
+    const fn assert_send<T: Send>() {}
+    const _: () = assert_send::<CallbackHandle<String, usize>>();
+
+    #[test]
+    #[should_panic(expected = "inner backend is not empty")]
+    fn test_panic_inner_not_empty() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        PolicyBackend::new(
+            Box::new(HashMap::from([(String::from("foo"), 1usize)])),
+            time_provider,
+        );
+    }
+
+    #[test]
+    fn test_generic() {
+        crate::backend::test_util::test_generic(|| {
+            let time_provider = Arc::new(MockProvider::new(Time::MIN));
+            PolicyBackend::hashmap_backed(time_provider)
+        })
+    }
+
+    #[test]
+    #[should_panic(expected = "test steps left")]
+    fn test_meta_panic_steps_left() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Set {
+                k: String::from("a"),
+                v: 1,
+            },
+            action: TestAction::ChangeRequests(vec![]),
+        }]));
+    }
+
+    #[test]
+    #[should_panic(expected = "step left for get operation")]
+    fn test_meta_panic_requires_condition_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![]));
+
+        backend.get(&String::from("a"));
+    }
+
+    #[test]
+    #[should_panic(expected = "step left for set operation")]
+    fn test_meta_panic_requires_condition_set() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![]));
+
+        backend.set(String::from("a"), 2);
+    }
+
+    #[test]
+    #[should_panic(expected = "step left for remove operation")]
+    fn test_meta_panic_requires_condition_remove() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![]));
+
+        backend.remove(&String::from("a"));
+    }
+
+    #[test]
+    #[should_panic(expected = "Condition mismatch")]
+    fn test_meta_panic_checks_condition_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Get {
+                k: String::from("a"),
+            },
+            action: TestAction::ChangeRequests(vec![]),
+        }]));
+
+        backend.get(&String::from("b"));
+    }
+
+    #[test]
+    #[should_panic(expected = "Condition mismatch")]
+    fn test_meta_panic_checks_condition_set() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Set {
+                k: String::from("a"),
+                v: 1,
+            },
+            action: TestAction::ChangeRequests(vec![]),
+        }]));
+
+        backend.set(String::from("a"), 2);
+    }
+
+    #[test]
+    #[should_panic(expected = "Condition mismatch")]
+    fn test_meta_panic_checks_condition_remove() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Remove {
+                k: String::from("a"),
+            },
+            action: TestAction::ChangeRequests(vec![]),
+        }]));
+
+        backend.remove(&String::from("b"));
+    }
+
+    #[test]
+    fn test_basic_propagation() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 1,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("b"),
+                    v: 2,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Remove {
+                    k: String::from("b"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("b"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 1);
+        backend.set(String::from("b"), 2);
+        backend.remove(&String::from("b"));
+
+        assert_eq!(backend.get(&String::from("a")), Some(1));
+        assert_eq!(backend.get(&String::from("b")), None);
+    }
+
+    #[test]
+    #[should_panic(expected = "illegal recursive access")]
+    fn test_panic_recursion_detection_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Remove {
+                k: String::from("a"),
+            },
+            action: TestAction::CallBackendDirectly(TestBackendInteraction::Get {
+                k: String::from("b"),
+            }),
+        }]));
+
+        backend.remove(&String::from("a"));
+    }
+
+    #[test]
+    #[should_panic(expected = "illegal recursive access")]
+    fn test_panic_recursion_detection_set() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Remove {
+                k: String::from("a"),
+            },
+            action: TestAction::CallBackendDirectly(TestBackendInteraction::Set {
+                k: String::from("b"),
+                v: 1,
+            }),
+        }]));
+
+        backend.remove(&String::from("a"));
+    }
+
+    #[test]
+    #[should_panic(expected = "illegal recursive access")]
+    fn test_panic_recursion_detection_remove() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Remove {
+                k: String::from("a"),
+            },
+            action: TestAction::CallBackendDirectly(TestBackendInteraction::Remove {
+                k: String::from("b"),
+            }),
+        }]));
+
+        backend.remove(&String::from("a"));
+    }
+
+    #[test]
+    fn test_get_untracked() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 1,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::from_fn(
+                    |backend| {
+                        assert_eq!(backend.get_untracked(&String::from("a")), Some(1));
+                    },
+                )]),
+            },
+            // NO `GET` interaction triggered here!
+        ]));
+
+        backend.set(String::from("a"), 1);
+    }
+
+    #[test]
+    fn test_basic_get_set() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    1,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 1,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        assert_eq!(backend.get(&String::from("a")), Some(1));
+    }
+
+    #[test]
+    fn test_basic_get_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::get(String::from(
+                    "a",
+                ))]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        assert_eq!(backend.get(&String::from("a")), None);
+    }
+
+    #[test]
+    fn test_basic_set_set_get_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 1,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("b"),
+                    2,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("b"),
+                    v: 2,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("b"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 1);
+
+        assert_eq!(backend.get(&String::from("a")), Some(1));
+        assert_eq!(backend.get(&String::from("b")), Some(2));
+    }
+
+    #[test]
+    fn test_basic_set_remove_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 1,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::remove(
+                    String::from("a"),
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Remove {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 1);
+
+        assert_eq!(backend.get(&String::from("a")), None);
+    }
+
+    #[test]
+    fn test_basic_remove_set_get_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Remove {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("b"),
+                    1,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("b"),
+                    v: 1,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("b"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.remove(&String::from("a"));
+
+        assert_eq!(backend.get(&String::from("a")), None);
+        assert_eq!(backend.get(&String::from("b")), Some(1));
+    }
+
+    #[test]
+    fn test_basic_remove_remove_get_get() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Remove {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::remove(
+                    String::from("b"),
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Remove {
+                    k: String::from("b"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("b"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.remove(&String::from("a"));
+
+        assert_eq!(backend.get(&String::from("a")), None);
+        assert_eq!(backend.get(&String::from("b")), None);
+    }
+
+    #[test]
+    fn test_ordering_within_requests_vector() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 11,
+                },
+                action: TestAction::ChangeRequests(vec![
+                    SendableChangeRequest::set(String::from("a"), 12),
+                    SendableChangeRequest::set(String::from("a"), 13),
+                ]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 12,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 13,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 11);
+
+        assert_eq!(backend.get(&String::from("a")), Some(13));
+    }
+
+    #[test]
+    fn test_ordering_across_policies() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 11,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    12,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 12,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 13,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 11,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    13,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 12,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 13,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 11);
+
+        assert_eq!(backend.get(&String::from("a")), Some(13));
+    }
+
+    #[test]
+    fn test_ping_pong() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 11,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    12,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 12,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 13,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    14,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 14,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 11,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    13,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 12,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 13,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 14,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 11);
+
+        assert_eq!(backend.get(&String::from("a")), Some(14));
+    }
+
+    #[test]
+    #[should_panic(expected = "this is a test")]
+    fn test_meta_multithread_panics_are_propagated() {
+        let barrier_pre = Arc::new(Barrier::new(2));
+        let barrier_post = Arc::new(Barrier::new(1));
+
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![TestStep {
+            condition: TestBackendInteraction::Set {
+                k: String::from("a"),
+                v: 1,
+            },
+            action: TestAction::CallBackendDelayed(
+                Arc::clone(&barrier_pre),
+                TestBackendInteraction::Panic,
+                Arc::clone(&barrier_post),
+            ),
+        }]));
+
+        backend.set(String::from("a"), 1);
+        barrier_pre.wait();
+
+        // panic on drop
+    }
+
+    /// Checks that a policy background task can access the "callback backend" without triggering
+    /// the "illegal recursion" detection.
+    #[test]
+    fn test_multithread() {
+        let barrier_pre = Arc::new(Barrier::new(2));
+        let barrier_post = Arc::new(Barrier::new(2));
+
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 1,
+                },
+                action: TestAction::CallBackendDelayed(
+                    Arc::clone(&barrier_pre),
+                    TestBackendInteraction::Set {
+                        k: String::from("a"),
+                        v: 4,
+                    },
+                    Arc::clone(&barrier_post),
+                ),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 2,
+                },
+                action: TestAction::BlockAndChangeRequest(
+                    Arc::clone(&barrier_pre),
+                    vec![SendableChangeRequest::set(String::from("a"), 3)],
+                ),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 3,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 4,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 1);
+        assert_eq!(backend.get(&String::from("a")), Some(1));
+
+        backend.set(String::from("a"), 2);
+
+        barrier_post.wait();
+        assert_eq!(backend.get(&String::from("a")), Some(4));
+    }
+
+    #[test]
+    fn test_get_from_policies_are_propagated() {
+        let barrier_pre = Arc::new(Barrier::new(2));
+        let barrier_post = Arc::new(Barrier::new(2));
+
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 1,
+                },
+                action: TestAction::CallBackendDelayed(
+                    Arc::clone(&barrier_pre),
+                    TestBackendInteraction::Get {
+                        k: String::from("a"),
+                    },
+                    Arc::clone(&barrier_post),
+                ),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 1);
+        barrier_pre.wait();
+        barrier_post.wait();
+    }
+
+    /// Checks that dropping [`PolicyBackend`] drop the policies as well as the inner backend.
+    #[test]
+    fn test_drop() {
+        let marker_backend = Arc::new(());
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::new(
+            Box::new(DropTester(Arc::clone(&marker_backend), ())),
+            time_provider,
+        );
+
+        let marker_policy = Arc::new(());
+        backend.add_policy(|callback| DropTester(Arc::clone(&marker_policy), callback));
+
+        assert_eq!(Arc::strong_count(&marker_backend), 2);
+        assert_eq!(Arc::strong_count(&marker_policy), 2);
+
+        drop(backend);
+
+        assert_eq!(Arc::strong_count(&marker_backend), 1);
+        assert_eq!(Arc::strong_count(&marker_policy), 1);
+    }
+
+    /// We have to ways of handling "compound" [`ChangeRequest`]s, i.e. requests that perform
+    /// multiple operations:
+    ///
+    /// 1. We could loop over the operations and inner-loop over the policies to collect reactions
+    /// 2. We could loop over all the policies and present each polices all operations in one go
+    ///
+    /// We've decided to chose option 1. This test ensures that by setting up a compound request
+    /// (reacting to `set("a", 11)`) with a compound of two operations (`set("a", 12)`, `set("a",
+    /// 13)`) which we call `C1` and `C2` (for "compound 1 and 2"). The two policies react to
+    /// these two compound operations as follows:
+    ///
+    /// |    | Policy 1       | Policy 2       |
+    /// | -- | -------------- | -------------- |
+    /// | C1 | `set("a", 14)` | `set("a", 15)` |
+    /// | C2 | `set("a", 16)` | --             |
+    ///
+    /// For option (1) the outcome will be `"a" -> 16`, for option (2) the outcome would be `"a" ->
+    /// 15`.
+    #[test]
+    fn test_ordering_within_compound_requests() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 11,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::from_fn(
+                    |backend| {
+                        backend.set(String::from("a"), 12);
+                        backend.set(String::from("a"), 13);
+                    },
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 12,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    14,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 13,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    16,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 14,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 15,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 16,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+        backend.add_policy(create_test_policy(vec![
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 11,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 12,
+                },
+                action: TestAction::ChangeRequests(vec![SendableChangeRequest::set(
+                    String::from("a"),
+                    15,
+                )]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 13,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 14,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 15,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Set {
+                    k: String::from("a"),
+                    v: 16,
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+            TestStep {
+                condition: TestBackendInteraction::Get {
+                    k: String::from("a"),
+                },
+                action: TestAction::ChangeRequests(vec![]),
+            },
+        ]));
+
+        backend.set(String::from("a"), 11);
+
+        assert_eq!(backend.get(&String::from("a")), Some(16));
+    }
+
+    #[derive(Debug)]
+    struct DropTester<T>(Arc<()>, T)
+    where
+        T: Debug + Send + 'static;
+
+    impl<T> CacheBackend for DropTester<T>
+    where
+        T: Debug + Send + 'static,
+    {
+        type K = ();
+        type V = ();
+
+        fn get(&mut self, _k: &Self::K) -> Option<Self::V> {
+            unreachable!()
+        }
+
+        fn set(&mut self, _k: Self::K, _v: Self::V) {
+            unreachable!()
+        }
+
+        fn remove(&mut self, _k: &Self::K) {
+            unreachable!()
+        }
+
+        fn is_empty(&self) -> bool {
+            true
+        }
+
+        fn as_any(&self) -> &dyn std::any::Any {
+            unreachable!()
+        }
+    }
+
+    impl<T> Subscriber for DropTester<T>
+    where
+        T: Debug + Send + 'static,
+    {
+        type K = ();
+        type V = ();
+    }
+
+    fn create_test_policy(
+        steps: Vec<TestStep>,
+    ) -> impl FnOnce(CallbackHandle<String, usize>) -> TestSubscriber {
+        |handle| TestSubscriber {
+            background_task: TestSubscriberBackgroundTask::NotStarted(handle),
+            steps: VecDeque::from(steps),
+        }
+    }
+
+    #[derive(Debug, PartialEq)]
+    enum TestBackendInteraction {
+        Get { k: String },
+
+        Set { k: String, v: usize },
+
+        Remove { k: String },
+
+        Panic,
+    }
+
+    impl TestBackendInteraction {
+        fn perform(self, handle: &mut CallbackHandle<String, usize>) {
+            match self {
+                Self::Get { k } => {
+                    handle.execute_requests(vec![ChangeRequest::get(k)]);
+                }
+                Self::Set { k, v } => handle.execute_requests(vec![ChangeRequest::set(k, v)]),
+                Self::Remove { k } => handle.execute_requests(vec![ChangeRequest::remove(k)]),
+                Self::Panic => panic!("this is a test"),
+            }
+        }
+    }
+
+    #[derive(Debug)]
+    enum TestAction {
+        /// Perform an illegal direct, recursive call to the backend.
+        CallBackendDirectly(TestBackendInteraction),
+
+        /// Return change requests
+        ChangeRequests(Vec<SendableChangeRequest>),
+
+        /// Use callback backend but wait for a barrier in a background thread.
+        ///
+        /// This will return immediately.
+        CallBackendDelayed(Arc<Barrier>, TestBackendInteraction, Arc<Barrier>),
+
+        /// Block on barrier and return afterwards.
+        BlockAndChangeRequest(Arc<Barrier>, Vec<SendableChangeRequest>),
+    }
+
+    impl TestAction {
+        fn perform(
+            self,
+            background_task: &mut TestSubscriberBackgroundTask,
+        ) -> Vec<ChangeRequest<'static, String, usize>> {
+            match self {
+                Self::CallBackendDirectly(interaction) => {
+                    let handle = match background_task {
+                        TestSubscriberBackgroundTask::NotStarted(handle) => handle,
+                        TestSubscriberBackgroundTask::Started(_) => {
+                            panic!("background task already started")
+                        }
+                        TestSubscriberBackgroundTask::Invalid => panic!("Invalid state"),
+                    };
+
+                    interaction.perform(handle);
+                    unreachable!("illegal call should have failed")
+                }
+                Self::ChangeRequests(change_requests) => {
+                    change_requests.into_iter().map(|r| r.into()).collect()
+                }
+                Self::CallBackendDelayed(barrier_pre, interaction, barrier_post) => {
+                    let mut tmp = TestSubscriberBackgroundTask::Invalid;
+                    std::mem::swap(&mut tmp, background_task);
+                    let mut handle = match tmp {
+                        TestSubscriberBackgroundTask::NotStarted(handle) => handle,
+                        TestSubscriberBackgroundTask::Started(_) => {
+                            panic!("background task already started")
+                        }
+                        TestSubscriberBackgroundTask::Invalid => panic!("Invalid state"),
+                    };
+
+                    let join_handle = std::thread::spawn(move || {
+                        barrier_pre.wait();
+                        interaction.perform(&mut handle);
+                        barrier_post.wait();
+                    });
+                    *background_task = TestSubscriberBackgroundTask::Started(join_handle);
+
+                    vec![]
+                }
+                Self::BlockAndChangeRequest(barrier, change_requests) => {
+                    barrier.wait();
+                    change_requests.into_iter().map(|r| r.into()).collect()
+                }
+            }
+        }
+    }
+
+    #[derive(Debug)]
+    struct TestStep {
+        condition: TestBackendInteraction,
+        action: TestAction,
+    }
+
+    #[derive(Debug)]
+    enum TestSubscriberBackgroundTask {
+        NotStarted(CallbackHandle<String, usize>),
+        Started(JoinHandle<()>),
+
+        /// Temporary variant for swapping.
+        Invalid,
+    }
+
+    #[derive(Debug)]
+    struct TestSubscriber {
+        background_task: TestSubscriberBackgroundTask,
+        steps: VecDeque<TestStep>,
+    }
+
+    impl Drop for TestSubscriber {
+        fn drop(&mut self) {
+            // prevent SIGABRT due to double-panic
+            if !std::thread::panicking() {
+                assert!(self.steps.is_empty(), "test steps left");
+                let mut tmp = TestSubscriberBackgroundTask::Invalid;
+                std::mem::swap(&mut tmp, &mut self.background_task);
+
+                match tmp {
+                    TestSubscriberBackgroundTask::NotStarted(_) => {
+                        // nothing to check
+                    }
+                    TestSubscriberBackgroundTask::Started(handle) => {
+                        // propagate panics
+                        if let Err(e) = handle.join() {
+                            if let Some(err) = e.downcast_ref::<&str>() {
+                                panic!("Error in background task: {err}")
+                            } else if let Some(err) = e.downcast_ref::<String>() {
+                                panic!("Error in background task: {err}")
+                            } else {
+                                panic!("Error in background task: <unknown>")
+                            }
+                        }
+                    }
+                    TestSubscriberBackgroundTask::Invalid => {
+                        // that's OK during drop
+                    }
+                }
+            }
+        }
+    }
+
+    impl Subscriber for TestSubscriber {
+        type K = String;
+        type V = usize;
+
+        fn get(
+            &mut self,
+            k: &Self::K,
+            _now: Time,
+        ) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+            let step = self.steps.pop_front().expect("step left for get operation");
+
+            let expected_condition = TestBackendInteraction::Get { k: k.clone() };
+            assert_eq!(
+                step.condition, expected_condition,
+                "Condition mismatch\n\nActual:\n{:#?}\n\nExpected:\n{:#?}",
+                step.condition, expected_condition,
+            );
+
+            step.action.perform(&mut self.background_task)
+        }
+
+        fn set(
+            &mut self,
+            k: &Self::K,
+            v: &Self::V,
+            _now: Time,
+        ) -> Vec<ChangeRequest<'static, String, usize>> {
+            let step = self.steps.pop_front().expect("step left for set operation");
+
+            let expected_condition = TestBackendInteraction::Set {
+                k: k.clone(),
+                v: *v,
+            };
+            assert_eq!(
+                step.condition, expected_condition,
+                "Condition mismatch\n\nActual:\n{:#?}\n\nExpected:\n{:#?}",
+                step.condition, expected_condition,
+            );
+
+            step.action.perform(&mut self.background_task)
+        }
+
+        fn remove(
+            &mut self,
+            k: &Self::K,
+            _now: Time,
+        ) -> Vec<ChangeRequest<'static, String, usize>> {
+            let step = self
+                .steps
+                .pop_front()
+                .expect("step left for remove operation");
+
+            let expected_condition = TestBackendInteraction::Remove { k: k.clone() };
+            assert_eq!(
+                step.condition, expected_condition,
+                "Condition mismatch\n\nActual:\n{:#?}\n\nExpected:\n{:#?}",
+                step.condition, expected_condition,
+            );
+
+            step.action.perform(&mut self.background_task)
+        }
+    }
+
+    /// Same as [`ChangeRequestFn`] but implements `Send`.
+    type SendableChangeRequestFn =
+        Box<dyn for<'a, 'b> FnOnce(&'b mut Recorder<'a, String, usize>) + Send + 'static>;
+
+    /// Same as [`ChangeRequest`] but implements `Send`.
+    struct SendableChangeRequest {
+        fun: SendableChangeRequestFn,
+    }
+
+    impl Debug for SendableChangeRequest {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            f.debug_struct("SendableCacheRequest")
+                .finish_non_exhaustive()
+        }
+    }
+
+    impl SendableChangeRequest {
+        fn from_fn<F>(f: F) -> Self
+        where
+            F: for<'b, 'c> FnOnce(&'c mut Recorder<'b, String, usize>) + Send + 'static,
+        {
+            Self { fun: Box::new(f) }
+        }
+
+        fn get(k: String) -> Self {
+            Self::from_fn(move |backend| {
+                backend.get(&k);
+            })
+        }
+
+        fn set(k: String, v: usize) -> Self {
+            Self::from_fn(move |backend| {
+                backend.set(k, v);
+            })
+        }
+
+        fn remove(k: String) -> Self {
+            Self::from_fn(move |backend| {
+                backend.remove(&k);
+            })
+        }
+    }
+
+    impl From<SendableChangeRequest> for ChangeRequest<'static, String, usize> {
+        fn from(request: SendableChangeRequest) -> Self {
+            Self::from_fn(request.fun)
+        }
+    }
+}
diff --git a/cache_system/src/backend/policy/refresh.rs b/cache_system/src/backend/policy/refresh.rs
new file mode 100644
index 0000000..6682352
--- /dev/null
+++ b/cache_system/src/backend/policy/refresh.rs
@@ -0,0 +1,1028 @@
+//! Refresh handling.
+use std::{
+    fmt::Debug,
+    hash::Hash,
+    marker::PhantomData,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+};
+
+use backoff::{Backoff, BackoffConfig};
+use futures::{future::BoxFuture, stream::FuturesUnordered, FutureExt, StreamExt};
+use iox_time::{Time, TimeProvider};
+use metric::U64Counter;
+use parking_lot::Mutex;
+use rand::rngs::mock::StepRng;
+use tokio::{runtime::Handle, sync::Notify, task::JoinHandle};
+use tokio_util::sync::CancellationToken;
+
+use crate::{addressable_heap::AddressableHeap, loader::Loader};
+
+use super::{CacheBackend, CallbackHandle, ChangeRequest, Subscriber};
+
+/// Interface to provide refresh duration for a key-value pair.
+pub trait RefreshDurationProvider: std::fmt::Debug + Send + Sync + 'static {
+    /// Cache key.
+    type K;
+
+    /// Cached value.
+    type V;
+
+    /// When should the given key-value pair be refreshed?
+    ///
+    /// Return `None` for "never".
+    ///
+    /// The function is only called once for a newly cached key-value pair. This means:
+    /// - There is no need in remembering the time of a given pair (e.g. you can safely always return a constant).
+    /// - You cannot change the timings after the data was cached.
+    ///
+    /// Refresh is set to take place AT OR AFTER the provided duration.
+    fn refresh_in(&self, k: &Self::K, v: &Self::V) -> Option<BackoffConfig>;
+}
+
+/// [`RefreshDurationProvider`] that never expires.
+#[derive(Default)]
+pub struct NeverRefreshProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    // phantom data that is Send and Sync, see https://stackoverflow.com/a/50201389
+    _k: PhantomData<fn() -> K>,
+    _v: PhantomData<fn() -> V>,
+}
+
+impl<K, V> std::fmt::Debug for NeverRefreshProvider<K, V> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("NeverRefreshProvider")
+            .finish_non_exhaustive()
+    }
+}
+
+impl<K, V> RefreshDurationProvider for NeverRefreshProvider<K, V> {
+    type K = K;
+    type V = V;
+
+    fn refresh_in(&self, _k: &Self::K, _v: &Self::V) -> Option<BackoffConfig> {
+        None
+    }
+}
+
+/// [`RefreshDurationProvider`] that returns different values for `None`/`Some(...)` values.
+pub struct OptionalValueRefreshDurationProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    // phantom data that is Send and Sync, see https://stackoverflow.com/a/50201389
+    _k: PhantomData<fn() -> K>,
+    _v: PhantomData<fn() -> V>,
+
+    backoff_cfg_none: Option<BackoffConfig>,
+    backoff_cfg_some: Option<BackoffConfig>,
+}
+
+impl<K, V> OptionalValueRefreshDurationProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    /// Create new provider with the given refresh duration for `None` and `Some(...)`.
+    pub fn new(
+        backoff_cfg_none: Option<BackoffConfig>,
+        backoff_cfg_some: Option<BackoffConfig>,
+    ) -> Self {
+        Self {
+            _k: PhantomData,
+            _v: PhantomData,
+            backoff_cfg_none,
+            backoff_cfg_some,
+        }
+    }
+}
+
+impl<K, V> std::fmt::Debug for OptionalValueRefreshDurationProvider<K, V> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OptionalValueRefreshDurationProvider")
+            .field("t_none", &self.backoff_cfg_none)
+            .field("t_some", &self.backoff_cfg_some)
+            .finish_non_exhaustive()
+    }
+}
+
+impl<K, V> RefreshDurationProvider for OptionalValueRefreshDurationProvider<K, V> {
+    type K = K;
+    type V = Option<V>;
+
+    fn refresh_in(&self, _k: &Self::K, v: &Self::V) -> Option<BackoffConfig> {
+        match v {
+            None => self.backoff_cfg_none.clone(),
+            Some(_) => self.backoff_cfg_some.clone(),
+        }
+    }
+}
+
+/// Tag for keys (incl. their backoff state and their running background tasks) to reason about lock gaps.
+type Tag = u64;
+
+/// Cache policy that implements refreshing.
+#[derive(Debug)]
+pub struct RefreshPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    refresh_duration_provider: Arc<dyn RefreshDurationProvider<K = K, V = V>>,
+    background_worker: JoinHandle<()>,
+    timings: Arc<Mutex<AddressableHeap<K, RefreshState, TimeOrNever>>>,
+    timings_changed: Arc<Notify>,
+    tag_counter: AtomicU64,
+    rng_overwrite: Option<StepRng>,
+}
+
+impl<K, V> RefreshPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Create new refresh policy.
+    #[allow(clippy::new_ret_no_self)]
+    pub fn new(
+        time_provider: Arc<dyn TimeProvider>,
+        refresh_duration_provider: Arc<dyn RefreshDurationProvider<K = K, V = V>>,
+        loader: Arc<dyn Loader<K = K, V = V, Extra = ()>>,
+        name: &'static str,
+        metric_registry: &metric::Registry,
+        handle: &Handle,
+    ) -> impl FnOnce(CallbackHandle<K, V>) -> Self {
+        let idle_notify = Arc::new(Notify::new());
+        Self::new_inner(
+            time_provider,
+            refresh_duration_provider,
+            loader,
+            name,
+            metric_registry,
+            idle_notify,
+            handle,
+            None,
+        )
+    }
+
+    /// Create new refresh policy but allows to specify some internals for testing.
+    ///
+    /// These internals are:
+    ///
+    /// - `idle_notify`: a [`Notify`] that will be triggered when the background worker is idle.
+    /// - `rng_overwrite`: a static RNG that will be used for the [`backoff`]-based refresh timers instead of a true
+    ///   thread RNG.
+    #[allow(clippy::new_ret_no_self, clippy::too_many_arguments)]
+    pub(crate) fn new_inner(
+        time_provider: Arc<dyn TimeProvider>,
+        refresh_duration_provider: Arc<dyn RefreshDurationProvider<K = K, V = V>>,
+        loader: Arc<dyn Loader<K = K, V = V, Extra = ()>>,
+        name: &'static str,
+        metric_registry: &metric::Registry,
+        idle_notify: Arc<Notify>,
+        handle: &Handle,
+        rng_overwrite: Option<StepRng>,
+    ) -> impl FnOnce(CallbackHandle<K, V>) -> Self {
+        let metric_refreshed = metric_registry
+            .register_metric::<U64Counter>("cache_refresh", "Number of cache refresh operations.")
+            .recorder(&[("name", name)]);
+
+        // clone handle for callback
+        let handle = handle.clone();
+
+        move |mut callback_handle| {
+            callback_handle.execute_requests(vec![ChangeRequest::ensure_empty()]);
+
+            let timings: Arc<Mutex<AddressableHeap<K, RefreshState, TimeOrNever>>> =
+                Default::default();
+            let timings_captured = Arc::clone(&timings);
+            let timings_changed = Arc::new(Notify::new());
+            let timings_changed_captured = Arc::clone(&timings_changed);
+            let callback_handle = Arc::new(Mutex::new(callback_handle));
+            let rng_overwrite_captured = rng_overwrite.clone();
+
+            let background_worker = handle.spawn(async move {
+                let mut refresh_tasks = FuturesUnordered::<BoxFuture<'static, Option<(K, Tag)>>>::new();
+
+                // We MUST NOT poll the empty task set because this would finish immediately. This will hot-loop
+                // the loop. Even worse, since `FuturesUnodered` is not hooked up into tokio's (somewhat bizarre)
+                // task preemtion system, tokio will poll this method here forever, essentially blocking this
+                // thread.
+                refresh_tasks.push(Box::pin(futures::future::pending()));
+
+                // flag that remembers if we can notify idle observers again
+                let mut can_notify_idle = true;
+
+                loop {
+                    // future that waits for the next refresh task to start
+                    let fut_start_next_task: BoxFuture<'static, ()> = {
+                        let timings = timings_captured.lock();
+                        match timings.peek() {
+                            None => Box::pin(futures::future::pending()),
+                            Some((_k, _state, t_next)) => match t_next {
+                                TimeOrNever::Never => Box::pin(futures::future::pending()),
+                                TimeOrNever::Time(t) => Box::pin(time_provider.sleep_until(*t)),
+                            }
+                        }
+                    };
+
+                    // future that "guards" our idle notification to prevent hot loops (essentially blocking the entire
+                    // tokio thread forever)
+                    let fut_idle_notify_guard: BoxFuture<'static, ()> = if can_notify_idle {
+                        Box::pin(futures::future::ready(()))
+                    } else {
+                        Box::pin(futures::future::pending())
+                    };
+
+                    tokio::select! {
+                        biased;
+                        maybe_k_and_tag = refresh_tasks.next() => {
+                            // a refresh tasks finished
+
+                            // see if this refresh task was NOT finished
+                            if let Some((k, tag)) = maybe_k_and_tag.flatten() {
+                                let mut timings = timings_captured.lock();
+                                if let Some((mut state, t_next)) = timings.remove(&k) {
+                                    if state.tag == tag {
+                                        state.running_refresh = None;
+                                        let (state, t_next) = state.next(time_provider.now(), &rng_overwrite_captured);
+                                        timings.insert(k, state, t_next);
+                                    } else {
+                                        // wrong one (lock gap)
+                                        timings.insert(k, state, t_next);
+                                    }
+                                }
+                            }
+
+                            can_notify_idle = true;
+                        }
+                        _ = fut_start_next_task => {
+                            // a new refresh task shall start
+                            let mut timings = timings_captured.lock();
+
+                            // careful with inspection of timings since there was a lock-gap, the data might have changed
+                            if let Some((k, mut state, t_next)) = timings.pop() {
+                                if t_next <= TimeOrNever::Time(time_provider.now()) {
+                                    assert!(state.running_refresh.is_none());
+
+                                    let (fut, ctoken) = Self::refresh(Arc::clone(&loader), Arc::clone(&callback_handle), k.clone(), state.tag, metric_refreshed.clone());
+                                    state.running_refresh = Some(ctoken);
+                                    refresh_tasks.push(fut);
+
+                                    timings.insert(k, state, TimeOrNever::Never);
+                                } else {
+                                    // the entry in question is gone and we got the wrong one, put it back
+                                    timings.insert(k, state, t_next);
+                                }
+                            }
+
+                            can_notify_idle = true;
+                        }
+                        _ = timings_changed_captured.notified() => {
+                            // timings updated
+
+                            // do NOT count this as "can not notify IDLE" because nothing really happened yet
+                        }
+                        _ = fut_idle_notify_guard => {
+                            // no other jobs to do (this select is biased!), we inform the external test observer
+                            idle_notify.notify_one();
+                            can_notify_idle = false;
+                        }
+                    }
+                }
+            });
+
+            Self {
+                refresh_duration_provider,
+                background_worker,
+                timings,
+                timings_changed,
+                tag_counter: AtomicU64::new(0),
+                rng_overwrite,
+            }
+        }
+    }
+
+    /// Start refresh task for given key and return cancelation token for the task.
+    ///
+    /// You shall store the given token in [`RefreshState`].
+    #[must_use]
+    fn refresh(
+        loader: Arc<dyn Loader<K = K, V = V, Extra = ()>>,
+        callback_handle: Arc<Mutex<CallbackHandle<K, V>>>,
+        k: K,
+        tag: Tag,
+        metric_refreshed: U64Counter,
+    ) -> (BoxFuture<'static, Option<(K, Tag)>>, CancellationToken) {
+        let cancelled = CancellationToken::default();
+
+        let cancelled_captured = cancelled.clone();
+        let fut = async move {
+            // some `let`-dance so that rustc does not complain that `&K` is not `Send`
+            let k_for_loader = k.clone();
+            let v = loader.load(k_for_loader, ()).await;
+
+            let mut callback_handle = callback_handle.lock();
+            callback_handle.execute_requests(vec![ChangeRequest::from_fn(|backend| {
+                // Here we have the PolicyBackend implicit lock. There is no way our Subscriber can be
+                // active here, but we need to check if we have been canceled one last time.
+                if cancelled_captured.is_cancelled() {
+                    return;
+                }
+
+                backend.set(k.clone(), v);
+            })]);
+
+            // update metric AFTER change request
+            metric_refreshed.inc(1);
+
+            // there is NO need to update our own `timings` after this refresh because this very Subscriber
+            // will also get a `set` notification and update its timing table accordingly
+            (k, tag)
+        };
+
+        let cancelled_captured = cancelled.clone();
+        let fut = async move {
+            tokio::select! {
+                _ = cancelled_captured.cancelled() => None,
+                k = fut => Some(k),
+            }
+        }
+        .boxed();
+
+        (fut, cancelled)
+    }
+}
+
+impl<K, V> Drop for RefreshPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    fn drop(&mut self) {
+        self.background_worker.abort();
+    }
+}
+
+impl<K, V> Subscriber for RefreshPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    type K = K;
+    type V = V;
+
+    fn get(&mut self, k: &Self::K, now: Time) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        let mut timings = self.timings.lock();
+
+        // Does this entry exists?
+        if let Some((mut state, t_next)) = timings.remove(k) {
+            // reset backoff
+            state.next = None;
+
+            if state.running_refresh.is_some() {
+                // there is a refresh operation running, so just reset the backoff and put this back
+                assert_eq!(t_next, TimeOrNever::Never);
+                timings.insert(k.clone(), state, TimeOrNever::Never);
+            } else {
+                // refresh operation currently NOT running => schedule one
+                let (state, t_next) = state.next(now, &self.rng_overwrite);
+                timings.insert(k.clone(), state, t_next);
+                self.timings_changed.notify_one();
+            }
+        }
+
+        vec![]
+    }
+
+    fn set(
+        &mut self,
+        k: &Self::K,
+        v: &Self::V,
+        now: Time,
+    ) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        let backoff_cfg = self.refresh_duration_provider.refresh_in(k, v);
+
+        let mut timings = self.timings.lock();
+
+        // ignore any entries that don't require any work
+        if let Some(backoff_cfg) = backoff_cfg {
+            if let Some((mut state, time)) = timings.remove(k) {
+                // we know this key already
+                state.next = match state.next.take() {
+                    Some(mut next) => {
+                        next.fade_to(&backoff_cfg);
+                        Some(next)
+                    }
+                    None => None,
+                };
+                state.backoff_cfg = backoff_cfg;
+
+                timings.insert(k.clone(), state, time);
+                self.timings_changed.notify_one();
+            } else {
+                // new key
+                let state =
+                    RefreshState::new(backoff_cfg, self.tag_counter.fetch_add(1, Ordering::SeqCst));
+                let (state, time) = state.next(now, &self.rng_overwrite);
+
+                timings.insert(k.clone(), state, time);
+                self.timings_changed.notify_one();
+            }
+        } else {
+            // need to remove potentially existing entry that had some refresh set
+            timings.remove(k);
+
+            // the removal drops the RefreshState which triggers a cancelation for any potentially running
+            // refresh operation
+        }
+
+        vec![]
+    }
+
+    fn remove(&mut self, k: &Self::K, _now: Time) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        let mut timings = self.timings.lock();
+        timings.remove(k);
+
+        // the removal automatically triggered a cancelation for any potentially running refresh operation
+
+        vec![]
+    }
+}
+
+/// Current state of an entry managed by the refresh policy.
+#[derive(Debug)]
+struct RefreshState {
+    /// When to refresh or expire.
+    backoff_cfg: BackoffConfig,
+
+    /// Current backoff state
+    next: Option<Backoff>,
+
+    /// Tag that links the background task to this very entry
+    tag: Tag,
+
+    /// Cancellation token for a potentially running refresh operation.
+    ///
+    /// This token will be triggered on [`drop`](Drop::drop).
+    running_refresh: Option<CancellationToken>,
+}
+
+impl RefreshState {
+    fn new(backoff_cfg: BackoffConfig, tag: Tag) -> Self {
+        Self {
+            backoff_cfg,
+            next: None,
+            tag,
+            running_refresh: None,
+        }
+    }
+
+    fn next(mut self, now: Time, rng_overwrite: &Option<StepRng>) -> (Self, TimeOrNever) {
+        assert!(self.running_refresh.is_none());
+
+        let mut next = self.next.take().unwrap_or_else(|| {
+            Backoff::new_with_rng(
+                &self.backoff_cfg,
+                rng_overwrite.as_ref().map(|rng| Box::new(rng.clone()) as _),
+            )
+        });
+        let time = match next.next().and_then(|d| now.checked_add(d)) {
+            None => TimeOrNever::Never,
+            Some(time) => TimeOrNever::Time(time),
+        };
+        let this = Self {
+            backoff_cfg: self.backoff_cfg.clone(),
+            tag: self.tag,
+            next: Some(next),
+            running_refresh: None,
+        };
+        (this, time)
+    }
+}
+
+impl Drop for RefreshState {
+    fn drop(&mut self) {
+        if let Some(token) = &self.running_refresh {
+            token.cancel();
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+enum TimeOrNever {
+    Time(Time),
+    Never,
+}
+
+pub mod test_util {
+    //! Testing utilities for refresh policy.
+
+    use std::{collections::HashMap, time::Duration};
+
+    use super::*;
+
+    /// Easy-to-control [`RefreshDurationProvider`].
+    #[derive(Debug, Default)]
+    pub struct TestRefreshDurationProvider {
+        times: Mutex<HashMap<(u8, String), Option<BackoffConfig>>>,
+    }
+
+    impl TestRefreshDurationProvider {
+        /// Create new, empty provider.
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Specify a refresh duration for a given key-value pair.
+        ///
+        /// Existing values will be overridden.
+        pub fn set_refresh_in(&self, k: u8, v: String, d: Option<BackoffConfig>) {
+            self.times.lock().insert((k, v), d);
+            // do NOT check if there was already a value set because we allow overrides
+        }
+    }
+
+    impl RefreshDurationProvider for TestRefreshDurationProvider {
+        type K = u8;
+        type V = String;
+
+        fn refresh_in(&self, k: &Self::K, v: &Self::V) -> Option<BackoffConfig> {
+            self.times
+                .lock()
+                .get(&(*k, v.clone()))
+                .unwrap_or_else(|| panic!("refresh time not mocked: K={k}, V={v}"))
+                .clone()
+        }
+    }
+
+    /// Some extensions for [`Notify`].
+    pub trait NotifyExt {
+        /// Wait for notification but panic after a short timeout.
+        fn notified_with_timeout(&self) -> BoxFuture<'_, ()>;
+
+        /// Ensure that we are NOT notified.
+        fn not_notified(&self) -> BoxFuture<'_, ()>;
+    }
+
+    impl NotifyExt for Notify {
+        fn notified_with_timeout(&self) -> BoxFuture<'_, ()> {
+            Box::pin(async {
+                tokio::time::timeout(Duration::from_secs(1), self.notified())
+                    .await
+                    .expect("notified_with_timeout");
+            })
+        }
+
+        fn not_notified(&self) -> BoxFuture<'_, ()> {
+            Box::pin(async {
+                tokio::time::timeout(Duration::from_millis(10), self.notified())
+                    .await
+                    .unwrap_err();
+            })
+        }
+    }
+
+    /// Generate a simple [`BackoffConfig`] for testing.
+    ///
+    /// Uses the given duration as initial backoff and a base of 2. No max backoff and deadline are set.
+    pub fn backoff_cfg(d: Duration) -> BackoffConfig {
+        BackoffConfig {
+            init_backoff: d,
+            max_backoff: Duration::MAX,
+            base: 2.0,
+            deadline: None,
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::*;
+
+        #[test]
+        #[should_panic(expected = "refresh time not mocked: K=1, V=foo")]
+        fn test_provider_panic_not_mocked() {
+            let provider = TestRefreshDurationProvider::default();
+            provider.refresh_in(&1, &String::from("foo"));
+        }
+
+        #[test]
+        fn test_provider_mocking() {
+            let provider = TestRefreshDurationProvider::default();
+
+            let cfg1 = BackoffConfig::default();
+            let cfg2 = BackoffConfig { base: 42., ..cfg1 };
+            let cfg3 = BackoffConfig {
+                base: 1337.,
+                ..cfg1
+            };
+
+            provider.set_refresh_in(1, String::from("a"), None);
+            provider.set_refresh_in(1, String::from("b"), Some(cfg1.clone()));
+            provider.set_refresh_in(2, String::from("a"), Some(cfg2.clone()));
+
+            assert_eq!(provider.refresh_in(&1, &String::from("a")), None);
+            assert_eq!(provider.refresh_in(&1, &String::from("b")), Some(cfg1),);
+            assert_eq!(provider.refresh_in(&2, &String::from("a")), Some(cfg2),);
+
+            // replace
+            provider.set_refresh_in(1, String::from("a"), Some(cfg3.clone()));
+            assert_eq!(provider.refresh_in(&1, &String::from("a")), Some(cfg3),);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, time::Duration};
+
+    use iox_time::MockProvider;
+    use metric::{Observation, RawReporter};
+    use rand::rngs::mock::StepRng;
+
+    use crate::{
+        backend::{
+            policy::{
+                refresh::test_util::{backoff_cfg, NotifyExt},
+                PolicyBackend,
+            },
+            CacheBackend,
+        },
+        loader::test_util::TestLoader,
+    };
+
+    use super::{test_util::TestRefreshDurationProvider, *};
+
+    #[test]
+    fn test_time_or_never_ord() {
+        assert!(TimeOrNever::Never == TimeOrNever::Never);
+        assert!(
+            TimeOrNever::Time(Time::from_timestamp_millis(1).unwrap())
+                == TimeOrNever::Time(Time::from_timestamp_millis(1).unwrap())
+        );
+        assert!(
+            TimeOrNever::Time(Time::from_timestamp_millis(1).unwrap())
+                < TimeOrNever::Time(Time::from_timestamp_millis(2).unwrap())
+        );
+        assert!(TimeOrNever::Time(Time::from_timestamp_millis(1).unwrap()) < TimeOrNever::Never);
+    }
+
+    #[test]
+    fn test_never_refresh_provider() {
+        let provider = NeverRefreshProvider::<u8, i8>::default();
+        assert_eq!(provider.refresh_in(&1, &2), None);
+    }
+
+    #[test]
+    fn test_optional_value_ttl_provider() {
+        let t_none = Some(BackoffConfig {
+            base: 1.,
+            ..Default::default()
+        });
+        let t_some = Some(BackoffConfig {
+            base: 2.,
+            ..Default::default()
+        });
+        let provider =
+            OptionalValueRefreshDurationProvider::<u8, i8>::new(t_none.clone(), t_some.clone());
+        assert_eq!(provider.refresh_in(&1, &None), t_none);
+        assert_eq!(provider.refresh_in(&1, &Some(2)), t_some);
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "inner backend is not empty")]
+    async fn test_panic_inner_not_empty() {
+        let refresh_duration_provider = Arc::new(TestRefreshDurationProvider::new());
+        let metric_registry = metric::Registry::new();
+
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let loader = Arc::new(TestLoader::default());
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        let policy_constructor = RefreshPolicy::new(
+            time_provider,
+            refresh_duration_provider,
+            loader,
+            "my_cache",
+            &metric_registry,
+            &Handle::current(),
+        );
+        backend.add_policy(|mut handle| {
+            handle.execute_requests(vec![ChangeRequest::set(1, String::from("foo"))]);
+            policy_constructor(handle)
+        });
+    }
+
+    #[tokio::test]
+    async fn test_duration_overflow() {
+        let refresh_duration_provider = Arc::new(TestRefreshDurationProvider::new());
+        refresh_duration_provider.set_refresh_in(
+            1,
+            String::from("a"),
+            Some(BackoffConfig {
+                init_backoff: Duration::MAX,
+                ..Default::default()
+            }),
+        );
+
+        let metric_registry = metric::Registry::new();
+        let time_provider = Arc::new(MockProvider::new(Time::MAX - Duration::from_secs(1)));
+        let loader = Arc::new(TestLoader::default());
+        let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(RefreshPolicy::new(
+            Arc::clone(&time_provider) as _,
+            refresh_duration_provider,
+            loader,
+            "my_cache",
+            &metric_registry,
+            &Handle::current(),
+        ));
+
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(1));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+        assert_eq!(get_refresh_metric(&metric_registry), 0);
+    }
+
+    #[tokio::test]
+    async fn test_refresh() {
+        let TestState {
+            mut backend,
+            refresh_duration_provider,
+            time_provider,
+            loader,
+            metric_registry,
+            notify_idle,
+            ..
+        } = TestState::new();
+
+        loader.mock_next(1, String::from("foo"));
+        loader.mock_next(1, String::from("bar"));
+
+        refresh_duration_provider.set_refresh_in(
+            1,
+            String::from("a"),
+            Some(backoff_cfg(Duration::from_secs(1))),
+        );
+        refresh_duration_provider.set_refresh_in(
+            1,
+            String::from("foo"),
+            Some(backoff_cfg(Duration::from_secs(1))),
+        );
+        refresh_duration_provider.set_refresh_in(1, String::from("bar"), None);
+
+        // start backoff cycle
+        backend.set(1, String::from("a"));
+
+        // initial notify by the background loop
+        notify_idle.notified_with_timeout().await;
+
+        // still the same key
+        assert_eq!(get_inner(&mut backend, 1), Some(String::from("a")));
+        assert_eq!(get_refresh_metric(&metric_registry), 0);
+
+        // refresh starts by background timer
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.notified_with_timeout().await;
+        assert_eq!(get_refresh_metric(&metric_registry), 1);
+        assert_eq!(get_inner(&mut backend, 1), Some(String::from("foo")));
+
+        // nothing to refresh yet
+        notify_idle.not_notified().await;
+        assert_eq!(get_refresh_metric(&metric_registry), 1);
+        assert_eq!(get_inner(&mut backend, 1), Some(String::from("foo")));
+
+        // just bumping the refresh by the old refresh timer won't do anything (we need 2 seconds this time due to the
+        // base factor)
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.not_notified().await;
+        assert_eq!(get_refresh_metric(&metric_registry), 1);
+        assert_eq!(get_inner(&mut backend, 1), Some(String::from("foo")));
+
+        // try a 2nd update
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.notified_with_timeout().await;
+        assert_eq!(get_refresh_metric(&metric_registry), 2);
+        assert_eq!(get_inner(&mut backend, 1), Some(String::from("bar")));
+    }
+
+    #[tokio::test]
+    async fn test_do_not_start_refresh_while_one_is_running() {
+        let TestState {
+            mut backend,
+            refresh_duration_provider,
+            time_provider,
+            loader,
+            notify_idle,
+            ..
+        } = TestState::new();
+
+        let barrier = loader.block_next(1, String::from("foo"));
+        refresh_duration_provider.set_refresh_in(
+            1,
+            String::from("a"),
+            Some(backoff_cfg(Duration::from_secs(1))),
+        );
+        refresh_duration_provider.set_refresh_in(1, String::from("foo"), None);
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.notified_with_timeout().await;
+
+        // if this would start another refresh then the loader would panic because we've only mocked a single request
+        time_provider.inc(Duration::from_secs(100));
+        notify_idle.not_notified().await;
+
+        barrier.wait().await;
+        notify_idle.notified_with_timeout().await;
+        assert_eq!(backend.get(&1), Some(String::from("foo")));
+    }
+
+    #[tokio::test]
+    async fn test_refresh_does_not_override_new_entries() {
+        let TestState {
+            mut backend,
+            refresh_duration_provider,
+            time_provider,
+            loader,
+            notify_idle,
+            ..
+        } = TestState::new();
+
+        let barrier = loader.block_next(1, String::from("foo"));
+        refresh_duration_provider.set_refresh_in(
+            1,
+            String::from("a"),
+            Some(backoff_cfg(Duration::from_secs(1))),
+        );
+        refresh_duration_provider.set_refresh_in(1, String::from("b"), None);
+        backend.set(1, String::from("a"));
+
+        // perform refresh
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.notified_with_timeout().await;
+
+        backend.set(1, String::from("b"));
+        barrier.wait().await;
+        notify_idle.notified_with_timeout().await;
+        assert_eq!(backend.get(&1), Some(String::from("b")));
+    }
+
+    #[tokio::test]
+    async fn test_remove_cancels_loader() {
+        let TestState {
+            mut backend,
+            refresh_duration_provider,
+            time_provider,
+            loader,
+            notify_idle,
+            ..
+        } = TestState::new();
+
+        let barrier = loader.block_next(1, String::from("foo"));
+        refresh_duration_provider.set_refresh_in(
+            1,
+            String::from("a"),
+            Some(backoff_cfg(Duration::from_secs(1))),
+        );
+        backend.set(1, String::from("a"));
+
+        // perform refresh
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.notified_with_timeout().await;
+
+        assert_eq!(Arc::strong_count(&barrier), 2);
+        backend.remove(&1);
+        notify_idle.notified_with_timeout().await;
+        assert_eq!(Arc::strong_count(&barrier), 1);
+    }
+
+    #[tokio::test]
+    async fn test_override_with_no_refresh() {
+        let TestState {
+            mut backend,
+            refresh_duration_provider,
+            time_provider,
+            loader,
+            notify_idle,
+            ..
+        } = TestState::new();
+
+        let barrier = loader.block_next(1, String::from("foo"));
+        refresh_duration_provider.set_refresh_in(
+            1,
+            String::from("a"),
+            Some(backoff_cfg(Duration::from_secs(1))),
+        );
+        refresh_duration_provider.set_refresh_in(1, String::from("b"), None);
+        backend.set(1, String::from("a"));
+
+        // perform refresh
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.notified_with_timeout().await;
+
+        backend.set(1, String::from("b"));
+        barrier.wait().await;
+
+        // no refresh
+        time_provider.inc(Duration::from_secs(1));
+        notify_idle.notified_with_timeout().await;
+        assert_eq!(backend.get(&1), Some(String::from("b")));
+    }
+
+    #[tokio::test]
+    async fn test_generic_backend() {
+        use crate::backend::test_util::test_generic;
+
+        test_generic(|| {
+            let refresh_duration_provider = Arc::new(NeverRefreshProvider::default());
+            let time_provider = Arc::new(MockProvider::new(Time::MIN));
+            let metric_registry = metric::Registry::new();
+            let loader = Arc::new(TestLoader::default());
+            let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+
+            backend.add_policy(RefreshPolicy::new(
+                time_provider,
+                Arc::clone(&refresh_duration_provider) as _,
+                loader,
+                "my_cache",
+                &metric_registry,
+                &Handle::current(),
+            ));
+            backend
+        });
+    }
+
+    struct TestState {
+        backend: PolicyBackend<u8, String>,
+        metric_registry: metric::Registry,
+        refresh_duration_provider: Arc<TestRefreshDurationProvider>,
+        time_provider: Arc<MockProvider>,
+        loader: Arc<TestLoader<u8, (), String>>,
+        notify_idle: Arc<Notify>,
+    }
+
+    impl TestState {
+        fn new() -> Self {
+            let refresh_duration_provider = Arc::new(TestRefreshDurationProvider::new());
+            let time_provider = Arc::new(MockProvider::new(Time::MIN));
+            let metric_registry = metric::Registry::new();
+            let loader = Arc::new(TestLoader::default());
+            let notify_idle = Arc::new(Notify::new());
+
+            // set up "RNG" that always generates the maximum, so we can test things easier
+            let rng_overwrite = StepRng::new(u64::MAX, 0);
+
+            let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+            backend.add_policy(RefreshPolicy::new_inner(
+                Arc::clone(&time_provider) as _,
+                Arc::clone(&refresh_duration_provider) as _,
+                Arc::clone(&loader) as _,
+                "my_cache",
+                &metric_registry,
+                Arc::clone(&notify_idle),
+                &Handle::current(),
+                Some(rng_overwrite),
+            ));
+
+            Self {
+                backend,
+                metric_registry,
+                refresh_duration_provider,
+                time_provider,
+                loader,
+                notify_idle,
+            }
+        }
+    }
+
+    fn get_inner(backend: &mut PolicyBackend<u8, String>, k: u8) -> Option<String> {
+        let inner_backend = backend.inner_ref();
+        let inner_backend = inner_backend
+            .as_any()
+            .downcast_ref::<HashMap<u8, String>>()
+            .unwrap();
+        inner_backend.get(&k).cloned()
+    }
+
+    fn get_refresh_metric(metric_registry: &metric::Registry) -> u64 {
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        let observation = reporter
+            .metric("cache_refresh")
+            .unwrap()
+            .observation(&[("name", "my_cache")])
+            .unwrap();
+
+        if let Observation::U64Counter(c) = observation {
+            *c
+        } else {
+            panic!("Wrong observation type")
+        }
+    }
+}
diff --git a/cache_system/src/backend/policy/remove_if.rs b/cache_system/src/backend/policy/remove_if.rs
new file mode 100644
index 0000000..57abf45
--- /dev/null
+++ b/cache_system/src/backend/policy/remove_if.rs
@@ -0,0 +1,288 @@
+//! Backend that supports custom removal / expiry of keys
+use metric::U64Counter;
+use parking_lot::Mutex;
+use std::{fmt::Debug, hash::Hash, marker::PhantomData, sync::Arc};
+
+use crate::{
+    backend::policy::{CacheBackend, CallbackHandle, ChangeRequest, Subscriber},
+    cache::{Cache, CacheGetStatus},
+};
+
+/// Allows explicitly removing entries from the cache.
+#[derive(Debug, Clone)]
+pub struct RemoveIfPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    // the policy itself doesn't do anything, the handles will do all the work
+    _phantom: PhantomData<(K, V)>,
+}
+
+impl<K, V> RemoveIfPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Create new policy.
+    ///
+    /// This returns the policy constructor which shall be pass to
+    /// [`PolicyBackend::add_policy`] and handle that can be used to remove entries.
+    ///
+    /// Note that as long as the policy constructor is NOT passed to [`PolicyBackend::add_policy`], the operations on
+    /// the handle are essentially no-ops (i.e. they will not remove anything).
+    ///
+    /// [`PolicyBackend::add_policy`]: super::PolicyBackend::add_policy
+    pub fn create_constructor_and_handle(
+        name: &'static str,
+        metric_registry: &metric::Registry,
+    ) -> (
+        impl FnOnce(CallbackHandle<K, V>) -> Self,
+        RemoveIfHandle<K, V>,
+    ) {
+        let metric_removed_by_predicate = metric_registry
+            .register_metric::<U64Counter>(
+                "cache_removed_by_custom_condition",
+                "Number of entries removed from a cache via a custom condition",
+            )
+            .recorder(&[("name", name)]);
+
+        let handle = RemoveIfHandle {
+            callback_handle: Arc::new(Mutex::new(None)),
+            metric_removed_by_predicate,
+        };
+        let handle_captured = handle.clone();
+
+        let policy_constructor = move |callback_handle| {
+            *handle_captured.callback_handle.lock() = Some(callback_handle);
+            Self {
+                _phantom: PhantomData,
+            }
+        };
+
+        (policy_constructor, handle)
+    }
+}
+
+impl<K, V> Subscriber for RemoveIfPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    type K = K;
+    type V = V;
+}
+
+/// Handle created by [`RemoveIfPolicy`] that can be used to evict data from caches.
+///
+/// The handle can be cloned freely. All clones will refer to the same underlying backend.
+#[derive(Debug, Clone)]
+pub struct RemoveIfHandle<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    callback_handle: Arc<Mutex<Option<CallbackHandle<K, V>>>>,
+    metric_removed_by_predicate: U64Counter,
+}
+
+impl<K, V> RemoveIfHandle<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// "remove" a key (aka remove it from the shared backend) if the
+    /// specified predicate is true. If the key is removed return
+    /// true, otherwise return false
+    ///
+    /// Note that the predicate function is called while the lock is
+    /// held (and thus the inner backend can't be concurrently accessed
+    pub fn remove_if<P>(&self, k: &K, predicate: P) -> bool
+    where
+        P: FnOnce(V) -> bool,
+    {
+        let mut guard = self.callback_handle.lock();
+        let handle = match guard.as_mut() {
+            Some(handle) => handle,
+            None => return false,
+        };
+
+        let metric_removed_by_predicate = self.metric_removed_by_predicate.clone();
+        let mut removed = false;
+        let removed_captured = &mut removed;
+        let k = k.clone();
+        handle.execute_requests(vec![ChangeRequest::from_fn(move |backend| {
+            if let Some(v) = backend.get_untracked(&k) {
+                if predicate(v) {
+                    metric_removed_by_predicate.inc(1);
+                    backend.remove(&k);
+                    *removed_captured = true;
+                }
+            }
+        })]);
+
+        removed
+    }
+
+    /// Performs [`remove_if`](Self::remove_if) and [`GET`](Cache::get) in one go.
+    ///
+    /// Ensures that these two actions interact correctly.
+    ///
+    /// # Forward process
+    /// This function only works if cache values evolve in one direction. This is that the predicate can only flip from
+    /// `true` to `false` over time (i.e. it detects an outdated value and then an up-to-date value), NOT the other way
+    /// around (i.e. data cannot get outdated under the same predicate).
+    pub async fn remove_if_and_get_with_status<P, C, GetExtra>(
+        &self,
+        cache: &C,
+        k: K,
+        predicate: P,
+        extra: GetExtra,
+    ) -> (V, CacheGetStatus)
+    where
+        P: Fn(V) -> bool + Send,
+        C: Cache<K = K, V = V, GetExtra = GetExtra>,
+        GetExtra: Clone + Send,
+    {
+        let mut removed = self.remove_if(&k, &predicate);
+
+        loop {
+            // avoid some `Sync` bounds
+            let k_for_get = k.clone();
+            let extra_for_get = extra.clone();
+            let (v, status) = cache.get_with_status(k_for_get, extra_for_get).await;
+
+            match status {
+                CacheGetStatus::Hit => {
+                    // key existed and no other process loaded it => safe to use
+                    return (v, status);
+                }
+                CacheGetStatus::Miss => {
+                    // key didn't exist and we loaded it => safe to use
+                    return (v, status);
+                }
+                CacheGetStatus::MissAlreadyLoading => {
+                    if removed {
+                        // key was outdated but there was some loading in process, this may have overlapped with our check
+                        // so our check might have been incomplete => need to re-check
+                        removed = self.remove_if(&k, &predicate);
+                        if removed {
+                            // removed again, so cannot use our result
+                            continue;
+                        } else {
+                            // didn't remove => safe to use
+                            return (v, status);
+                        }
+                    } else {
+                        // there was a load action in process but the key was already up-to-date, so it's OK to use the new
+                        // data as well (forward process)
+                        return (v, status);
+                    }
+                }
+            }
+        }
+    }
+
+    /// Same as [`remove_if_and_get_with_status`](Self::remove_if_and_get_with_status) but without the status.
+    pub async fn remove_if_and_get<P, C, GetExtra>(
+        &self,
+        cache: &C,
+        k: K,
+        predicate: P,
+        extra: GetExtra,
+    ) -> V
+    where
+        P: Fn(V) -> bool + Send,
+        C: Cache<K = K, V = V, GetExtra = GetExtra>,
+        GetExtra: Clone + Send,
+    {
+        self.remove_if_and_get_with_status(cache, k, predicate, extra)
+            .await
+            .0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use iox_time::{MockProvider, Time};
+    use metric::{Observation, RawReporter};
+
+    use crate::backend::{policy::PolicyBackend, CacheBackend};
+
+    use super::*;
+
+    #[test]
+    fn test_generic_backend() {
+        use crate::backend::test_util::test_generic;
+
+        test_generic(|| {
+            let metric_registry = metric::Registry::new();
+            let time_provider = Arc::new(MockProvider::new(Time::MIN));
+            let mut backend = PolicyBackend::hashmap_backed(time_provider);
+            let (policy_constructor, _handle) =
+                RemoveIfPolicy::create_constructor_and_handle("my_cache", &metric_registry);
+            backend.add_policy(policy_constructor);
+            backend
+        });
+    }
+
+    #[test]
+    fn test_remove_if() {
+        let metric_registry = metric::Registry::new();
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend: PolicyBackend<u8, String> = PolicyBackend::hashmap_backed(time_provider);
+        let (policy_constructor, handle) =
+            RemoveIfPolicy::create_constructor_and_handle("my_cache", &metric_registry);
+        backend.add_policy(policy_constructor);
+        backend.set(1, "foo".into());
+        backend.set(2, "bar".into());
+
+        assert_eq!(get_removed_metric(&metric_registry), 0);
+
+        assert!(!handle.remove_if(&1, |v| v == "zzz"));
+        assert_eq!(backend.get(&1), Some("foo".into()));
+        assert_eq!(backend.get(&2), Some("bar".into()));
+        assert_eq!(get_removed_metric(&metric_registry), 0);
+
+        assert!(handle.remove_if(&1, |v| v == "foo"));
+        assert_eq!(backend.get(&1), None);
+        assert_eq!(backend.get(&2), Some("bar".into()));
+        assert_eq!(get_removed_metric(&metric_registry), 1);
+
+        assert!(!handle.remove_if(&1, |v| v == "bar"));
+        assert_eq!(backend.get(&1), None);
+        assert_eq!(backend.get(&2), Some("bar".into()));
+        assert_eq!(get_removed_metric(&metric_registry), 1);
+    }
+
+    #[test]
+    fn test_not_linked() {
+        let metric_registry = metric::Registry::new();
+        let (_policy_constructor, handle) =
+            RemoveIfPolicy::<u8, String>::create_constructor_and_handle(
+                "my_cache",
+                &metric_registry,
+            );
+
+        assert_eq!(get_removed_metric(&metric_registry), 0);
+
+        assert!(!handle.remove_if(&1, |v| v == "zzz"));
+        assert_eq!(get_removed_metric(&metric_registry), 0);
+    }
+
+    fn get_removed_metric(metric_registry: &metric::Registry) -> u64 {
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        let observation = reporter
+            .metric("cache_removed_by_custom_condition")
+            .unwrap()
+            .observation(&[("name", "my_cache")])
+            .unwrap();
+
+        if let Observation::U64Counter(c) = observation {
+            *c
+        } else {
+            panic!("Wrong observation type")
+        }
+    }
+}
diff --git a/cache_system/src/backend/policy/ttl.rs b/cache_system/src/backend/policy/ttl.rs
new file mode 100644
index 0000000..fee9e62
--- /dev/null
+++ b/cache_system/src/backend/policy/ttl.rs
@@ -0,0 +1,755 @@
+//! Time-to-live handling.
+use std::{fmt::Debug, hash::Hash, marker::PhantomData, sync::Arc, time::Duration};
+
+use iox_time::Time;
+use metric::U64Counter;
+
+use crate::addressable_heap::AddressableHeap;
+
+use super::{CallbackHandle, ChangeRequest, Subscriber};
+
+/// Interface to provide TTL (time to live) data for a key-value pair.
+pub trait TtlProvider: std::fmt::Debug + Send + Sync + 'static {
+    /// Cache key.
+    type K;
+
+    /// Cached value.
+    type V;
+
+    /// When should the given key-value pair expire?
+    ///
+    /// Return `None` for "never".
+    ///
+    /// The function is only called once for a newly cached key-value pair. This means:
+    /// - There is no need in remembering the time of a given pair (e.g. you can safely always return a constant).
+    /// - You cannot change the TTL after the data was cached.
+    ///
+    /// Expiration is set to take place AT OR AFTER the provided duration.
+    fn expires_in(&self, k: &Self::K, v: &Self::V) -> Option<Duration>;
+}
+
+/// [`TtlProvider`] that never expires.
+#[derive(Default)]
+pub struct NeverTtlProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    // phantom data that is Send and Sync, see https://stackoverflow.com/a/50201389
+    _k: PhantomData<fn() -> K>,
+    _v: PhantomData<fn() -> V>,
+}
+
+impl<K, V> std::fmt::Debug for NeverTtlProvider<K, V> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("NeverTtlProvider").finish_non_exhaustive()
+    }
+}
+
+impl<K, V> TtlProvider for NeverTtlProvider<K, V> {
+    type K = K;
+    type V = V;
+
+    fn expires_in(&self, _k: &Self::K, _v: &Self::V) -> Option<Duration> {
+        None
+    }
+}
+
+/// [`TtlProvider`] that returns a constant value.
+pub struct ConstantValueTtlProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    // phantom data that is Send and Sync, see https://stackoverflow.com/a/50201389
+    _k: PhantomData<fn() -> K>,
+    _v: PhantomData<fn() -> V>,
+
+    ttl: Option<Duration>,
+}
+
+impl<K, V> ConstantValueTtlProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    /// Create new provider with the given TTL value.
+    pub fn new(ttl: Option<Duration>) -> Self {
+        Self {
+            _k: PhantomData,
+            _v: PhantomData,
+            ttl,
+        }
+    }
+}
+
+impl<K, V> std::fmt::Debug for ConstantValueTtlProvider<K, V> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ConstantValueTtlProvider")
+            .field("ttl", &self.ttl)
+            .finish_non_exhaustive()
+    }
+}
+
+impl<K, V> TtlProvider for ConstantValueTtlProvider<K, V> {
+    type K = K;
+    type V = V;
+
+    fn expires_in(&self, _k: &Self::K, _v: &Self::V) -> Option<Duration> {
+        self.ttl
+    }
+}
+
+/// [`TtlProvider`] that returns different values for `None`/`Some(...)` values.
+pub struct OptionalValueTtlProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    // phantom data that is Send and Sync, see https://stackoverflow.com/a/50201389
+    _k: PhantomData<fn() -> K>,
+    _v: PhantomData<fn() -> V>,
+
+    ttl_none: Option<Duration>,
+    ttl_some: Option<Duration>,
+}
+
+impl<K, V> OptionalValueTtlProvider<K, V>
+where
+    K: 'static,
+    V: 'static,
+{
+    /// Create new provider with the given TTL values for `None` and `Some(...)`.
+    pub fn new(ttl_none: Option<Duration>, ttl_some: Option<Duration>) -> Self {
+        Self {
+            _k: PhantomData,
+            _v: PhantomData,
+            ttl_none,
+            ttl_some,
+        }
+    }
+}
+
+impl<K, V> std::fmt::Debug for OptionalValueTtlProvider<K, V> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OptionalValueTtlProvider")
+            .field("ttl_none", &self.ttl_none)
+            .field("ttl_some", &self.ttl_some)
+            .finish_non_exhaustive()
+    }
+}
+
+impl<K, V> TtlProvider for OptionalValueTtlProvider<K, V> {
+    type K = K;
+    type V = Option<V>;
+
+    fn expires_in(&self, _k: &Self::K, v: &Self::V) -> Option<Duration> {
+        match v {
+            None => self.ttl_none,
+            Some(_) => self.ttl_some,
+        }
+    }
+}
+
+/// Cache policy that implements Time To Life.
+///
+/// # Cache Eviction
+/// Every method ([`get`](Subscriber::get), [`set`](Subscriber::set), [`remove`](Subscriber::remove)) causes the
+/// cache to check for expired keys. This may lead to certain delays, esp. when dropping the contained values takes a
+/// long time.
+#[derive(Debug)]
+pub struct TtlPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    ttl_provider: Arc<dyn TtlProvider<K = K, V = V>>,
+    expiration: AddressableHeap<K, (), Time>,
+    metric_expired: U64Counter,
+}
+
+impl<K, V> TtlPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Create new TTL policy.
+    pub fn new(
+        ttl_provider: Arc<dyn TtlProvider<K = K, V = V>>,
+        name: &'static str,
+        metric_registry: &metric::Registry,
+    ) -> impl FnOnce(CallbackHandle<K, V>) -> Self {
+        let metric_expired = metric_registry
+            .register_metric::<U64Counter>(
+                "cache_ttl_expired",
+                "Number of entries that expired via TTL.",
+            )
+            .recorder(&[("name", name)]);
+
+        |mut callback_handle| {
+            callback_handle.execute_requests(vec![ChangeRequest::ensure_empty()]);
+
+            Self {
+                ttl_provider,
+                expiration: Default::default(),
+                metric_expired,
+            }
+        }
+    }
+
+    fn evict_expired(&mut self, now: Time) -> Vec<ChangeRequest<'static, K, V>> {
+        let mut requests = vec![];
+
+        while self
+            .expiration
+            .peek()
+            .map(|(_k, _, t)| *t <= now)
+            .unwrap_or_default()
+        {
+            let (k, _, _t) = self.expiration.pop().unwrap();
+            self.metric_expired.inc(1);
+            requests.push(ChangeRequest::remove(k));
+        }
+
+        requests
+    }
+}
+
+impl<K, V> Subscriber for TtlPolicy<K, V>
+where
+    K: Clone + Eq + Debug + Hash + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    type K = K;
+    type V = V;
+
+    fn get(&mut self, _k: &Self::K, now: Time) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        self.evict_expired(now)
+    }
+
+    fn set(
+        &mut self,
+        k: &Self::K,
+        v: &Self::V,
+        now: Time,
+    ) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        let mut requests = self.evict_expired(now);
+
+        if let Some(ttl) = self.ttl_provider.expires_in(k, v) {
+            if ttl.is_zero() {
+                requests.push(ChangeRequest::remove(k.clone()));
+            }
+
+            match now.checked_add(ttl) {
+                Some(t) => {
+                    self.expiration.insert(k.clone(), (), t);
+                }
+                None => {
+                    // Still need to ensure that any current expiration is disabled
+                    self.expiration.remove(k);
+                }
+            }
+        } else {
+            // Still need to ensure that any current expiration is disabled
+            self.expiration.remove(k);
+        };
+
+        requests
+    }
+
+    fn remove(&mut self, k: &Self::K, now: Time) -> Vec<ChangeRequest<'static, Self::K, Self::V>> {
+        self.expiration.remove(k);
+        self.evict_expired(now)
+    }
+}
+
+pub mod test_util {
+    //! Test utils for TTL policy.
+    use std::collections::HashMap;
+
+    use parking_lot::Mutex;
+
+    use super::*;
+
+    /// [`TtlProvider`] for testing.
+    #[derive(Debug, Default)]
+    pub struct TestTtlProvider {
+        expires_in: Mutex<HashMap<(u8, String), Option<Duration>>>,
+    }
+
+    impl TestTtlProvider {
+        /// Create new, empty provider.
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Set TTL time for given key-value pair.
+        pub fn set_expires_in(&self, k: u8, v: String, d: Option<Duration>) {
+            self.expires_in.lock().insert((k, v), d);
+        }
+    }
+
+    impl TtlProvider for TestTtlProvider {
+        type K = u8;
+        type V = String;
+
+        fn expires_in(&self, k: &Self::K, v: &Self::V) -> Option<Duration> {
+            *self
+                .expires_in
+                .lock()
+                .get(&(*k, v.clone()))
+                .expect("expires_in value not mocked")
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::*;
+
+        #[test]
+        #[should_panic(expected = "expires_in value not mocked")]
+        fn test_panic_value_not_mocked() {
+            TestTtlProvider::new().expires_in(&1, &String::from("foo"));
+        }
+
+        #[test]
+        fn test_mocking() {
+            let provider = TestTtlProvider::default();
+
+            provider.set_expires_in(1, String::from("a"), None);
+            provider.set_expires_in(1, String::from("b"), Some(Duration::from_secs(1)));
+            provider.set_expires_in(2, String::from("a"), Some(Duration::from_secs(2)));
+
+            assert_eq!(provider.expires_in(&1, &String::from("a")), None,);
+            assert_eq!(
+                provider.expires_in(&1, &String::from("b")),
+                Some(Duration::from_secs(1)),
+            );
+            assert_eq!(
+                provider.expires_in(&2, &String::from("a")),
+                Some(Duration::from_secs(2)),
+            );
+
+            // replace
+            provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(3)));
+            assert_eq!(
+                provider.expires_in(&1, &String::from("a")),
+                Some(Duration::from_secs(3)),
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, time::Duration};
+
+    use iox_time::MockProvider;
+    use metric::{Observation, RawReporter};
+
+    use crate::backend::{policy::PolicyBackend, CacheBackend};
+
+    use super::{test_util::TestTtlProvider, *};
+
+    #[test]
+    fn test_never_ttl_provider() {
+        let provider = NeverTtlProvider::<u8, i8>::default();
+        assert_eq!(provider.expires_in(&1, &2), None);
+    }
+
+    #[test]
+    fn test_constant_value_ttl_provider() {
+        let ttl = Some(Duration::from_secs(1));
+        let provider = ConstantValueTtlProvider::<u8, i8>::new(ttl);
+        assert_eq!(provider.expires_in(&1, &2), ttl);
+    }
+
+    #[test]
+    fn test_optional_value_ttl_provider() {
+        let ttl_none = Some(Duration::from_secs(1));
+        let ttl_some = Some(Duration::from_secs(2));
+        let provider = OptionalValueTtlProvider::<u8, i8>::new(ttl_none, ttl_some);
+        assert_eq!(provider.expires_in(&1, &None), ttl_none);
+        assert_eq!(provider.expires_in(&1, &Some(2)), ttl_some);
+    }
+
+    #[test]
+    #[should_panic(expected = "inner backend is not empty")]
+    fn test_panic_inner_not_empty() {
+        let ttl_provider = Arc::new(TestTtlProvider::new());
+        let metric_registry = metric::Registry::new();
+
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut backend: PolicyBackend<u8, String> = PolicyBackend::hashmap_backed(time_provider);
+        let policy_constructor =
+            TtlPolicy::new(Arc::clone(&ttl_provider) as _, "my_cache", &metric_registry);
+        backend.add_policy(|mut handle| {
+            handle.execute_requests(vec![ChangeRequest::set(1, String::from("foo"))]);
+            policy_constructor(handle)
+        });
+    }
+
+    #[test]
+    fn test_expires_single() {
+        let TestState {
+            mut backend,
+            metric_registry,
+            ttl_provider,
+            time_provider,
+        } = TestState::new();
+
+        assert_eq!(get_expired_metric(&metric_registry), 0);
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        assert_eq!(get_expired_metric(&metric_registry), 0);
+
+        time_provider.inc(Duration::from_secs(1));
+        assert_eq!(backend.get(&1), None);
+
+        assert_eq!(get_expired_metric(&metric_registry), 1);
+    }
+
+    #[test]
+    fn test_overflow_expire() {
+        let ttl_provider = Arc::new(TestTtlProvider::new());
+        let metric_registry = metric::Registry::new();
+
+        // init time provider at MAX!
+        let time_provider = Arc::new(MockProvider::new(Time::MAX));
+        let mut backend: PolicyBackend<u8, String> = PolicyBackend::hashmap_backed(time_provider);
+        backend.add_policy(TtlPolicy::new(
+            Arc::clone(&ttl_provider) as _,
+            "my_cache",
+            &metric_registry,
+        ));
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::MAX));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+    }
+
+    #[test]
+    fn test_never_expire() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), None);
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        time_provider.inc(Duration::from_secs(1));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+    }
+
+    #[test]
+    fn test_expiration_uses_key_and_value() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        ttl_provider.set_expires_in(1, String::from("b"), Some(Duration::from_secs(4)));
+        ttl_provider.set_expires_in(2, String::from("a"), Some(Duration::from_secs(2)));
+        backend.set(1, String::from("b"));
+
+        time_provider.inc(Duration::from_secs(3));
+        assert_eq!(backend.get(&1), Some(String::from("b")));
+    }
+
+    #[test]
+    fn test_override_with_different_expiration() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(3)));
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(2));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+    }
+
+    #[test]
+    fn test_override_with_no_expiration() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        ttl_provider.set_expires_in(1, String::from("a"), None);
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(2));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+    }
+
+    #[test]
+    fn test_override_with_some_expiration() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), None);
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(2));
+        assert_eq!(backend.get(&1), None);
+    }
+
+    #[test]
+    fn test_override_with_overflow() {
+        let ttl_provider = Arc::new(TestTtlProvider::new());
+        let metric_registry = metric::Registry::new();
+
+        // init time provider at nearly MAX!
+        let time_provider = Arc::new(MockProvider::new(Time::MAX - Duration::from_secs(2)));
+        let mut backend: PolicyBackend<u8, String> =
+            PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+        backend.add_policy(TtlPolicy::new(
+            Arc::clone(&ttl_provider) as _,
+            "my_cache",
+            &metric_registry,
+        ));
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(u64::MAX)));
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(2));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+    }
+
+    #[test]
+    fn test_readd_with_different_expiration() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(3)));
+        backend.remove(&1);
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(2));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+    }
+
+    #[test]
+    fn test_readd_with_no_expiration() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        ttl_provider.set_expires_in(1, String::from("a"), None);
+        backend.remove(&1);
+        backend.set(1, String::from("a"));
+
+        time_provider.inc(Duration::from_secs(2));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+    }
+
+    #[test]
+    fn test_update_cleans_multiple_keys() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        ttl_provider.set_expires_in(2, String::from("b"), Some(Duration::from_secs(2)));
+        ttl_provider.set_expires_in(3, String::from("c"), Some(Duration::from_secs(2)));
+        ttl_provider.set_expires_in(4, String::from("d"), Some(Duration::from_secs(3)));
+        backend.set(1, String::from("a"));
+        backend.set(2, String::from("b"));
+        backend.set(3, String::from("c"));
+        backend.set(4, String::from("d"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+        assert_eq!(backend.get(&2), Some(String::from("b")));
+        assert_eq!(backend.get(&3), Some(String::from("c")));
+        assert_eq!(backend.get(&4), Some(String::from("d")));
+
+        time_provider.inc(Duration::from_secs(2));
+        assert_eq!(backend.get(&1), None);
+
+        {
+            let inner_ref = backend.inner_ref();
+            let inner_backend = inner_ref
+                .as_any()
+                .downcast_ref::<HashMap<u8, String>>()
+                .unwrap();
+            assert!(!inner_backend.contains_key(&1));
+            assert!(!inner_backend.contains_key(&2));
+            assert!(!inner_backend.contains_key(&3));
+            assert!(inner_backend.contains_key(&4));
+        }
+
+        assert_eq!(backend.get(&2), None);
+        assert_eq!(backend.get(&3), None);
+        assert_eq!(backend.get(&4), Some(String::from("d")));
+    }
+
+    #[test]
+    fn test_remove_expired_key() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        backend.set(1, String::from("a"));
+        assert_eq!(backend.get(&1), Some(String::from("a")));
+
+        time_provider.inc(Duration::from_secs(1));
+        backend.remove(&1);
+        assert_eq!(backend.get(&1), None);
+    }
+
+    #[test]
+    fn test_expire_removed_key() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            time_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(1)));
+        ttl_provider.set_expires_in(2, String::from("b"), Some(Duration::from_secs(2)));
+        backend.set(1, String::from("a"));
+        backend.remove(&1);
+
+        time_provider.inc(Duration::from_secs(1));
+        backend.set(2, String::from("b"));
+        assert_eq!(backend.get(&1), None);
+        assert_eq!(backend.get(&2), Some(String::from("b")));
+    }
+
+    #[test]
+    fn test_expire_immediately() {
+        let TestState {
+            mut backend,
+            ttl_provider,
+            ..
+        } = TestState::new();
+
+        ttl_provider.set_expires_in(1, String::from("a"), Some(Duration::from_secs(0)));
+        backend.set(1, String::from("a"));
+
+        assert!(backend.is_empty());
+
+        assert_eq!(backend.get(&1), None);
+    }
+
+    #[test]
+    fn test_generic_backend() {
+        use crate::backend::test_util::test_generic;
+
+        test_generic(|| {
+            let ttl_provider = Arc::new(NeverTtlProvider::default());
+            let time_provider = Arc::new(MockProvider::new(Time::MIN));
+            let metric_registry = metric::Registry::new();
+            let mut backend = PolicyBackend::hashmap_backed(time_provider);
+            backend.add_policy(TtlPolicy::new(
+                Arc::clone(&ttl_provider) as _,
+                "my_cache",
+                &metric_registry,
+            ));
+            backend
+        });
+    }
+
+    struct TestState {
+        backend: PolicyBackend<u8, String>,
+        metric_registry: metric::Registry,
+        ttl_provider: Arc<TestTtlProvider>,
+        time_provider: Arc<MockProvider>,
+    }
+
+    impl TestState {
+        fn new() -> Self {
+            let ttl_provider = Arc::new(TestTtlProvider::new());
+            let time_provider = Arc::new(MockProvider::new(Time::MIN));
+            let metric_registry = metric::Registry::new();
+
+            let mut backend = PolicyBackend::hashmap_backed(Arc::clone(&time_provider) as _);
+            backend.add_policy(TtlPolicy::new(
+                Arc::clone(&ttl_provider) as _,
+                "my_cache",
+                &metric_registry,
+            ));
+
+            Self {
+                backend,
+                metric_registry,
+                ttl_provider,
+                time_provider,
+            }
+        }
+    }
+
+    fn get_expired_metric(metric_registry: &metric::Registry) -> u64 {
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        let observation = reporter
+            .metric("cache_ttl_expired")
+            .unwrap()
+            .observation(&[("name", "my_cache")])
+            .unwrap();
+
+        if let Observation::U64Counter(c) = observation {
+            *c
+        } else {
+            panic!("Wrong observation type")
+        }
+    }
+}
diff --git a/cache_system/src/backend/test_util.rs b/cache_system/src/backend/test_util.rs
new file mode 100644
index 0000000..21dae2c
--- /dev/null
+++ b/cache_system/src/backend/test_util.rs
@@ -0,0 +1,112 @@
+use super::CacheBackend;
+
+/// Generic test set for [`Backend`].
+///
+/// The backend must NOT perform any pruning/deletions during the tests (even though backends are allowed to do that in
+/// general).
+pub fn test_generic<B, F>(constructor: F)
+where
+    B: CacheBackend<K = u8, V = String>,
+    F: Fn() -> B,
+{
+    test_get_empty(constructor());
+    test_get_set(constructor());
+    test_get_twice(constructor());
+    test_override(constructor());
+    test_set_remove_get(constructor());
+    test_remove_empty(constructor());
+    test_readd(constructor());
+    test_is_empty(constructor());
+}
+
+/// Test GET on empty backend.
+fn test_get_empty<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    assert_eq!(backend.get(&1), None);
+}
+
+/// Test GET and SET without any overrides.
+fn test_get_set<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    backend.set(1, String::from("a"));
+    backend.set(2, String::from("b"));
+
+    assert_eq!(backend.get(&1), Some(String::from("a")));
+    assert_eq!(backend.get(&2), Some(String::from("b")));
+    assert_eq!(backend.get(&3), None);
+}
+
+/// Test that a value can be retrieved multiple times.
+fn test_get_twice<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    backend.set(1, String::from("a"));
+
+    assert_eq!(backend.get(&1), Some(String::from("a")));
+    assert_eq!(backend.get(&1), Some(String::from("a")));
+}
+
+/// Test that setting a value twice w/o deletion overrides the existing value.
+fn test_override<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    backend.set(1, String::from("a"));
+    backend.set(1, String::from("b"));
+
+    assert_eq!(backend.get(&1), Some(String::from("b")));
+}
+
+/// Test removal of on empty backend.
+fn test_remove_empty<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    backend.remove(&1);
+}
+
+/// Test removal of existing values.
+fn test_set_remove_get<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    backend.set(1, String::from("a"));
+    backend.remove(&1);
+
+    assert_eq!(backend.get(&1), None);
+}
+
+/// Test setting a new value after removing it.
+fn test_readd<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    backend.set(1, String::from("a"));
+    backend.remove(&1);
+    backend.set(1, String::from("b"));
+
+    assert_eq!(backend.get(&1), Some(String::from("b")));
+}
+
+/// Test `is_empty` check.
+fn test_is_empty<B>(mut backend: B)
+where
+    B: CacheBackend<K = u8, V = String>,
+{
+    assert!(backend.is_empty());
+
+    backend.set(1, String::from("a"));
+    backend.set(2, String::from("b"));
+    assert!(!backend.is_empty());
+
+    backend.remove(&1);
+    assert!(!backend.is_empty());
+
+    backend.remove(&2);
+    assert!(backend.is_empty());
+}
diff --git a/cache_system/src/cache/driver.rs b/cache_system/src/cache/driver.rs
new file mode 100644
index 0000000..c0c9773
--- /dev/null
+++ b/cache_system/src/cache/driver.rs
@@ -0,0 +1,452 @@
+//! Main data structure, see [`CacheDriver`].
+
+use crate::{
+    backend::CacheBackend,
+    cancellation_safe_future::{CancellationSafeFuture, CancellationSafeFutureReceiver},
+    loader::Loader,
+};
+use async_trait::async_trait;
+use futures::{
+    channel::oneshot::{channel, Canceled, Sender},
+    future::{BoxFuture, Shared},
+    FutureExt, TryFutureExt,
+};
+use observability_deps::tracing::debug;
+use std::{collections::HashMap, fmt::Debug, future::Future, sync::Arc};
+use tracker::{LockMetrics, Mutex};
+
+use super::{Cache, CacheGetStatus, CachePeekStatus};
+
+/// Combine a [`CacheBackend`] and a [`Loader`] into a single [`Cache`]
+#[derive(Debug)]
+pub struct CacheDriver<B, L>
+where
+    B: CacheBackend + Send + 'static,
+    L: Loader<K = B::K, V = B::V>,
+{
+    state: Arc<Mutex<CacheState<B>>>,
+    loader: Arc<L>,
+}
+
+impl<B, L> CacheDriver<B, L>
+where
+    B: CacheBackend + Send + 'static,
+    L: Loader<K = B::K, V = B::V>,
+{
+    /// Create new, empty cache with given loader function.
+    pub fn new(loader: Arc<L>, backend: B, metrics: &metric::Registry, name: &'static str) -> Self {
+        let metrics = Arc::new(LockMetrics::new(
+            metrics,
+            &[("what", "cache_driver_state"), ("cache", name)],
+        ));
+
+        Self {
+            state: Arc::new(metrics.new_mutex(CacheState {
+                cached_entries: backend,
+                running_queries: HashMap::new(),
+                tag_counter: 0,
+            })),
+            loader,
+        }
+    }
+
+    fn start_new_query(
+        state: &mut CacheState<B>,
+        state_captured: Arc<Mutex<CacheState<B>>>,
+        loader: Arc<L>,
+        k: B::K,
+        extra: L::Extra,
+    ) -> (
+        CancellationSafeFuture<impl Future<Output = ()>>,
+        SharedReceiver<B::V>,
+    ) {
+        let (tx_main, rx_main) = channel();
+        let receiver = rx_main
+            .map_ok(|v| Arc::new(Mutex::new(v)))
+            .map_err(Arc::new)
+            .boxed()
+            .shared();
+        let (tx_set, rx_set) = channel();
+
+        // generate unique tag
+        let tag = state.tag_counter;
+        state.tag_counter += 1;
+
+        // need to wrap the query into a `CancellationSafeFuture` so that it doesn't get cancelled when
+        // this very request is cancelled
+        let join_handle_receiver = CancellationSafeFutureReceiver::default();
+        let k_captured = k.clone();
+        let fut = async move {
+            let loader_fut = async move {
+                let submitter = ResultSubmitter::new(state_captured, k_captured.clone(), tag);
+
+                // execute the loader
+                // If we panic here then `tx` will be dropped and the receivers will be
+                // notified.
+                let v = loader.load(k_captured, extra).await;
+
+                // remove "running" state and store result
+                let was_running = submitter.submit(v.clone());
+
+                if !was_running {
+                    // value was side-loaded, so we cannot populate `v`. Instead block this
+                    // execution branch and wait for `rx_set` to deliver the side-loaded
+                    // result.
+                    loop {
+                        tokio::task::yield_now().await;
+                    }
+                }
+
+                v
+            };
+
+            // prefer the side-loader
+            let v = futures::select_biased! {
+                maybe_v = rx_set.fuse() => {
+                    match maybe_v {
+                        Ok(v) => {
+                            // data get side-loaded via `Cache::set`. In this case, we do
+                            // NOT modify the state because there would be a lock-gap. The
+                            // `set` function will do that for us instead.
+                            v
+                        }
+                        Err(_) => {
+                            // sender side is gone, very likely the cache is shutting down
+                            debug!(
+                                "Sender for side-loading data into running query gone.",
+                            );
+                            return;
+                        }
+                    }
+                }
+                v = loader_fut.fuse() => v,
+            };
+
+            // broadcast result
+            // It's OK if the receiver side is gone. This might happen during shutdown
+            tx_main.send(v).ok();
+        };
+        let fut = CancellationSafeFuture::new(fut, join_handle_receiver.clone());
+
+        state.running_queries.insert(
+            k,
+            RunningQuery {
+                recv: receiver.clone(),
+                set: tx_set,
+                _join_handle: join_handle_receiver,
+                tag,
+            },
+        );
+
+        (fut, receiver)
+    }
+}
+
+#[async_trait]
+impl<B, L> Cache for CacheDriver<B, L>
+where
+    B: CacheBackend + Send,
+    L: Loader<K = B::K, V = B::V>,
+{
+    type K = B::K;
+    type V = B::V;
+    type GetExtra = L::Extra;
+    type PeekExtra = ();
+
+    async fn get_with_status(
+        &self,
+        k: Self::K,
+        extra: Self::GetExtra,
+    ) -> (Self::V, CacheGetStatus) {
+        // place state locking into its own scope so it doesn't leak into the generator (async
+        // function)
+        let (fut, receiver, status) = {
+            let mut state = self.state.lock();
+
+            // check if the entry has already been cached
+            if let Some(v) = state.cached_entries.get(&k) {
+                return (v, CacheGetStatus::Hit);
+            }
+
+            // check if there is already a query for this key running
+            if let Some(running_query) = state.running_queries.get(&k) {
+                (
+                    None,
+                    running_query.recv.clone(),
+                    CacheGetStatus::MissAlreadyLoading,
+                )
+            } else {
+                // requires new query
+                let (fut, receiver) = Self::start_new_query(
+                    &mut state,
+                    Arc::clone(&self.state),
+                    Arc::clone(&self.loader),
+                    k,
+                    extra,
+                );
+                (Some(fut), receiver, CacheGetStatus::Miss)
+            }
+        };
+
+        // try to run the loader future in this very task context to avoid spawning tokio tasks (which adds latency and
+        // overhead)
+        if let Some(fut) = fut {
+            fut.await;
+        }
+
+        let v = retrieve_from_shared(receiver).await;
+
+        (v, status)
+    }
+
+    async fn peek_with_status(
+        &self,
+        k: Self::K,
+        _extra: Self::PeekExtra,
+    ) -> Option<(Self::V, CachePeekStatus)> {
+        // place state locking into its own scope so it doesn't leak into the generator (async
+        // function)
+        let (receiver, status) = {
+            let mut state = self.state.lock();
+
+            // check if the entry has already been cached
+            if let Some(v) = state.cached_entries.get(&k) {
+                return Some((v, CachePeekStatus::Hit));
+            }
+
+            // check if there is already a query for this key running
+            if let Some(running_query) = state.running_queries.get(&k) {
+                (
+                    running_query.recv.clone(),
+                    CachePeekStatus::MissAlreadyLoading,
+                )
+            } else {
+                return None;
+            }
+        };
+
+        let v = retrieve_from_shared(receiver).await;
+
+        Some((v, status))
+    }
+
+    async fn set(&self, k: Self::K, v: Self::V) {
+        let maybe_join_handle = {
+            let mut state = self.state.lock();
+
+            let maybe_recv = if let Some(running_query) = state.running_queries.remove(&k) {
+                // it's OK when the receiver side is gone (likely panicked)
+                running_query.set.send(v.clone()).ok();
+
+                // When we side-load data into the running task, the task does NOT modify the
+                // backend, so we have to do that. The reason for not letting the task feed the
+                // side-loaded data back into `cached_entries` is that we would need to drop the
+                // state lock here before the task could acquire it, leading to a lock gap.
+                Some(running_query.recv)
+            } else {
+                None
+            };
+
+            state.cached_entries.set(k, v);
+
+            maybe_recv
+        };
+
+        // drive running query (if any) to completion
+        if let Some(recv) = maybe_join_handle {
+            // we do not care if the query died (e.g. due to a panic)
+            recv.await.ok();
+        }
+    }
+}
+
+impl<B, L> Drop for CacheDriver<B, L>
+where
+    B: CacheBackend + Send,
+    L: Loader<K = B::K, V = B::V>,
+{
+    fn drop(&mut self) {
+        for _ in self.state.lock().running_queries.drain() {}
+    }
+}
+
+/// Helper to submit results of running queries.
+///
+/// Ensures that running query is removed when dropped (e.g. during panic).
+struct ResultSubmitter<B>
+where
+    B: CacheBackend,
+{
+    state: Arc<Mutex<CacheState<B>>>,
+    tag: u64,
+    k: Option<B::K>,
+    v: Option<B::V>,
+}
+
+impl<B> ResultSubmitter<B>
+where
+    B: CacheBackend,
+{
+    fn new(state: Arc<Mutex<CacheState<B>>>, k: B::K, tag: u64) -> Self {
+        Self {
+            state,
+            tag,
+            k: Some(k),
+            v: None,
+        }
+    }
+
+    /// Submit value.
+    ///
+    /// Returns `true` if this very query was running.
+    fn submit(mut self, v: B::V) -> bool {
+        assert!(self.v.is_none());
+        self.v = Some(v);
+        self.finalize()
+    }
+
+    /// Finalize request.
+    ///
+    /// Returns `true` if this very query was running.
+    fn finalize(&mut self) -> bool {
+        let k = self.k.take().expect("finalized twice");
+        let mut state = self.state.lock();
+
+        match state.running_queries.get(&k) {
+            Some(running_query) if running_query.tag == self.tag => {
+                state.running_queries.remove(&k);
+
+                if let Some(v) = self.v.take() {
+                    // this very query is in charge of the key, so store in in the
+                    // underlying cache
+                    state.cached_entries.set(k, v);
+                }
+
+                true
+            }
+            _ => {
+                // This query is actually not really running any longer but got
+                // shut down, e.g. due to side loading. Do NOT store the
+                // generated value in the underlying cache.
+
+                false
+            }
+        }
+    }
+}
+
+impl<B> Drop for ResultSubmitter<B>
+where
+    B: CacheBackend,
+{
+    fn drop(&mut self) {
+        if self.k.is_some() {
+            // not finalized yet
+            self.finalize();
+        }
+    }
+}
+
+/// A [`tokio::sync::oneshot::Receiver`] that can be cloned.
+///
+/// The types are:
+///
+/// - `Arc<Mutex<V>>`: Ensures that we can clone `V` without requiring `V: Sync`. At the same time
+///   the reference to `V` (i.e. the `Arc`) must be cloneable for `Shared`
+/// - `Arc<RecvError>`: Is required because `RecvError` is not `Clone` but `Shared` requires that.
+/// - `BoxFuture`: The transformation from `Result<V, RecvError>` to `Result<Arc<Mutex<V>>,
+///   Arc<RecvError>>` results in a kinda messy type and we wanna erase that.
+/// - `Shared`: Allow the receiver to be cloned and be awaited from multiple places.
+type SharedReceiver<V> = Shared<BoxFuture<'static, Result<Arc<Mutex<V>>, Arc<Canceled>>>>;
+
+/// Retrieve data from shared receiver.
+async fn retrieve_from_shared<V>(receiver: SharedReceiver<V>) -> V
+where
+    V: Clone + Send,
+{
+    receiver
+        .await
+        .expect("cache loader panicked, see logs")
+        .lock()
+        .clone()
+}
+
+/// State for coordinating the execution of a single running query.
+#[derive(Debug)]
+struct RunningQuery<V> {
+    /// A receiver that can await the result as well.
+    recv: SharedReceiver<V>,
+
+    /// A sender that enables setting entries while the query is running.
+    #[allow(dead_code)]
+    set: Sender<V>,
+
+    /// A handle for the task that is currently executing the query.
+    ///
+    /// The handle can be used to abort the running query, e.g. when dropping the cache.
+    ///
+    /// This is "dead code" because we only store it to keep the future alive. There's no direct interaction.
+    _join_handle: CancellationSafeFutureReceiver<()>,
+
+    /// Tag so that queries for the same key (e.g. when starting, side-loading, starting again) can
+    /// be told apart.
+    tag: u64,
+}
+
+/// Inner cache state that is usually guarded by a lock.
+///
+/// The state parts must be updated in a consistent manner, i.e. while using the same lock guard.
+#[derive(Debug)]
+struct CacheState<B>
+where
+    B: CacheBackend,
+{
+    /// Cached entires (i.e. queries completed).
+    cached_entries: B,
+
+    /// Currently running queries indexed by cache key.
+    running_queries: HashMap<B::K, RunningQuery<B::V>>,
+
+    /// Tag counter for running queries.
+    tag_counter: u64,
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use crate::{
+        cache::test_util::{run_test_generic, TestAdapter},
+        loader::test_util::TestLoader,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_generic() {
+        run_test_generic(MyTestAdapter).await;
+    }
+
+    struct MyTestAdapter;
+
+    impl TestAdapter for MyTestAdapter {
+        type GetExtra = bool;
+        type PeekExtra = ();
+        type Cache = CacheDriver<HashMap<u8, String>, TestLoader>;
+
+        fn construct(&self, loader: Arc<TestLoader>) -> Arc<Self::Cache> {
+            Arc::new(CacheDriver::new(
+                Arc::clone(&loader) as _,
+                HashMap::new(),
+                &metric::Registry::default(),
+                "test",
+            ))
+        }
+
+        fn get_extra(&self, inner: bool) -> Self::GetExtra {
+            inner
+        }
+
+        fn peek_extra(&self) -> Self::PeekExtra {}
+    }
+}
diff --git a/cache_system/src/cache/metrics.rs b/cache_system/src/cache/metrics.rs
new file mode 100644
index 0000000..c72364a
--- /dev/null
+++ b/cache_system/src/cache/metrics.rs
@@ -0,0 +1,718 @@
+//! Metrics instrumentation for [`Cache`]s.
+use std::{fmt::Debug, sync::Arc};
+
+use async_trait::async_trait;
+use iox_time::{Time, TimeProvider};
+use metric::{Attributes, DurationHistogram, U64Counter};
+use observability_deps::tracing::warn;
+use trace::span::{Span, SpanRecorder};
+
+use super::{Cache, CacheGetStatus, CachePeekStatus};
+
+/// Struct containing all the metrics
+#[derive(Debug)]
+struct Metrics {
+    time_provider: Arc<dyn TimeProvider>,
+    metric_get_hit: DurationHistogram,
+    metric_get_miss: DurationHistogram,
+    metric_get_miss_already_loading: DurationHistogram,
+    metric_get_cancelled: DurationHistogram,
+    metric_peek_hit: DurationHistogram,
+    metric_peek_miss: DurationHistogram,
+    metric_peek_miss_already_loading: DurationHistogram,
+    metric_peek_cancelled: DurationHistogram,
+    metric_set: U64Counter,
+}
+
+impl Metrics {
+    fn new(
+        name: &'static str,
+        time_provider: Arc<dyn TimeProvider>,
+        metric_registry: &metric::Registry,
+    ) -> Self {
+        let attributes = Attributes::from(&[("name", name)]);
+
+        let mut attributes_get = attributes.clone();
+        let metric_get = metric_registry
+            .register_metric::<DurationHistogram>("iox_cache_get", "Cache GET requests");
+
+        attributes_get.insert("status", "hit");
+        let metric_get_hit = metric_get.recorder(attributes_get.clone());
+
+        attributes_get.insert("status", "miss");
+        let metric_get_miss = metric_get.recorder(attributes_get.clone());
+
+        attributes_get.insert("status", "miss_already_loading");
+        let metric_get_miss_already_loading = metric_get.recorder(attributes_get.clone());
+
+        attributes_get.insert("status", "cancelled");
+        let metric_get_cancelled = metric_get.recorder(attributes_get);
+
+        let mut attributes_peek = attributes.clone();
+        let metric_peek = metric_registry
+            .register_metric::<DurationHistogram>("iox_cache_peek", "Cache PEEK requests");
+
+        attributes_peek.insert("status", "hit");
+        let metric_peek_hit = metric_peek.recorder(attributes_peek.clone());
+
+        attributes_peek.insert("status", "miss");
+        let metric_peek_miss = metric_peek.recorder(attributes_peek.clone());
+
+        attributes_peek.insert("status", "miss_already_loading");
+        let metric_peek_miss_already_loading = metric_peek.recorder(attributes_peek.clone());
+
+        attributes_peek.insert("status", "cancelled");
+        let metric_peek_cancelled = metric_peek.recorder(attributes_peek);
+
+        let metric_set = metric_registry
+            .register_metric::<U64Counter>("iox_cache_set", "Cache SET requests.")
+            .recorder(attributes);
+
+        Self {
+            time_provider,
+            metric_get_hit,
+            metric_get_miss,
+            metric_get_miss_already_loading,
+            metric_get_cancelled,
+            metric_peek_hit,
+            metric_peek_miss,
+            metric_peek_miss_already_loading,
+            metric_peek_cancelled,
+            metric_set,
+        }
+    }
+}
+
+/// Wraps given cache with metrics.
+#[derive(Debug)]
+pub struct CacheWithMetrics<C>
+where
+    C: Cache,
+{
+    inner: C,
+    metrics: Metrics,
+}
+
+impl<C> CacheWithMetrics<C>
+where
+    C: Cache,
+{
+    /// Create new metrics wrapper around given cache.
+    pub fn new(
+        inner: C,
+        name: &'static str,
+        time_provider: Arc<dyn TimeProvider>,
+        metric_registry: &metric::Registry,
+    ) -> Self {
+        Self {
+            inner,
+            metrics: Metrics::new(name, time_provider, metric_registry),
+        }
+    }
+}
+
+#[async_trait]
+impl<C> Cache for CacheWithMetrics<C>
+where
+    C: Cache,
+{
+    type K = C::K;
+    type V = C::V;
+    type GetExtra = (C::GetExtra, Option<Span>);
+    type PeekExtra = (C::PeekExtra, Option<Span>);
+
+    async fn get_with_status(
+        &self,
+        k: Self::K,
+        extra: Self::GetExtra,
+    ) -> (Self::V, CacheGetStatus) {
+        let (extra, span) = extra;
+        let mut set_on_drop = SetGetMetricOnDrop::new(&self.metrics, span);
+        let (v, status) = self.inner.get_with_status(k, extra).await;
+        set_on_drop.status = Some(status);
+
+        (v, status)
+    }
+
+    async fn peek_with_status(
+        &self,
+        k: Self::K,
+        extra: Self::PeekExtra,
+    ) -> Option<(Self::V, CachePeekStatus)> {
+        let (extra, span) = extra;
+        let mut set_on_drop = SetPeekMetricOnDrop::new(&self.metrics, span);
+        let res = self.inner.peek_with_status(k, extra).await;
+        set_on_drop.status = Some(res.as_ref().map(|(_v, status)| *status));
+
+        res
+    }
+
+    async fn set(&self, k: Self::K, v: Self::V) {
+        self.inner.set(k, v).await;
+        self.metrics.metric_set.inc(1);
+    }
+}
+
+/// Helper that set's GET metrics on drop depending on the `status`.
+///
+/// A drop might happen due to completion (in which case the `status` should be set) or if the future is cancelled (in
+/// which case the `status` is `None`).
+struct SetGetMetricOnDrop<'a> {
+    metrics: &'a Metrics,
+    t_start: Time,
+    status: Option<CacheGetStatus>,
+    span_recorder: SpanRecorder,
+}
+
+impl<'a> SetGetMetricOnDrop<'a> {
+    fn new(metrics: &'a Metrics, span: Option<Span>) -> Self {
+        let t_start = metrics.time_provider.now();
+
+        Self {
+            metrics,
+            t_start,
+            status: None,
+            span_recorder: SpanRecorder::new(span),
+        }
+    }
+}
+
+impl<'a> Drop for SetGetMetricOnDrop<'a> {
+    fn drop(&mut self) {
+        let t_end = self.metrics.time_provider.now();
+
+        match t_end.checked_duration_since(self.t_start) {
+            Some(duration) => {
+                match self.status {
+                    Some(CacheGetStatus::Hit) => &self.metrics.metric_get_hit,
+                    Some(CacheGetStatus::Miss) => &self.metrics.metric_get_miss,
+                    Some(CacheGetStatus::MissAlreadyLoading) => {
+                        &self.metrics.metric_get_miss_already_loading
+                    }
+                    None => &self.metrics.metric_get_cancelled,
+                }
+                .record(duration);
+            }
+            None => {
+                warn!("Clock went backwards, not recording cache GET duration");
+            }
+        }
+
+        if let Some(status) = self.status {
+            self.span_recorder.ok(status.name());
+        }
+    }
+}
+
+/// Helper that set's PEEK metrics on drop depending on the `status`.
+///
+/// A drop might happen due to completion (in which case the `status` should be set) or if the future is cancelled (in
+/// which case the `status` is `None`).
+struct SetPeekMetricOnDrop<'a> {
+    metrics: &'a Metrics,
+    t_start: Time,
+    status: Option<Option<CachePeekStatus>>,
+    span_recorder: SpanRecorder,
+}
+
+impl<'a> SetPeekMetricOnDrop<'a> {
+    fn new(metrics: &'a Metrics, span: Option<Span>) -> Self {
+        let t_start = metrics.time_provider.now();
+
+        Self {
+            metrics,
+            t_start,
+            status: None,
+            span_recorder: SpanRecorder::new(span),
+        }
+    }
+}
+
+impl<'a> Drop for SetPeekMetricOnDrop<'a> {
+    fn drop(&mut self) {
+        let t_end = self.metrics.time_provider.now();
+
+        match t_end.checked_duration_since(self.t_start) {
+            Some(duration) => {
+                match self.status {
+                    Some(Some(CachePeekStatus::Hit)) => &self.metrics.metric_peek_hit,
+                    Some(Some(CachePeekStatus::MissAlreadyLoading)) => {
+                        &self.metrics.metric_peek_miss_already_loading
+                    }
+                    Some(None) => &self.metrics.metric_peek_miss,
+                    None => &self.metrics.metric_peek_cancelled,
+                }
+                .record(duration);
+            }
+            None => {
+                warn!("Clock went backwards, not recording cache PEEK duration");
+            }
+        }
+
+        if let Some(status) = self.status {
+            self.span_recorder
+                .ok(status.map(|status| status.name()).unwrap_or("miss"));
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, time::Duration};
+
+    use futures::{stream::FuturesUnordered, StreamExt};
+    use iox_time::{MockProvider, Time};
+    use metric::{HistogramObservation, Observation, RawReporter};
+    use tokio::sync::Barrier;
+    use trace::{span::SpanStatus, RingBufferTraceCollector};
+
+    use crate::{
+        cache::{
+            driver::CacheDriver,
+            test_util::{run_test_generic, TestAdapter},
+        },
+        loader::test_util::TestLoader,
+        test_util::{AbortAndWaitExt, EnsurePendingExt},
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_generic() {
+        run_test_generic(MyTestAdapter).await;
+    }
+
+    struct MyTestAdapter;
+
+    impl TestAdapter for MyTestAdapter {
+        type GetExtra = (bool, Option<Span>);
+        type PeekExtra = ((), Option<Span>);
+        type Cache = CacheWithMetrics<CacheDriver<HashMap<u8, String>, TestLoader>>;
+
+        fn construct(&self, loader: Arc<TestLoader>) -> Arc<Self::Cache> {
+            TestMetricsCache::new_with_loader(loader).cache
+        }
+
+        fn get_extra(&self, inner: bool) -> Self::GetExtra {
+            (inner, None)
+        }
+
+        fn peek_extra(&self) -> Self::PeekExtra {
+            ((), None)
+        }
+    }
+
+    #[tokio::test]
+    async fn test_get() {
+        let test_cache = TestMetricsCache::new();
+
+        let traces = Arc::new(RingBufferTraceCollector::new(1_000));
+
+        let mut reporter = RawReporter::default();
+        test_cache.metric_registry.report(&mut reporter);
+
+        for status in ["hit", "miss", "miss_already_loading", "cancelled"] {
+            let hist = get_metric_cache_get(&reporter, status);
+            assert_eq!(hist.sample_count(), 0);
+            assert_eq!(hist.total, Duration::from_secs(0));
+        }
+
+        test_cache.loader.block_global();
+
+        let barrier_pending_1 = Arc::new(Barrier::new(2));
+        let barrier_pending_1_captured = Arc::clone(&barrier_pending_1);
+        let traces_captured = Arc::clone(&traces);
+        let cache_captured = Arc::clone(&test_cache.cache);
+        let join_handle_1 = tokio::task::spawn(async move {
+            cache_captured
+                .get(
+                    1,
+                    (
+                        true,
+                        Some(Span::root("miss", Arc::clone(&traces_captured) as _)),
+                    ),
+                )
+                .ensure_pending(barrier_pending_1_captured)
+                .await
+        });
+
+        barrier_pending_1.wait().await;
+        let d1 = Duration::from_secs(1);
+        test_cache.time_provider.inc(d1);
+        let barrier_pending_2 = Arc::new(Barrier::new(2));
+        let barrier_pending_2_captured = Arc::clone(&barrier_pending_2);
+        let traces_captured = Arc::clone(&traces);
+        let cache_captured = Arc::clone(&test_cache.cache);
+        let n_miss_already_loading = 10;
+        let join_handle_2 = tokio::task::spawn(async move {
+            (0..n_miss_already_loading)
+                .map(|_| {
+                    cache_captured.get(
+                        1,
+                        (
+                            true,
+                            Some(Span::root(
+                                "miss_already_loading",
+                                Arc::clone(&traces_captured) as _,
+                            )),
+                        ),
+                    )
+                })
+                .collect::<FuturesUnordered<_>>()
+                .collect::<Vec<_>>()
+                .ensure_pending(barrier_pending_2_captured)
+                .await
+        });
+
+        barrier_pending_2.wait().await;
+        let d2 = Duration::from_secs(3);
+        test_cache.time_provider.inc(d2);
+        test_cache.loader.mock_next(1, "v".into());
+        test_cache.loader.unblock_global();
+
+        join_handle_1.await.unwrap();
+        join_handle_2.await.unwrap();
+
+        test_cache.loader.block_global();
+        test_cache.time_provider.inc(Duration::from_secs(10));
+        let n_hit = 100;
+        for _ in 0..n_hit {
+            test_cache
+                .cache
+                .get(1, (true, Some(Span::root("hit", Arc::clone(&traces) as _))))
+                .await;
+        }
+
+        let n_cancelled = 200;
+        let barrier_pending_3 = Arc::new(Barrier::new(2));
+        let barrier_pending_3_captured = Arc::clone(&barrier_pending_3);
+        let traces_captured = Arc::clone(&traces);
+        let cache_captured = Arc::clone(&test_cache.cache);
+        let join_handle_3 = tokio::task::spawn(async move {
+            (0..n_cancelled)
+                .map(|_| {
+                    cache_captured.get(
+                        2,
+                        (
+                            true,
+                            Some(Span::root("cancelled", Arc::clone(&traces_captured) as _)),
+                        ),
+                    )
+                })
+                .collect::<FuturesUnordered<_>>()
+                .collect::<Vec<_>>()
+                .ensure_pending(barrier_pending_3_captured)
+                .await
+        });
+
+        barrier_pending_3.wait().await;
+        let d3 = Duration::from_secs(20);
+        test_cache.time_provider.inc(d3);
+        join_handle_3.abort_and_wait().await;
+
+        let mut reporter = RawReporter::default();
+        test_cache.metric_registry.report(&mut reporter);
+
+        let hist = get_metric_cache_get(&reporter, "hit");
+        assert_eq!(hist.sample_count(), n_hit);
+        // "hit"s are instant because there's no lock contention
+        assert_eq!(hist.total, Duration::from_secs(0));
+
+        let hist = get_metric_cache_get(&reporter, "miss");
+        let n = 1;
+        assert_eq!(hist.sample_count(), n);
+        assert_eq!(hist.total, (n as u32) * (d1 + d2));
+
+        let hist = get_metric_cache_get(&reporter, "miss_already_loading");
+        assert_eq!(hist.sample_count(), n_miss_already_loading);
+        assert_eq!(hist.total, (n_miss_already_loading as u32) * d2);
+
+        let hist = get_metric_cache_get(&reporter, "cancelled");
+        assert_eq!(hist.sample_count(), n_cancelled);
+        assert_eq!(hist.total, (n_cancelled as u32) * d3);
+
+        // check spans
+        assert_n_spans(&traces, "hit", SpanStatus::Ok, n_hit as usize);
+        assert_n_spans(&traces, "miss", SpanStatus::Ok, 1);
+        assert_n_spans(
+            &traces,
+            "miss_already_loading",
+            SpanStatus::Ok,
+            n_miss_already_loading as usize,
+        );
+        assert_n_spans(
+            &traces,
+            "cancelled",
+            SpanStatus::Unknown,
+            n_cancelled as usize,
+        );
+    }
+
+    #[tokio::test]
+    async fn test_peek() {
+        let test_cache = TestMetricsCache::new();
+
+        let traces = Arc::new(RingBufferTraceCollector::new(1_000));
+
+        let mut reporter = RawReporter::default();
+        test_cache.metric_registry.report(&mut reporter);
+
+        for status in ["hit", "miss", "miss_already_loading", "cancelled"] {
+            let hist = get_metric_cache_peek(&reporter, status);
+            assert_eq!(hist.sample_count(), 0);
+            assert_eq!(hist.total, Duration::from_secs(0));
+        }
+
+        test_cache.loader.block_global();
+
+        test_cache
+            .cache
+            .peek(1, ((), Some(Span::root("miss", Arc::clone(&traces) as _))))
+            .await;
+
+        let barrier_pending_1 = Arc::new(Barrier::new(2));
+        let barrier_pending_1_captured = Arc::clone(&barrier_pending_1);
+        let cache_captured = Arc::clone(&test_cache.cache);
+        let join_handle_1 = tokio::task::spawn(async move {
+            cache_captured
+                .get(1, (true, None))
+                .ensure_pending(barrier_pending_1_captured)
+                .await
+        });
+
+        barrier_pending_1.wait().await;
+        let d1 = Duration::from_secs(1);
+        test_cache.time_provider.inc(d1);
+        let barrier_pending_2 = Arc::new(Barrier::new(2));
+        let barrier_pending_2_captured = Arc::clone(&barrier_pending_2);
+        let traces_captured = Arc::clone(&traces);
+        let cache_captured = Arc::clone(&test_cache.cache);
+        let n_miss_already_loading = 10;
+        let join_handle_2 = tokio::task::spawn(async move {
+            (0..n_miss_already_loading)
+                .map(|_| {
+                    cache_captured.peek(
+                        1,
+                        (
+                            (),
+                            Some(Span::root(
+                                "miss_already_loading",
+                                Arc::clone(&traces_captured) as _,
+                            )),
+                        ),
+                    )
+                })
+                .collect::<FuturesUnordered<_>>()
+                .collect::<Vec<_>>()
+                .ensure_pending(barrier_pending_2_captured)
+                .await
+        });
+
+        barrier_pending_2.wait().await;
+        let d2 = Duration::from_secs(3);
+        test_cache.time_provider.inc(d2);
+        test_cache.loader.mock_next(1, "v".into());
+        test_cache.loader.unblock_global();
+
+        join_handle_1.await.unwrap();
+        join_handle_2.await.unwrap();
+
+        test_cache.loader.block_global();
+        test_cache.time_provider.inc(Duration::from_secs(10));
+        let n_hit = 100;
+        for _ in 0..n_hit {
+            test_cache
+                .cache
+                .peek(1, ((), Some(Span::root("hit", Arc::clone(&traces) as _))))
+                .await;
+        }
+
+        let n_cancelled = 200;
+        let barrier_pending_3 = Arc::new(Barrier::new(2));
+        let barrier_pending_3_captured = Arc::clone(&barrier_pending_3);
+        let cache_captured = Arc::clone(&test_cache.cache);
+        tokio::task::spawn(async move {
+            cache_captured
+                .get(2, (true, None))
+                .ensure_pending(barrier_pending_3_captured)
+                .await
+        });
+        barrier_pending_3.wait().await;
+        let barrier_pending_4 = Arc::new(Barrier::new(2));
+        let barrier_pending_4_captured = Arc::clone(&barrier_pending_4);
+        let traces_captured = Arc::clone(&traces);
+        let cache_captured = Arc::clone(&test_cache.cache);
+        let join_handle_3 = tokio::task::spawn(async move {
+            (0..n_cancelled)
+                .map(|_| {
+                    cache_captured.peek(
+                        2,
+                        (
+                            (),
+                            Some(Span::root("cancelled", Arc::clone(&traces_captured) as _)),
+                        ),
+                    )
+                })
+                .collect::<FuturesUnordered<_>>()
+                .collect::<Vec<_>>()
+                .ensure_pending(barrier_pending_4_captured)
+                .await
+        });
+
+        barrier_pending_4.wait().await;
+        let d3 = Duration::from_secs(20);
+        test_cache.time_provider.inc(d3);
+        join_handle_3.abort_and_wait().await;
+
+        let mut reporter = RawReporter::default();
+        test_cache.metric_registry.report(&mut reporter);
+
+        let hist = get_metric_cache_peek(&reporter, "hit");
+        assert_eq!(hist.sample_count(), n_hit);
+        // "hit"s are instant because there's no lock contention
+        assert_eq!(hist.total, Duration::from_secs(0));
+
+        let hist = get_metric_cache_peek(&reporter, "miss");
+        let n = 1;
+        assert_eq!(hist.sample_count(), n);
+        // "miss"es are instant
+        assert_eq!(hist.total, Duration::from_secs(0));
+
+        let hist = get_metric_cache_peek(&reporter, "miss_already_loading");
+        assert_eq!(hist.sample_count(), n_miss_already_loading);
+        assert_eq!(hist.total, (n_miss_already_loading as u32) * d2);
+
+        let hist = get_metric_cache_peek(&reporter, "cancelled");
+        assert_eq!(hist.sample_count(), n_cancelled);
+        assert_eq!(hist.total, (n_cancelled as u32) * d3);
+
+        // check spans
+        assert_n_spans(&traces, "hit", SpanStatus::Ok, n_hit as usize);
+        assert_n_spans(&traces, "miss", SpanStatus::Ok, 1);
+        assert_n_spans(
+            &traces,
+            "miss_already_loading",
+            SpanStatus::Ok,
+            n_miss_already_loading as usize,
+        );
+        assert_n_spans(
+            &traces,
+            "cancelled",
+            SpanStatus::Unknown,
+            n_cancelled as usize,
+        );
+    }
+
+    #[tokio::test]
+    async fn test_set() {
+        let test_cache = TestMetricsCache::new();
+
+        let mut reporter = RawReporter::default();
+        test_cache.metric_registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("iox_cache_set")
+                .unwrap()
+                .observation(&[("name", "test")])
+                .unwrap(),
+            &Observation::U64Counter(0)
+        );
+
+        test_cache.cache.set(1, String::from("foo")).await;
+
+        let mut reporter = RawReporter::default();
+        test_cache.metric_registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("iox_cache_set")
+                .unwrap()
+                .observation(&[("name", "test")])
+                .unwrap(),
+            &Observation::U64Counter(1)
+        );
+    }
+
+    struct TestMetricsCache {
+        loader: Arc<TestLoader>,
+        time_provider: Arc<MockProvider>,
+        metric_registry: metric::Registry,
+        cache: Arc<CacheWithMetrics<CacheDriver<HashMap<u8, String>, TestLoader>>>,
+    }
+
+    impl TestMetricsCache {
+        fn new() -> Self {
+            Self::new_with_loader(Arc::new(TestLoader::default()))
+        }
+
+        fn new_with_loader(loader: Arc<TestLoader>) -> Self {
+            let inner = CacheDriver::new(
+                Arc::clone(&loader) as _,
+                HashMap::new(),
+                &metric::Registry::default(),
+                "test",
+            );
+            let time_provider =
+                Arc::new(MockProvider::new(Time::from_timestamp_millis(0).unwrap()));
+            let metric_registry = metric::Registry::new();
+            let cache = Arc::new(CacheWithMetrics::new(
+                inner,
+                "test",
+                Arc::clone(&time_provider) as _,
+                &metric_registry,
+            ));
+
+            Self {
+                loader,
+                time_provider,
+                metric_registry,
+                cache,
+            }
+        }
+    }
+
+    fn get_metric_cache_get(
+        reporter: &RawReporter,
+        status: &'static str,
+    ) -> HistogramObservation<Duration> {
+        if let Observation::DurationHistogram(hist) = reporter
+            .metric("iox_cache_get")
+            .unwrap()
+            .observation(&[("name", "test"), ("status", status)])
+            .unwrap()
+        {
+            hist.clone()
+        } else {
+            panic!("Wrong observation type");
+        }
+    }
+
+    fn get_metric_cache_peek(
+        reporter: &RawReporter,
+        status: &'static str,
+    ) -> HistogramObservation<Duration> {
+        if let Observation::DurationHistogram(hist) = reporter
+            .metric("iox_cache_peek")
+            .unwrap()
+            .observation(&[("name", "test"), ("status", status)])
+            .unwrap()
+        {
+            hist.clone()
+        } else {
+            panic!("Wrong observation type");
+        }
+    }
+
+    fn assert_n_spans(
+        traces: &RingBufferTraceCollector,
+        name: &'static str,
+        status: SpanStatus,
+        expected: usize,
+    ) {
+        let actual = traces
+            .spans()
+            .into_iter()
+            .filter(|span| (span.name == name) && (span.status == status))
+            .count();
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/cache_system/src/cache/mod.rs b/cache_system/src/cache/mod.rs
new file mode 100644
index 0000000..ba3d541
--- /dev/null
+++ b/cache_system/src/cache/mod.rs
@@ -0,0 +1,167 @@
+//! Top-level trait ([`Cache`]) that provides a fully functional cache.
+//!
+//! Caches usually combine a [backend](crate::backend) with a [loader](crate::loader). The easiest way to achieve that
+//! is to use [`CacheDriver`](crate::cache::driver::CacheDriver). Caches might also wrap inner caches to provide certain
+//! extra functionality like metrics.
+use std::{fmt::Debug, hash::Hash};
+
+use async_trait::async_trait;
+
+pub mod driver;
+pub mod metrics;
+
+#[cfg(test)]
+mod test_util;
+
+/// Status of a [`Cache`] [GET](Cache::get_with_status) request.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CacheGetStatus {
+    /// The requested entry was present in the storage backend.
+    Hit,
+
+    /// The requested entry was NOT present in the storage backend and the loader had no previous query running.
+    Miss,
+
+    /// The requested entry was NOT present in the storage backend, but there was already a loader query running for
+    /// this particular key.
+    MissAlreadyLoading,
+}
+
+impl CacheGetStatus {
+    /// Get human and machine readable name.
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::Hit => "hit",
+            Self::Miss => "miss",
+            Self::MissAlreadyLoading => "miss_already_loading",
+        }
+    }
+}
+
+/// Status of a [`Cache`] [PEEK](Cache::peek_with_status) request.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CachePeekStatus {
+    /// The requested entry was present in the storage backend.
+    Hit,
+
+    /// The requested entry was NOT present in the storage backend, but there was already a loader query running for
+    /// this particular key.
+    MissAlreadyLoading,
+}
+
+impl CachePeekStatus {
+    /// Get human and machine redable name.
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::Hit => "hit",
+            Self::MissAlreadyLoading => "miss_already_loading",
+        }
+    }
+}
+
+/// High-level cache implementation.
+///
+/// # Concurrency
+///
+/// Multiple cache requests for different keys can run at the same time. When data is requested for
+/// the same key the underlying loader will only be polled once, even when the requests are made
+/// while the loader is still running.
+///
+/// # Cancellation
+///
+/// Canceling a [`get`](Self::get) request will NOT cancel the underlying loader. The data will
+/// still be cached.
+///
+/// # Panic
+///
+/// If the underlying loader panics, all currently running [`get`](Self::get) requests will panic.
+/// The data will NOT be cached.
+#[async_trait]
+pub trait Cache: Debug + Send + Sync + 'static {
+    /// Cache key.
+    type K: Clone + Eq + Hash + Debug + Ord + Send + 'static;
+
+    /// Cache value.
+    type V: Clone + Debug + Send + 'static;
+
+    /// Extra data that is provided during [`GET`](Self::get) but that is NOT part of the cache key.
+    type GetExtra: Debug + Send + 'static;
+
+    /// Extra data that is provided during [`PEEK`](Self::peek) but that is NOT part of the cache key.
+    type PeekExtra: Debug + Send + 'static;
+
+    /// Get value from cache.
+    ///
+    /// Note that `extra` is only used if the key is missing from the storage backend and no loader query is running yet.
+    async fn get(&self, k: Self::K, extra: Self::GetExtra) -> Self::V {
+        self.get_with_status(k, extra).await.0
+    }
+
+    /// Get value from cache and the [status](CacheGetStatus).
+    ///
+    /// Note that `extra` is only used if the key is missing from the storage backend and no loader query is running yet.
+    async fn get_with_status(&self, k: Self::K, extra: Self::GetExtra)
+        -> (Self::V, CacheGetStatus);
+
+    /// Peek value from cache.
+    ///
+    /// In contrast to [`get`](Self::get) this will only return a value if there is a stored value or the value loading
+    /// is already in progress. This will NOT start a new loading task.
+    ///
+    /// Note that `extra` is only used if the key is missing from the storage backend and no loader query is running yet.
+    async fn peek(&self, k: Self::K, extra: Self::PeekExtra) -> Option<Self::V> {
+        self.peek_with_status(k, extra).await.map(|(v, _status)| v)
+    }
+
+    /// Peek value from cache and the [status](CachePeekStatus).
+    ///
+    /// In contrast to [`get_with_status`](Self::get_with_status) this will only return a value if there is a stored
+    /// value or the value loading is already in progress. This will NOT start a new loading task.
+    ///
+    /// Note that `extra` is only used if the key is missing from the storage backend and no loader query is running yet.
+    async fn peek_with_status(
+        &self,
+        k: Self::K,
+        extra: Self::PeekExtra,
+    ) -> Option<(Self::V, CachePeekStatus)>;
+
+    /// Side-load an entry into the cache.
+    ///
+    /// This will also complete a currently running request for this key.
+    async fn set(&self, k: Self::K, v: Self::V);
+}
+
+#[async_trait]
+impl<K, V, GetExtra, PeekExtra> Cache
+    for Box<dyn Cache<K = K, V = V, GetExtra = GetExtra, PeekExtra = PeekExtra>>
+where
+    K: Clone + Eq + Hash + Debug + Ord + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+    GetExtra: Debug + Send + 'static,
+    PeekExtra: Debug + Send + 'static,
+{
+    type K = K;
+    type V = V;
+    type GetExtra = GetExtra;
+    type PeekExtra = PeekExtra;
+
+    async fn get_with_status(
+        &self,
+        k: Self::K,
+        extra: Self::GetExtra,
+    ) -> (Self::V, CacheGetStatus) {
+        self.as_ref().get_with_status(k, extra).await
+    }
+
+    async fn peek_with_status(
+        &self,
+        k: Self::K,
+        extra: Self::PeekExtra,
+    ) -> Option<(Self::V, CachePeekStatus)> {
+        self.as_ref().peek_with_status(k, extra).await
+    }
+
+    async fn set(&self, k: Self::K, v: Self::V) {
+        self.as_ref().set(k, v).await
+    }
+}
diff --git a/cache_system/src/cache/test_util.rs b/cache_system/src/cache/test_util.rs
new file mode 100644
index 0000000..b149eec
--- /dev/null
+++ b/cache_system/src/cache/test_util.rs
@@ -0,0 +1,462 @@
+use std::{sync::Arc, time::Duration};
+
+use tokio::sync::Barrier;
+
+use crate::{
+    cache::{CacheGetStatus, CachePeekStatus},
+    loader::test_util::TestLoader,
+    test_util::{AbortAndWaitExt, EnsurePendingExt},
+};
+
+use super::Cache;
+
+/// Interface between generic tests and a concrete cache type.
+pub trait TestAdapter: Send + Sync + 'static {
+    /// Extra information for GET.
+    type GetExtra: Send;
+
+    /// Extra information for PEEK.
+    type PeekExtra: Send;
+
+    /// Cache type.
+    type Cache: Cache<K = u8, V = String, GetExtra = Self::GetExtra, PeekExtra = Self::PeekExtra>;
+
+    /// Create new cache with given loader.
+    fn construct(&self, loader: Arc<TestLoader>) -> Arc<Self::Cache>;
+
+    /// Build [`GetExtra`](Self::GetExtra).
+    ///
+    /// Must contain a [`bool`] payload that is later included into the value string for testing purposes.
+    fn get_extra(&self, inner: bool) -> Self::GetExtra;
+
+    /// Build [`PeekExtra`](Self::PeekExtra).
+    fn peek_extra(&self) -> Self::PeekExtra;
+}
+
+/// Setup test.
+fn setup<T>(adapter: &T) -> (Arc<T::Cache>, Arc<TestLoader>)
+where
+    T: TestAdapter,
+{
+    let loader = Arc::new(TestLoader::default());
+    let cache = adapter.construct(Arc::clone(&loader));
+    (cache, loader)
+}
+
+pub async fn run_test_generic<T>(adapter: T)
+where
+    T: TestAdapter,
+{
+    let adapter = Arc::new(adapter);
+
+    test_answers_are_correct(Arc::clone(&adapter)).await;
+    test_linear_memory(Arc::clone(&adapter)).await;
+    test_concurrent_query_loads_once(Arc::clone(&adapter)).await;
+    test_queries_are_parallelized(Arc::clone(&adapter)).await;
+    test_cancel_request(Arc::clone(&adapter)).await;
+    test_panic_request(Arc::clone(&adapter)).await;
+    test_drop_cancels_loader(Arc::clone(&adapter)).await;
+    test_set_before_request(Arc::clone(&adapter)).await;
+    test_set_during_request(Arc::clone(&adapter)).await;
+}
+
+async fn test_answers_are_correct<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.mock_next(1, "res_1".to_owned());
+    loader.mock_next(2, "res_2".to_owned());
+
+    assert_eq!(
+        cache.get(1, adapter.get_extra(true)).await,
+        String::from("res_1")
+    );
+    assert_eq!(
+        cache.peek(1, adapter.peek_extra()).await,
+        Some(String::from("res_1"))
+    );
+    assert_eq!(
+        cache.get(2, adapter.get_extra(false)).await,
+        String::from("res_2")
+    );
+    assert_eq!(
+        cache.peek(2, adapter.peek_extra()).await,
+        Some(String::from("res_2"))
+    );
+}
+
+async fn test_linear_memory<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.mock_next(1, "res_1".to_owned());
+    loader.mock_next(2, "res_2".to_owned());
+
+    assert_eq!(cache.peek_with_status(1, adapter.peek_extra()).await, None,);
+    assert_eq!(
+        cache.get_with_status(1, adapter.get_extra(true)).await,
+        (String::from("res_1"), CacheGetStatus::Miss),
+    );
+    assert_eq!(
+        cache.get_with_status(1, adapter.get_extra(false)).await,
+        (String::from("res_1"), CacheGetStatus::Hit),
+    );
+    assert_eq!(
+        cache.peek_with_status(1, adapter.peek_extra()).await,
+        Some((String::from("res_1"), CachePeekStatus::Hit)),
+    );
+    assert_eq!(
+        cache.get_with_status(2, adapter.get_extra(false)).await,
+        (String::from("res_2"), CacheGetStatus::Miss),
+    );
+    assert_eq!(
+        cache.get_with_status(2, adapter.get_extra(false)).await,
+        (String::from("res_2"), CacheGetStatus::Hit),
+    );
+    assert_eq!(
+        cache.get_with_status(1, adapter.get_extra(true)).await,
+        (String::from("res_1"), CacheGetStatus::Hit),
+    );
+    assert_eq!(
+        cache.peek_with_status(1, adapter.peek_extra()).await,
+        Some((String::from("res_1"), CachePeekStatus::Hit)),
+    );
+
+    assert_eq!(loader.loaded(), vec![(1, true), (2, false)]);
+}
+
+async fn test_concurrent_query_loads_once<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.block_global();
+
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let barrier_pending_1 = Arc::new(Barrier::new(2));
+    let barrier_pending_1_captured = Arc::clone(&barrier_pending_1);
+    let handle_1 = tokio::spawn(async move {
+        cache_captured
+            .get_with_status(1, adapter_captured.get_extra(true))
+            .ensure_pending(barrier_pending_1_captured)
+            .await
+    });
+
+    barrier_pending_1.wait().await;
+
+    let barrier_pending_2 = Arc::new(Barrier::new(3));
+
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let barrier_pending_2_captured = Arc::clone(&barrier_pending_2);
+    let handle_2 = tokio::spawn(async move {
+        // use a different `extra` here to proof that the first one was used
+        cache_captured
+            .get_with_status(1, adapter_captured.get_extra(false))
+            .ensure_pending(barrier_pending_2_captured)
+            .await
+    });
+    let barrier_pending_2_captured = Arc::clone(&barrier_pending_2);
+    let handle_3 = tokio::spawn(async move {
+        // use a different `extra` here to proof that the first one was used
+        cache
+            .peek_with_status(1, adapter.peek_extra())
+            .ensure_pending(barrier_pending_2_captured)
+            .await
+    });
+
+    barrier_pending_2.wait().await;
+    loader.mock_next(1, "res_1".to_owned());
+    // Shouldn't issue concurrent load requests for the same key
+    let n_blocked = loader.unblock_global();
+    assert_eq!(n_blocked, 1);
+
+    assert_eq!(
+        handle_1.await.unwrap(),
+        (String::from("res_1"), CacheGetStatus::Miss),
+    );
+    assert_eq!(
+        handle_2.await.unwrap(),
+        (String::from("res_1"), CacheGetStatus::MissAlreadyLoading),
+    );
+    assert_eq!(
+        handle_3.await.unwrap(),
+        Some((String::from("res_1"), CachePeekStatus::MissAlreadyLoading)),
+    );
+
+    assert_eq!(loader.loaded(), vec![(1, true)]);
+}
+
+async fn test_queries_are_parallelized<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.block_global();
+
+    let barrier = Arc::new(Barrier::new(4));
+
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let barrier_captured = Arc::clone(&barrier);
+    let handle_1 = tokio::spawn(async move {
+        cache_captured
+            .get(1, adapter_captured.get_extra(true))
+            .ensure_pending(barrier_captured)
+            .await
+    });
+
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let barrier_captured = Arc::clone(&barrier);
+    let handle_2 = tokio::spawn(async move {
+        cache_captured
+            .get(1, adapter_captured.get_extra(true))
+            .ensure_pending(barrier_captured)
+            .await
+    });
+
+    let barrier_captured = Arc::clone(&barrier);
+    let handle_3 = tokio::spawn(async move {
+        cache
+            .get(2, adapter.get_extra(false))
+            .ensure_pending(barrier_captured)
+            .await
+    });
+
+    barrier.wait().await;
+
+    loader.mock_next(1, "res_1".to_owned());
+    loader.mock_next(2, "res_2".to_owned());
+
+    let n_blocked = loader.unblock_global();
+    assert_eq!(n_blocked, 2);
+
+    assert_eq!(handle_1.await.unwrap(), String::from("res_1"));
+    assert_eq!(handle_2.await.unwrap(), String::from("res_1"));
+    assert_eq!(handle_3.await.unwrap(), String::from("res_2"));
+
+    assert_eq!(loader.loaded(), vec![(1, true), (2, false)]);
+}
+
+async fn test_cancel_request<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.block_global();
+
+    let barrier_pending_1 = Arc::new(Barrier::new(2));
+    let barrier_pending_1_captured = Arc::clone(&barrier_pending_1);
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let handle_1 = tokio::spawn(async move {
+        cache_captured
+            .get(1, adapter_captured.get_extra(true))
+            .ensure_pending(barrier_pending_1_captured)
+            .await
+    });
+
+    barrier_pending_1.wait().await;
+    let barrier_pending_2 = Arc::new(Barrier::new(2));
+    let barrier_pending_2_captured = Arc::clone(&barrier_pending_2);
+    let handle_2 = tokio::spawn(async move {
+        cache
+            .get(1, adapter.get_extra(false))
+            .ensure_pending(barrier_pending_2_captured)
+            .await
+    });
+
+    barrier_pending_2.wait().await;
+
+    // abort first handle
+    handle_1.abort_and_wait().await;
+
+    loader.mock_next(1, "res_1".to_owned());
+
+    let n_blocked = loader.unblock_global();
+    assert_eq!(n_blocked, 1);
+
+    assert_eq!(handle_2.await.unwrap(), String::from("res_1"));
+
+    assert_eq!(loader.loaded(), vec![(1, true)]);
+}
+
+async fn test_panic_request<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.block_global();
+
+    // set up initial panicking request
+    let barrier_pending_get_panic = Arc::new(Barrier::new(2));
+    let barrier_pending_get_panic_captured = Arc::clone(&barrier_pending_get_panic);
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let handle_get_panic = tokio::spawn(async move {
+        cache_captured
+            .get(1, adapter_captured.get_extra(true))
+            .ensure_pending(barrier_pending_get_panic_captured)
+            .await
+    });
+
+    barrier_pending_get_panic.wait().await;
+
+    // set up other requests
+    let barrier_pending_others = Arc::new(Barrier::new(4));
+
+    let barrier_pending_others_captured = Arc::clone(&barrier_pending_others);
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let handle_get_while_loading_panic = tokio::spawn(async move {
+        cache_captured
+            .get(1, adapter_captured.get_extra(false))
+            .ensure_pending(barrier_pending_others_captured)
+            .await
+    });
+
+    let barrier_pending_others_captured = Arc::clone(&barrier_pending_others);
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let handle_peek_while_loading_panic = tokio::spawn(async move {
+        cache_captured
+            .peek(1, adapter_captured.peek_extra())
+            .ensure_pending(barrier_pending_others_captured)
+            .await
+    });
+
+    let barrier_pending_others_captured = Arc::clone(&barrier_pending_others);
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let handle_get_other_key = tokio::spawn(async move {
+        cache_captured
+            .get(2, adapter_captured.get_extra(false))
+            .ensure_pending(barrier_pending_others_captured)
+            .await
+    });
+
+    barrier_pending_others.wait().await;
+
+    loader.panic_next(1);
+    loader.mock_next(1, "res_1".to_owned());
+    loader.mock_next(2, "res_2".to_owned());
+
+    let n_blocked = loader.unblock_global();
+    assert_eq!(n_blocked, 2);
+
+    // panic of initial request
+    handle_get_panic.await.unwrap_err();
+
+    // requests that use the same loading status also panic
+    handle_get_while_loading_panic.await.unwrap_err();
+    handle_peek_while_loading_panic.await.unwrap_err();
+
+    // unrelated request should succeed
+    assert_eq!(handle_get_other_key.await.unwrap(), String::from("res_2"));
+
+    // failing key was tried exactly once (and the other unrelated key as well)
+    assert_eq!(loader.loaded(), vec![(1, true), (2, false)]);
+
+    // loading after panic just works (no poisoning)
+    assert_eq!(
+        cache.get(1, adapter.get_extra(false)).await,
+        String::from("res_1")
+    );
+    assert_eq!(loader.loaded(), vec![(1, true), (2, false), (1, false)]);
+}
+
+async fn test_drop_cancels_loader<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.block_global();
+
+    let barrier_pending = Arc::new(Barrier::new(2));
+    let barrier_pending_captured = Arc::clone(&barrier_pending);
+    let handle = tokio::spawn(async move {
+        cache
+            .get(1, adapter.get_extra(true))
+            .ensure_pending(barrier_pending_captured)
+            .await
+    });
+
+    barrier_pending.wait().await;
+
+    handle.abort_and_wait().await;
+
+    assert_eq!(Arc::strong_count(&loader), 1);
+}
+
+async fn test_set_before_request<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.block_global();
+
+    cache.set(1, String::from("foo")).await;
+
+    // blocked loader is not used
+    let res = tokio::time::timeout(
+        Duration::from_millis(10),
+        cache.get(1, adapter.get_extra(false)),
+    )
+    .await
+    .unwrap();
+    assert_eq!(res, String::from("foo"));
+    assert_eq!(loader.loaded(), Vec::<(u8, bool)>::new());
+}
+
+async fn test_set_during_request<T>(adapter: Arc<T>)
+where
+    T: TestAdapter,
+{
+    let (cache, loader) = setup(adapter.as_ref());
+
+    loader.block_global();
+
+    let adapter_captured = Arc::clone(&adapter);
+    let cache_captured = Arc::clone(&cache);
+    let barrier_pending = Arc::new(Barrier::new(2));
+    let barrier_pending_captured = Arc::clone(&barrier_pending);
+    let handle = tokio::spawn(async move {
+        cache_captured
+            .get(1, adapter_captured.get_extra(true))
+            .ensure_pending(barrier_pending_captured)
+            .await
+    });
+    barrier_pending.wait().await;
+
+    cache.set(1, String::from("foo")).await;
+
+    // request succeeds even though the loader is blocked
+    let res = tokio::time::timeout(Duration::from_millis(10), handle)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(res, String::from("foo"));
+    assert_eq!(loader.loaded(), vec![(1, true)]);
+
+    // still cached
+    let res = tokio::time::timeout(
+        Duration::from_millis(10),
+        cache.get(1, adapter.get_extra(false)),
+    )
+    .await
+    .unwrap();
+    assert_eq!(res, String::from("foo"));
+    assert_eq!(loader.loaded(), vec![(1, true)]);
+}
diff --git a/cache_system/src/cancellation_safe_future.rs b/cache_system/src/cancellation_safe_future.rs
new file mode 100644
index 0000000..ae45fc3
--- /dev/null
+++ b/cache_system/src/cancellation_safe_future.rs
@@ -0,0 +1,184 @@
+use std::{
+    future::Future,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use futures::future::BoxFuture;
+use parking_lot::Mutex;
+use tokio::task::JoinHandle;
+
+/// Receiver for [`CancellationSafeFuture`] join handles if the future was rescued from cancellation.
+///
+/// `T` is the [output type](Future::Output) of the wrapped future.
+#[derive(Debug, Default, Clone)]
+pub struct CancellationSafeFutureReceiver<T> {
+    inner: Arc<ReceiverInner<T>>,
+}
+
+#[derive(Debug, Default)]
+struct ReceiverInner<T> {
+    slot: Mutex<Option<JoinHandle<T>>>,
+}
+
+impl<T> Drop for ReceiverInner<T> {
+    fn drop(&mut self) {
+        let handle = self.slot.lock();
+        if let Some(handle) = handle.as_ref() {
+            handle.abort();
+        }
+    }
+}
+
+/// Wrapper around a future that cannot be cancelled.
+///
+/// When the future is dropped/cancelled, we'll spawn a tokio task to _rescue_ it.
+pub struct CancellationSafeFuture<F>
+where
+    F: Future + Send + 'static,
+    F::Output: Send,
+{
+    /// Mark if the inner future finished. If not, we must spawn a helper task on drop.
+    done: bool,
+
+    /// Inner future.
+    ///
+    /// Wrapped in an `Option` so we can extract it during drop. Inside that option however we also need a pinned
+    /// box because once this wrapper is polled, it will be pinned in memory -- even during drop. Now the inner
+    /// future does not necessarily implement `Unpin`, so we need a heap allocation to pin it in memory even when we
+    /// move it out of this option.
+    inner: Option<BoxFuture<'static, F::Output>>,
+
+    /// Where to store the join handle on drop.
+    receiver: CancellationSafeFutureReceiver<F::Output>,
+}
+
+impl<F> Drop for CancellationSafeFuture<F>
+where
+    F: Future + Send + 'static,
+    F::Output: Send,
+{
+    fn drop(&mut self) {
+        if !self.done {
+            // acquire lock BEFORE checking the Arc
+            let mut receiver = self.receiver.inner.slot.lock();
+            assert!(receiver.is_none());
+
+            // The Mutex is owned by the Arc and cannot be moved out of it. So after we acquired the lock we can safely
+            // check if any external party still has access to the receiver state. If not, we assume there is no
+            // interest in this future at all (e.g. during shutdown) and will NOT spawn it.
+            if Arc::strong_count(&self.receiver.inner) > 1 {
+                let inner = self.inner.take().expect("Double-drop?");
+                let handle = tokio::task::spawn(inner);
+                *receiver = Some(handle);
+            }
+        }
+    }
+}
+
+impl<F> CancellationSafeFuture<F>
+where
+    F: Future + Send,
+    F::Output: Send,
+{
+    /// Create new future that is protected from cancellation.
+    ///
+    /// If [`CancellationSafeFuture`] is cancelled (i.e. dropped) and there is still some external receiver of the state
+    /// left, than we will drive the payload (`f`) to completion. Otherwise `f` will be cancelled.
+    pub fn new(fut: F, receiver: CancellationSafeFutureReceiver<F::Output>) -> Self {
+        Self {
+            done: false,
+            inner: Some(Box::pin(fut)),
+            receiver,
+        }
+    }
+}
+
+impl<F> Future for CancellationSafeFuture<F>
+where
+    F: Future + Send,
+    F::Output: Send,
+{
+    type Output = F::Output;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        assert!(!self.done, "Polling future that already returned");
+
+        match self.inner.as_mut().expect("not dropped").as_mut().poll(cx) {
+            Poll::Ready(res) => {
+                self.done = true;
+                Poll::Ready(res)
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        sync::atomic::{AtomicBool, Ordering},
+        time::Duration,
+    };
+
+    use tokio::sync::Barrier;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_happy_path() {
+        let done = Arc::new(AtomicBool::new(false));
+        let done_captured = Arc::clone(&done);
+
+        let receiver = Default::default();
+        let fut = CancellationSafeFuture::new(
+            async move {
+                done_captured.store(true, Ordering::SeqCst);
+            },
+            receiver,
+        );
+
+        fut.await;
+
+        assert!(done.load(Ordering::SeqCst));
+    }
+
+    #[tokio::test]
+    async fn test_cancel_future() {
+        let done = Arc::new(Barrier::new(2));
+        let done_captured = Arc::clone(&done);
+
+        let receiver = CancellationSafeFutureReceiver::default();
+        let fut = CancellationSafeFuture::new(
+            async move {
+                done_captured.wait().await;
+            },
+            receiver.clone(),
+        );
+
+        drop(fut);
+
+        tokio::time::timeout(Duration::from_secs(5), done.wait())
+            .await
+            .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_receiver_gone() {
+        let done = Arc::new(Barrier::new(2));
+        let done_captured = Arc::clone(&done);
+
+        let receiver = Default::default();
+        let fut = CancellationSafeFuture::new(
+            async move {
+                done_captured.wait().await;
+            },
+            receiver,
+        );
+
+        drop(fut);
+
+        assert_eq!(Arc::strong_count(&done), 1);
+    }
+}
diff --git a/cache_system/src/lib.rs b/cache_system/src/lib.rs
new file mode 100644
index 0000000..68e60ae
--- /dev/null
+++ b/cache_system/src/lib.rs
@@ -0,0 +1,29 @@
+//! Flexible and modular cache system.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(unreachable_pub)]
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use criterion as _;
+use workspace_hack as _;
+
+pub mod addressable_heap;
+pub mod backend;
+pub mod cache;
+mod cancellation_safe_future;
+pub mod loader;
+pub mod resource_consumption;
+#[cfg(test)]
+mod test_util;
diff --git a/cache_system/src/loader/batch.rs b/cache_system/src/loader/batch.rs
new file mode 100644
index 0000000..36ab123
--- /dev/null
+++ b/cache_system/src/loader/batch.rs
@@ -0,0 +1,501 @@
+//! Batching of loader request.
+use std::{
+    collections::HashMap,
+    fmt::Debug,
+    future::Future,
+    hash::Hash,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    task::Poll,
+};
+
+use async_trait::async_trait;
+use futures::{
+    channel::oneshot::{channel, Sender},
+    FutureExt,
+};
+use observability_deps::tracing::trace;
+use parking_lot::Mutex;
+
+use crate::cancellation_safe_future::{CancellationSafeFuture, CancellationSafeFutureReceiver};
+
+use super::Loader;
+
+/// Batch [load](Loader::load) requests.
+///
+/// Requests against this loader will be [pending](std::task::Poll::Pending) until [flush](BatchLoaderFlusher::flush) is
+/// called. To simplify the usage -- esp. in combination with [`Cache::get`] -- use [`BatchLoaderFlusherExt`].
+///
+///
+/// [`Cache::get`]: crate::cache::Cache::get
+#[derive(Debug)]
+pub struct BatchLoader<K, Extra, V, L>
+where
+    K: Debug + Hash + Send + 'static,
+    Extra: Debug + Send + 'static,
+    V: Debug + Send + 'static,
+    L: Loader<K = Vec<K>, Extra = Vec<Extra>, V = Vec<V>>,
+{
+    inner: Arc<BatchLoaderInner<K, Extra, V, L>>,
+}
+
+impl<K, Extra, V, L> BatchLoader<K, Extra, V, L>
+where
+    K: Debug + Hash + Send + 'static,
+    Extra: Debug + Send + 'static,
+    V: Debug + Send + 'static,
+    L: Loader<K = Vec<K>, Extra = Vec<Extra>, V = Vec<V>>,
+{
+    /// Create new batch loader based on a non-batched, vector-based one.
+    pub fn new(inner: L) -> Self {
+        Self {
+            inner: Arc::new(BatchLoaderInner {
+                inner,
+                pending: Default::default(),
+                job_id_counter: Default::default(),
+                job_handles: Default::default(),
+            }),
+        }
+    }
+}
+
+/// State of [`BatchLoader`].
+///
+/// This is an extra struct so it can be wrapped into an [`Arc`] and shared with the futures that are spawned into
+/// [`CancellationSafeFuture`]
+#[derive(Debug)]
+struct BatchLoaderInner<K, Extra, V, L>
+where
+    K: Debug + Hash + Send + 'static,
+    Extra: Debug + Send + 'static,
+    V: Debug + Send + 'static,
+    L: Loader<K = Vec<K>, Extra = Vec<Extra>, V = Vec<V>>,
+{
+    inner: L,
+    pending: Mutex<Vec<(K, Extra, Sender<V>)>>,
+    job_id_counter: AtomicU64,
+    job_handles: Mutex<HashMap<u64, CancellationSafeFutureReceiver<()>>>,
+}
+
+/// Flush interface for [`BatchLoader`].
+///
+/// This is a trait so you can [type-erase](https://en.wikipedia.org/wiki/Type_erasure) it by putting it into an
+/// [`Arc`],
+///
+/// This trait is object-safe.
+#[async_trait]
+pub trait BatchLoaderFlusher: Debug + Send + Sync + 'static {
+    /// Flush all batched requests.
+    async fn flush(&self);
+}
+
+#[async_trait]
+impl BatchLoaderFlusher for Arc<dyn BatchLoaderFlusher> {
+    async fn flush(&self) {
+        self.as_ref().flush().await;
+    }
+}
+
+#[async_trait]
+impl<K, Extra, V, L> BatchLoaderFlusher for BatchLoader<K, Extra, V, L>
+where
+    K: Debug + Hash + Send + 'static,
+    Extra: Debug + Send + 'static,
+    V: Debug + Send + 'static,
+    L: Loader<K = Vec<K>, Extra = Vec<Extra>, V = Vec<V>>,
+{
+    async fn flush(&self) {
+        let pending: Vec<_> = {
+            let mut pending = self.inner.pending.lock();
+            std::mem::take(pending.as_mut())
+        };
+
+        if pending.is_empty() {
+            return;
+        }
+        trace!(n_pending = pending.len(), "flush batch loader",);
+
+        let job_id = self.inner.job_id_counter.fetch_add(1, Ordering::SeqCst);
+        let handle_recv = CancellationSafeFutureReceiver::default();
+
+        {
+            let mut job_handles = self.inner.job_handles.lock();
+            job_handles.insert(job_id, handle_recv.clone());
+        }
+
+        let inner = Arc::clone(&self.inner);
+        let fut = CancellationSafeFuture::new(
+            async move {
+                let mut keys = Vec::with_capacity(pending.len());
+                let mut extras = Vec::with_capacity(pending.len());
+                let mut senders = Vec::with_capacity(pending.len());
+
+                for (k, extra, sender) in pending {
+                    keys.push(k);
+                    extras.push(extra);
+                    senders.push(sender);
+                }
+
+                let values = inner.inner.load(keys, extras).await;
+                assert_eq!(values.len(), senders.len());
+
+                for (value, sender) in values.into_iter().zip(senders) {
+                    sender.send(value).unwrap();
+                }
+
+                let mut job_handles = inner.job_handles.lock();
+                job_handles.remove(&job_id);
+            },
+            handle_recv,
+        );
+        fut.await;
+    }
+}
+
+#[async_trait]
+impl<K, Extra, V, L> Loader for BatchLoader<K, Extra, V, L>
+where
+    K: Debug + Hash + Send + 'static,
+    Extra: Debug + Send + 'static,
+    V: Debug + Send + 'static,
+    L: Loader<K = Vec<K>, Extra = Vec<Extra>, V = Vec<V>>,
+{
+    type K = K;
+    type Extra = Extra;
+    type V = V;
+
+    async fn load(&self, k: Self::K, extra: Self::Extra) -> Self::V {
+        let (tx, rx) = channel();
+
+        {
+            let mut pending = self.inner.pending.lock();
+            pending.push((k, extra, tx));
+        }
+
+        rx.await.unwrap()
+    }
+}
+
+/// Extension trait for [`BatchLoaderFlusher`] because the methods on this extension trait are not object safe.
+#[async_trait]
+pub trait BatchLoaderFlusherExt {
+    /// Try to poll all given futures and automatically [flush](BatchLoaderFlusher) if any of them end up in a pending state.
+    ///
+    /// This guarantees that the order of the results is identical to the order of the futures.
+    async fn auto_flush<F>(&self, futures: Vec<F>) -> Vec<F::Output>
+    where
+        F: Future + Send,
+        F::Output: Send;
+}
+
+#[async_trait]
+impl<B> BatchLoaderFlusherExt for B
+where
+    B: BatchLoaderFlusher,
+{
+    async fn auto_flush<F>(&self, futures: Vec<F>) -> Vec<F::Output>
+    where
+        F: Future + Send,
+        F::Output: Send,
+    {
+        let mut futures = futures
+            .into_iter()
+            .map(|f| f.boxed())
+            .enumerate()
+            .collect::<Vec<_>>();
+        let mut output: Vec<Option<F::Output>> = (0..futures.len()).map(|_| None).collect();
+
+        while !futures.is_empty() {
+            let mut pending = Vec::with_capacity(futures.len());
+
+            for (idx, mut f) in futures.into_iter() {
+                match futures::poll!(&mut f) {
+                    Poll::Ready(res) => {
+                        output[idx] = Some(res);
+                    }
+                    Poll::Pending => {
+                        pending.push((idx, f));
+                    }
+                }
+            }
+
+            if !pending.is_empty() {
+                self.flush().await;
+
+                // prevent hot-looping:
+                // It seems that in some cases the underlying loader is ready but the data is not available via the
+                // cache driver yet. This is likely due to the signalling system within the cache driver that prevents
+                // cancelation, but also allows side-loading and at the same time prevents that the same key is loaded
+                // multiple times. Tokio doesn't know that this method here is basically a wait loop. So we yield back
+                // to the tokio worker and to allow it to make some progress. Since flush+load take some time anyways,
+                // this yield here is not overall performance critical.
+                tokio::task::yield_now().await;
+            }
+
+            futures = pending;
+        }
+
+        output
+            .into_iter()
+            .map(|o| o.expect("all futures finished"))
+            .collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tokio::sync::Barrier;
+
+    use crate::{
+        cache::{driver::CacheDriver, Cache},
+        loader::test_util::TestLoader,
+        test_util::EnsurePendingExt,
+    };
+
+    use super::*;
+
+    type TestLoaderT = Arc<TestLoader<Vec<u8>, Vec<bool>, Vec<String>>>;
+
+    #[tokio::test]
+    async fn test_flush_empty() {
+        let (inner, batch) = setup();
+        batch.flush().await;
+        assert_eq!(inner.loaded(), vec![],);
+    }
+
+    #[tokio::test]
+    async fn test_flush_manual() {
+        let (inner, batch) = setup();
+
+        let pending_barrier_1 = Arc::new(Barrier::new(2));
+        let pending_barrier_1_captured = Arc::clone(&pending_barrier_1);
+        let batch_captured = Arc::clone(&batch);
+        let handle_1 = tokio::spawn(async move {
+            batch_captured
+                .load(1, true)
+                .ensure_pending(pending_barrier_1_captured)
+                .await
+        });
+        pending_barrier_1.wait().await;
+
+        let pending_barrier_2 = Arc::new(Barrier::new(2));
+        let pending_barrier_2_captured = Arc::clone(&pending_barrier_2);
+        let batch_captured = Arc::clone(&batch);
+        let handle_2 = tokio::spawn(async move {
+            batch_captured
+                .load(2, false)
+                .ensure_pending(pending_barrier_2_captured)
+                .await
+        });
+        pending_barrier_2.wait().await;
+
+        inner.mock_next(vec![1, 2], vec![String::from("foo"), String::from("bar")]);
+
+        batch.flush().await;
+        assert_eq!(inner.loaded(), vec![(vec![1, 2], vec![true, false])],);
+
+        assert_eq!(handle_1.await.unwrap(), String::from("foo"));
+        assert_eq!(handle_2.await.unwrap(), String::from("bar"));
+    }
+
+    /// Simulate the following scenario:
+    ///
+    /// 1. load `1`, flush it, inner load starts processing `[1]`
+    /// 2. load `2`, flush it, inner load starts processing `[2]`
+    /// 3. inner loader returns result for `[2]`, batch loader returns that result as well
+    /// 4. inner loader returns result for `[1]`, batch loader returns that result as well
+    #[tokio::test]
+    async fn test_concurrent_load() {
+        let (inner, batch) = setup();
+
+        let load_barrier_1 = inner.block_next(vec![1], vec![String::from("foo")]);
+        inner.mock_next(vec![2], vec![String::from("bar")]);
+
+        // set up first load
+        let pending_barrier_1 = Arc::new(Barrier::new(2));
+        let pending_barrier_1_captured = Arc::clone(&pending_barrier_1);
+        let batch_captured = Arc::clone(&batch);
+        let handle_1 = tokio::spawn(async move {
+            batch_captured
+                .load(1, true)
+                .ensure_pending(pending_barrier_1_captured)
+                .await
+        });
+        pending_barrier_1.wait().await;
+
+        // flush first load, this is blocked by the load barrier
+        let pending_barrier_2 = Arc::new(Barrier::new(2));
+        let pending_barrier_2_captured = Arc::clone(&pending_barrier_2);
+        let batch_captured = Arc::clone(&batch);
+        let handle_2 = tokio::spawn(async move {
+            batch_captured
+                .flush()
+                .ensure_pending(pending_barrier_2_captured)
+                .await;
+        });
+        pending_barrier_2.wait().await;
+
+        // set up second load
+        let pending_barrier_3 = Arc::new(Barrier::new(2));
+        let pending_barrier_3_captured = Arc::clone(&pending_barrier_3);
+        let batch_captured = Arc::clone(&batch);
+        let handle_3 = tokio::spawn(async move {
+            batch_captured
+                .load(2, false)
+                .ensure_pending(pending_barrier_3_captured)
+                .await
+        });
+        pending_barrier_3.wait().await;
+
+        // flush 2nd load and get result
+        batch.flush().await;
+        assert_eq!(handle_3.await.unwrap(), String::from("bar"));
+
+        // flush 1st load and get result
+        load_barrier_1.wait().await;
+        handle_2.await.unwrap();
+        assert_eq!(handle_1.await.unwrap(), String::from("foo"));
+
+        assert_eq!(
+            inner.loaded(),
+            vec![(vec![1], vec![true]), (vec![2], vec![false])],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_cancel_flush() {
+        let (inner, batch) = setup();
+
+        let load_barrier_1 = inner.block_next(vec![1], vec![String::from("foo")]);
+
+        // set up load
+        let pending_barrier_1 = Arc::new(Barrier::new(2));
+        let pending_barrier_1_captured = Arc::clone(&pending_barrier_1);
+        let batch_captured = Arc::clone(&batch);
+        let handle_1 = tokio::spawn(async move {
+            batch_captured
+                .load(1, true)
+                .ensure_pending(pending_barrier_1_captured)
+                .await
+        });
+        pending_barrier_1.wait().await;
+
+        // flush load, this is blocked by the load barrier
+        let pending_barrier_2 = Arc::new(Barrier::new(2));
+        let pending_barrier_2_captured = Arc::clone(&pending_barrier_2);
+        let batch_captured = Arc::clone(&batch);
+        let handle_2 = tokio::spawn(async move {
+            batch_captured
+                .flush()
+                .ensure_pending(pending_barrier_2_captured)
+                .await;
+        });
+        pending_barrier_2.wait().await;
+
+        // abort flush
+        handle_2.abort();
+
+        // flush load and get result
+        load_barrier_1.wait().await;
+        assert_eq!(handle_1.await.unwrap(), String::from("foo"));
+
+        assert_eq!(inner.loaded(), vec![(vec![1], vec![true])],);
+    }
+
+    #[tokio::test]
+    async fn test_cancel_load_and_flush() {
+        let (inner, batch) = setup();
+
+        let load_barrier_1 = inner.block_next(vec![1], vec![String::from("foo")]);
+
+        // set up load
+        let pending_barrier_1 = Arc::new(Barrier::new(2));
+        let pending_barrier_1_captured = Arc::clone(&pending_barrier_1);
+        let batch_captured = Arc::clone(&batch);
+        let handle_1 = tokio::spawn(async move {
+            batch_captured
+                .load(1, true)
+                .ensure_pending(pending_barrier_1_captured)
+                .await
+        });
+        pending_barrier_1.wait().await;
+
+        // flush load, this is blocked by the load barrier
+        let pending_barrier_2 = Arc::new(Barrier::new(2));
+        let pending_barrier_2_captured = Arc::clone(&pending_barrier_2);
+        let batch_captured = Arc::clone(&batch);
+        let handle_2 = tokio::spawn(async move {
+            batch_captured
+                .flush()
+                .ensure_pending(pending_barrier_2_captured)
+                .await;
+        });
+        pending_barrier_2.wait().await;
+
+        // abort load and flush
+        handle_1.abort();
+        handle_2.abort();
+
+        // unblock
+        load_barrier_1.wait().await;
+
+        // load was still driven to completion
+        assert_eq!(inner.loaded(), vec![(vec![1], vec![true])],);
+    }
+
+    #[tokio::test]
+    async fn test_auto_flush_with_loader() {
+        let (inner, batch) = setup();
+
+        inner.mock_next(vec![1, 2], vec![String::from("foo"), String::from("bar")]);
+
+        assert_eq!(
+            batch
+                .auto_flush(vec![batch.load(1, true), batch.load(2, false)])
+                .await,
+            vec![String::from("foo"), String::from("bar")],
+        );
+
+        assert_eq!(inner.loaded(), vec![(vec![1, 2], vec![true, false])],);
+    }
+
+    #[tokio::test]
+    async fn test_auto_flush_integration_with_cache_driver() {
+        let (inner, batch) = setup();
+        let cache = CacheDriver::new(
+            Arc::clone(&batch),
+            HashMap::new(),
+            &metric::Registry::default(),
+            "test",
+        );
+
+        inner.mock_next(vec![1, 2], vec![String::from("foo"), String::from("bar")]);
+        inner.mock_next(vec![3], vec![String::from("baz")]);
+
+        assert_eq!(
+            batch
+                .auto_flush(vec![cache.get(1, true), cache.get(2, false)])
+                .await,
+            vec![String::from("foo"), String::from("bar")],
+        );
+        assert_eq!(
+            batch
+                .auto_flush(vec![cache.get(2, true), cache.get(3, true)])
+                .await,
+            vec![String::from("bar"), String::from("baz")],
+        );
+
+        assert_eq!(
+            inner.loaded(),
+            vec![(vec![1, 2], vec![true, false]), (vec![3], vec![true])],
+        );
+    }
+
+    fn setup() -> (TestLoaderT, Arc<BatchLoader<u8, bool, String, TestLoaderT>>) {
+        let inner = TestLoaderT::default();
+        let batch = Arc::new(BatchLoader::new(Arc::clone(&inner)));
+        (inner, batch)
+    }
+}
diff --git a/cache_system/src/loader/metrics.rs b/cache_system/src/loader/metrics.rs
new file mode 100644
index 0000000..72645b2
--- /dev/null
+++ b/cache_system/src/loader/metrics.rs
@@ -0,0 +1,247 @@
+//! Metrics for [`Loader`].
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use iox_time::TimeProvider;
+use metric::{DurationHistogram, U64Counter};
+use observability_deps::tracing::warn;
+use parking_lot::Mutex;
+use pdatastructs::filters::{bloomfilter::BloomFilter, Filter};
+
+use super::Loader;
+
+/// Wraps a [`Loader`] and adds metrics.
+pub struct MetricsLoader<L>
+where
+    L: Loader,
+{
+    inner: L,
+    time_provider: Arc<dyn TimeProvider>,
+    metric_calls_new: U64Counter,
+    metric_calls_probably_reloaded: U64Counter,
+    metric_duration: DurationHistogram,
+    seen: Mutex<BloomFilter<L::K>>,
+}
+
+impl<L> MetricsLoader<L>
+where
+    L: Loader,
+{
+    /// Create new wrapper.
+    ///
+    /// # Testing
+    /// If `testing` is set, the "seen" metrics will NOT be processed correctly because the underlying data structure is
+    /// too expensive to create many times a second in an un-optimized debug build.
+    pub fn new(
+        inner: L,
+        name: &'static str,
+        time_provider: Arc<dyn TimeProvider>,
+        metric_registry: &metric::Registry,
+        testing: bool,
+    ) -> Self {
+        let metric_calls = metric_registry.register_metric::<U64Counter>(
+            "cache_load_function_calls",
+            "Count how often a cache loader was called.",
+        );
+        let metric_calls_new = metric_calls.recorder(&[("name", name), ("status", "new")]);
+        let metric_calls_probably_reloaded =
+            metric_calls.recorder(&[("name", name), ("status", "probably_reloaded")]);
+        let metric_duration = metric_registry
+            .register_metric::<DurationHistogram>(
+                "cache_load_function_duration",
+                "Time taken by cache load function calls",
+            )
+            .recorder(&[("name", name)]);
+
+        let seen = if testing {
+            BloomFilter::with_params(1, 1)
+        } else {
+            // Set up bloom filter for "probably reloaded" test:
+            //
+            // - input size: we expect 10M elements
+            // - reliability: probability of false positives should be <= 1%
+            // - CPU efficiency: number of hash functions should be <= 10
+            // - RAM efficiency: size should be <= 15MB
+            //
+            //
+            // A bloom filter was chosen here because of the following properties:
+            //
+            // - memory bound: The storage size is bound even when the set of "probably reloaded" entries approaches
+            //   infinite sizes.
+            // - memory efficiency: We do not need to store the actual keys.
+            // - infallible: Inserting new data into the filter never fails (in contrast to for example a CuckooFilter or
+            //   QuotientFilter).
+            //
+            // The fact that a filter can produce false positives (i.e. it classifies an actual new entry as "probably
+            // reloaded") is considered to be OK since the metric is more of an estimate and a guide for cache tuning. We
+            // might want to use a more efficient (i.e. more modern) filter design at one point though.
+            let seen = BloomFilter::with_properties(10_000_000, 1.0 / 100.0);
+            const BOUND_HASH_FUNCTIONS: usize = 10;
+            assert!(
+                seen.k() <= BOUND_HASH_FUNCTIONS,
+                "number of hash functions for bloom filter should be <= {} but is {}",
+                BOUND_HASH_FUNCTIONS,
+                seen.k(),
+            );
+            const BOUND_SIZE_BYTES: usize = 15_000_000;
+            let size_bytes = (seen.m() + 7) / 8;
+            assert!(
+                size_bytes <= BOUND_SIZE_BYTES,
+                "size of bloom filter should be <= {BOUND_SIZE_BYTES} bytes but is {size_bytes} bytes",
+            );
+
+            seen
+        };
+
+        Self {
+            inner,
+            time_provider,
+            metric_calls_new,
+            metric_calls_probably_reloaded,
+            metric_duration,
+            seen: Mutex::new(seen),
+        }
+    }
+}
+
+impl<L> std::fmt::Debug for MetricsLoader<L>
+where
+    L: Loader,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MetricsLoader").finish_non_exhaustive()
+    }
+}
+
+#[async_trait]
+impl<L> Loader for MetricsLoader<L>
+where
+    L: Loader,
+{
+    type K = L::K;
+    type V = L::V;
+    type Extra = L::Extra;
+
+    async fn load(&self, k: Self::K, extra: Self::Extra) -> Self::V {
+        {
+            let mut seen_guard = self.seen.lock();
+
+            if seen_guard.insert(&k).expect("bloom filter cannot fail") {
+                &self.metric_calls_new
+            } else {
+                &self.metric_calls_probably_reloaded
+            }
+            .inc(1);
+        }
+
+        let t_start = self.time_provider.now();
+        let v = self.inner.load(k, extra).await;
+        let t_end = self.time_provider.now();
+
+        match t_end.checked_duration_since(t_start) {
+            Some(duration) => {
+                self.metric_duration.record(duration);
+            }
+            None => {
+                warn!("Clock went backwards, not recording loader duration");
+            }
+        }
+
+        v
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use iox_time::{MockProvider, Time};
+    use metric::{Observation, RawReporter};
+
+    use crate::loader::FunctionLoader;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_metrics() {
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_millis(0).unwrap()));
+        let metric_registry = Arc::new(metric::Registry::new());
+
+        let time_provider_captured = Arc::clone(&time_provider);
+        let d = Duration::from_secs(10);
+        let inner_loader = FunctionLoader::new(move |x: u64, _extra: ()| {
+            let time_provider_captured = Arc::clone(&time_provider_captured);
+            async move {
+                time_provider_captured.inc(d);
+                x.to_string()
+            }
+        });
+
+        let loader = MetricsLoader::new(
+            inner_loader,
+            "my_loader",
+            time_provider,
+            &metric_registry,
+            false,
+        );
+
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        for status in ["new", "probably_reloaded"] {
+            assert_eq!(
+                reporter
+                    .metric("cache_load_function_calls")
+                    .unwrap()
+                    .observation(&[("name", "my_loader"), ("status", status)])
+                    .unwrap(),
+                &Observation::U64Counter(0)
+            );
+        }
+        if let Observation::DurationHistogram(hist) = reporter
+            .metric("cache_load_function_duration")
+            .unwrap()
+            .observation(&[("name", "my_loader")])
+            .unwrap()
+        {
+            assert_eq!(hist.sample_count(), 0);
+            assert_eq!(hist.total, Duration::from_secs(0));
+        } else {
+            panic!("Wrong observation type");
+        }
+
+        assert_eq!(loader.load(42, ()).await, String::from("42"));
+        assert_eq!(loader.load(42, ()).await, String::from("42"));
+        assert_eq!(loader.load(1337, ()).await, String::from("1337"));
+
+        let mut reporter = RawReporter::default();
+        metric_registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("cache_load_function_calls")
+                .unwrap()
+                .observation(&[("name", "my_loader"), ("status", "new")])
+                .unwrap(),
+            &Observation::U64Counter(2)
+        );
+        assert_eq!(
+            reporter
+                .metric("cache_load_function_calls")
+                .unwrap()
+                .observation(&[("name", "my_loader"), ("status", "probably_reloaded")])
+                .unwrap(),
+            &Observation::U64Counter(1)
+        );
+        if let Observation::DurationHistogram(hist) = reporter
+            .metric("cache_load_function_duration")
+            .unwrap()
+            .observation(&[("name", "my_loader")])
+            .unwrap()
+        {
+            assert_eq!(hist.sample_count(), 3);
+            assert_eq!(hist.total, 3 * d);
+        } else {
+            panic!("Wrong observation type");
+        }
+    }
+}
diff --git a/cache_system/src/loader/mod.rs b/cache_system/src/loader/mod.rs
new file mode 100644
index 0000000..6c429a7
--- /dev/null
+++ b/cache_system/src/loader/mod.rs
@@ -0,0 +1,151 @@
+//! How to load new cache entries.
+use async_trait::async_trait;
+use std::{fmt::Debug, future::Future, hash::Hash, marker::PhantomData, sync::Arc};
+
+pub mod batch;
+pub mod metrics;
+
+#[cfg(test)]
+pub(crate) mod test_util;
+
+/// Loader for missing [`Cache`](crate::cache::Cache) entries.
+#[async_trait]
+pub trait Loader: std::fmt::Debug + Send + Sync + 'static {
+    /// Cache key.
+    type K: Debug + Hash + Send + 'static;
+
+    /// Extra data needed when loading a missing entry. Specify `()` if not needed.
+    type Extra: Debug + Send + 'static;
+
+    /// Cache value.
+    type V: Debug + Send + 'static;
+
+    /// Load value for given key, using the extra data if needed.
+    async fn load(&self, k: Self::K, extra: Self::Extra) -> Self::V;
+}
+
+#[async_trait]
+impl<K, V, Extra> Loader for Box<dyn Loader<K = K, V = V, Extra = Extra>>
+where
+    K: Debug + Hash + Send + 'static,
+    V: Debug + Send + 'static,
+    Extra: Debug + Send + 'static,
+{
+    type K = K;
+    type V = V;
+    type Extra = Extra;
+
+    async fn load(&self, k: Self::K, extra: Self::Extra) -> Self::V {
+        self.as_ref().load(k, extra).await
+    }
+}
+
+#[async_trait]
+impl<K, V, Extra, L> Loader for Arc<L>
+where
+    K: Debug + Hash + Send + 'static,
+    V: Debug + Send + 'static,
+    Extra: Debug + Send + 'static,
+    L: Loader<K = K, V = V, Extra = Extra>,
+{
+    type K = K;
+    type V = V;
+    type Extra = Extra;
+
+    async fn load(&self, k: Self::K, extra: Self::Extra) -> Self::V {
+        self.as_ref().load(k, extra).await
+    }
+}
+
+/// Simple-to-use wrapper for async functions to act as a [`Loader`].
+///
+/// # Typing
+/// Semantically this wrapper has only one degree of freedom: `T`, which is the async loader function. However until
+/// [`fn_traits`] are stable, there is no way to extract the parameters and return value from a function via associated
+/// types. So we need to add additional type parametes for the special `Fn(...) -> ...` handling.
+///
+/// It is likely that `T` will be a closure, e.g.:
+///
+/// ```
+/// use cache_system::loader::FunctionLoader;
+///
+/// let my_loader = FunctionLoader::new(|k: u8, _extra: ()| async move {
+///     format!("{k}")
+/// });
+/// ```
+///
+/// There is no way to spell out the exact type of `my_loader` in the above example, because  the closure has an
+/// anonymous type. If you need the type signature of [`FunctionLoader`], you have to
+/// [erase the type](https://en.wikipedia.org/wiki/Type_erasure) by putting the [`FunctionLoader`] it into a [`Box`],
+/// e.g.:
+///
+/// ```
+/// use cache_system::loader::{Loader, FunctionLoader};
+///
+/// let my_loader = FunctionLoader::new(|k: u8, _extra: ()| async move {
+///     format!("{k}")
+/// });
+/// let m_loader: Box<dyn Loader<K = u8, V = String, Extra = ()>> = Box::new(my_loader);
+/// ```
+///
+///
+/// [`fn_traits`]: https://doc.rust-lang.org/beta/unstable-book/library-features/fn-traits.html
+pub struct FunctionLoader<T, F, K, Extra>
+where
+    T: Fn(K, Extra) -> F + Send + Sync + 'static,
+    F: Future + Send + 'static,
+    K: Debug + Send + 'static,
+    F::Output: Debug + Send + 'static,
+    Extra: Debug + Send + 'static,
+{
+    loader: T,
+    _phantom: PhantomData<dyn Fn() -> (F, K, Extra) + Send + Sync + 'static>,
+}
+
+impl<T, F, K, Extra> FunctionLoader<T, F, K, Extra>
+where
+    T: Fn(K, Extra) -> F + Send + Sync + 'static,
+    F: Future + Send + 'static,
+    K: Debug + Send + 'static,
+    F::Output: Debug + Send + 'static,
+    Extra: Debug + Send + 'static,
+{
+    /// Create loader from function.
+    pub fn new(loader: T) -> Self {
+        Self {
+            loader,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<T, F, K, Extra> std::fmt::Debug for FunctionLoader<T, F, K, Extra>
+where
+    T: Fn(K, Extra) -> F + Send + Sync + 'static,
+    F: Future + Send + 'static,
+    K: Debug + Send + 'static,
+    F::Output: Debug + Send + 'static,
+    Extra: Debug + Send + 'static,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FunctionLoader").finish_non_exhaustive()
+    }
+}
+
+#[async_trait]
+impl<T, F, K, Extra> Loader for FunctionLoader<T, F, K, Extra>
+where
+    T: Fn(K, Extra) -> F + Send + Sync + 'static,
+    F: Future + Send + 'static,
+    K: Debug + Hash + Send + 'static,
+    F::Output: Debug + Send + 'static,
+    Extra: Debug + Send + 'static,
+{
+    type K = K;
+    type V = F::Output;
+    type Extra = Extra;
+
+    async fn load(&self, k: Self::K, extra: Self::Extra) -> Self::V {
+        (self.loader)(k, extra).await
+    }
+}
diff --git a/cache_system/src/loader/test_util.rs b/cache_system/src/loader/test_util.rs
new file mode 100644
index 0000000..a35e708
--- /dev/null
+++ b/cache_system/src/loader/test_util.rs
@@ -0,0 +1,239 @@
+use std::{collections::HashMap, fmt::Debug, hash::Hash, sync::Arc};
+
+use async_trait::async_trait;
+use parking_lot::Mutex;
+use tokio::sync::{Barrier, Notify};
+
+use super::Loader;
+
+#[derive(Debug)]
+enum TestLoaderResponse<V> {
+    Answer { v: V, block: Option<Arc<Barrier>> },
+    Panic,
+}
+
+/// An easy-to-mock [`Loader`].
+#[derive(Debug, Default)]
+pub struct TestLoader<K = u8, Extra = bool, V = String>
+where
+    K: Clone + Debug + Eq + Hash + Send + 'static,
+    Extra: Clone + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    responses: Mutex<HashMap<K, Vec<TestLoaderResponse<V>>>>,
+    blocked: Mutex<Option<Arc<Notify>>>,
+    loaded: Mutex<Vec<(K, Extra)>>,
+}
+
+impl<K, V, Extra> TestLoader<K, Extra, V>
+where
+    K: Clone + Debug + Eq + Hash + Send + 'static,
+    Extra: Clone + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    /// Mock next value for given key-value pair.
+    pub fn mock_next(&self, k: K, v: V) {
+        self.mock_inner(k, TestLoaderResponse::Answer { v, block: None });
+    }
+
+    /// Block on next load for given key-value pair.
+    ///
+    /// Return a barrier that can be used to unblock the load.
+    #[must_use]
+    pub fn block_next(&self, k: K, v: V) -> Arc<Barrier> {
+        let block = Arc::new(Barrier::new(2));
+        self.mock_inner(
+            k,
+            TestLoaderResponse::Answer {
+                v,
+                block: Some(Arc::clone(&block)),
+            },
+        );
+        block
+    }
+
+    /// Panic when loading value for `k`.
+    ///
+    /// If this is used together with [`block_global`](Self::block_global), the panic will occur AFTER
+    /// blocking.
+    pub fn panic_next(&self, k: K) {
+        self.mock_inner(k, TestLoaderResponse::Panic);
+    }
+
+    fn mock_inner(&self, k: K, response: TestLoaderResponse<V>) {
+        let mut responses = self.responses.lock();
+        responses.entry(k).or_default().push(response);
+    }
+
+    /// Block all [`load`](Self::load) requests until [`unblock`](Self::unblock) is called.
+    ///
+    /// If this is used together with [`panic_once`](Self::panic_once), the panic will occur
+    /// AFTER blocking.
+    pub fn block_global(&self) {
+        let mut blocked = self.blocked.lock();
+        assert!(blocked.is_none());
+        *blocked = Some(Arc::new(Notify::new()));
+    }
+
+    /// Unblock all requests.
+    ///
+    /// Returns number of requests that were blocked.
+    pub fn unblock_global(&self) -> usize {
+        let handle = self.blocked.lock().take().unwrap();
+        let blocked_count = Arc::strong_count(&handle) - 1;
+        handle.notify_waiters();
+        blocked_count
+    }
+
+    /// List all keys that were loaded.
+    ///
+    /// Contains duplicates if keys were loaded multiple times.
+    pub fn loaded(&self) -> Vec<(K, Extra)> {
+        self.loaded.lock().clone()
+    }
+}
+
+impl<K, Extra, V> Drop for TestLoader<K, Extra, V>
+where
+    K: Clone + Debug + Eq + Hash + Send + 'static,
+    Extra: Clone + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    fn drop(&mut self) {
+        // prevent double-panic (i.e. aborts)
+        if !std::thread::panicking() {
+            for entries in self.responses.lock().values() {
+                assert!(entries.is_empty(), "mocked response left");
+            }
+        }
+    }
+}
+
+#[async_trait]
+impl<K, V, Extra> Loader for TestLoader<K, Extra, V>
+where
+    K: Clone + Debug + Eq + Hash + Send + 'static,
+    Extra: Clone + Debug + Send + 'static,
+    V: Clone + Debug + Send + 'static,
+{
+    type K = K;
+    type Extra = Extra;
+    type V = V;
+
+    async fn load(&self, k: Self::K, extra: Self::Extra) -> Self::V {
+        self.loaded.lock().push((k.clone(), extra));
+
+        // need to capture the cloned notify handle, otherwise the lock guard leaks into the
+        // generator
+        let maybe_block = self.blocked.lock().clone();
+        if let Some(block) = maybe_block {
+            block.notified().await;
+        }
+
+        let response = {
+            let mut guard = self.responses.lock();
+            let entries = guard.get_mut(&k).expect("entry not mocked");
+
+            assert!(!entries.is_empty(), "no mocked response left");
+
+            entries.remove(0)
+        };
+
+        match response {
+            TestLoaderResponse::Answer { v, block } => {
+                if let Some(block) = block {
+                    block.wait().await;
+                }
+
+                v
+            }
+            TestLoaderResponse::Panic => {
+                panic!("test")
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::FutureExt;
+
+    use super::*;
+
+    #[tokio::test]
+    #[should_panic(expected = "entry not mocked")]
+    async fn test_loader_panic_entry_unknown() {
+        let loader = TestLoader::<u8, (), String>::default();
+        loader.load(1, ()).await;
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "no mocked response left")]
+    async fn test_loader_panic_no_mocked_reponse_left() {
+        let loader = TestLoader::default();
+        loader.mock_next(1, String::from("foo"));
+        loader.load(1, ()).await;
+        loader.load(1, ()).await;
+    }
+
+    #[test]
+    #[should_panic(expected = "mocked response left")]
+    fn test_loader_panic_requests_left() {
+        let loader = TestLoader::<u8, (), String>::default();
+        loader.mock_next(1, String::from("foo"));
+    }
+
+    #[test]
+    #[should_panic(expected = "panic-by-choice")]
+    fn test_loader_no_double_panic() {
+        let loader = TestLoader::<u8, (), String>::default();
+        loader.mock_next(1, String::from("foo"));
+        panic!("panic-by-choice");
+    }
+
+    #[tokio::test]
+    async fn test_loader_nonblocking_mock() {
+        let loader = TestLoader::default();
+
+        loader.mock_next(1, String::from("foo"));
+        loader.mock_next(1, String::from("bar"));
+        loader.mock_next(2, String::from("baz"));
+
+        assert_eq!(loader.load(1, ()).await, String::from("foo"));
+        assert_eq!(loader.load(2, ()).await, String::from("baz"));
+        assert_eq!(loader.load(1, ()).await, String::from("bar"));
+    }
+
+    #[tokio::test]
+    async fn test_loader_blocking_mock() {
+        let loader = Arc::new(TestLoader::default());
+
+        let loader_barrier = loader.block_next(1, String::from("foo"));
+        loader.mock_next(2, String::from("bar"));
+
+        let is_blocked_barrier = Arc::new(Barrier::new(2));
+
+        let loader_captured = Arc::clone(&loader);
+        let is_blocked_barrier_captured = Arc::clone(&is_blocked_barrier);
+        let handle = tokio::task::spawn(async move {
+            let mut fut_load = loader_captured.load(1, ()).fuse();
+
+            futures::select_biased! {
+                _ = fut_load => {
+                    panic!("should not finish");
+                }
+                _ = is_blocked_barrier_captured.wait().fuse() => {}
+            }
+            fut_load.await
+        });
+
+        is_blocked_barrier.wait().await;
+
+        // can still load other entries
+        assert_eq!(loader.load(2, ()).await, String::from("bar"));
+
+        // unblock load
+        loader_barrier.wait().await;
+        assert_eq!(handle.await.unwrap(), String::from("foo"));
+    }
+}
diff --git a/cache_system/src/resource_consumption.rs b/cache_system/src/resource_consumption.rs
new file mode 100644
index 0000000..c2d32ce
--- /dev/null
+++ b/cache_system/src/resource_consumption.rs
@@ -0,0 +1,195 @@
+//! Reasoning about resource consumption of cached data.
+use std::{
+    fmt::Debug,
+    marker::PhantomData,
+    ops::{Add, Sub},
+};
+
+/// Strongly-typed resource consumption.
+///
+/// Can be used to represent in-RAM memory as well as on-disc memory.
+pub trait Resource:
+    Add<Output = Self>
+    + Copy
+    + Debug
+    + Into<u64>
+    + Ord
+    + PartialOrd
+    + Send
+    + Sync
+    + Sub<Output = Self>
+    + 'static
+{
+    /// Create resource consumption of zero.
+    fn zero() -> Self;
+
+    /// Unit name.
+    ///
+    /// This must be a single lowercase word.
+    fn unit() -> &'static str;
+}
+
+/// An estimator of [`Resource`] consumption for a given key-value pair.
+pub trait ResourceEstimator: Debug + Send + Sync + 'static {
+    /// Cache key.
+    type K;
+
+    /// Cached value.
+    type V;
+
+    /// Size that can be estimated.
+    type S: Resource;
+
+    /// Estimate size of given key-value pair.
+    fn consumption(&self, k: &Self::K, v: &Self::V) -> Self::S;
+}
+
+/// A simple function-based [`ResourceEstimator].
+///
+/// # Typing
+/// Semantically this wrapper has only one degree of freedom: `F`, which is the estimator function. However until
+/// [`fn_traits`] are stable, there is no way to extract the parameters and return value from a function via associated
+/// types. So we need to add additional type parametes for the special `Fn(...) -> ...` handling.
+///
+/// It is likely that `F` will be a closure, e.g.:
+///
+/// ```
+/// use cache_system::resource_consumption::{
+///     FunctionEstimator,
+///     test_util::TestSize,
+/// };
+///
+/// let my_estimator = FunctionEstimator::new(|_k: &u8, v: &String| -> TestSize {
+///     TestSize(std::mem::size_of::<(u8, String)>() + v.capacity())
+/// });
+/// ```
+///
+/// There is no way to spell out the exact type of `my_estimator` in the above example, because  the closure has an
+/// anonymous type. If you need the type signature of [`FunctionEstimator`], you have to
+/// [erase the type](https://en.wikipedia.org/wiki/Type_erasure) by putting the [`FunctionEstimator`] it into a [`Box`],
+/// e.g.:
+///
+/// ```
+/// use cache_system::resource_consumption::{
+///     FunctionEstimator,
+///     ResourceEstimator,
+///     test_util::TestSize,
+/// };
+///
+/// let my_estimator = FunctionEstimator::new(|_k: &u8, v: &String| -> TestSize {
+///     TestSize(std::mem::size_of::<(u8, String)>() + v.capacity())
+/// });
+/// let my_estimator: Box<dyn ResourceEstimator<K = u8, V = String, S = TestSize>> = Box::new(my_estimator);
+/// ```
+///
+///
+/// [`fn_traits`]: https://doc.rust-lang.org/beta/unstable-book/library-features/fn-traits.html
+pub struct FunctionEstimator<F, K, V, S>
+where
+    F: Fn(&K, &V) -> S + Send + Sync + 'static,
+    K: 'static,
+    V: 'static,
+    S: Resource,
+{
+    estimator: F,
+    _phantom: PhantomData<dyn Fn() -> (K, V, S) + Send + Sync + 'static>,
+}
+
+impl<F, K, V, S> FunctionEstimator<F, K, V, S>
+where
+    F: Fn(&K, &V) -> S + Send + Sync + 'static,
+    K: 'static,
+    V: 'static,
+    S: Resource,
+{
+    /// Create new resource estimator from given function.
+    pub fn new(f: F) -> Self {
+        Self {
+            estimator: f,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<F, K, V, S> std::fmt::Debug for FunctionEstimator<F, K, V, S>
+where
+    F: Fn(&K, &V) -> S + Send + Sync + 'static,
+    K: 'static,
+    V: 'static,
+    S: Resource,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("FunctionEstimator").finish_non_exhaustive()
+    }
+}
+
+impl<F, K, V, S> ResourceEstimator for FunctionEstimator<F, K, V, S>
+where
+    F: Fn(&K, &V) -> S + Send + Sync + 'static,
+    K: 'static,
+    V: 'static,
+    S: Resource,
+{
+    type K = K;
+    type V = V;
+    type S = S;
+
+    fn consumption(&self, k: &Self::K, v: &Self::V) -> Self::S {
+        (self.estimator)(k, v)
+    }
+}
+
+pub mod test_util {
+    //! Helpers to test resource consumption-based algorithms.
+    use super::*;
+
+    /// Simple resource type for testing.
+    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+    pub struct TestSize(pub usize);
+
+    impl Resource for TestSize {
+        fn zero() -> Self {
+            Self(0)
+        }
+
+        fn unit() -> &'static str {
+            "bytes"
+        }
+    }
+
+    impl From<TestSize> for u64 {
+        fn from(s: TestSize) -> Self {
+            s.0 as Self
+        }
+    }
+
+    impl Add for TestSize {
+        type Output = Self;
+
+        fn add(self, rhs: Self) -> Self::Output {
+            Self(self.0.checked_add(rhs.0).expect("overflow"))
+        }
+    }
+
+    impl Sub for TestSize {
+        type Output = Self;
+
+        fn sub(self, rhs: Self) -> Self::Output {
+            Self(self.0.checked_sub(rhs.0).expect("underflow"))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::resource_consumption::test_util::TestSize;
+
+    use super::*;
+
+    #[test]
+    fn test_function_estimator() {
+        let estimator =
+            FunctionEstimator::new(|k: &u8, v: &u16| TestSize((*k as usize) * 10 + (*v as usize)));
+        assert_eq!(estimator.consumption(&3, &2), TestSize(32));
+    }
+}
diff --git a/cache_system/src/test_util.rs b/cache_system/src/test_util.rs
new file mode 100644
index 0000000..959cc68
--- /dev/null
+++ b/cache_system/src/test_util.rs
@@ -0,0 +1,62 @@
+use std::{future::Future, sync::Arc, time::Duration};
+
+use async_trait::async_trait;
+use futures::FutureExt;
+use tokio::{sync::Barrier, task::JoinHandle};
+
+#[async_trait]
+pub trait EnsurePendingExt {
+    type Out;
+
+    /// Ensure that the future is pending. In the pending case, try to pass the given barrier. Afterwards await the future again.
+    ///
+    /// This is helpful to ensure a future is in a pending state before continuing with the test setup.
+    async fn ensure_pending(self, barrier: Arc<Barrier>) -> Self::Out;
+}
+
+#[async_trait]
+impl<F> EnsurePendingExt for F
+where
+    F: Future + Send + Unpin,
+{
+    type Out = F::Output;
+
+    async fn ensure_pending(self, barrier: Arc<Barrier>) -> Self::Out {
+        let mut fut = self.fuse();
+        futures::select_biased! {
+            _ = fut => panic!("fut should be pending"),
+            _ = barrier.wait().fuse() => (),
+        }
+
+        fut.await
+    }
+}
+
+#[async_trait]
+pub trait AbortAndWaitExt {
+    /// Abort handle and wait for completion.
+    ///
+    /// Note that this is NOT just a "wait with timeout or panic". This extension is specific to [`JoinHandle`] and will:
+    ///
+    /// 1. Call [`JoinHandle::abort`].
+    /// 2. Await the [`JoinHandle`] with a timeout (or panic if the timeout is reached).
+    /// 3. Check that the handle returned a [`JoinError`] that signals that the tracked task was indeed cancelled and
+    ///    didn't exit otherwise (either by finishing or by panicking).
+    async fn abort_and_wait(self);
+}
+
+#[async_trait]
+impl<T> AbortAndWaitExt for JoinHandle<T>
+where
+    T: std::fmt::Debug + Send,
+{
+    async fn abort_and_wait(mut self) {
+        self.abort();
+
+        let join_err = tokio::time::timeout(Duration::from_secs(1), self)
+            .await
+            .expect("no timeout")
+            .expect_err("handle was aborted and therefore MUST fail");
+        assert!(join_err.is_cancelled());
+    }
+}
diff --git a/catalog_cache/Cargo.toml b/catalog_cache/Cargo.toml
new file mode 100644
index 0000000..cdb79c5
--- /dev/null
+++ b/catalog_cache/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "catalog_cache"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+bytes = "1.5"
+dashmap = "5.5"
+futures = "0.3"
+hyper = "0.14"
+url = "2.5"
+reqwest = { version = "0.11", default-features = false }
+snafu = "0.8"
+tokio = { version = "1.35", default-features = false, features = ["macros", "rt"] }
+tokio-util = "0.7"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/catalog_cache/src/api/client.rs b/catalog_cache/src/api/client.rs
new file mode 100644
index 0000000..94e9bf9
--- /dev/null
+++ b/catalog_cache/src/api/client.rs
@@ -0,0 +1,176 @@
+//! Client for the cache HTTP API
+
+use crate::api::list::{ListDecoder, ListEntry, MAX_VALUE_SIZE};
+use crate::api::{RequestPath, GENERATION};
+use crate::{CacheKey, CacheValue};
+use bytes::{Buf, Bytes};
+use futures::prelude::*;
+use futures::stream::BoxStream;
+use reqwest::{Client, Response, StatusCode, Url};
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::time::Duration;
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Creating client: {source}"))]
+    Client { source: reqwest::Error },
+
+    #[snafu(display("Put Reqwest error: {source}"))]
+    Put { source: reqwest::Error },
+
+    #[snafu(display("Get Reqwest error: {source}"))]
+    Get { source: reqwest::Error },
+
+    #[snafu(display("List Reqwest error: {source}"))]
+    List { source: reqwest::Error },
+
+    #[snafu(display("Health Reqwest error: {source}"))]
+    Health { source: reqwest::Error },
+
+    #[snafu(display("Missing generation header"))]
+    MissingGeneration,
+
+    #[snafu(display("Invalid generation value"))]
+    InvalidGeneration,
+
+    #[snafu(display("Error decoding list stream: {source}"), context(false))]
+    ListStream { source: crate::api::list::Error },
+}
+
+/// Result type for [`CatalogCacheClient`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// The type returned by [`CatalogCacheClient::list`]
+pub type ListStream = BoxStream<'static, Result<ListEntry>>;
+
+const RESOURCE_REQUEST_TIMEOUT: Duration = Duration::from_secs(1);
+
+/// We use a longer timeout for list request as they may transfer a non-trivial amount of data
+const LIST_REQUEST_TIMEOUT: Duration = Duration::from_secs(20);
+
+/// A client for accessing a remote catalog cache
+#[derive(Debug)]
+pub struct CatalogCacheClient {
+    client: Client,
+    endpoint: Url,
+}
+
+impl CatalogCacheClient {
+    /// Create a new [`CatalogCacheClient`] with the given remote endpoint
+    pub fn try_new(endpoint: Url) -> Result<Self> {
+        let client = Client::builder()
+            .connect_timeout(Duration::from_secs(2))
+            .build()
+            .context(ClientSnafu)?;
+
+        Ok(Self { endpoint, client })
+    }
+
+    /// Retrieve the given value from the remote cache, if present
+    pub async fn get(&self, key: CacheKey) -> Result<Option<CacheValue>> {
+        let url = format!("{}{}", self.endpoint, RequestPath::Resource(key));
+        let timeout = RESOURCE_REQUEST_TIMEOUT;
+        let req = self.client.get(url).timeout(timeout);
+        let resp = req.send().await.context(GetSnafu)?;
+
+        if resp.status() == StatusCode::NOT_FOUND {
+            return Ok(None);
+        }
+        let resp = resp.error_for_status().context(GetSnafu)?;
+
+        let generation = resp
+            .headers()
+            .get(&GENERATION)
+            .context(MissingGenerationSnafu)?;
+
+        let generation = generation
+            .to_str()
+            .ok()
+            .and_then(|v| v.parse().ok())
+            .context(InvalidGenerationSnafu)?;
+
+        let data = resp.bytes().await.context(GetSnafu)?;
+
+        Ok(Some(CacheValue::new(data, generation)))
+    }
+
+    /// Upsert the given key-value pair to the remote cache
+    ///
+    /// Returns false if the value had a generation less than or equal to
+    /// an existing value
+    pub async fn put(&self, key: CacheKey, value: &CacheValue) -> Result<bool> {
+        let url = format!("{}{}", self.endpoint, RequestPath::Resource(key));
+
+        let response = self
+            .client
+            .put(url)
+            .timeout(RESOURCE_REQUEST_TIMEOUT)
+            .header(&GENERATION, value.generation)
+            .body(value.data.clone())
+            .send()
+            .await
+            .context(PutSnafu)?
+            .error_for_status()
+            .context(PutSnafu)?;
+
+        Ok(matches!(response.status(), StatusCode::OK))
+    }
+
+    /// List the contents of the remote cache
+    ///
+    /// Values larger than `max_value_size` will not be returned inline, with only the key
+    /// and generation returned instead. Defaults to [`MAX_VALUE_SIZE`]
+    pub fn list(&self, max_value_size: Option<usize>) -> ListStream {
+        let size = max_value_size.unwrap_or(MAX_VALUE_SIZE);
+        let url = format!("{}{}?size={size}", self.endpoint, RequestPath::List);
+        let fut = self.client.get(url).timeout(LIST_REQUEST_TIMEOUT).send();
+
+        futures::stream::once(fut.map_err(|source| Error::List { source }))
+            .and_then(move |response| futures::future::ready(list_stream(response, size)))
+            .try_flatten()
+            .boxed()
+    }
+}
+
+struct ListStreamState {
+    response: Response,
+    current: Bytes,
+    decoder: ListDecoder,
+}
+
+impl ListStreamState {
+    fn new(response: Response, max_value_size: usize) -> Self {
+        Self {
+            response,
+            current: Default::default(),
+            decoder: ListDecoder::new().with_max_value_size(max_value_size),
+        }
+    }
+}
+
+fn list_stream(
+    response: Response,
+    max_value_size: usize,
+) -> Result<impl Stream<Item = Result<ListEntry>>> {
+    let resp = response.error_for_status().context(ListSnafu)?;
+    let state = ListStreamState::new(resp, max_value_size);
+    Ok(stream::try_unfold(state, |mut state| async move {
+        loop {
+            if state.current.is_empty() {
+                match state.response.chunk().await.context(ListSnafu)? {
+                    Some(new) => state.current = new,
+                    None => break,
+                }
+            }
+
+            let to_read = state.current.len();
+            let read = state.decoder.decode(&state.current)?;
+            state.current.advance(read);
+            if read != to_read {
+                break;
+            }
+        }
+        Ok(state.decoder.flush()?.map(|entry| (entry, state)))
+    }))
+}
diff --git a/catalog_cache/src/api/list.rs b/catalog_cache/src/api/list.rs
new file mode 100644
index 0000000..155f794
--- /dev/null
+++ b/catalog_cache/src/api/list.rs
@@ -0,0 +1,467 @@
+//! The encoding mechanism for list streams
+//!
+//! This is capable of streaming both keys and values, this saves round-trips when hydrating
+//! a cache from a remote, and avoids creating a flood of HTTP GET requests
+
+use bytes::Bytes;
+use snafu::{ensure, Snafu};
+
+use crate::{CacheKey, CacheValue};
+
+/// Error type for list streams
+#[derive(Debug, Snafu)]
+#[allow(missing_copy_implementations, missing_docs)]
+pub enum Error {
+    #[snafu(display("Unexpected EOF whilst decoding list stream"))]
+    UnexpectedEOF,
+
+    #[snafu(display("List value of {size} bytes too large"))]
+    ValueTooLarge { size: usize },
+}
+
+/// Result type for list streams
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// The size at which to flush [`Bytes`] from [`ListEncoder`]
+pub const FLUSH_SIZE: usize = 1024 * 1024; // Flush in 1MB blocks
+
+/// The maximum value size to send in a [`ListEntry`]
+///
+/// This primarily exists as a self-protection limit to prevent large or corrupted streams
+/// from swamping the client, but also mitigates Head-Of-Line blocking resulting from
+/// large cache values
+pub const MAX_VALUE_SIZE: usize = 1024 * 10;
+
+/// Encodes [`ListEntry`] as an iterator of [`Bytes`]
+///
+/// Each [`ListEntry`] is encoded as a `ListHeader`, followed by the value data
+#[derive(Debug)]
+pub struct ListEncoder {
+    /// The current offset into entries
+    offset: usize,
+    /// The ListEntry to encode
+    entries: Vec<ListEntry>,
+    /// The flush size, made configurable for testing
+    flush_size: usize,
+    /// The maximum value size to write
+    max_value_size: usize,
+}
+
+impl ListEncoder {
+    /// Create a new [`ListEncoder`] from the provided [`ListEntry`]
+    pub fn new(entries: Vec<ListEntry>) -> Self {
+        Self {
+            entries,
+            offset: 0,
+            flush_size: FLUSH_SIZE,
+            max_value_size: MAX_VALUE_SIZE,
+        }
+    }
+
+    /// Override the maximum value size to write
+    pub fn with_max_value_size(mut self, size: usize) -> Self {
+        self.max_value_size = size;
+        self
+    }
+}
+
+impl Iterator for ListEncoder {
+    type Item = Bytes;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.offset == self.entries.len() {
+            return None;
+        }
+
+        let mut cap = 0;
+        let mut end_offset = self.offset;
+        while end_offset < self.entries.len() && cap < self.flush_size {
+            match &self.entries[end_offset].data {
+                Some(d) if d.len() <= self.max_value_size => cap += ListHeader::SIZE + d.len(),
+                _ => cap += ListHeader::SIZE,
+            };
+            end_offset += 1;
+        }
+
+        let mut buf = Vec::with_capacity(cap);
+        for entry in self.entries.iter().take(end_offset).skip(self.offset) {
+            match &entry.data {
+                Some(d) if d.len() <= self.max_value_size => {
+                    buf.extend_from_slice(&entry.header(false).encode());
+                    buf.extend_from_slice(d)
+                }
+                _ => buf.extend_from_slice(&entry.header(true).encode()),
+            }
+        }
+        self.offset = end_offset;
+        Some(buf.into())
+    }
+}
+
+#[allow(non_snake_case)]
+mod Flags {
+    /// The value is not included in this response
+    ///
+    /// [`ListEncoder`](super::ListEncoder) only sends inline values for values smaller than a
+    /// configured threshold
+    pub(crate) const HEAD: u8 = 1;
+}
+
+/// The header encoded in a list stream
+#[derive(Debug)]
+struct ListHeader {
+    /// The size of the value
+    size: u32,
+    /// Reserved for future usage
+    reserved: u16,
+    /// A bitmask of [`Flags`]
+    flags: u8,
+    /// The variant of [`CacheKey`]
+    variant: u8,
+    /// The generation of this value
+    generation: u64,
+    /// The key contents of [`CacheKey`]
+    key: u128,
+}
+
+impl ListHeader {
+    /// The encoded size of [`ListHeader`]
+    const SIZE: usize = 32;
+
+    /// Encodes [`ListHeader`] to an array
+    fn encode(&self) -> [u8; Self::SIZE] {
+        let mut out = [0; Self::SIZE];
+        out[..4].copy_from_slice(&self.size.to_le_bytes());
+        out[4..6].copy_from_slice(&self.reserved.to_le_bytes());
+        out[6] = self.flags;
+        out[7] = self.variant;
+        out[8..16].copy_from_slice(&self.generation.to_le_bytes());
+        out[16..32].copy_from_slice(&self.key.to_le_bytes());
+        out
+    }
+
+    /// Decodes [`ListHeader`] from an array
+    fn decode(buf: [u8; Self::SIZE]) -> Self {
+        Self {
+            size: u32::from_le_bytes(buf[..4].try_into().unwrap()),
+            reserved: u16::from_le_bytes(buf[4..6].try_into().unwrap()),
+            flags: buf[6],
+            variant: buf[7],
+            generation: u64::from_le_bytes(buf[8..16].try_into().unwrap()),
+            key: u128::from_le_bytes(buf[16..32].try_into().unwrap()),
+        }
+    }
+}
+
+/// The state for [`ListDecoder`]
+#[derive(Debug)]
+enum DecoderState {
+    /// Decoding a header, contains the decoded data and the current offset
+    Header([u8; ListHeader::SIZE], usize),
+    /// Decoding value data for the given [`ListHeader`]
+    Body(ListHeader, Vec<u8>),
+}
+
+impl Default for DecoderState {
+    fn default() -> Self {
+        Self::Header([0; ListHeader::SIZE], 0)
+    }
+}
+
+/// Decodes [`ListEntry`] from a stream of bytes
+#[derive(Debug)]
+pub struct ListDecoder {
+    state: DecoderState,
+    max_size: usize,
+}
+
+impl Default for ListDecoder {
+    fn default() -> Self {
+        Self {
+            state: DecoderState::default(),
+            max_size: MAX_VALUE_SIZE,
+        }
+    }
+}
+
+impl ListDecoder {
+    /// Create a new [`ListDecoder`]
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Overrides the maximum value to deserialize
+    ///
+    /// Values larger than this will result in an error
+    /// Defaults to [`MAX_VALUE_SIZE`]
+    pub fn with_max_value_size(mut self, size: usize) -> Self {
+        self.max_size = size;
+        self
+    }
+
+    /// Decode an entry from `buf`, returning the number of bytes consumed
+    ///
+    /// This is meant to be used in combination with [`Self::flush`]
+    pub fn decode(&mut self, mut buf: &[u8]) -> Result<usize> {
+        let initial = buf.len();
+        while !buf.is_empty() {
+            match &mut self.state {
+                DecoderState::Header(header, offset) => {
+                    let to_read = buf.len().min(ListHeader::SIZE - *offset);
+                    header[*offset..*offset + to_read].copy_from_slice(&buf[..to_read]);
+                    *offset += to_read;
+                    buf = &buf[to_read..];
+
+                    if *offset == ListHeader::SIZE {
+                        let header = ListHeader::decode(*header);
+                        let size = header.size as _;
+                        ensure!(size <= self.max_size, ValueTooLargeSnafu { size });
+                        self.state = DecoderState::Body(header, Vec::with_capacity(size))
+                    }
+                }
+                DecoderState::Body(header, value) => {
+                    let to_read = buf.len().min(header.size as usize - value.len());
+                    if to_read == 0 {
+                        break;
+                    }
+                    value.extend_from_slice(&buf[..to_read]);
+                    buf = &buf[to_read..];
+                }
+            }
+        }
+        Ok(initial - buf.len())
+    }
+
+    /// Flush the contents of this [`ListDecoder`]
+    ///
+    /// Returns `Ok(Some(entry))` if a record is fully decoded
+    /// Returns `Ok(None)` if no in-progress record
+    /// Otherwise returns an error
+    pub fn flush(&mut self) -> Result<Option<ListEntry>> {
+        match std::mem::take(&mut self.state) {
+            DecoderState::Body(header, value) if value.len() == header.size as usize => {
+                Ok(Some(ListEntry {
+                    variant: header.variant,
+                    key: header.key,
+                    generation: header.generation,
+                    data: ((header.flags & Flags::HEAD) == 0).then(|| value.into()),
+                }))
+            }
+            DecoderState::Header(_, 0) => Ok(None),
+            _ => Err(Error::UnexpectedEOF),
+        }
+    }
+}
+
+/// A key value pair encoded as part of a list
+///
+/// Unlike [`CacheKey`] and [`CacheValue`] this allows:
+///
+/// * Non-fatal handling of unknown key variants
+/// * The option to not include the value data, e.g. if too large
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct ListEntry {
+    variant: u8,
+    generation: u64,
+    key: u128,
+    data: Option<Bytes>,
+}
+
+impl ListEntry {
+    /// Create a new [`ListEntry`] from the provided key and value
+    pub fn new(key: CacheKey, value: CacheValue) -> Self {
+        let (variant, key) = match key {
+            CacheKey::Namespace(v) => (b'n', v as _),
+            CacheKey::Table(v) => (b't', v as _),
+            CacheKey::Partition(v) => (b'p', v as _),
+        };
+
+        Self {
+            key,
+            variant,
+            generation: value.generation,
+            data: Some(value.data),
+        }
+    }
+
+    /// The key if it matches a known variant of [`CacheKey`]
+    ///
+    /// Returns `None` otherwise
+    pub fn key(&self) -> Option<CacheKey> {
+        match self.variant {
+            b't' => Some(CacheKey::Table(self.key as _)),
+            b'n' => Some(CacheKey::Namespace(self.key as _)),
+            b'p' => Some(CacheKey::Partition(self.key as _)),
+            _ => None,
+        }
+    }
+
+    /// The generation of this entry
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+
+    /// Returns the value data if present
+    pub fn value(&self) -> Option<&Bytes> {
+        self.data.as_ref()
+    }
+
+    /// Returns the [`ListHeader`] for a given [`ListEntry`]
+    fn header(&self, head: bool) -> ListHeader {
+        let generation = self.generation;
+        let (flags, size) = match (head, &self.data) {
+            (false, Some(data)) => (0, data.len() as u32),
+            _ => (Flags::HEAD, 0),
+        };
+
+        ListHeader {
+            size,
+            flags,
+            variant: self.variant,
+            key: self.key,
+            generation,
+            reserved: 0,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bytes::Buf;
+    use std::io::BufRead;
+
+    fn decode_entries<R: BufRead>(mut r: R) -> Result<Vec<ListEntry>> {
+        let mut decoder = ListDecoder::default();
+        let iter = std::iter::from_fn(move || {
+            loop {
+                let buf = r.fill_buf().unwrap();
+                if buf.is_empty() {
+                    break;
+                }
+                let to_read = buf.len();
+                let read = decoder.decode(buf).unwrap();
+                r.consume(read);
+                if read != to_read {
+                    break;
+                }
+            }
+            decoder.flush().transpose()
+        });
+        iter.collect()
+    }
+
+    #[test]
+    fn test_roundtrip() {
+        let expected = vec![
+            ListEntry::new(CacheKey::Namespace(2), CacheValue::new("test".into(), 32)),
+            ListEntry::new(CacheKey::Namespace(6), CacheValue::new("3".into(), 4)),
+            ListEntry {
+                variant: 0,
+                key: u128::MAX,
+                generation: u64::MAX,
+                data: Some("unknown".into()),
+            },
+            ListEntry::new(CacheKey::Table(6), CacheValue::new("3".into(), 23)),
+            ListEntry {
+                variant: b'p',
+                key: 45,
+                generation: 23,
+                data: None,
+            },
+            ListEntry::new(
+                CacheKey::Partition(3),
+                CacheValue::new("bananas".into(), 23),
+            ),
+        ];
+
+        let encoded: Vec<_> = ListEncoder::new(expected.clone()).collect();
+        assert_eq!(encoded.len(), 1); // Expect entries to be encoded in single flush
+
+        for buf_size in [3, 5, 12] {
+            let reader = std::io::BufReader::with_capacity(buf_size, encoded[0].clone().reader());
+            let entries = decode_entries(reader).unwrap();
+            assert_eq!(entries, expected);
+
+            // Invalid key should not be fatal
+            assert_eq!(entries[2].key(), None);
+            // Head response should not be fatal
+            assert_eq!(entries[4].value(), None);
+        }
+    }
+
+    #[test]
+    fn test_empty() {
+        let data: Vec<_> = ListEncoder::new(vec![]).collect();
+        assert_eq!(data.len(), 0);
+
+        let entries = decode_entries(std::io::Cursor::new([0_u8; 0])).unwrap();
+        assert_eq!(entries.len(), 0);
+    }
+
+    #[test]
+    fn test_flush_size() {
+        let data = Bytes::from(vec![0; 128]);
+        let entries = (0..1024)
+            .map(|x| ListEntry::new(CacheKey::Namespace(x), CacheValue::new(data.clone(), 0)))
+            .collect();
+
+        let mut encoder = ListEncoder::new(entries);
+        encoder.flush_size = 1024; // Lower limit for test
+
+        let mut remaining = 1024;
+        for block in encoder {
+            let expected = remaining.min(7);
+            assert_eq!(block.len(), (data.len() + ListHeader::SIZE) * expected);
+            let decoded = decode_entries(block.reader()).unwrap();
+            assert_eq!(decoded.len(), expected);
+            remaining -= expected;
+        }
+    }
+
+    #[test]
+    fn test_size_limit() {
+        let entries = vec![
+            ListEntry::new(
+                CacheKey::Namespace(0),
+                CacheValue::new(vec![0; 128].into(), 0),
+            ),
+            ListEntry::new(
+                CacheKey::Namespace(1),
+                CacheValue::new(vec![0; 129].into(), 0),
+            ),
+            ListEntry::new(
+                CacheKey::Namespace(2),
+                CacheValue::new(vec![0; 128].into(), 0),
+            ),
+        ];
+
+        let mut encoder = ListEncoder::new(entries);
+        encoder.max_value_size = 128; // Artificially lower limit for test
+
+        let encoded: Vec<_> = encoder.collect();
+        assert_eq!(encoded.len(), 1);
+
+        let decoded = decode_entries(encoded[0].clone().reader()).unwrap();
+        assert_eq!(decoded[0].value().unwrap().len(), 128);
+        assert_eq!(decoded[1].value(), None); // Should omit value that is too large
+        assert_eq!(decoded[2].value().unwrap().len(), 128);
+
+        let mut decoder = ListDecoder::new();
+        decoder.max_size = 12;
+        let err = decoder.decode(&encoded[0]).unwrap_err().to_string();
+        assert_eq!(err, "List value of 128 bytes too large");
+
+        let mut decoder = ListDecoder::new();
+        decoder.max_size = 128;
+
+        let consumed = decoder.decode(&encoded[0]).unwrap();
+        let r = decoder.flush().unwrap().unwrap();
+        assert_eq!(r.value().unwrap().len(), 128);
+
+        // Next record skipped by encoder as too large
+        decoder.decode(&encoded[0][consumed..]).unwrap();
+        let r = decoder.flush().unwrap().unwrap();
+        assert_eq!(r.value(), None);
+    }
+}
diff --git a/catalog_cache/src/api/mod.rs b/catalog_cache/src/api/mod.rs
new file mode 100644
index 0000000..66d4042
--- /dev/null
+++ b/catalog_cache/src/api/mod.rs
@@ -0,0 +1,159 @@
+//! The remote API for the catalog cache
+
+use crate::CacheKey;
+use hyper::http::HeaderName;
+
+pub mod client;
+
+pub mod quorum;
+
+pub mod server;
+
+pub mod list;
+
+/// The header used to encode the generation in a get response
+static GENERATION: HeaderName = HeaderName::from_static("x-influx-generation");
+
+/// Defines the mapping to HTTP paths for given request types
+#[derive(Debug, Eq, PartialEq)]
+enum RequestPath {
+    /// A request addressing a resource identified by [`CacheKey`]
+    Resource(CacheKey),
+    /// A list request
+    List,
+}
+
+impl RequestPath {
+    fn parse(s: &str) -> Option<Self> {
+        let s = s.strip_prefix('/').unwrap_or(s);
+        if s == "v1/" {
+            return Some(Self::List);
+        }
+
+        let (prefix, value) = s.rsplit_once('/')?;
+        let value = u64::from_str_radix(value, 16).ok()?;
+        match prefix {
+            "v1/n" => Some(Self::Resource(CacheKey::Namespace(value as i64))),
+            "v1/t" => Some(Self::Resource(CacheKey::Table(value as i64))),
+            "v1/p" => Some(Self::Resource(CacheKey::Partition(value as i64))),
+            _ => None,
+        }
+    }
+}
+
+impl std::fmt::Display for RequestPath {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::List => write!(f, "v1/"),
+            Self::Resource(CacheKey::Namespace(v)) => write!(f, "v1/n/{v:016x}"),
+            Self::Resource(CacheKey::Table(v)) => write!(f, "v1/t/{v:016x}"),
+            Self::Resource(CacheKey::Partition(v)) => write!(f, "v1/p/{v:016x}"),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::api::list::ListEntry;
+    use crate::api::server::test_util::TestCacheServer;
+    use crate::api::RequestPath;
+    use crate::{CacheKey, CacheValue};
+    use futures::TryStreamExt;
+    use std::collections::HashSet;
+
+    #[test]
+    fn test_request_path() {
+        let paths = [
+            RequestPath::List,
+            RequestPath::Resource(CacheKey::Partition(12)),
+            RequestPath::Resource(CacheKey::Partition(i64::MAX)),
+            RequestPath::Resource(CacheKey::Partition(i64::MIN)),
+            RequestPath::Resource(CacheKey::Namespace(12)),
+            RequestPath::Resource(CacheKey::Namespace(i64::MAX)),
+            RequestPath::Resource(CacheKey::Namespace(i64::MIN)),
+            RequestPath::Resource(CacheKey::Table(12)),
+            RequestPath::Resource(CacheKey::Table(i64::MAX)),
+            RequestPath::Resource(CacheKey::Table(i64::MIN)),
+        ];
+
+        let mut set = HashSet::with_capacity(paths.len());
+        for path in paths {
+            let s = path.to_string();
+            let back = RequestPath::parse(&s).unwrap();
+            assert_eq!(back, path);
+            assert!(set.insert(s), "should be unique");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_basic() {
+        let serve = TestCacheServer::bind_ephemeral();
+        let client = serve.client();
+
+        let key = CacheKey::Partition(1);
+
+        let v1 = CacheValue::new("1".into(), 2);
+        assert!(client.put(key, &v1).await.unwrap());
+
+        let returned = client.get(key).await.unwrap().unwrap();
+        assert_eq!(v1, returned);
+
+        // Duplicate upsert ignored
+        assert!(!client.put(key, &v1).await.unwrap());
+
+        // Stale upsert ignored
+        let v2 = CacheValue::new("2".into(), 1);
+        assert!(!client.put(key, &v2).await.unwrap());
+
+        let returned = client.get(key).await.unwrap().unwrap();
+        assert_eq!(v1, returned);
+
+        let v3 = CacheValue::new("3".into(), 3);
+        assert!(client.put(key, &v3).await.unwrap());
+
+        let returned = client.get(key).await.unwrap().unwrap();
+        assert_eq!(v3, returned);
+
+        let key2 = CacheKey::Partition(5);
+        assert!(client.put(key2, &v1).await.unwrap());
+
+        let mut result = client.list(None).try_collect::<Vec<_>>().await.unwrap();
+        result.sort_unstable_by_key(|entry| entry.key());
+
+        let expected = vec![ListEntry::new(key, v3), ListEntry::new(key2, v1)];
+        assert_eq!(result, expected);
+
+        serve.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_list_size() {
+        let serve = TestCacheServer::bind_ephemeral();
+        let client = serve.client();
+
+        let v1 = CacheValue::new("123".into(), 2);
+        client.put(CacheKey::Table(1), &v1).await.unwrap();
+
+        let v2 = CacheValue::new("13".into(), 2);
+        client.put(CacheKey::Table(2), &v2).await.unwrap();
+
+        let v3 = CacheValue::new("1".into(), 2);
+        client.put(CacheKey::Table(3), &v3).await.unwrap();
+
+        let mut res = client.list(Some(2)).try_collect::<Vec<_>>().await.unwrap();
+        res.sort_unstable_by_key(|x| x.key());
+
+        assert_eq!(res.len(), 3);
+
+        assert_eq!(res[0].value(), None);
+        assert_eq!(res[1].value(), Some(&v2.data));
+        assert_eq!(res[2].value(), Some(&v3.data));
+
+        let mut res = client.list(Some(3)).try_collect::<Vec<_>>().await.unwrap();
+        res.sort_unstable_by_key(|x| x.key());
+
+        assert_eq!(res[0].value(), Some(&v1.data));
+        assert_eq!(res[1].value(), Some(&v2.data));
+        assert_eq!(res[2].value(), Some(&v3.data));
+    }
+}
diff --git a/catalog_cache/src/api/quorum.rs b/catalog_cache/src/api/quorum.rs
new file mode 100644
index 0000000..17c4edf
--- /dev/null
+++ b/catalog_cache/src/api/quorum.rs
@@ -0,0 +1,459 @@
+//! Client for performing quorum catalog reads/writes
+
+use crate::api::client::{CatalogCacheClient, Error as ClientError};
+use crate::local::CatalogCache;
+use crate::{CacheKey, CacheValue};
+use futures::channel::oneshot;
+use futures::future::{select, Either};
+use futures::{pin_mut, StreamExt};
+use snafu::{ResultExt, Snafu};
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::task::JoinError;
+use tokio_util::sync::CancellationToken;
+
+/// Error for [`QuorumCatalogCache`]
+#[allow(missing_docs)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to communicate with any remote replica: {source}"))]
+    NoRemote { source: ClientError },
+
+    #[snafu(display("Write task was aborted"))]
+    Cancelled,
+
+    #[snafu(display("Join Error: {source}"))]
+    Join { source: JoinError },
+
+    #[snafu(display("Failed to establish a read quorum: {generations:?}"))]
+    Quorum {
+        generations: [Result<Option<u64>, ClientError>; 3],
+    },
+
+    #[snafu(display("Failed to list replica: {source}"))]
+    List { source: ClientError },
+
+    #[snafu(display("Local cache error: {source}"), context(false))]
+    Local { source: crate::local::Error },
+}
+
+/// Result for [`QuorumCatalogCache`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Performs quorum reads and writes across a local [`CatalogCache`] and two [`CatalogCacheClient`]
+#[derive(Debug)]
+pub struct QuorumCatalogCache {
+    local: Arc<CatalogCache>,
+    replicas: Arc<[CatalogCacheClient; 2]>,
+    shutdown: CancellationToken,
+}
+
+impl Drop for QuorumCatalogCache {
+    fn drop(&mut self) {
+        self.shutdown.cancel()
+    }
+}
+
+impl QuorumCatalogCache {
+    /// Create a new [`QuorumCatalogCache`]
+    pub fn new(local: Arc<CatalogCache>, replicas: Arc<[CatalogCacheClient; 2]>) -> Self {
+        Self {
+            local,
+            replicas,
+            shutdown: CancellationToken::new(),
+        }
+    }
+
+    /// Retrieve the given value from the remote cache
+    ///
+    /// Returns `None` if value is not present in a quorum of replicas
+    /// Returns [`Error::Quorum`] if cannot establish a read quorum
+    pub async fn get(&self, key: CacheKey) -> Result<Option<CacheValue>> {
+        let local = self.local.get(key);
+
+        let fut1 = self.replicas[0].get(key);
+        let fut2 = self.replicas[1].get(key);
+        pin_mut!(fut1);
+        pin_mut!(fut2);
+
+        match select(fut1, fut2).await {
+            Either::Left((result, fut)) | Either::Right((result, fut)) => match (local, result) {
+                (None, Ok(None)) => Ok(None),
+                (Some(l), Ok(Some(r))) if l.generation <= r.generation => {
+                    // preempt write from remote to local that arrives late
+                    if l.generation < r.generation {
+                        self.local.insert(key, r.clone())?;
+                    }
+                    Ok(Some(r))
+                }
+                (local, r1) => {
+                    // r1 either failed or did not return anything
+                    let r2 = fut.await;
+                    match (local, r1, r2) {
+                        (None, _, Ok(None)) | (_, Ok(None), Ok(None)) => Ok(None),
+                        (Some(l), _, Ok(Some(r))) if l.generation <= r.generation => {
+                            // preempt write from remote to local that arrives late
+                            if l.generation < r.generation {
+                                self.local.insert(key, r.clone())?;
+                            }
+                            Ok(Some(r))
+                        }
+                        (local, Ok(Some(l)), Ok(Some(r))) if l.generation == r.generation => {
+                            if local.map(|x| x.generation < l.generation).unwrap_or(true) {
+                                self.local.insert(key, l.clone())?;
+                            }
+                            Ok(Some(l))
+                        }
+                        (l, r1, r2) => Err(Error::Quorum {
+                            generations: [
+                                Ok(l.map(|x| x.generation)),
+                                r1.map(|x| x.map(|x| x.generation)),
+                                r2.map(|x| x.map(|x| x.generation)),
+                            ],
+                        }),
+                    }
+                }
+            },
+        }
+    }
+
+    /// Upsert the given key-value pair
+    ///
+    /// Returns Ok if able to replicate the write to a quorum
+    pub async fn put(&self, key: CacheKey, value: CacheValue) -> Result<()> {
+        self.local.insert(key, value.clone())?;
+
+        let replicas = Arc::clone(&self.replicas);
+        let (sender, receiver) = oneshot::channel();
+
+        let fut = async move {
+            let fut1 = replicas[0].put(key, &value);
+            let fut2 = replicas[1].put(key, &value);
+            pin_mut!(fut1);
+            pin_mut!(fut2);
+
+            match select(fut1, fut2).await {
+                Either::Left((r, fut)) | Either::Right((r, fut)) => {
+                    let _ = sender.send(r);
+                    fut.await
+                }
+            }
+        };
+
+        // We spawn a tokio task so that we can potentially continue to replicate
+        // to the second replica asynchronously once we receive an ok response
+        let cancel = self.shutdown.child_token();
+        let handle = tokio::spawn(async move {
+            let cancelled = cancel.cancelled();
+            pin_mut!(fut);
+            pin_mut!(cancelled);
+            match select(cancelled, fut).await {
+                Either::Left(_) => Err(Error::Cancelled),
+                Either::Right((Ok(_), _)) => Ok(()),
+                Either::Right((Err(source), _)) => Err(Error::NoRemote { source }),
+            }
+        });
+
+        match receiver.await {
+            Ok(Ok(_)) => Ok(()),
+            _ => match handle.await {
+                Ok(r) => r,
+                Err(source) => Err(Error::Join { source }),
+            },
+        }
+    }
+
+    /// Warm the local cache by performing quorum reads from the other two replicas
+    ///
+    /// This method should be called after this server has been participating in the write quorum
+    /// for a period of time, e.g. 1 minute. This avoids an issue where a quorum cannot be
+    /// established for in-progress writes.
+    pub async fn warm(&self) -> Result<()> {
+        // List doesn't return keys in any particular order
+        //
+        // We therefore build a hashmap with the keys from one replica and compare
+        // this against those returned by the other
+        //
+        // We don't need to consult the local `CatalogCache`, as we only need to insert
+        // if a read quorum can be established between the replicas and isn't present locally
+        let mut generations = HashMap::with_capacity(128);
+        let mut list = self.replicas[0].list(Some(0));
+        while let Some(entry) = list.next().await.transpose().context(ListSnafu)? {
+            if let Some(k) = entry.key() {
+                generations.insert(k, entry.generation());
+            }
+        }
+
+        let mut list = self.replicas[1].list(None);
+        while let Some(entry) = list.next().await.transpose().context(ListSnafu)? {
+            if let Some(k) = entry.key() {
+                match (generations.get(&k), entry.value()) {
+                    (Some(generation), Some(v)) if *generation == entry.generation() => {
+                        let value = CacheValue::new(v.clone(), *generation);
+                        // In the case that local already has the given version
+                        // this will be a no-op
+                        self.local.insert(k, value)?;
+                    }
+                    _ => {}
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::api::server::test_util::TestCacheServer;
+    use std::future::Future;
+    use std::task::Context;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn test_basic() {
+        let local = Arc::new(CatalogCache::default());
+        let r1 = TestCacheServer::bind_ephemeral();
+        let r2 = TestCacheServer::bind_ephemeral();
+
+        let replicas = Arc::new([r1.client(), r2.client()]);
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        assert_eq!(quorum.get(CacheKey::Table(1)).await.unwrap(), None);
+
+        let k1 = CacheKey::Table(1);
+        let k2 = CacheKey::Table(2);
+        let k3 = CacheKey::Table(3);
+
+        let v1 = CacheValue::new("foo".into(), 2);
+        quorum.put(k1, v1.clone()).await.unwrap();
+        quorum.put(k2, v1.clone()).await.unwrap();
+
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // New value
+        let v2 = CacheValue::new("foo".into(), 4);
+        quorum.put(k2, v2.clone()).await.unwrap();
+
+        let r = quorum.get(k1).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v2);
+
+        // Can remove value from one replica and still get quorum
+        r2.cache().delete(k2).unwrap();
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v2);
+
+        // Loss of two copies results in not found
+        r1.cache().delete(k2).unwrap();
+        let r = quorum.get(k2).await.unwrap();
+        assert_eq!(r, None);
+
+        // Simulate stale value in r1
+        r1.cache().insert(k2, v1.clone()).unwrap();
+        let err = quorum.get(k2).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+
+        // If quorum has stale value follows quorum
+        r2.cache().delete(k2);
+        r2.cache().insert(k2, v1.clone()).unwrap();
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // Simulate loss of replica 2
+        r2.shutdown().await;
+
+        // Can still establish a write quorum
+        quorum.put(k3, v1.clone()).await.unwrap();
+
+        // Can read newly inserted value
+        let r = quorum.get(k3).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // Can still read from quorum of k1
+        let r = quorum.get(k1).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // Cannot get quorum as lost single node and local disagrees with replica 1
+        let err = quorum.get(k2).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+
+        // Can establish quorum following write
+        quorum.put(k2, v2.clone()).await.unwrap();
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v2);
+
+        // Still cannot establish quorum
+        r1.cache().delete(k2);
+        let err = quorum.get(k2).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+
+        // k2 is now no longer present anywhere, can establish quorum
+        local.delete(k2);
+        let r = quorum.get(k2).await.unwrap();
+        assert_eq!(r, None);
+
+        // Simulate loss of replica 1 (in addition to replica 2)
+        r1.shutdown().await;
+
+        // Can no longer get quorum for anything
+        let err = quorum.get(k1).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+    }
+
+    #[tokio::test]
+    async fn test_read_through() {
+        let local = Arc::new(CatalogCache::default());
+        let r1 = TestCacheServer::bind_ephemeral();
+        let r2 = TestCacheServer::bind_ephemeral();
+
+        let replicas = Arc::new([r1.client(), r2.client()]);
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        let key = CacheKey::Table(1);
+        let v0 = CacheValue::new("v0".into(), 0);
+
+        r1.cache().insert(key, v0.clone()).unwrap();
+        r2.cache().insert(key, v0.clone()).unwrap();
+
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v0);
+
+        // Should have read-through to local
+        assert_eq!(local.get(key).unwrap(), v0);
+
+        let v1 = CacheValue::new("v1".into(), 1);
+        let v2 = CacheValue::new("v2".into(), 2);
+
+        r1.cache().insert(key, v1.clone()).unwrap();
+        r2.cache().insert(key, v2.clone()).unwrap();
+
+        // A quorum request will get either v1 or v2 depending on which it contacts first
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert!(result == v1 || result == v2, "{result:?}");
+
+        // Should read-through
+        assert_eq!(local.get(key).unwrap(), result);
+
+        // Update r1 with version 2
+        r1.cache().insert(key, v2.clone()).unwrap();
+
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v2);
+
+        // Should read-through
+        assert_eq!(local.get(key).unwrap(), v2);
+
+        let v3 = CacheValue::new("v3".into(), 3);
+        local.insert(key, v3.clone()).unwrap();
+
+        // Should establish quorum for v2 even though local is v3
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v2);
+
+        // Should not read-through
+        assert_eq!(local.get(key).unwrap(), v3);
+
+        let v4 = CacheValue::new("v4".into(), 4);
+        let v5 = CacheValue::new("v5".into(), 5);
+
+        local.insert(key, v5.clone()).unwrap();
+        r1.cache().insert(key, v4.clone()).unwrap();
+
+        // Should fail as cannot establish quorum of three different versions of `[5, 4, 2]`
+        // and has latest version locally
+        let err = quorum.get(key).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+        assert_eq!(local.get(key).unwrap(), v5);
+
+        let v6 = CacheValue::new("v6".into(), 6);
+        r1.cache().insert(key, v6.clone()).unwrap();
+
+        // Should succeed as r1 has newer version than local
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v6);
+
+        // Should read-through
+        assert_eq!(local.get(key).unwrap(), v6);
+    }
+
+    #[tokio::test]
+    async fn test_warm() {
+        let local = Arc::new(CatalogCache::default());
+        let r1 = TestCacheServer::bind_ephemeral();
+        let r2 = TestCacheServer::bind_ephemeral();
+
+        let replicas = Arc::new([r1.client(), r2.client()]);
+        let quorum = QuorumCatalogCache::new(local, Arc::clone(&replicas));
+
+        let k1 = CacheKey::Table(1);
+        let v1 = CacheValue::new("v1".into(), 1);
+        quorum.put(k1, v1.clone()).await.unwrap();
+
+        let k2 = CacheKey::Table(2);
+        let v2 = CacheValue::new("v2".into(), 1);
+        quorum.put(k2, v2.clone()).await.unwrap();
+
+        // Simulate local restart
+        let local = Arc::new(CatalogCache::default());
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        assert_eq!(local.list().count(), 0);
+
+        quorum.warm().await.unwrap();
+
+        // Should populate both entries
+        let mut entries: Vec<_> = local.list().collect();
+        entries.sort_unstable_by_key(|(k, _)| *k);
+        assert_eq!(entries, vec![(k1, v1.clone()), (k2, v2.clone())]);
+
+        // Simulate local restart
+        let local = Arc::new(CatalogCache::default());
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        // Simulate in-progress write
+        let v3 = CacheValue::new("v3".into(), 2);
+        assert!(r1.cache().insert(k2, v3.clone()).unwrap());
+
+        // Cannot establish quorum for k1 so should skip over
+        quorum.warm().await.unwrap();
+        let entries: Vec<_> = local.list().collect();
+        assert_eq!(entries.len(), 1);
+        assert_eq!(entries[0], (k1, v1.clone()));
+
+        // If r2 updated warming should pick up new quorum
+        assert!(r2.cache().insert(k2, v3.clone()).unwrap());
+        quorum.warm().await.unwrap();
+        let mut entries: Vec<_> = local.list().collect();
+        entries.sort_unstable_by_key(|(k, _)| *k);
+        assert_eq!(entries, vec![(k1, v1), (k2, v3)]);
+
+        // Test cancellation safety
+        let k3 = CacheKey::Table(3);
+        let fut = quorum.put(k3, v2.clone());
+        {
+            // `fut` is dropped (cancelled) on exit from this code block
+            pin_mut!(fut);
+
+            let noop_waker = futures::task::noop_waker();
+            let mut cx = Context::from_waker(&noop_waker);
+            assert!(fut.poll(&mut cx).is_pending());
+        }
+
+        // Write should still propagate asynchronously
+        let mut attempts = 0;
+        loop {
+            tokio::time::sleep(Duration::from_millis(1)).await;
+            match quorum.get(k3).await {
+                Ok(Some(_)) => break,
+                _ => {
+                    assert!(attempts < 100);
+                    attempts += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/catalog_cache/src/api/server.rs b/catalog_cache/src/api/server.rs
new file mode 100644
index 0000000..b29d841
--- /dev/null
+++ b/catalog_cache/src/api/server.rs
@@ -0,0 +1,300 @@
+//! Server for the cache HTTP API
+
+use crate::api::list::{ListEncoder, ListEntry};
+use crate::api::{RequestPath, GENERATION};
+use crate::local::CatalogCache;
+use crate::CacheValue;
+use futures::ready;
+use hyper::body::HttpBody;
+use hyper::header::ToStrError;
+use hyper::http::request::Parts;
+use hyper::service::Service;
+use hyper::{Body, Method, Request, Response, StatusCode};
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::convert::Infallible;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+enum Error {
+    #[snafu(display("Http error: {source}"), context(false))]
+    Http { source: hyper::http::Error },
+
+    #[snafu(display("Hyper error: {source}"), context(false))]
+    Hyper { source: hyper::Error },
+
+    #[snafu(display("Local cache error: {source}"), context(false))]
+    Local { source: crate::local::Error },
+
+    #[snafu(display("Non UTF-8 Header: {source}"))]
+    BadHeader { source: ToStrError },
+
+    #[snafu(display("Request missing generation header"))]
+    MissingGeneration,
+
+    #[snafu(display("Invalid generation header: {source}"))]
+    InvalidGeneration { source: std::num::ParseIntError },
+
+    #[snafu(display("List query missing size"))]
+    MissingSize,
+
+    #[snafu(display("List query invalid size: {source}"))]
+    InvalidSize { source: std::num::ParseIntError },
+}
+
+impl Error {
+    /// Convert an error into a [`Response`]
+    fn response(self) -> Response<Body> {
+        let mut response = Response::new(Body::from(self.to_string()));
+        *response.status_mut() = match &self {
+            Self::Http { .. } | Self::Hyper { .. } | Self::Local { .. } => {
+                StatusCode::INTERNAL_SERVER_ERROR
+            }
+            Self::InvalidGeneration { .. }
+            | Self::MissingGeneration
+            | Self::InvalidSize { .. }
+            | Self::MissingSize
+            | Self::BadHeader { .. } => StatusCode::BAD_REQUEST,
+        };
+        response
+    }
+}
+
+/// A [`Service`] that wraps a [`CatalogCache`]
+#[derive(Debug, Clone)]
+pub struct CatalogCacheService(Arc<ServiceState>);
+
+/// Shared state for [`CatalogCacheService`]
+#[derive(Debug)]
+struct ServiceState {
+    cache: Arc<CatalogCache>,
+}
+
+impl Service<Request<Body>> for CatalogCacheService {
+    type Response = Response<Body>;
+
+    type Error = Infallible;
+    type Future = CatalogRequestFuture;
+
+    fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Infallible>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let (parts, body) = req.into_parts();
+        CatalogRequestFuture {
+            parts,
+            body,
+            buffer: vec![],
+            state: Arc::clone(&self.0),
+        }
+    }
+}
+
+/// The future for [`CatalogCacheService`]
+#[derive(Debug)]
+pub struct CatalogRequestFuture {
+    /// The request body
+    body: Body,
+    /// The request parts
+    parts: Parts,
+    /// The in-progress body
+    ///
+    /// We use Vec not Bytes to ensure the cache isn't storing slices of large allocations
+    buffer: Vec<u8>,
+    /// The cache to service requests
+    state: Arc<ServiceState>,
+}
+
+impl Future for CatalogRequestFuture {
+    type Output = Result<Response<Body>, Infallible>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let r = loop {
+            match ready!(Pin::new(&mut self.body).poll_data(cx)) {
+                Some(Ok(b)) => self.buffer.extend_from_slice(&b),
+                Some(Err(e)) => break Err(e.into()),
+                None => break Ok(()),
+            }
+        };
+        Poll::Ready(Ok(match r.and_then(|_| self.call()) {
+            Ok(resp) => resp,
+            Err(e) => e.response(),
+        }))
+    }
+}
+
+impl CatalogRequestFuture {
+    fn call(&mut self) -> Result<Response<Body>, Error> {
+        let body = std::mem::take(&mut self.buffer);
+
+        let status = match RequestPath::parse(self.parts.uri.path()) {
+            Some(RequestPath::List) => match self.parts.method {
+                Method::GET => {
+                    let query = self.parts.uri.query().context(MissingSizeSnafu)?;
+                    let mut parts = url::form_urlencoded::parse(query.as_bytes());
+                    let (_, size) = parts.find(|(k, _)| k == "size").context(MissingSizeSnafu)?;
+                    let size = size.parse().context(InvalidSizeSnafu)?;
+
+                    let iter = self.state.cache.list();
+                    let entries = iter.map(|(k, v)| ListEntry::new(k, v)).collect();
+                    let encoder = ListEncoder::new(entries).with_max_value_size(size);
+
+                    let stream = futures::stream::iter(encoder.map(Ok::<_, Error>));
+                    let response = Response::builder().body(Body::wrap_stream(stream))?;
+                    return Ok(response);
+                }
+                _ => StatusCode::METHOD_NOT_ALLOWED,
+            },
+            Some(RequestPath::Resource(key)) => match self.parts.method {
+                Method::GET => match self.state.cache.get(key) {
+                    Some(value) => {
+                        let response = Response::builder()
+                            .header(&GENERATION, value.generation)
+                            .body(value.data.into())?;
+                        return Ok(response);
+                    }
+                    None => StatusCode::NOT_FOUND,
+                },
+                Method::PUT => {
+                    let headers = &self.parts.headers;
+                    let generation = headers.get(&GENERATION).context(MissingGenerationSnafu)?;
+                    let generation = generation.to_str().context(BadHeaderSnafu)?;
+                    let generation = generation.parse().context(InvalidGenerationSnafu)?;
+                    let value = CacheValue::new(body.into(), generation);
+
+                    match self.state.cache.insert(key, value)? {
+                        true => StatusCode::OK,
+                        false => StatusCode::NOT_MODIFIED,
+                    }
+                }
+                Method::DELETE => {
+                    self.state.cache.delete(key);
+                    StatusCode::OK
+                }
+                _ => StatusCode::METHOD_NOT_ALLOWED,
+            },
+            None => StatusCode::NOT_FOUND,
+        };
+
+        let mut response = Response::new(Body::empty());
+        *response.status_mut() = status;
+        Ok(response)
+    }
+}
+
+/// Runs a [`CatalogCacheService`] in a background task
+///
+/// Will abort the background task on drop
+#[derive(Debug)]
+pub struct CatalogCacheServer {
+    state: Arc<ServiceState>,
+}
+
+impl CatalogCacheServer {
+    /// Create a new [`CatalogCacheServer`].
+    ///
+    /// Note that the HTTP interface needs to be wired up in some higher-level structure. Use [`service`](Self::service)
+    /// for that.
+    pub fn new(cache: Arc<CatalogCache>) -> Self {
+        let state = Arc::new(ServiceState { cache });
+
+        Self { state }
+    }
+
+    /// Returns HTTP service.
+    pub fn service(&self) -> CatalogCacheService {
+        CatalogCacheService(Arc::clone(&self.state))
+    }
+
+    /// Returns a reference to the [`CatalogCache`] of this server
+    pub fn cache(&self) -> &Arc<CatalogCache> {
+        &self.state.cache
+    }
+}
+
+/// Test utilities.
+pub mod test_util {
+    use std::{net::SocketAddr, ops::Deref};
+
+    use hyper::{service::make_service_fn, Server};
+    use tokio::task::JoinHandle;
+    use tokio_util::sync::CancellationToken;
+
+    use crate::api::client::CatalogCacheClient;
+
+    use super::*;
+
+    /// Test runner for a [`CatalogCacheServer`].
+    #[derive(Debug)]
+    pub struct TestCacheServer {
+        addr: SocketAddr,
+        server: CatalogCacheServer,
+        shutdown: CancellationToken,
+        handle: Option<JoinHandle<()>>,
+    }
+
+    impl TestCacheServer {
+        /// Create a new [`TestCacheServer`] bound to an ephemeral port
+        pub fn bind_ephemeral() -> Self {
+            Self::bind(&SocketAddr::from(([127, 0, 0, 1], 0)))
+        }
+
+        /// Create a new [`CatalogCacheServer`] bound to the provided [`SocketAddr`]
+        pub fn bind(addr: &SocketAddr) -> Self {
+            let server = CatalogCacheServer::new(Arc::new(CatalogCache::default()));
+            let service = server.service();
+            let make_service = make_service_fn(move |_conn| {
+                futures::future::ready(Ok::<_, Infallible>(service.clone()))
+            });
+
+            let hyper_server = Server::bind(addr).serve(make_service);
+            let addr = hyper_server.local_addr();
+
+            let shutdown = CancellationToken::new();
+            let signal = shutdown.clone().cancelled_owned();
+            let graceful = hyper_server.with_graceful_shutdown(signal);
+            let handle = Some(tokio::spawn(async move { graceful.await.unwrap() }));
+
+            Self {
+                addr,
+                server,
+                shutdown,
+                handle,
+            }
+        }
+
+        /// Returns a [`CatalogCacheClient`] for communicating with this server
+        pub fn client(&self) -> CatalogCacheClient {
+            let addr = format!("http://{}", self.addr);
+            CatalogCacheClient::try_new(addr.parse().unwrap()).unwrap()
+        }
+
+        /// Triggers and waits for graceful shutdown
+        pub async fn shutdown(mut self) {
+            self.shutdown.cancel();
+            if let Some(x) = self.handle.take() {
+                x.await.unwrap()
+            }
+        }
+    }
+
+    impl Deref for TestCacheServer {
+        type Target = CatalogCacheServer;
+
+        fn deref(&self) -> &Self::Target {
+            &self.server
+        }
+    }
+
+    impl Drop for TestCacheServer {
+        fn drop(&mut self) {
+            if let Some(x) = &self.handle {
+                x.abort()
+            }
+        }
+    }
+}
diff --git a/catalog_cache/src/lib.rs b/catalog_cache/src/lib.rs
new file mode 100644
index 0000000..0370448
--- /dev/null
+++ b/catalog_cache/src/lib.rs
@@ -0,0 +1,143 @@
+//! Consistent cache system used by the catalog service
+//!
+//! # Design
+//!
+//! The catalog service needs to be able to service queries without needing to communicate
+//! with its underlying backing store. This serves the dual purpose of reducing load on this
+//! backing store, and also returning results in a more timely manner.
+//!
+//! This caching must be transparent to the users of the catalog service, and therefore must not
+//! introduce eventually consistent behaviour, or other consistency effects.
+//!
+//! As such this crate provides a strongly-consistent, distributed key-value cache.
+//!
+//! In order to keep things simple, this only provides a mapping from [`CacheKey`] to opaque
+//! binary payloads, with no support for structured payloads.
+//!
+//! This avoids:
+//!
+//! * Complex replicated state machines
+//! * Forward compatibility challenges where newer data can't roundtrip through older servers
+//! * Simple to introspect, debug and reason about
+//! * Predictable and easily quantifiable memory usage
+//!
+//! However, it does have the following implications:
+//!
+//! * Care must be taken to ensure that parsing of the cached payloads does not become a bottleneck
+//! * Large values (> 1MB) should be avoided, as updates will resend the entire value
+//!
+//! ## Components
+//!
+//! This crate is broken into multiple parts
+//!
+//! * [`CatalogCache`] provides a local key value store
+//! * [`CatalogCacheService`] exposes this [`CatalogCache`] over an HTTP API
+//! * [`CatalogCacheClient`] communicates with a remote [`CatalogCacheService`]
+//! * [`QuorumCatalogCache`] combines the above into a strongly-consistent distributed cache
+//!
+//! [`CatalogCache`]: local::CatalogCache
+//! [`CatalogCacheClient`]: api::client::CatalogCacheClient
+//! [`CatalogCacheService`]: api::server::CatalogCacheService
+//! [`QuorumCatalogCache`]: api::quorum::QuorumCatalogCache
+//!
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use bytes::Bytes;
+use std::sync::atomic::AtomicBool;
+
+pub mod api;
+pub mod local;
+
+/// The types of catalog cache key
+#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
+pub enum CacheKey {
+    /// A catalog namespace
+    Namespace(i64),
+    /// A catalog table
+    Table(i64),
+    /// A catalog partition
+    Partition(i64),
+}
+
+impl CacheKey {
+    /// Variant as string.
+    ///
+    /// This can be used for logging and metrics.
+    pub fn variant(&self) -> &'static str {
+        match self {
+            Self::Namespace(_) => "namespace",
+            Self::Table(_) => "table",
+            Self::Partition(_) => "partition",
+        }
+    }
+
+    /// Untyped ID.
+    pub fn id(&self) -> i64 {
+        match self {
+            Self::Namespace(id) => *id,
+            Self::Table(id) => *id,
+            Self::Partition(id) => *id,
+        }
+    }
+}
+
+/// A value stored in [`CatalogCache`](local::CatalogCache)
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct CacheValue {
+    /// The data stored for this cache
+    data: Bytes,
+    /// The generation of this cache data
+    generation: u64,
+}
+
+impl CacheValue {
+    /// Create a new [`CacheValue`] with the provided `data` and `generation`
+    pub fn new(data: Bytes, generation: u64) -> Self {
+        Self { data, generation }
+    }
+
+    /// The data stored for this cache
+    pub fn data(&self) -> &Bytes {
+        &self.data
+    }
+
+    /// The generation of this cache data
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+}
+
+/// Combines a [`CacheValue`] with an [`AtomicBool`] for the purposes of NRU-eviction
+#[derive(Debug)]
+struct CacheEntry {
+    /// The value of this cache entry
+    value: CacheValue,
+    /// An atomic flag that is set to `true` by `CatalogCache::get` and
+    /// cleared by `CatalogCache::evict_unused`
+    used: AtomicBool,
+}
+
+impl From<CacheValue> for CacheEntry {
+    fn from(value: CacheValue) -> Self {
+        Self {
+            value,
+            // Values start used to prevent racing with `evict_unused`
+            used: AtomicBool::new(true),
+        }
+    }
+}
diff --git a/catalog_cache/src/local/limit.rs b/catalog_cache/src/local/limit.rs
new file mode 100644
index 0000000..6c38fee
--- /dev/null
+++ b/catalog_cache/src/local/limit.rs
@@ -0,0 +1,82 @@
+//! A memory limiter
+
+use super::{Error, Result};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+#[derive(Debug)]
+pub(crate) struct MemoryLimiter {
+    current: AtomicUsize,
+    limit: usize,
+}
+
+impl MemoryLimiter {
+    /// Create a new [`MemoryLimiter`] limited to `limit` bytes
+    pub(crate) fn new(limit: usize) -> Self {
+        Self {
+            current: AtomicUsize::new(0),
+            limit,
+        }
+    }
+
+    /// Reserve `size` bytes, returning an error if this would exceed the limit
+    pub(crate) fn reserve(&self, size: usize) -> Result<()> {
+        let limit = self.limit;
+        let max = limit
+            .checked_sub(size)
+            .ok_or(Error::TooLarge { size, limit })?;
+
+        // We can use relaxed ordering as not relying on this to
+        // synchronise memory accesses beyond itself
+        self.current
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| {
+                // This cannot overflow as current + size <= limit
+                (current <= max).then_some(current + size)
+            })
+            .map_err(|current| Error::OutOfMemory {
+                size,
+                current,
+                limit,
+            })?;
+        Ok(())
+    }
+
+    /// Free `size` bytes
+    pub(crate) fn free(&self, size: usize) {
+        self.current.fetch_sub(size, Ordering::Relaxed);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_limiter() {
+        let limiter = MemoryLimiter::new(100);
+
+        limiter.reserve(20).unwrap();
+        limiter.reserve(70).unwrap();
+
+        let err = limiter.reserve(20).unwrap_err().to_string();
+        assert_eq!(err, "Cannot reserve additional 20 bytes for cache containing 90 bytes as would exceed limit of 100 bytes");
+
+        limiter.reserve(10).unwrap();
+        limiter.reserve(0).unwrap();
+
+        let err = limiter.reserve(1).unwrap_err().to_string();
+        assert_eq!(err, "Cannot reserve additional 1 bytes for cache containing 100 bytes as would exceed limit of 100 bytes");
+
+        limiter.free(10);
+        limiter.reserve(10).unwrap();
+
+        limiter.free(100);
+
+        // Can add single value taking entire range
+        limiter.reserve(100).unwrap();
+        limiter.free(100);
+
+        // Protected against overflow
+        let err = limiter.reserve(usize::MAX).unwrap_err();
+        assert!(matches!(err, Error::TooLarge { .. }), "{err}");
+    }
+}
diff --git a/catalog_cache/src/local/mod.rs b/catalog_cache/src/local/mod.rs
new file mode 100644
index 0000000..373dd62
--- /dev/null
+++ b/catalog_cache/src/local/mod.rs
@@ -0,0 +1,355 @@
+//! A local in-memory cache
+
+mod limit;
+
+use crate::local::limit::MemoryLimiter;
+use crate::{CacheEntry, CacheKey, CacheValue};
+use dashmap::mapref::entry::Entry;
+use dashmap::DashMap;
+use snafu::Snafu;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+
+/// Error for [`CatalogCache`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs, missing_copy_implementations)]
+pub enum Error {
+    #[snafu(display("Cannot reserve additional {size} bytes for cache containing {current} bytes as would exceed limit of {limit} bytes"))]
+    OutOfMemory {
+        size: usize,
+        current: usize,
+        limit: usize,
+    },
+
+    #[snafu(display("Cannot reserve additional {size} bytes for cache as request exceeds total memory limit of {limit} bytes"))]
+    TooLarge { size: usize, limit: usize },
+}
+
+/// Result for [`CatalogCache`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A trait for observing updated to [`CatalogCache`]
+///
+/// This can be used for injecting metrics, maintaining secondary indices or otherwise
+///
+/// Note: members are invoked under locks in [`CatalogCache`] and should therefore
+/// be short-running and not call back into [`CatalogCache`]
+pub trait CatalogCacheObserver: std::fmt::Debug + Send + Sync {
+    /// Called before a value is potentially inserted into [`CatalogCache`]
+    ///
+    /// This is called regardless of it [`CatalogCache`] already contains the value
+    fn insert(&self, key: CacheKey, new: &CacheValue, old: Option<&CacheValue>);
+
+    /// A key removed from the [`CatalogCache`]
+    fn evict(&self, key: CacheKey, value: &CacheValue);
+}
+
+/// A concurrent Not-Recently-Used cache mapping [`CacheKey`] to [`CacheValue`]
+#[derive(Debug, Default)]
+pub struct CatalogCache {
+    map: DashMap<CacheKey, CacheEntry>,
+    observer: Option<Arc<dyn CatalogCacheObserver>>,
+    limit: Option<MemoryLimiter>,
+}
+
+impl CatalogCache {
+    /// Create a new `CatalogCache` with an optional memory limit
+    pub fn new(limit: Option<usize>) -> Self {
+        Self {
+            limit: limit.map(MemoryLimiter::new),
+            ..Default::default()
+        }
+    }
+
+    /// Sets a [`CatalogCacheObserver`] for this [`CatalogCache`]
+    pub fn with_observer(self, observer: Arc<dyn CatalogCacheObserver>) -> Self {
+        Self {
+            observer: Some(observer),
+            ..self
+        }
+    }
+
+    /// Returns the value for `key` if it exists
+    pub fn get(&self, key: CacheKey) -> Option<CacheValue> {
+        let entry = self.map.get(&key)?;
+        entry.used.store(true, Ordering::Relaxed);
+        Some(entry.value.clone())
+    }
+
+    /// Insert the given `value` into the cache
+    ///
+    /// Skips insertion and returns false iff an entry already exists with the
+    /// same or greater generation
+    pub fn insert(&self, key: CacheKey, value: CacheValue) -> Result<bool> {
+        match self.map.entry(key) {
+            Entry::Occupied(mut o) => {
+                let old = &o.get().value;
+                if value.generation <= old.generation {
+                    return Ok(false);
+                }
+                if let Some(l) = &self.limit {
+                    let new_len = value.data.len();
+                    let cur_len = old.data.len();
+                    match new_len > cur_len {
+                        true => l.reserve(new_len - cur_len)?,
+                        false => l.free(cur_len - new_len),
+                    }
+                }
+                if let Some(v) = &self.observer {
+                    v.insert(key, &value, Some(old));
+                }
+                o.insert(value.into());
+            }
+            Entry::Vacant(v) => {
+                if let Some(l) = &self.limit {
+                    l.reserve(value.data.len())?;
+                }
+                if let Some(v) = &self.observer {
+                    v.insert(key, &value, None);
+                }
+                v.insert(value.into());
+            }
+        }
+        Ok(true)
+    }
+
+    /// Removes the [`CacheValue`] for the given `key` if any
+    pub fn delete(&self, key: CacheKey) -> Option<CacheValue> {
+        match self.map.entry(key) {
+            Entry::Occupied(o) => {
+                let old = &o.get().value;
+                if let Some(v) = &self.observer {
+                    v.evict(key, old)
+                }
+                if let Some(l) = &self.limit {
+                    l.free(old.data.len())
+                }
+                Some(o.remove().value)
+            }
+            _ => None,
+        }
+    }
+
+    /// Returns an iterator over the items in this cache
+    pub fn list(&self) -> CacheIterator<'_> {
+        CacheIterator(self.map.iter())
+    }
+
+    /// Evict all entries not accessed with [`CatalogCache::get`] or updated since
+    /// the last call to this function
+    ///
+    /// Periodically calling this provides a Not-Recently-Used eviction policy
+    pub fn evict_unused(&self) {
+        self.map.retain(|key, entry| {
+            let retain = entry.used.swap(false, Ordering::Relaxed);
+            if !retain {
+                if let Some(v) = &self.observer {
+                    v.evict(*key, &entry.value);
+                }
+                if let Some(l) = &self.limit {
+                    l.free(entry.value.data.len());
+                }
+            }
+            retain
+        });
+    }
+}
+
+/// Iterator for [`CatalogCache`]
+#[allow(missing_debug_implementations)]
+pub struct CacheIterator<'a>(dashmap::iter::Iter<'a, CacheKey, CacheEntry>);
+
+impl<'a> Iterator for CacheIterator<'a> {
+    type Item = (CacheKey, CacheValue);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let value = self.0.next()?;
+        Some((*value.key(), value.value().value.clone()))
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.0.size_hint()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bytes::Bytes;
+    use dashmap::DashSet;
+
+    #[derive(Debug, Default)]
+    struct KeyObserver {
+        keys: DashSet<CacheKey>,
+    }
+
+    impl KeyObserver {
+        fn keys(&self) -> Vec<CacheKey> {
+            let mut keys: Vec<_> = self.keys.iter().map(|k| *k).collect();
+            keys.sort_unstable();
+            keys
+        }
+    }
+
+    impl CatalogCacheObserver for KeyObserver {
+        fn insert(&self, key: CacheKey, _new: &CacheValue, _old: Option<&CacheValue>) {
+            self.keys.insert(key);
+        }
+
+        fn evict(&self, key: CacheKey, _value: &CacheValue) {
+            self.keys.remove(&key);
+        }
+    }
+
+    #[test]
+    fn test_basic() {
+        let observer = Arc::new(KeyObserver::default());
+        let cache = CatalogCache::default().with_observer(Arc::clone(&observer) as _);
+
+        let v1 = CacheValue::new("1".into(), 5);
+        assert!(cache.insert(CacheKey::Table(0), v1.clone()).unwrap());
+        assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v1);
+
+        // Older generation rejected
+        assert!(!cache
+            .insert(CacheKey::Table(0), CacheValue::new("2".into(), 3))
+            .unwrap());
+
+        // Value unchanged
+        assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v1);
+
+        // Different key accepted
+        let v2 = CacheValue::new("2".into(), 5);
+        assert!(cache.insert(CacheKey::Table(1), v2.clone()).unwrap());
+        assert_eq!(cache.get(CacheKey::Table(1)).unwrap(), v2);
+
+        let v3 = CacheValue::new("3".into(), 0);
+        assert!(cache.insert(CacheKey::Partition(0), v3.clone()).unwrap());
+
+        // Newer generation updates
+        let v4 = CacheValue::new("4".into(), 6);
+        assert!(cache.insert(CacheKey::Table(0), v4.clone()).unwrap());
+
+        let mut values: Vec<_> = cache.list().collect();
+        values.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
+
+        assert_eq!(
+            values,
+            vec![
+                (CacheKey::Table(0), v4.clone()),
+                (CacheKey::Table(1), v2),
+                (CacheKey::Partition(0), v3),
+            ]
+        );
+        assert_eq!(
+            observer.keys(),
+            vec![
+                CacheKey::Table(0),
+                CacheKey::Table(1),
+                CacheKey::Partition(0)
+            ]
+        );
+
+        assert_eq!(cache.get(CacheKey::Namespace(0)), None);
+        assert_eq!(cache.delete(CacheKey::Namespace(0)), None);
+
+        assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v4);
+        assert_eq!(cache.delete(CacheKey::Table(0)).unwrap(), v4);
+        assert_eq!(cache.get(CacheKey::Table(0)), None);
+
+        assert_eq!(cache.list().count(), 2);
+        assert_eq!(observer.keys.len(), 2);
+    }
+
+    #[test]
+    fn test_nru() {
+        let observer = Arc::new(KeyObserver::default());
+        let cache = CatalogCache::default().with_observer(Arc::clone(&observer) as _);
+
+        let value = CacheValue::new("1".into(), 0);
+        cache.insert(CacheKey::Namespace(0), value.clone()).unwrap();
+        cache.insert(CacheKey::Partition(0), value.clone()).unwrap();
+        cache.insert(CacheKey::Table(0), value.clone()).unwrap();
+
+        cache.evict_unused();
+        // Inserted records should only be evicted on the next pass
+        assert_eq!(cache.list().count(), 3);
+        assert_eq!(observer.keys.len(), 3);
+
+        // Updating a record marks it used
+        cache
+            .insert(CacheKey::Table(0), CacheValue::new("2".into(), 1))
+            .unwrap();
+
+        // Fetching a record marks it used
+        cache.get(CacheKey::Partition(0)).unwrap();
+
+        // Insert a new record is used
+        cache.insert(CacheKey::Partition(1), value.clone()).unwrap();
+
+        cache.evict_unused();
+
+        // Namespace(0) evicted
+        let mut values: Vec<_> = cache.list().map(|(k, _)| k).collect();
+        values.sort_unstable();
+        let expected = vec![
+            CacheKey::Table(0),
+            CacheKey::Partition(0),
+            CacheKey::Partition(1),
+        ];
+        assert_eq!(values, expected);
+        assert_eq!(observer.keys(), expected);
+
+        // Stale updates don't count as usage
+        assert!(!cache.insert(CacheKey::Partition(0), value).unwrap());
+
+        // Listing does not preserve recently used
+        assert_eq!(cache.list().count(), 3);
+
+        cache.evict_unused();
+        assert_eq!(cache.list().count(), 0);
+        assert_eq!(observer.keys.len(), 0)
+    }
+
+    #[test]
+    fn test_limit() {
+        let cache = CatalogCache::new(Some(200));
+
+        let k1 = CacheKey::Table(1);
+        let k2 = CacheKey::Table(2);
+        let k3 = CacheKey::Table(3);
+
+        let v_100 = Bytes::from(vec![0; 100]);
+        let v_20 = Bytes::from(vec![0; 20]);
+
+        cache.insert(k1, CacheValue::new(v_100.clone(), 0)).unwrap();
+        cache.insert(k2, CacheValue::new(v_100.clone(), 0)).unwrap();
+
+        let r = cache.insert(k3, CacheValue::new(v_20.clone(), 0));
+        assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 20 bytes for cache containing 200 bytes as would exceed limit of 200 bytes");
+
+        // Upsert k1 to 20 bytes
+        cache.insert(k1, CacheValue::new(v_20.clone(), 1)).unwrap();
+
+        // Can now insert k3
+        cache.insert(k3, CacheValue::new(v_20.clone(), 0)).unwrap();
+
+        // Should evict nothing
+        cache.evict_unused();
+
+        // Cannot increase size of k3 to 100
+        let r = cache.insert(k3, CacheValue::new(v_100.clone(), 1));
+        assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 80 bytes for cache containing 140 bytes as would exceed limit of 200 bytes");
+
+        cache.delete(k2).unwrap();
+        cache.insert(k3, CacheValue::new(v_100.clone(), 1)).unwrap();
+
+        let r = cache.insert(k2, CacheValue::new(v_100.clone(), 1));
+        assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 100 bytes for cache containing 120 bytes as would exceed limit of 200 bytes");
+
+        // Should evict everything apart from k3
+        cache.evict_unused();
+
+        cache.insert(k2, CacheValue::new(v_100.clone(), 1)).unwrap();
+    }
+}
diff --git a/clap_blocks/Cargo.toml b/clap_blocks/Cargo.toml
new file mode 100644
index 0000000..de5d836
--- /dev/null
+++ b/clap_blocks/Cargo.toml
@@ -0,0 +1,40 @@
+[package]
+name = "clap_blocks"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+clap = { version = "4", features = ["derive", "env"] }
+ed25519-dalek = { version = "2", features = ["pem"] }
+futures = "0.3"
+http = "0.2.11"
+humantime = "2.1.0"
+iox_catalog = { path = "../iox_catalog" }
+iox_time = { path = "../iox_time" }
+itertools = "0.12.0"
+metric = { path = "../metric" }
+non-empty-string = "0.2.4"
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+parquet_cache = { path = "../parquet_cache" }
+snafu = "0.8"
+sysinfo = "0.30.5"
+trace_exporters = { path = "../trace_exporters" }
+trogging = { path = "../trogging", default-features = false, features = ["clap"] }
+url = "2.4"
+uuid = { version = "1", features = ["v4"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+tempfile = "3.9.0"
+test_helpers = { path = "../test_helpers" }
+
+[features]
+azure = ["object_store/azure"] # Optional Azure Object store support
+gcp = ["object_store/gcp"] # Optional GCP object store support
+aws = ["object_store/aws"] # Optional AWS / S3 object store support
diff --git a/clap_blocks/src/bulk_ingest.rs b/clap_blocks/src/bulk_ingest.rs
new file mode 100644
index 0000000..df383b5
--- /dev/null
+++ b/clap_blocks/src/bulk_ingest.rs
@@ -0,0 +1,274 @@
+//! CLI config for the router to enable bulk ingest APIs
+
+use ed25519_dalek::{
+    pkcs8::{DecodePrivateKey, DecodePublicKey},
+    SigningKey, VerifyingKey,
+};
+use snafu::{ResultExt, Snafu};
+use std::{fs, io, path::PathBuf};
+
+/// CLI config for bulk ingest.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct BulkIngestConfig {
+    /// Private signing key used for Parquet metadata returned from the `NewParquetMetadata` gRPC
+    /// API to prevent tampering/corruption of Parquet metadata provided by IOx to the process
+    /// preparing Parquet files for bulk ingest.
+    ///
+    /// This is a path to an Ed25519 private key file generated by OpenSSL with the command:
+    /// `openssl genpkey -algorithm ed25519 -out private-key-filename.pem`
+    ///
+    /// The public key used to verify signatures will be derived from this private key. Additional
+    /// public verification keys can be specified with
+    /// `-bulk-ingest-additional-verification-key-files` to support key rotation.
+    ///
+    /// If not specified, the `NewParquetMetadata` gRPC API will return unimplemented.
+    #[clap(
+        long = "bulk-ingest-metadata-signing-key-file",
+        env = "INFLUXDB_IOX_BULK_INGEST_METADATA_SIGNING_KEY_FILE"
+    )]
+    metadata_signing_key_file: Option<PathBuf>,
+
+    /// When in the process of rotating keys, specify paths to files containing public verification
+    /// keys of previously used private signing keys used for signing metadata in the past.
+    ///
+    /// These files can be derived from private key files with this OpenSSL command:
+    /// `openssl pkey -in private-key-filename.pem -pubout -out public-key-filename.pem`
+    ///
+    /// Example: "public-key-1.pem,public-key-2.pem"
+    ///
+    /// If verification of the metadata signature fails with the current public key derived from
+    /// the current signing key, these verification keys will be tested in order to allow older
+    /// signatures generated with the old key to still be validated. For best performance of
+    /// signature verification, specify the additional verification keys in order of most likely
+    /// candidates first (probably most recently used first).
+    ///
+    /// If no additional verification keys are specified, only the verification key associated with
+    /// the current metadata signing key will be used to validate signatures.
+    #[clap(
+        long = "bulk-ingest-additional-verification-key-files",
+        env = "INFLUXDB_IOX_BULK_INGEST_ADDITIONAL_VERIFICATION_KEY_FILES",
+        required = false,
+        num_args=1..,
+        value_delimiter = ',',
+    )]
+    additional_verification_key_files: Vec<PathBuf>,
+
+    /// Rather than using whatever object store configuration may have been specified as a source
+    /// of presigned upload URLs for bulk ingest, use a mock implementation that returns an upload
+    /// URL value that can be inspected but not used.
+    ///
+    /// Only useful for testing bulk ingest without setting up S3! Do not use this in production!
+    #[clap(
+        hide = true,
+        long = "bulk-ingest-use-mock-presigned-url-signer",
+        env = "INFLUXDB_IOX_BULK_INGEST_USE_MOCK_PRESIGNED_URL_SIGNER",
+        default_value = "false"
+    )]
+    pub use_mock_presigned_url_signer: bool,
+}
+
+impl BulkIngestConfig {
+    /// Constructor for bulk ingest configuration.
+    pub fn new(
+        metadata_signing_key_file: Option<PathBuf>,
+        additional_verification_key_files: Vec<PathBuf>,
+        use_mock_presigned_url_signer: bool,
+    ) -> Self {
+        Self {
+            metadata_signing_key_file,
+            additional_verification_key_files,
+            use_mock_presigned_url_signer,
+        }
+    }
+}
+
+impl TryFrom<&BulkIngestConfig> for Option<BulkIngestKeys> {
+    type Error = BulkIngestConfigError;
+
+    fn try_from(config: &BulkIngestConfig) -> Result<Self, Self::Error> {
+        config
+            .metadata_signing_key_file
+            .as_ref()
+            .map(|signing_key_file| {
+                let signing_key: SigningKey = fs::read_to_string(signing_key_file)
+                    .context(ReadingSigningKeyFileSnafu {
+                        filename: &signing_key_file,
+                    })
+                    .and_then(|file_contents| {
+                        DecodePrivateKey::from_pkcs8_pem(&file_contents).context(
+                            DecodingSigningKeySnafu {
+                                filename: signing_key_file,
+                            },
+                        )
+                    })?;
+
+                let additional_verifying_keys: Vec<_> = config
+                    .additional_verification_key_files
+                    .iter()
+                    .map(|verification_key_file| {
+                        fs::read_to_string(verification_key_file)
+                            .context(ReadingVerifyingKeyFileSnafu {
+                                filename: &verification_key_file,
+                            })
+                            .and_then(|file_contents| {
+                                DecodePublicKey::from_public_key_pem(&file_contents).context(
+                                    DecodingVerifyingKeySnafu {
+                                        filename: verification_key_file,
+                                    },
+                                )
+                            })
+                    })
+                    .collect::<Result<Vec<_>, _>>()?;
+
+                Ok(BulkIngestKeys {
+                    signing_key,
+                    additional_verifying_keys,
+                })
+            })
+            .transpose()
+    }
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum BulkIngestConfigError {
+    #[snafu(display("Could not read signing key from {}: {source}", filename.display()))]
+    ReadingSigningKeyFile {
+        filename: PathBuf,
+        source: io::Error,
+    },
+
+    #[snafu(display("Could not decode signing key from {}: {source}", filename.display()))]
+    DecodingSigningKey {
+        filename: PathBuf,
+        source: ed25519_dalek::pkcs8::Error,
+    },
+
+    #[snafu(display("Could not read verifying key from {}: {source}", filename.display()))]
+    ReadingVerifyingKeyFile {
+        filename: PathBuf,
+        source: io::Error,
+    },
+
+    #[snafu(display("Could not decode verifying key from {}: {source}", filename.display()))]
+    DecodingVerifyingKey {
+        filename: PathBuf,
+        source: ed25519_dalek::pkcs8::spki::Error,
+    },
+}
+
+/// Key values extracted from the files specified to the CLI. To get an instance, first create a
+/// `BulkIngestConfig`, then call `try_from` to get a `Result` containing an
+/// `Option<BulkIngestKeys>` where the `Option` will be `Some` if the `BulkIngestConfig`'s
+/// `metadata_signing_key_file` value is `Some`.
+///
+/// If any filenames specified anywhere in the `BulkIngestConfig` can't be read or don't contain
+/// valid key values, the `try_from` implementation will return an error.
+#[derive(Debug)]
+pub struct BulkIngestKeys {
+    /// The parsed private signing key value contained in the file specified to
+    /// `--bulk-ingest-metadata-signing-key-file`.
+    pub signing_key: SigningKey,
+
+    /// If any files were specified in `--bulk-ingest-additional-verification-key-files`, this list
+    /// will contain their parsed public verification key values.
+    pub additional_verifying_keys: Vec<VerifyingKey>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser;
+    use std::process::Command;
+    use test_helpers::{assert_contains, make_temp_file, tmp_dir};
+
+    #[test]
+    fn missing_signing_key_param() {
+        // No signing key file -> no keys
+        let config = BulkIngestConfig::try_parse_from(["something"]).unwrap();
+        let keys: Option<BulkIngestKeys> = (&config).try_into().unwrap();
+        assert!(keys.is_none(), "expected None, got: {:?}", keys);
+
+        // Even if there are additional verification key files; no signing key file means no keys
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-additional-verification-key-files",
+            "some-public-key-filename.pem",
+        ])
+        .unwrap();
+        let keys: Option<BulkIngestKeys> = (&config).try_into().unwrap();
+        assert!(keys.is_none(), "expected None, got: {:?}", keys);
+    }
+
+    #[test]
+    fn signing_key_file_not_found() {
+        let nonexistent_filename = "do-not-create-a-file-with-this-name-or-this-test-will-fail";
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-metadata-signing-key-file",
+            nonexistent_filename,
+        ])
+        .unwrap();
+
+        let keys: Result<Option<BulkIngestKeys>, _> = (&config).try_into();
+        let err = keys.unwrap_err();
+        assert_contains!(
+            err.to_string(),
+            format!("Could not read signing key from {nonexistent_filename}")
+        );
+    }
+
+    #[test]
+    fn signing_key_file_contents_invalid() {
+        let signing_key_file = make_temp_file("not a valid signing key");
+        let signing_key_filename = signing_key_file.path().display().to_string();
+
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-metadata-signing-key-file",
+            &signing_key_filename,
+        ])
+        .unwrap();
+
+        let keys: Result<Option<BulkIngestKeys>, _> = (&config).try_into();
+        let err = keys.unwrap_err();
+        assert_contains!(
+            err.to_string(),
+            format!("Could not decode signing key from {signing_key_filename}")
+        );
+    }
+
+    #[test]
+    fn valid_signing_key_file_no_additional_key_files() {
+        let tmp_dir = tmp_dir().unwrap();
+        let signing_key_filename = tmp_dir
+            .path()
+            .join("test-private-key.pem")
+            .display()
+            .to_string();
+        Command::new("openssl")
+            .arg("genpkey")
+            .arg("-algorithm")
+            .arg("ed25519")
+            .arg("-out")
+            .arg(&signing_key_filename)
+            .output()
+            .unwrap();
+
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-metadata-signing-key-file",
+            &signing_key_filename,
+        ])
+        .unwrap();
+
+        let keys: Result<Option<BulkIngestKeys>, _> = (&config).try_into();
+        let keys = keys.unwrap().unwrap();
+        let additional_keys = keys.additional_verifying_keys;
+        assert!(
+            additional_keys.is_empty(),
+            "expected additional keys to be empty, got {:?}",
+            additional_keys
+        );
+    }
+}
diff --git a/clap_blocks/src/catalog_cache.rs b/clap_blocks/src/catalog_cache.rs
new file mode 100644
index 0000000..a9b8543
--- /dev/null
+++ b/clap_blocks/src/catalog_cache.rs
@@ -0,0 +1,154 @@
+//! Config for the catalog cache server mode.
+
+use std::time::Duration;
+
+use itertools::Itertools;
+use snafu::{OptionExt, Snafu};
+use url::{Host, Url};
+
+use crate::memory_size::MemorySize;
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("host '{host}' is not a prefix of '{prefix}'"))]
+    NotAPrefix { host: String, prefix: String },
+
+    #[snafu(display("host '{host}' is not a valid host"))]
+    NotAValidHost { host: String },
+
+    #[snafu(display("invalid url: {source}"))]
+    InvalidUrl { source: url::ParseError },
+
+    #[snafu(display("Expected exactly two peers"))]
+    InvalidPeers,
+}
+
+/// CLI config for catalog configuration
+#[derive(Debug, Clone, PartialEq, Eq, clap::Parser)]
+pub struct CatalogConfig {
+    /// Host Name
+    ///
+    /// If provided, any matching entries in peers will be ignored
+    #[clap(long = "hostname", env = "INFLUXDB_IOX_HOSTNAME", value_parser = Host::parse)]
+    pub hostname: Option<Host<String>>,
+
+    /// Peers
+    ///
+    /// Can be provided as a comma-separated list, or on the command line multiple times
+    #[clap(
+        long = "catalog-cache-peers",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_PEERS",
+        required = false,
+        value_delimiter = ','
+    )]
+    pub peers: Vec<Url>,
+
+    /// Warmup delay.
+    ///
+    /// The warm-up (via dumping the cache of our peers) is delayed by the given time to make sure that we already
+    /// receive quorum writes. This ensure a gaplass transition / roll-out w/o any cache MISSes (esp. w/o any backend requests).
+    #[clap(
+        long = "catalog-cache-warmup-delay",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_WARMUP_DELAY",
+        default_value = default_warmup_delay(),
+        value_parser = humantime::parse_duration,
+    )]
+    pub warmup_delay: Duration,
+
+    /// Garbage collection interval.
+    ///
+    /// Every time this interval past, cache elements that have not been used (i.e. read or updated) since the last time
+    /// are evicted from the cache.
+    #[clap(
+        long = "catalog-cache-gc-interval",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_GC_INTERVAL",
+        default_value = default_gc_interval(),
+        value_parser = humantime::parse_duration,
+    )]
+    pub gc_interval: Duration,
+
+    /// Maximum number of bytes that should be cached within the catalog cache.
+    ///
+    /// If that limit is exceeded, no new values are accepted. This is meant as a safety measurement. You should adjust
+    /// your pod size and the GC interval (`--catalog-cache-gc-interval` / `INFLUXDB_IOX_CATALOG_CACHE_GC_INTERVAL`) to
+    /// your workload.
+    ///
+    /// Can be given as absolute value or in percentage of the total available memory (e.g. `10%`).
+    #[clap(
+        long = "catalog-cache-size-limit",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_SIZE_LIMIT",
+        default_value = "1073741824",  // 1GB
+        action
+    )]
+    pub cache_size_limit: MemorySize,
+
+    /// Number of concurrent quorum operations that a single request can trigger.
+    #[clap(
+        long = "catalog-cache-quorum-fanout",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_QUORUM_FANOUT",
+        default_value_t = 10
+    )]
+    pub quorum_fanout: usize,
+}
+
+impl CatalogConfig {
+    /// Return URL of other catalog cache nodes.
+    pub fn peers(&self) -> Result<[Url; 2], Error> {
+        let (peer1, peer2) = self
+            .peers
+            .iter()
+            .filter(|x| match (x.host(), &self.hostname) {
+                (Some(a), Some(r)) => &a != r,
+                _ => true,
+            })
+            .collect_tuple()
+            .context(InvalidPeersSnafu)?;
+
+        Ok([peer1.clone(), peer2.clone()])
+    }
+}
+
+fn default_warmup_delay() -> &'static str {
+    let s = humantime::format_duration(Duration::from_secs(60 * 5)).to_string();
+    Box::leak(Box::new(s))
+}
+
+fn default_gc_interval() -> &'static str {
+    let s = humantime::format_duration(Duration::from_secs(60 * 15)).to_string();
+    Box::leak(Box::new(s))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser;
+
+    #[test]
+    fn test_peers() {
+        let config = CatalogConfig::parse_from([
+            "binary",
+            "--catalog-cache-peers",
+            "http://peer1:8080",
+            "--catalog-cache-peers",
+            "http://peer2:9090",
+        ]);
+        let peer1 = Url::parse("http://peer1:8080").unwrap();
+        let peer2 = Url::parse("http://peer2:9090").unwrap();
+
+        let peers = config.peers().unwrap();
+        assert_eq!(peers, [peer1.clone(), peer2.clone()]);
+
+        let mut config = CatalogConfig::parse_from([
+            "binary",
+            "--catalog-cache-peers",
+            "http://peer1:8080,http://peer2:9090,http://peer3:9091",
+        ]);
+        let err = config.peers().unwrap_err();
+        assert!(matches!(err, Error::InvalidPeers), "{err}");
+
+        config.hostname = Some(Host::parse("peer3").unwrap());
+        let peers = config.peers().unwrap();
+        assert_eq!(peers, [peer1.clone(), peer2.clone()]);
+    }
+}
diff --git a/clap_blocks/src/catalog_dsn.rs b/clap_blocks/src/catalog_dsn.rs
new file mode 100644
index 0000000..74e84bc
--- /dev/null
+++ b/clap_blocks/src/catalog_dsn.rs
@@ -0,0 +1,176 @@
+//! Catalog-DSN-related configs.
+use http::uri::InvalidUri;
+use iox_catalog::grpc::client::GrpcCatalogClient;
+use iox_catalog::sqlite::{SqliteCatalog, SqliteConnectionOptions};
+use iox_catalog::{
+    interface::Catalog,
+    mem::MemCatalog,
+    postgres::{PostgresCatalog, PostgresConnectionOptions},
+};
+use iox_time::TimeProvider;
+use observability_deps::tracing::*;
+use snafu::{ResultExt, Snafu};
+use std::{sync::Arc, time::Duration};
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Unknown Catalog DSN {dsn}. Expected a string like 'postgresql://postgres@localhost:5432/postgres' or 'sqlite:///tmp/catalog.sqlite'"))]
+    UnknownCatalogDsn { dsn: String },
+
+    #[snafu(display("Catalog DSN not specified. Expected a string like 'postgresql://postgres@localhost:5432/postgres' or 'sqlite:///tmp/catalog.sqlite'"))]
+    DsnNotSpecified {},
+
+    #[snafu(display("Invalid URI: {source}"))]
+    InvalidUri { source: InvalidUri },
+
+    #[snafu(display("A catalog error occurred: {}", source))]
+    Catalog {
+        source: iox_catalog::interface::Error,
+    },
+}
+
+fn default_max_connections() -> &'static str {
+    let s = PostgresConnectionOptions::DEFAULT_MAX_CONNS.to_string();
+    Box::leak(Box::new(s))
+}
+
+fn default_connect_timeout() -> &'static str {
+    let s =
+        humantime::format_duration(PostgresConnectionOptions::DEFAULT_CONNECT_TIMEOUT).to_string();
+    Box::leak(Box::new(s))
+}
+
+fn default_idle_timeout() -> &'static str {
+    let s = humantime::format_duration(PostgresConnectionOptions::DEFAULT_IDLE_TIMEOUT).to_string();
+    Box::leak(Box::new(s))
+}
+
+fn default_hotswap_poll_interval_timeout() -> &'static str {
+    let s = humantime::format_duration(PostgresConnectionOptions::DEFAULT_HOTSWAP_POLL_INTERVAL)
+        .to_string();
+    Box::leak(Box::new(s))
+}
+
+/// CLI config for catalog DSN.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct CatalogDsnConfig {
+    /// Catalog connection string.
+    ///
+    /// The dsn determines the type of catalog used.
+    ///
+    /// PostgreSQL: `postgresql://postgres@localhost:5432/postgres`
+    ///
+    /// Sqlite (a local filename /tmp/foo.sqlite): `sqlite:///tmp/foo.sqlite` -
+    /// note sqlite is for development/testing only and should not be used for
+    /// production workloads.
+    ///
+    /// Memory (ephemeral, only useful for testing): `memory`
+    ///
+    #[clap(long = "catalog-dsn", env = "INFLUXDB_IOX_CATALOG_DSN", action)]
+    pub dsn: Option<String>,
+
+    /// Maximum number of connections allowed to the catalog at any one time.
+    #[clap(
+        long = "catalog-max-connections",
+        env = "INFLUXDB_IOX_CATALOG_MAX_CONNECTIONS",
+        default_value = default_max_connections(),
+        action,
+    )]
+    pub max_catalog_connections: u32,
+
+    /// Schema name for PostgreSQL-based catalogs.
+    #[clap(
+        long = "catalog-postgres-schema-name",
+        env = "INFLUXDB_IOX_CATALOG_POSTGRES_SCHEMA_NAME",
+        default_value = PostgresConnectionOptions::DEFAULT_SCHEMA_NAME,
+        action,
+    )]
+    pub postgres_schema_name: String,
+
+    /// Set the amount of time to attempt connecting to the database.
+    #[clap(
+        long = "catalog-connect-timeout",
+        env = "INFLUXDB_IOX_CATALOG_CONNECT_TIMEOUT",
+        default_value = default_connect_timeout(),
+        value_parser = humantime::parse_duration,
+    )]
+    pub connect_timeout: Duration,
+
+    /// Set a maximum idle duration for individual connections.
+    #[clap(
+        long = "catalog-idle-timeout",
+        env = "INFLUXDB_IOX_CATALOG_IDLE_TIMEOUT",
+        default_value = default_idle_timeout(),
+        value_parser = humantime::parse_duration,
+    )]
+    pub idle_timeout: Duration,
+
+    /// If the DSN points to a file (i.e. starts with `dsn-file://`), this sets the interval how often the the file
+    /// should be polled for updates.
+    ///
+    /// If an update is encountered, the underlying connection pool will be hot-swapped.
+    #[clap(
+        long = "catalog-hotswap-poll-interval",
+        env = "INFLUXDB_IOX_CATALOG_HOTSWAP_POLL_INTERVAL",
+        default_value = default_hotswap_poll_interval_timeout(),
+        value_parser = humantime::parse_duration,
+    )]
+    pub hotswap_poll_interval: Duration,
+}
+
+impl CatalogDsnConfig {
+    /// Get config-dependent catalog.
+    pub async fn get_catalog(
+        &self,
+        app_name: &'static str,
+        metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
+    ) -> Result<Arc<dyn Catalog>, Error> {
+        let Some(dsn) = self.dsn.as_ref() else {
+            return Err(Error::DsnNotSpecified {});
+        };
+
+        if dsn.starts_with("postgres") || dsn.starts_with("dsn-file://") {
+            // do not log entire postgres dsn as it may contain credentials
+            info!(postgres_schema_name=%self.postgres_schema_name, "Catalog: Postgres");
+            let options = PostgresConnectionOptions {
+                app_name: app_name.to_string(),
+                schema_name: self.postgres_schema_name.clone(),
+                dsn: dsn.clone(),
+                max_conns: self.max_catalog_connections,
+                connect_timeout: self.connect_timeout,
+                idle_timeout: self.idle_timeout,
+                hotswap_poll_interval: self.hotswap_poll_interval,
+            };
+            Ok(Arc::new(
+                PostgresCatalog::connect(options, metrics)
+                    .await
+                    .context(CatalogSnafu)?,
+            ))
+        } else if dsn == "memory" {
+            info!("Catalog: In-memory");
+            let mem = MemCatalog::new(metrics, time_provider);
+            Ok(Arc::new(mem))
+        } else if let Some(file_path) = dsn.strip_prefix("sqlite://") {
+            info!(file_path, "Catalog: Sqlite");
+            let options = SqliteConnectionOptions {
+                file_path: file_path.to_string(),
+            };
+            Ok(Arc::new(
+                SqliteCatalog::connect(options, metrics)
+                    .await
+                    .context(CatalogSnafu)?,
+            ))
+        } else if dsn.starts_with("http://") || dsn.starts_with("https://") {
+            info!("Catalog: gRPC");
+            let uri = dsn.parse().context(InvalidUriSnafu)?;
+            let grpc = GrpcCatalogClient::new(uri, metrics, time_provider);
+            Ok(Arc::new(grpc))
+        } else {
+            Err(Error::UnknownCatalogDsn {
+                dsn: dsn.to_string(),
+            })
+        }
+    }
+}
diff --git a/clap_blocks/src/compactor.rs b/clap_blocks/src/compactor.rs
new file mode 100644
index 0000000..9b63bc8
--- /dev/null
+++ b/clap_blocks/src/compactor.rs
@@ -0,0 +1,156 @@
+//! CLI config for compactor-related commands
+
+use std::num::NonZeroUsize;
+
+use crate::{gossip::GossipConfig, memory_size::MemorySize};
+
+use super::compactor_scheduler::CompactorSchedulerConfig;
+
+/// CLI config for compactor
+#[derive(Debug, Clone, clap::Parser)]
+pub struct CompactorConfig {
+    /// Gossip config.
+    #[clap(flatten)]
+    pub gossip_config: GossipConfig,
+
+    /// Configuration for the compactor scheduler
+    #[clap(flatten)]
+    pub compactor_scheduler_config: CompactorSchedulerConfig,
+
+    /// Number of partitions that should be compacted in parallel.
+    ///
+    /// This should usually be larger than the compaction job
+    /// concurrency since one partition can spawn multiple compaction
+    /// jobs.
+    #[clap(
+        long = "compaction-partition-concurrency",
+        env = "INFLUXDB_IOX_COMPACTION_PARTITION_CONCURRENCY",
+        default_value = "100",
+        action
+    )]
+    pub compaction_partition_concurrency: NonZeroUsize,
+
+    /// Number of concurrent compaction jobs scheduled to DataFusion.
+    ///
+    /// This should usually be smaller than the partition concurrency
+    /// since one partition can spawn multiple DF compaction jobs.
+    #[clap(
+        long = "compaction-df-concurrency",
+        env = "INFLUXDB_IOX_COMPACTION_DF_CONCURRENCY",
+        default_value = "10",
+        action
+    )]
+    pub compaction_df_concurrency: NonZeroUsize,
+
+    /// Number of jobs PER PARTITION that move files in and out of the
+    /// scratchpad.
+    #[clap(
+        long = "compaction-partition-scratchpad-concurrency",
+        env = "INFLUXDB_IOX_COMPACTION_PARTITION_SCRATCHPAD_CONCURRENCY",
+        default_value = "10",
+        action
+    )]
+    pub compaction_partition_scratchpad_concurrency: NonZeroUsize,
+
+    /// Number of threads to use for the compactor query execution,
+    /// compaction and persistence.
+    /// If not specified, defaults to one less than the number of cores on the system
+    #[clap(
+        long = "query-exec-thread-count",
+        env = "INFLUXDB_IOX_QUERY_EXEC_THREAD_COUNT",
+        action
+    )]
+    pub query_exec_thread_count: Option<NonZeroUsize>,
+
+    /// Size of memory pool used during compaction plan execution, in
+    /// bytes.
+    ///
+    /// If compaction plans attempt to allocate more than this many
+    /// bytes during execution, they will error with
+    /// "ResourcesExhausted".
+    ///
+    /// Can be given as absolute value or in percentage of the total available memory (e.g. `10%`).
+    #[clap(
+        long = "exec-mem-pool-bytes",
+        env = "INFLUXDB_IOX_EXEC_MEM_POOL_BYTES",
+        default_value = "17179869184",  // 16GB
+        action
+    )]
+    pub exec_mem_pool_bytes: MemorySize,
+
+    /// Overrides INFLUXDB_IOX_EXEC_MEM_POOL_BYTES to set the size of memory pool
+    /// used during compaction DF plan execution.  This value is expressed as a percent
+    /// of the memory limit for the cgroup (e.g. 70 = 70% of the cgroup memory limit).
+    /// This is converted to a byte limit as the compactor starts.
+    ///
+    /// Extreme values (<20% or >90%) are ignored and INFLUXDB_IOX_EXEC_MEM_POOL_BYTES
+    /// is used.  It will also use INFLUXDB_IOX_EXEC_MEM_POOL_BYTES if we fail to read
+    /// the cgroup limit, or it doesn't parse to a sane value.
+    ///
+    /// If compaction plans attempt to allocate more than the computed byte limit
+    /// during execution, they will error with "ResourcesExhausted".
+    #[clap(
+        long = "exec-mem-pool-percent",
+        env = "INFLUXDB_IOX_EXEC_MEM_POOL_PERCENT",
+        default_value = "70",
+        action
+    )]
+    pub exec_mem_pool_percent: u64,
+
+    /// Maximum duration of the per-partition compaction task in seconds.
+    #[clap(
+        long = "compaction-partition-timeout-secs",
+        env = "INFLUXDB_IOX_COMPACTION_PARTITION_TIMEOUT_SECS",
+        default_value = "1800",
+        action
+    )]
+    pub partition_timeout_secs: u64,
+
+    /// Shadow mode.
+    ///
+    /// This will NOT write / commit any output to the object store or catalog.
+    ///
+    /// This is mostly useful for debugging.
+    #[clap(
+        long = "compaction-shadow-mode",
+        env = "INFLUXDB_IOX_COMPACTION_SHADOW_MODE",
+        action
+    )]
+    pub shadow_mode: bool,
+
+    /// Enable scratchpad.
+    ///
+    /// This allows disabling the scratchpad in production.
+    ///
+    /// Disabling this is useful for testing performance and memory consequences of the scratchpad.
+    #[clap(
+        long = "compaction-enable-scratchpad",
+        env = "INFLUXDB_IOX_COMPACTION_ENABLE_SCRATCHPAD",
+        default_value = "true",
+        action
+    )]
+    pub enable_scratchpad: bool,
+
+    /// Only process all discovered partitions once.
+    ///
+    /// By default the compactor will continuously loop over all
+    /// partitions looking for work. Setting this option results in
+    /// exiting the loop after the one iteration.
+    #[clap(
+        long = "compaction-process-once",
+        env = "INFLUXDB_IOX_COMPACTION_PROCESS_ONCE",
+        action
+    )]
+    pub process_once: bool,
+
+    /// Limit the number of partition fetch queries to at most the specified
+    /// number of queries per second.
+    ///
+    /// Queries are smoothed over the full second.
+    #[clap(
+        long = "max-partition-fetch-queries-per-second",
+        env = "INFLUXDB_IOX_MAX_PARTITION_FETCH_QUERIES_PER_SECOND",
+        action
+    )]
+    pub max_partition_fetch_queries_per_second: Option<usize>,
+}
diff --git a/clap_blocks/src/compactor_scheduler.rs b/clap_blocks/src/compactor_scheduler.rs
new file mode 100644
index 0000000..e2b3c8f
--- /dev/null
+++ b/clap_blocks/src/compactor_scheduler.rs
@@ -0,0 +1,351 @@
+//! Compactor-Scheduler-related configs.
+
+use crate::socket_addr::SocketAddr;
+use std::str::FromStr;
+
+/// Compaction Scheduler type.
+#[derive(Debug, Default, Clone, Copy, PartialEq, clap::ValueEnum)]
+pub enum CompactorSchedulerType {
+    /// Perform scheduling decisions locally.
+    #[default]
+    Local,
+
+    /// Perform scheduling decisions remotely.
+    Remote,
+}
+
+/// CLI config for compactor scheduler.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct ShardConfigForLocalScheduler {
+    /// Number of shards.
+    ///
+    /// If this is set then the shard ID MUST also be set. If both are not provided, sharding is disabled.
+    /// (shard ID can be provided by the host name)
+    #[clap(
+        long = "compaction-shard-count",
+        env = "INFLUXDB_IOX_COMPACTION_SHARD_COUNT",
+        action
+    )]
+    pub shard_count: Option<usize>,
+
+    /// Shard ID.
+    ///
+    /// Starts at 0, must be smaller than the number of shard.
+    ///
+    /// If this is set then the shard count MUST also be set. If both are not provided, sharding is disabled.
+    #[clap(
+        long = "compaction-shard-id",
+        env = "INFLUXDB_IOX_COMPACTION_SHARD_ID",
+        requires("shard_count"),
+        action
+    )]
+    pub shard_id: Option<usize>,
+
+    /// Host Name
+    ///
+    /// comprised of leading text (e.g. 'iox-shared-compactor-'), ending with shard_id (e.g. '0').
+    /// When shard_count is specified, but shard_id is not specified, the id is extracted from hostname.
+    #[clap(env = "HOSTNAME")]
+    pub hostname: Option<String>,
+}
+
+/// CLI config for partitions_source used by the scheduler.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct PartitionSourceConfigForLocalScheduler {
+    /// The compactor will only consider compacting partitions that
+    /// have new Parquet files created within this many minutes.
+    #[clap(
+        long = "compaction_partition_minute_threshold",
+        env = "INFLUXDB_IOX_COMPACTION_PARTITION_MINUTE_THRESHOLD",
+        default_value = "10",
+        action
+    )]
+    pub compaction_partition_minute_threshold: u64,
+
+    /// Filter partitions to the given set of IDs.
+    ///
+    /// This is mostly useful for debugging.
+    #[clap(
+        long = "compaction-partition-filter",
+        env = "INFLUXDB_IOX_COMPACTION_PARTITION_FILTER",
+        action
+    )]
+    pub partition_filter: Option<Vec<i64>>,
+
+    /// Compact all partitions found in the catalog, no matter if/when
+    /// they received writes.
+    #[clap(
+        long = "compaction-process-all-partitions",
+        env = "INFLUXDB_IOX_COMPACTION_PROCESS_ALL_PARTITIONS",
+        default_value = "false",
+        action
+    )]
+    pub process_all_partitions: bool,
+
+    /// Ignores "partition marked w/ error and shall be skipped" entries in the catalog.
+    ///
+    /// This is mostly useful for debugging.
+    #[clap(
+        long = "compaction-ignore-partition-skip-marker",
+        env = "INFLUXDB_IOX_COMPACTION_IGNORE_PARTITION_SKIP_MARKER",
+        action
+    )]
+    pub ignore_partition_skip_marker: bool,
+}
+
+/// CLI config for scheduler's gossip.
+#[derive(Debug, Clone, clap::Parser)]
+pub struct CompactorSchedulerGossipConfig {
+    /// A comma-delimited set of seed gossip peer addresses.
+    ///
+    /// Example: "10.0.0.1:4242,10.0.0.2:4242"
+    ///
+    /// These seeds will be used to discover all other peers that talk to the
+    /// same seeds. Typically all nodes in the cluster should use the same set
+    /// of seeds.
+    #[clap(
+        long = "compactor-scheduler-gossip-seed-list",
+        env = "INFLUXDB_IOX_COMPACTOR_SCHEDULER_GOSSIP_SEED_LIST",
+        required = false,
+        num_args=1..,
+        value_delimiter = ',',
+        requires = "scheduler_gossip_bind_address", // Field name, not flag
+    )]
+    pub scheduler_seed_list: Vec<String>,
+
+    /// The UDP socket address IOx will use for gossip communication between
+    /// peers.
+    ///
+    /// Example: "0.0.0.0:4242"
+    ///
+    /// If not provided, the gossip sub-system is disabled.
+    #[clap(
+        long = "compactor-scheduler-gossip-bind-address",
+        env = "INFLUXDB_IOX_COMPACTOR_SCHEDULER_GOSSIP_BIND_ADDR",
+        default_value = "0.0.0.0:0",
+        required = false,
+        action
+    )]
+    pub scheduler_gossip_bind_address: SocketAddr,
+}
+
+impl Default for CompactorSchedulerGossipConfig {
+    fn default() -> Self {
+        Self {
+            scheduler_seed_list: vec![],
+            scheduler_gossip_bind_address: SocketAddr::from_str("0.0.0.0:4324").unwrap(),
+        }
+    }
+}
+
+impl CompactorSchedulerGossipConfig {
+    /// constructor for GossipConfig
+    ///
+    pub fn new(bind_address: &str, seed_list: Vec<String>) -> Self {
+        Self {
+            scheduler_seed_list: seed_list,
+            scheduler_gossip_bind_address: SocketAddr::from_str(bind_address).unwrap(),
+        }
+    }
+}
+
+/// CLI config for compactor scheduler.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct CompactorSchedulerConfig {
+    /// Scheduler type to use.
+    #[clap(
+        value_enum,
+        long = "compactor-scheduler",
+        env = "INFLUXDB_IOX_COMPACTION_SCHEDULER",
+        default_value = "local",
+        action
+    )]
+    pub compactor_scheduler_type: CompactorSchedulerType,
+
+    /// Maximum number of files that the compactor will try and
+    /// compact in a single plan.
+    ///
+    /// The higher this setting is the fewer compactor plans are run
+    /// and thus fewer resources over time are consumed by the
+    /// compactor. Increasing this setting also increases the peak
+    /// memory used for each compaction plan, and thus if it is set
+    /// too high, the compactor plans may exceed available memory.
+    #[clap(
+        long = "compaction-max-num-files-per-plan",
+        env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_FILES_PER_PLAN",
+        default_value = "20",
+        action
+    )]
+    pub max_num_files_per_plan: usize,
+
+    /// Desired max size of compacted parquet files.
+    ///
+    /// Note this is a target desired value, rather than a guarantee.
+    /// 1024 * 1024 * 100 =  104,857,600
+    #[clap(
+        long = "compaction-max-desired-size-bytes",
+        env = "INFLUXDB_IOX_COMPACTION_MAX_DESIRED_FILE_SIZE_BYTES",
+        default_value = "104857600",
+        action
+    )]
+    pub max_desired_file_size_bytes: u64,
+
+    /// Minimum number of L1 files to compact to L2.
+    ///
+    /// If there are more than this many L1 (by definition non
+    /// overlapping) files in a partition, the compactor will compact
+    /// them together into one or more larger L2 files.
+    ///
+    /// Setting this value higher in general results in fewer overall
+    /// resources spent on compaction but more files per partition (and
+    /// thus less optimal compression and query performance).
+    #[clap(
+        long = "compaction-min-num-l1-files-to-compact",
+        env = "INFLUXDB_IOX_COMPACTION_MIN_NUM_L1_FILES_TO_COMPACT",
+        default_value = "10",
+        action
+    )]
+    pub min_num_l1_files_to_compact: usize,
+
+    /// Maximum number of columns in a table of a partition that
+    /// will be able to considered to get compacted
+    ///
+    /// If a table has more than this many columns, the compactor will
+    /// not compact it, to avoid large memory use.
+    #[clap(
+        long = "compaction-max-num-columns-per-table",
+        env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_COLUMNS_PER_TABLE",
+        default_value = "10000",
+        action
+    )]
+    pub max_num_columns_per_table: usize,
+
+    /// Percentage of desired max file size for "leading edge split"
+    /// optimization.
+    ///
+    /// This setting controls the estimated output file size at which
+    /// the compactor will apply the "leading edge" optimization.
+    ///
+    /// When compacting files together, if the output size is
+    /// estimated to be greater than the following quantity, the
+    /// "leading edge split" optimization will be applied:
+    ///
+    /// percentage_max_file_size * target_file_size
+    ///
+    /// This value must be between (0, 100)
+    ///
+    /// Default is 20
+    #[clap(
+        long = "compaction-percentage-max-file_size",
+        env = "INFLUXDB_IOX_COMPACTION_PERCENTAGE_MAX_FILE_SIZE",
+        default_value = "20",
+        action
+    )]
+    pub percentage_max_file_size: u16,
+
+    /// Enable new priority-based compaction selection.
+    ///
+    /// Eventually, this will be the only way to select partitions.
+    ///
+    /// Default is false
+    #[clap(
+        long = "compaction-priority-based-selection",
+        env = "INFLUXDB_IOX_COMPACTION_PRIORITY_BASED_SELECTION",
+        default_value = "false",
+        action
+    )]
+    pub priority_based_selection: bool,
+
+    /// Split file percentage for "leading edge split"
+    ///
+    /// To reduce the likelihood of recompacting the same data too many
+    /// times, the compactor uses the "leading edge split"
+    /// optimization for the common case where the new data written
+    /// into a partition also has the most recent timestamps.
+    ///
+    /// When compacting multiple files together, if the compactor
+    /// estimates the resulting file will be large enough (see
+    /// `percentage_max_file_size`) it creates two output files
+    /// rather than one, split by time, like this:
+    ///
+    /// `|-------------- older_data -----------------||---- newer_data ----|`
+    ///
+    /// In the common case, the file containing `older_data` is less
+    /// likely to overlap with new data written in.
+    ///
+    /// This setting controls what percentage of data is placed into
+    /// the `older_data` portion.
+    ///
+    /// Increasing this value increases the average size of compacted
+    /// files after the first round of compaction. However, doing so
+    /// also increase the likelihood that late arriving data will
+    /// overlap with larger existing files, necessitating additional
+    /// compaction rounds.
+    ///
+    /// This value must be between (0, 100)
+    #[clap(
+        long = "compaction-split-percentage",
+        env = "INFLUXDB_IOX_COMPACTION_SPLIT_PERCENTAGE",
+        default_value = "80",
+        action
+    )]
+    pub split_percentage: u16,
+
+    /// Partition source config used by the local scheduler.
+    #[clap(flatten)]
+    pub partition_source_config: PartitionSourceConfigForLocalScheduler,
+
+    /// Shard config used by the local scheduler.
+    #[clap(flatten)]
+    pub shard_config: ShardConfigForLocalScheduler,
+
+    /// Gossip config.
+    #[clap(flatten)]
+    pub gossip_config: CompactorSchedulerGossipConfig,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser;
+    use test_helpers::assert_contains;
+
+    #[test]
+    fn default_compactor_scheduler_type_is_local() {
+        let config = CompactorSchedulerConfig::try_parse_from(["my_binary"]).unwrap();
+        assert_eq!(
+            config.compactor_scheduler_type,
+            CompactorSchedulerType::Local
+        );
+    }
+
+    #[test]
+    fn can_specify_local() {
+        let config = CompactorSchedulerConfig::try_parse_from([
+            "my_binary",
+            "--compactor-scheduler",
+            "local",
+        ])
+        .unwrap();
+        assert_eq!(
+            config.compactor_scheduler_type,
+            CompactorSchedulerType::Local
+        );
+    }
+
+    #[test]
+    fn any_other_scheduler_type_string_is_invalid() {
+        let error = CompactorSchedulerConfig::try_parse_from([
+            "my_binary",
+            "--compactor-scheduler",
+            "hello",
+        ])
+        .unwrap_err()
+        .to_string();
+        assert_contains!(
+            &error,
+            "invalid value 'hello' for '--compactor-scheduler <COMPACTOR_SCHEDULER_TYPE>'"
+        );
+        assert_contains!(&error, "[possible values: local, remote]");
+    }
+}
diff --git a/clap_blocks/src/garbage_collector.rs b/clap_blocks/src/garbage_collector.rs
new file mode 100644
index 0000000..0b10d78
--- /dev/null
+++ b/clap_blocks/src/garbage_collector.rs
@@ -0,0 +1,150 @@
+//! Garbage Collector configuration
+use clap::Parser;
+use humantime::parse_duration;
+use std::{fmt::Debug, time::Duration};
+
+/// Configuration specific to the object store garbage collector
+#[derive(Debug, Clone, Parser, Copy)]
+pub struct GarbageCollectorConfig {
+    /// If this flag is specified, don't delete the files in object storage. Only print the files
+    /// that would be deleted if this flag wasn't specified.
+    #[clap(long, env = "INFLUXDB_IOX_GC_DRY_RUN")]
+    pub dry_run: bool,
+
+    /// Items in the object store that are older than this duration that are not referenced in the
+    /// catalog will be deleted.
+    /// Parsed with <https://docs.rs/humantime/latest/humantime/fn.parse_duration.html>
+    ///
+    /// If not specified, defaults to 14 days ago.
+    #[clap(
+        long,
+        default_value = "14d",
+        value_parser = parse_duration,
+        env = "INFLUXDB_IOX_GC_OBJECTSTORE_CUTOFF"
+    )]
+    pub objectstore_cutoff: Duration,
+
+    /// Number of minutes to sleep between iterations of the objectstore list loop.
+    /// This is the sleep between entirely fresh list operations.
+    /// Defaults to 30 minutes.
+    #[clap(
+        long,
+        default_value_t = 30,
+        env = "INFLUXDB_IOX_GC_OBJECTSTORE_SLEEP_INTERVAL_MINUTES"
+    )]
+    pub objectstore_sleep_interval_minutes: u64,
+
+    /// Number of milliseconds to sleep between listing consecutive chunks of objecstore files.
+    /// Object store listing is processed in batches; this is the sleep between batches.
+    /// Defaults to 1000 milliseconds.
+    #[clap(
+        long,
+        default_value_t = 1000,
+        env = "INFLUXDB_IOX_GC_OBJECTSTORE_SLEEP_INTERVAL_BATCH_MILLISECONDS"
+    )]
+    pub objectstore_sleep_interval_batch_milliseconds: u64,
+
+    /// Parquet file rows in the catalog flagged for deletion before this duration will be deleted.
+    /// Parsed with <https://docs.rs/humantime/latest/humantime/fn.parse_duration.html>
+    ///
+    /// If not specified, defaults to 14 days ago.
+    #[clap(
+        long,
+        default_value = "14d",
+        value_parser = parse_duration,
+        env = "INFLUXDB_IOX_GC_PARQUETFILE_CUTOFF"
+    )]
+    pub parquetfile_cutoff: Duration,
+
+    /// Number of minutes to sleep between iterations of the parquet file deletion loop.
+    ///
+    /// Defaults to 30 minutes.
+    ///
+    /// If both INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES and
+    /// INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL are specified, the smaller is chosen
+    #[clap(long, env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES")]
+    pub parquetfile_sleep_interval_minutes: Option<u64>,
+
+    /// Duration to sleep between iterations of the parquet file deletion loop.
+    ///
+    /// Defaults to 30 minutes.
+    ///
+    /// If both INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES and
+    /// INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL are specified, the smaller is chosen
+    #[clap(
+        long,
+        value_parser = parse_duration,
+        env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL"
+    )]
+    pub parquetfile_sleep_interval: Option<Duration>,
+
+    /// Number of minutes to sleep between iterations of the retention code.
+    /// Defaults to 35 minutes to reduce incidence of it running at the same time as the parquet
+    /// file deleter.
+    #[clap(
+        long,
+        default_value_t = 35,
+        env = "INFLUXDB_IOX_GC_RETENTION_SLEEP_INTERVAL_MINUTES"
+    )]
+    pub retention_sleep_interval_minutes: u64,
+}
+
+impl GarbageCollectorConfig {
+    /// Returns the parquet_file sleep interval
+    pub fn parquetfile_sleep_interval(&self) -> Duration {
+        match (
+            self.parquetfile_sleep_interval,
+            self.parquetfile_sleep_interval_minutes,
+        ) {
+            (None, None) => Duration::from_secs(30 * 60),
+            (Some(d), None) => d,
+            (None, Some(m)) => Duration::from_secs(m * 60),
+            (Some(d), Some(m)) => d.min(Duration::from_secs(m * 60)),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gc_config() {
+        let a: &[&str] = &[];
+        let config = GarbageCollectorConfig::parse_from(a);
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(30 * 60)
+        );
+
+        let config =
+            GarbageCollectorConfig::parse_from(["something", "--parquetfile-sleep-interval", "3d"]);
+
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(24 * 60 * 60 * 3)
+        );
+
+        let config = GarbageCollectorConfig::parse_from([
+            "something",
+            "--parquetfile-sleep-interval-minutes",
+            "34",
+        ]);
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(34 * 60)
+        );
+
+        let config = GarbageCollectorConfig::parse_from([
+            "something",
+            "--parquetfile-sleep-interval-minutes",
+            "34",
+            "--parquetfile-sleep-interval",
+            "35m",
+        ]);
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(34 * 60)
+        );
+    }
+}
diff --git a/clap_blocks/src/gossip.rs b/clap_blocks/src/gossip.rs
new file mode 100644
index 0000000..47365ba
--- /dev/null
+++ b/clap_blocks/src/gossip.rs
@@ -0,0 +1,52 @@
+//! CLI config for cluster gossip communication.
+
+use crate::socket_addr::SocketAddr;
+use std::str::FromStr;
+
+/// Configuration parameters for the cluster gossip communication mechanism.
+#[derive(Debug, Clone, clap::Parser)]
+#[allow(missing_copy_implementations)]
+pub struct GossipConfig {
+    /// A comma-delimited set of seed gossip peer addresses.
+    ///
+    /// Example: "10.0.0.1:4242,10.0.0.2:4242"
+    ///
+    /// These seeds will be used to discover all other peers that talk to the
+    /// same seeds. Typically all nodes in the cluster should use the same set
+    /// of seeds.
+    #[clap(
+        long = "gossip-seed-list",
+        env = "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+        required = false,
+        num_args=1..,
+        value_delimiter = ',',
+        requires = "gossip_bind_address", // Field name, not flag
+    )]
+    pub seed_list: Vec<String>,
+
+    /// The UDP socket address IOx will use for gossip communication between
+    /// peers.
+    ///
+    /// Example: "0.0.0.0:4242"
+    ///
+    /// If not provided, the gossip sub-system is disabled.
+    #[clap(
+        long = "gossip-bind-address",
+        env = "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+        default_value = "0.0.0.0:4242",
+        required = false,
+        action
+    )]
+    pub gossip_bind_address: SocketAddr,
+}
+
+impl GossipConfig {
+    /// constructor for GossipConfig
+    ///
+    pub fn new(bind_address: &str, seed_list: Vec<String>) -> Self {
+        Self {
+            seed_list,
+            gossip_bind_address: SocketAddr::from_str(bind_address).unwrap(),
+        }
+    }
+}
diff --git a/clap_blocks/src/ingester.rs b/clap_blocks/src/ingester.rs
new file mode 100644
index 0000000..be2ab26
--- /dev/null
+++ b/clap_blocks/src/ingester.rs
@@ -0,0 +1,101 @@
+//! CLI config for the ingester using the RPC write path
+
+use std::{num::NonZeroUsize, path::PathBuf};
+
+use crate::gossip::GossipConfig;
+
+/// CLI config for the ingester using the RPC write path
+#[derive(Debug, Clone, clap::Parser)]
+#[allow(missing_copy_implementations)]
+pub struct IngesterConfig {
+    /// Gossip config.
+    #[clap(flatten)]
+    pub gossip_config: GossipConfig,
+
+    /// Where this ingester instance should store its write-ahead log files. Each ingester instance
+    /// must have its own directory.
+    #[clap(long = "wal-directory", env = "INFLUXDB_IOX_WAL_DIRECTORY", action)]
+    pub wal_directory: PathBuf,
+
+    /// Specify the maximum allowed incoming RPC write message size sent by the
+    /// Router.
+    #[clap(
+        long = "rpc-write-max-incoming-bytes",
+        env = "INFLUXDB_IOX_RPC_WRITE_MAX_INCOMING_BYTES",
+        default_value = "104857600", // 100MiB
+    )]
+    pub rpc_write_max_incoming_bytes: usize,
+
+    /// The number of seconds between WAL file rotations.
+    #[clap(
+        long = "wal-rotation-period-seconds",
+        env = "INFLUXDB_IOX_WAL_ROTATION_PERIOD_SECONDS",
+        default_value = "300",
+        action
+    )]
+    pub wal_rotation_period_seconds: u64,
+
+    /// Sets how many queries the ingester will handle simultaneously before
+    /// rejecting further incoming requests.
+    #[clap(
+        long = "concurrent-query-limit",
+        env = "INFLUXDB_IOX_CONCURRENT_QUERY_LIMIT",
+        default_value = "20",
+        action
+    )]
+    pub concurrent_query_limit: usize,
+
+    /// The maximum number of persist tasks that can run simultaneously.
+    #[clap(
+        long = "persist-max-parallelism",
+        env = "INFLUXDB_IOX_PERSIST_MAX_PARALLELISM",
+        default_value = "5",
+        action
+    )]
+    pub persist_max_parallelism: usize,
+
+    /// The maximum number of persist tasks that can be queued at any one time.
+    ///
+    /// Once this limit is reached, ingest is blocked until the persist backlog
+    /// is reduced.
+    #[clap(
+        long = "persist-queue-depth",
+        env = "INFLUXDB_IOX_PERSIST_QUEUE_DEPTH",
+        default_value = "250",
+        action
+    )]
+    pub persist_queue_depth: usize,
+
+    /// The limit at which a partition's estimated persistence cost causes it to
+    /// be queued for persistence.
+    #[clap(
+        long = "persist-hot-partition-cost",
+        env = "INFLUXDB_IOX_PERSIST_HOT_PARTITION_COST",
+        default_value = "20000000", // 20,000,000
+        action
+    )]
+    pub persist_hot_partition_cost: usize,
+
+    /// An optional lower bound byte size limit that buffered data within a
+    /// partition must reach in order to be converted into an incremental
+    /// snapshot at query time.
+    ///
+    /// Snapshots improve query performance by amortising response generation at
+    /// the expense of a small memory overhead. Snapshots are retained until the
+    /// buffer is persisted.
+    #[clap(
+        long = "min-partition-snapshot-size",
+        env = "INFLUXDB_IOX_MIN_PARTITION_SNAPSHOT_SIZE"
+    )]
+    pub min_partition_snapshot_size: Option<NonZeroUsize>,
+
+    /// Limit the number of partitions that may be buffered in a single
+    /// namespace (across all tables) at any one time.
+    ///
+    /// This limit is disabled by default.
+    #[clap(
+        long = "max-partitions-per-namespace",
+        env = "INFLUXDB_IOX_MAX_PARTITIONS_PER_NAMESPACE"
+    )]
+    pub max_partitions_per_namespace: Option<NonZeroUsize>,
+}
diff --git a/clap_blocks/src/ingester_address.rs b/clap_blocks/src/ingester_address.rs
new file mode 100644
index 0000000..90a8e8d
--- /dev/null
+++ b/clap_blocks/src/ingester_address.rs
@@ -0,0 +1,308 @@
+//! Shared configuration and tests for accepting ingester addresses as arguments.
+
+use http::uri::{InvalidUri, InvalidUriParts, Uri};
+use snafu::{ResultExt, Snafu};
+use std::{fmt::Display, str::FromStr};
+
+/// An address to an ingester's gRPC API. Create by using `IngesterAddress::from_str`.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct IngesterAddress {
+    uri: Uri,
+}
+
+/// Why a specified ingester address might be invalid
+#[allow(missing_docs)]
+#[derive(Snafu, Debug)]
+pub enum Error {
+    #[snafu(display("{source}"))]
+    Invalid { source: InvalidUri },
+
+    #[snafu(display("Port is required; no port found in `{value}`"))]
+    MissingPort { value: String },
+
+    #[snafu(context(false))]
+    InvalidParts { source: InvalidUriParts },
+}
+
+impl FromStr for IngesterAddress {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let uri = Uri::from_str(s).context(InvalidSnafu)?;
+
+        if uri.port().is_none() {
+            return MissingPortSnafu { value: s }.fail();
+        }
+
+        let uri = if uri.scheme().is_none() {
+            Uri::from_str(&format!("http://{s}")).context(InvalidSnafu)?
+        } else {
+            uri
+        };
+
+        Ok(Self { uri })
+    }
+}
+
+impl Display for IngesterAddress {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.uri)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::{error::ErrorKind, Parser};
+    use std::env;
+    use test_helpers::{assert_contains, assert_error};
+
+    /// Applications such as the router MUST have valid ingester addresses.
+    #[derive(Debug, Clone, clap::Parser)]
+    struct RouterConfig {
+        #[clap(
+            long = "ingester-addresses",
+            env = "TEST_INFLUXDB_IOX_INGESTER_ADDRESSES",
+            required = true,
+            num_args=1..,
+            value_delimiter = ','
+        )]
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
+    }
+
+    #[test]
+    fn error_if_not_specified_when_required() {
+        assert_error!(
+            RouterConfig::try_parse_from(["my_binary"]),
+            ref e if e.kind() == ErrorKind::MissingRequiredArgument
+        );
+    }
+
+    /// Applications such as the querier might not have any ingester addresses, but if they have
+    /// any, they should be valid.
+    #[derive(Debug, Clone, clap::Parser)]
+    struct QuerierConfig {
+        #[clap(
+            long = "ingester-addresses",
+            env = "TEST_INFLUXDB_IOX_INGESTER_ADDRESSES",
+            required = false,
+            num_args=0..,
+            value_delimiter = ','
+        )]
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
+    }
+
+    #[test]
+    fn empty_if_not_specified_when_optional() {
+        assert!(QuerierConfig::try_parse_from(["my_binary"])
+            .unwrap()
+            .ingester_addresses
+            .is_empty());
+    }
+
+    fn both_types_valid(args: &[&'static str], expected: &[&'static str]) {
+        let router = RouterConfig::try_parse_from(args).unwrap();
+        let actual: Vec<_> = router
+            .ingester_addresses
+            .iter()
+            .map(ToString::to_string)
+            .collect();
+        assert_eq!(actual, expected);
+
+        let querier = QuerierConfig::try_parse_from(args).unwrap();
+        let actual: Vec<_> = querier
+            .ingester_addresses
+            .iter()
+            .map(ToString::to_string)
+            .collect();
+        assert_eq!(actual, expected);
+    }
+
+    fn both_types_error(args: &[&'static str], expected_error_message: &'static str) {
+        assert_contains!(
+            RouterConfig::try_parse_from(args).unwrap_err().to_string(),
+            expected_error_message
+        );
+        assert_contains!(
+            QuerierConfig::try_parse_from(args).unwrap_err().to_string(),
+            expected_error_message
+        );
+    }
+
+    #[test]
+    fn accepts_one() {
+        let args = [
+            "my_binary",
+            "--ingester-addresses",
+            "http://example.com:1234",
+        ];
+        let expected = ["http://example.com:1234/"];
+
+        both_types_valid(&args, &expected);
+    }
+
+    #[test]
+    fn accepts_two() {
+        let args = [
+            "my_binary",
+            "--ingester-addresses",
+            "http://example.com:1234,http://example.com:5678",
+        ];
+        let expected = ["http://example.com:1234/", "http://example.com:5678/"];
+
+        both_types_valid(&args, &expected);
+    }
+
+    #[test]
+    fn rejects_any_invalid_uri() {
+        let args = [
+            "my_binary",
+            "--ingester-addresses",
+            "http://example.com:1234,", // note the trailing comma; empty URIs are invalid
+        ];
+        let expected = "error: invalid value '' for '--ingester-addresses";
+
+        both_types_error(&args, expected);
+    }
+
+    #[test]
+    fn rejects_uri_without_port() {
+        let args = [
+            "my_binary",
+            "--ingester-addresses",
+            "example.com,http://example.com:1234",
+        ];
+        let expected = "Port is required; no port found in `example.com`";
+
+        both_types_error(&args, expected);
+    }
+
+    #[test]
+    fn no_scheme_assumes_http() {
+        let args = [
+            "my_binary",
+            "--ingester-addresses",
+            "http://example.com:1234,somescheme://0.0.0.0:1000,127.0.0.1:8080",
+        ];
+        let expected = [
+            "http://example.com:1234/",
+            "somescheme://0.0.0.0:1000/",
+            "http://127.0.0.1:8080/",
+        ];
+
+        both_types_valid(&args, &expected);
+    }
+
+    #[test]
+    fn specifying_flag_multiple_times_works() {
+        let args = [
+            "my_binary",
+            "--ingester-addresses",
+            "http://example.com:1234",
+            "--ingester-addresses",
+            "somescheme://0.0.0.0:1000",
+            "--ingester-addresses",
+            "127.0.0.1:8080",
+        ];
+        let expected = [
+            "http://example.com:1234/",
+            "somescheme://0.0.0.0:1000/",
+            "http://127.0.0.1:8080/",
+        ];
+
+        both_types_valid(&args, &expected);
+    }
+
+    #[test]
+    fn specifying_flag_multiple_times_and_using_commas_works() {
+        let args = [
+            "my_binary",
+            "--ingester-addresses",
+            "http://example.com:1234",
+            "--ingester-addresses",
+            "somescheme://0.0.0.0:1000,127.0.0.1:8080",
+        ];
+        let expected = [
+            "http://example.com:1234/",
+            "somescheme://0.0.0.0:1000/",
+            "http://127.0.0.1:8080/",
+        ];
+
+        both_types_valid(&args, &expected);
+    }
+
+    /// Use an environment variable name not shared with any other config to avoid conflicts when
+    /// setting the var in tests.
+    /// Applications such as the router MUST have valid ingester addresses.
+    #[derive(Debug, Clone, clap::Parser)]
+    struct EnvRouterConfig {
+        #[clap(
+            long = "ingester-addresses",
+            env = "NO_CONFLICT_ROUTER_TEST_INFLUXDB_IOX_INGESTER_ADDRESSES",
+            required = true,
+            num_args=1..,
+            value_delimiter = ','
+        )]
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
+    }
+
+    #[test]
+    fn required_and_specified_via_environment_variable() {
+        env::set_var(
+            "NO_CONFLICT_ROUTER_TEST_INFLUXDB_IOX_INGESTER_ADDRESSES",
+            "http://example.com:1234,somescheme://0.0.0.0:1000,127.0.0.1:8080",
+        );
+        let args = ["my_binary"];
+        let expected = [
+            "http://example.com:1234/",
+            "somescheme://0.0.0.0:1000/",
+            "http://127.0.0.1:8080/",
+        ];
+
+        let router = EnvRouterConfig::try_parse_from(args).unwrap();
+        let actual: Vec<_> = router
+            .ingester_addresses
+            .iter()
+            .map(ToString::to_string)
+            .collect();
+        assert_eq!(actual, expected);
+    }
+
+    /// Use an environment variable name not shared with any other config to avoid conflicts when
+    /// setting the var in tests.
+    /// Applications such as the querier might not have any ingester addresses, but if they have
+    /// any, they should be valid.
+    #[derive(Debug, Clone, clap::Parser)]
+    struct EnvQuerierConfig {
+        #[clap(
+            long = "ingester-addresses",
+            env = "NO_CONFLICT_QUERIER_TEST_INFLUXDB_IOX_INGESTER_ADDRESSES",
+            required = false,
+            num_args=0..,
+            value_delimiter = ','
+        )]
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
+    }
+
+    #[test]
+    fn optional_and_specified_via_environment_variable() {
+        env::set_var(
+            "NO_CONFLICT_QUERIER_TEST_INFLUXDB_IOX_INGESTER_ADDRESSES",
+            "http://example.com:1234,somescheme://0.0.0.0:1000,127.0.0.1:8080",
+        );
+        let args = ["my_binary"];
+        let expected = [
+            "http://example.com:1234/",
+            "somescheme://0.0.0.0:1000/",
+            "http://127.0.0.1:8080/",
+        ];
+
+        let querier = EnvQuerierConfig::try_parse_from(args).unwrap();
+        let actual: Vec<_> = querier
+            .ingester_addresses
+            .iter()
+            .map(ToString::to_string)
+            .collect();
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/clap_blocks/src/lib.rs b/clap_blocks/src/lib.rs
new file mode 100644
index 0000000..d9f6891
--- /dev/null
+++ b/clap_blocks/src/lib.rs
@@ -0,0 +1,37 @@
+//! Building blocks for [`clap`]-driven configs.
+//!
+//! They can easily be re-used using `#[clap(flatten)]`.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod bulk_ingest;
+pub mod catalog_cache;
+pub mod catalog_dsn;
+pub mod compactor;
+pub mod compactor_scheduler;
+pub mod garbage_collector;
+pub mod gossip;
+pub mod ingester;
+pub mod ingester_address;
+pub mod memory_size;
+pub mod object_store;
+pub mod parquet_cache;
+pub mod querier;
+pub mod router;
+pub mod run_config;
+pub mod single_tenant;
+pub mod socket_addr;
diff --git a/clap_blocks/src/memory_size.rs b/clap_blocks/src/memory_size.rs
new file mode 100644
index 0000000..6e7515d
--- /dev/null
+++ b/clap_blocks/src/memory_size.rs
@@ -0,0 +1,113 @@
+//! Helper types to express memory size.
+
+use std::{str::FromStr, sync::OnceLock};
+
+use sysinfo::{MemoryRefreshKind, RefreshKind, System};
+
+/// Memory size.
+///
+/// # Parsing
+/// This can be parsed from strings in one of the following formats:
+///
+/// - **absolute:** just use a non-negative number to specify the absolute bytes, e.g. `1024`
+/// - **relative:** use percentage between 0 and 100 (both inclusive) to specify a relative amount of the totally
+///   available memory size, e.g. `50%`
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct MemorySize(usize);
+
+impl MemorySize {
+    /// Number of bytes.
+    pub fn bytes(&self) -> usize {
+        self.0
+    }
+}
+
+impl std::fmt::Debug for MemorySize {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl std::fmt::Display for MemorySize {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl FromStr for MemorySize {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.strip_suffix('%') {
+            Some(s) => {
+                let percentage = u64::from_str(s).map_err(|e| e.to_string())?;
+                if percentage > 100 {
+                    return Err(format!(
+                        "relative memory size must be in [0, 100] but is {percentage}"
+                    ));
+                }
+                let total = total_mem_bytes();
+                let bytes = (percentage as f64 / 100f64 * total as f64).round() as usize;
+                Ok(Self(bytes))
+            }
+            None => {
+                let bytes = usize::from_str(s).map_err(|e| e.to_string())?;
+                Ok(Self(bytes))
+            }
+        }
+    }
+}
+
+/// Totally available memory size in bytes.
+pub fn total_mem_bytes() -> usize {
+    // Keep this in a global state so that we only need to inspect the system once during IOx startup.
+    static TOTAL_MEM_BYTES: OnceLock<usize> = OnceLock::new();
+
+    *TOTAL_MEM_BYTES.get_or_init(|| {
+        let sys = System::new_with_specifics(
+            RefreshKind::new().with_memory(MemoryRefreshKind::everything()),
+        );
+        sys.total_memory() as usize
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse() {
+        assert_ok("0", 0);
+        assert_ok("1", 1);
+        assert_ok("1024", 1024);
+        assert_ok("0%", 0);
+
+        assert_gt_zero("50%");
+
+        assert_err("-1", "invalid digit found in string");
+        assert_err("foo", "invalid digit found in string");
+        assert_err("-1%", "invalid digit found in string");
+        assert_err(
+            "101%",
+            "relative memory size must be in [0, 100] but is 101",
+        );
+    }
+
+    #[track_caller]
+    fn assert_ok(s: &'static str, expected: usize) {
+        let parsed: MemorySize = s.parse().unwrap();
+        assert_eq!(parsed.bytes(), expected);
+    }
+
+    #[track_caller]
+    fn assert_gt_zero(s: &'static str) {
+        let parsed: MemorySize = s.parse().unwrap();
+        assert!(parsed.bytes() > 0);
+    }
+
+    #[track_caller]
+    fn assert_err(s: &'static str, expected: &'static str) {
+        let err = MemorySize::from_str(s).unwrap_err();
+        assert_eq!(err, expected);
+    }
+}
diff --git a/clap_blocks/src/object_store.rs b/clap_blocks/src/object_store.rs
new file mode 100644
index 0000000..e961357
--- /dev/null
+++ b/clap_blocks/src/object_store.rs
@@ -0,0 +1,775 @@
+//! CLI handling for object store config (via CLI arguments and environment variables).
+
+use futures::TryStreamExt;
+use non_empty_string::NonEmptyString;
+use object_store::{
+    memory::InMemory,
+    path::Path,
+    throttle::{ThrottleConfig, ThrottledStore},
+    DynObjectStore,
+};
+use observability_deps::tracing::{info, warn};
+use snafu::{ResultExt, Snafu};
+use std::{convert::Infallible, fs, num::NonZeroUsize, path::PathBuf, sync::Arc, time::Duration};
+use uuid::Uuid;
+
+use crate::parquet_cache::ParquetCacheClientConfig;
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum ParseError {
+    #[snafu(display("Unable to create database directory {:?}: {}", path, source))]
+    CreatingDatabaseDirectory {
+        path: PathBuf,
+        source: std::io::Error,
+    },
+
+    #[snafu(display("Unable to create local store {:?}: {}", path, source))]
+    CreateLocalFileSystem {
+        path: PathBuf,
+        source: object_store::Error,
+    },
+
+    #[snafu(display(
+        "Specified {:?} for the object store, required configuration missing for {}",
+        object_store,
+        missing
+    ))]
+    MissingObjectStoreConfig {
+        object_store: ObjectStoreType,
+        missing: String,
+    },
+
+    // Creating a new S3 object store can fail if the region is *specified* but
+    // not *parseable* as a rusoto `Region`. The other object store constructors
+    // don't return `Result`.
+    #[snafu(display("Error configuring Amazon S3: {}", source))]
+    InvalidS3Config { source: object_store::Error },
+
+    #[snafu(display("Error configuring GCS: {}", source))]
+    InvalidGCSConfig { source: object_store::Error },
+
+    #[snafu(display("Error configuring Microsoft Azure: {}", source))]
+    InvalidAzureConfig { source: object_store::Error },
+}
+
+/// The AWS region to use for Amazon S3 based object storage if none is
+/// specified.
+pub const FALLBACK_AWS_REGION: &str = "us-east-1";
+
+/// A `clap` `value_parser` which returns `None` when given an empty string and
+/// `Some(NonEmptyString)` otherwise.
+fn parse_optional_string(s: &str) -> Result<Option<NonEmptyString>, Infallible> {
+    Ok(NonEmptyString::new(s.to_string()).ok())
+}
+
+/// CLI config for object stores.
+#[derive(Debug, Clone, clap::Parser)]
+pub struct ObjectStoreConfig {
+    /// Which object storage to use. If not specified, defaults to memory.
+    ///
+    /// Possible values (case insensitive):
+    ///
+    /// * memory (default): Effectively no object persistence.
+    /// * memorythrottled: Like `memory` but with latency and throughput that somewhat resamble a cloud
+    ///    object store. Useful for testing and benchmarking.
+    /// * file: Stores objects in the local filesystem. Must also set `--data-dir`.
+    /// * s3: Amazon S3. Must also set `--bucket`, `--aws-access-key-id`, `--aws-secret-access-key`, and
+    ///    possibly `--aws-default-region`.
+    /// * google: Google Cloud Storage. Must also set `--bucket` and `--google-service-account`.
+    /// * azure: Microsoft Azure blob storage. Must also set `--bucket`, `--azure-storage-account`,
+    ///    and `--azure-storage-access-key`.
+    #[clap(
+        value_enum,
+        long = "object-store",
+        env = "INFLUXDB_IOX_OBJECT_STORE",
+        ignore_case = true,
+        action,
+        verbatim_doc_comment
+    )]
+    pub object_store: Option<ObjectStoreType>,
+
+    /// Name of the bucket to use for the object store. Must also set
+    /// `--object-store` to a cloud object storage to have any effect.
+    ///
+    /// If using Google Cloud Storage for the object store, this item as well
+    /// as `--google-service-account` must be set.
+    ///
+    /// If using S3 for the object store, must set this item as well
+    /// as `--aws-access-key-id` and `--aws-secret-access-key`. Can also set
+    /// `--aws-default-region` if not using the fallback region.
+    ///
+    /// If using Azure for the object store, set this item to the name of a
+    /// container you've created in the associated storage account, under
+    /// Blob Service > Containers. Must also set `--azure-storage-account` and
+    /// `--azure-storage-access-key`.
+    #[clap(long = "bucket", env = "INFLUXDB_IOX_BUCKET", action)]
+    pub bucket: Option<String>,
+
+    /// The location InfluxDB IOx will use to store files locally.
+    #[clap(long = "data-dir", env = "INFLUXDB_IOX_DB_DIR", action)]
+    pub database_directory: Option<PathBuf>,
+
+    /// When using Amazon S3 as the object store, set this to an access key that
+    /// has permission to read from and write to the specified S3 bucket.
+    ///
+    /// Must also set `--object-store=s3`, `--bucket`, and
+    /// `--aws-secret-access-key`. Can also set `--aws-default-region` if not
+    /// using the fallback region.
+    ///
+    /// Prefer the environment variable over the command line flag in shared
+    /// environments.
+    ///
+    /// An empty string value is equivalent to omitting the flag.
+    /// Note: must refer to std::option::Option explicitly, see <https://github.com/clap-rs/clap/issues/4626>
+    #[clap(long = "aws-access-key-id", env = "AWS_ACCESS_KEY_ID", value_parser = parse_optional_string, default_value="", action)]
+    pub aws_access_key_id: std::option::Option<NonEmptyString>,
+
+    /// When using Amazon S3 as the object store, set this to the secret access
+    /// key that goes with the specified access key ID.
+    ///
+    /// Must also set `--object-store=s3`, `--bucket`, `--aws-access-key-id`.
+    /// Can also set `--aws-default-region` if not using the fallback region.
+    ///
+    /// Prefer the environment variable over the command line flag in shared
+    /// environments.
+    ///
+    /// An empty string value is equivalent to omitting the flag.
+    /// Note: must refer to std::option::Option explicitly, see <https://github.com/clap-rs/clap/issues/4626>
+    #[clap(long = "aws-secret-access-key", env = "AWS_SECRET_ACCESS_KEY", value_parser = parse_optional_string, default_value = "", action)]
+    pub aws_secret_access_key: std::option::Option<NonEmptyString>,
+
+    /// When using Amazon S3 as the object store, set this to the region
+    /// that goes with the specified bucket if different from the fallback
+    /// value.
+    ///
+    /// Must also set `--object-store=s3`, `--bucket`, `--aws-access-key-id`,
+    /// and `--aws-secret-access-key`.
+    #[clap(
+        long = "aws-default-region",
+        env = "AWS_DEFAULT_REGION",
+        default_value = FALLBACK_AWS_REGION,
+        action,
+    )]
+    pub aws_default_region: String,
+
+    /// When using Amazon S3 compatibility storage service, set this to the
+    /// endpoint.
+    ///
+    /// Must also set `--object-store=s3`, `--bucket`. Can also set `--aws-default-region`
+    /// if not using the fallback region.
+    ///
+    /// Prefer the environment variable over the command line flag in shared
+    /// environments.
+    #[clap(long = "aws-endpoint", env = "AWS_ENDPOINT", action)]
+    pub aws_endpoint: Option<String>,
+
+    /// When using Amazon S3 as an object store, set this to the session token. This is handy when using a federated
+    /// login / SSO and you fetch credentials via the UI.
+    ///
+    /// Is it assumed that the session is valid as long as the IOx server is running.
+    ///
+    /// Prefer the environment variable over the command line flag in shared
+    /// environments.
+    #[clap(long = "aws-session-token", env = "AWS_SESSION_TOKEN", action)]
+    pub aws_session_token: Option<String>,
+
+    /// Allow unencrypted HTTP connection to AWS.
+    #[clap(long = "aws-allow-http", env = "AWS_ALLOW_HTTP", action)]
+    pub aws_allow_http: bool,
+
+    /// When using Google Cloud Storage as the object store, set this to the
+    /// path to the JSON file that contains the Google credentials.
+    ///
+    /// Must also set `--object-store=google` and `--bucket`.
+    #[clap(
+        long = "google-service-account",
+        env = "GOOGLE_SERVICE_ACCOUNT",
+        action
+    )]
+    pub google_service_account: Option<String>,
+
+    /// When using Microsoft Azure as the object store, set this to the
+    /// name you see when going to All Services > Storage accounts > `[name]`.
+    ///
+    /// Must also set `--object-store=azure`, `--bucket`, and
+    /// `--azure-storage-access-key`.
+    #[clap(long = "azure-storage-account", env = "AZURE_STORAGE_ACCOUNT", action)]
+    pub azure_storage_account: Option<String>,
+
+    /// When using Microsoft Azure as the object store, set this to one of the
+    /// Key values in the Storage account's Settings > Access keys.
+    ///
+    /// Must also set `--object-store=azure`, `--bucket`, and
+    /// `--azure-storage-account`.
+    ///
+    /// Prefer the environment variable over the command line flag in shared
+    /// environments.
+    #[clap(
+        long = "azure-storage-access-key",
+        env = "AZURE_STORAGE_ACCESS_KEY",
+        action
+    )]
+    pub azure_storage_access_key: Option<String>,
+
+    /// When using a network-based object store, limit the number of connection to this value.
+    #[clap(
+        long = "object-store-connection-limit",
+        env = "OBJECT_STORE_CONNECTION_LIMIT",
+        default_value = "16",
+        action
+    )]
+    pub object_store_connection_limit: NonZeroUsize,
+
+    /// Optional config for the cache client.
+    #[clap(flatten)]
+    pub cache_config: Option<ParquetCacheClientConfig>,
+}
+
+impl ObjectStoreConfig {
+    /// Create a new instance for all-in-one mode, only allowing some arguments.
+    pub fn new(database_directory: Option<PathBuf>) -> Self {
+        match &database_directory {
+            Some(dir) => info!("Object store: File-based in `{}`", dir.display()),
+            None => info!("Object store: In-memory"),
+        }
+
+        let object_store = database_directory.as_ref().map(|_| ObjectStoreType::File);
+
+        Self {
+            aws_access_key_id: Default::default(),
+            aws_allow_http: Default::default(),
+            aws_default_region: Default::default(),
+            aws_endpoint: Default::default(),
+            aws_secret_access_key: Default::default(),
+            aws_session_token: Default::default(),
+            azure_storage_access_key: Default::default(),
+            azure_storage_account: Default::default(),
+            bucket: Default::default(),
+            database_directory,
+            google_service_account: Default::default(),
+            object_store,
+            object_store_connection_limit: NonZeroUsize::new(16).unwrap(),
+            cache_config: Default::default(),
+        }
+    }
+}
+
+/// Object-store type.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, clap::ValueEnum)]
+pub enum ObjectStoreType {
+    /// In-memory.
+    Memory,
+
+    /// In-memory with additional throttling applied for testing
+    MemoryThrottled,
+
+    /// Filesystem.
+    File,
+
+    /// AWS S3.
+    S3,
+
+    /// GCS.
+    Google,
+
+    /// Azure object store.
+    Azure,
+}
+
+#[cfg(feature = "gcp")]
+fn new_gcs(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
+    use object_store::gcp::GoogleCloudStorageBuilder;
+    use object_store::limit::LimitStore;
+
+    info!(bucket=?config.bucket, object_store_type="GCS", "Object Store");
+
+    let mut builder = GoogleCloudStorageBuilder::new();
+
+    if let Some(bucket) = &config.bucket {
+        builder = builder.with_bucket_name(bucket);
+    }
+    if let Some(account) = &config.google_service_account {
+        builder = builder.with_service_account_path(account);
+    }
+
+    Ok(Arc::new(LimitStore::new(
+        builder.build().context(InvalidGCSConfigSnafu)?,
+        config.object_store_connection_limit.get(),
+    )))
+}
+
+#[cfg(not(feature = "gcp"))]
+fn new_gcs(_: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
+    panic!("GCS support not enabled, recompile with the gcp feature enabled")
+}
+
+#[cfg(feature = "aws")]
+fn new_s3(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
+    use object_store::limit::LimitStore;
+
+    info!(
+        bucket=?config.bucket,
+        endpoint=?config.aws_endpoint,
+        object_store_type="S3",
+        "Object Store"
+    );
+
+    Ok(Arc::new(LimitStore::new(
+        build_s3(config)?,
+        config.object_store_connection_limit.get(),
+    )))
+}
+
+#[cfg(feature = "aws")]
+fn build_s3(config: &ObjectStoreConfig) -> Result<object_store::aws::AmazonS3, ParseError> {
+    use object_store::aws::AmazonS3Builder;
+
+    let mut builder = AmazonS3Builder::new()
+        .with_allow_http(config.aws_allow_http)
+        .with_region(&config.aws_default_region)
+        .with_imdsv1_fallback();
+
+    if let Some(bucket) = &config.bucket {
+        builder = builder.with_bucket_name(bucket);
+    }
+    if let Some(key_id) = &config.aws_access_key_id {
+        builder = builder.with_access_key_id(key_id.get());
+    }
+    if let Some(token) = &config.aws_session_token {
+        builder = builder.with_token(token);
+    }
+    if let Some(secret) = &config.aws_secret_access_key {
+        builder = builder.with_secret_access_key(secret.get());
+    }
+    if let Some(endpoint) = &config.aws_endpoint {
+        builder = builder.with_endpoint(endpoint);
+    }
+
+    builder.build().context(InvalidS3ConfigSnafu)
+}
+
+#[cfg(not(feature = "aws"))]
+fn new_s3(_: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
+    panic!("S3 support not enabled, recompile with the aws feature enabled")
+}
+
+#[cfg(feature = "azure")]
+fn new_azure(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
+    use object_store::azure::MicrosoftAzureBuilder;
+    use object_store::limit::LimitStore;
+
+    info!(bucket=?config.bucket, account=?config.azure_storage_account,
+          object_store_type="Azure", "Object Store");
+
+    let mut builder = MicrosoftAzureBuilder::new();
+
+    if let Some(bucket) = &config.bucket {
+        builder = builder.with_container_name(bucket);
+    }
+    if let Some(account) = &config.azure_storage_account {
+        builder = builder.with_account(account)
+    }
+    if let Some(key) = &config.azure_storage_access_key {
+        builder = builder.with_access_key(key)
+    }
+
+    Ok(Arc::new(LimitStore::new(
+        builder.build().context(InvalidAzureConfigSnafu)?,
+        config.object_store_connection_limit.get(),
+    )))
+}
+
+#[cfg(not(feature = "azure"))]
+fn new_azure(_: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
+    panic!("Azure blob storage support not enabled, recompile with the azure feature enabled")
+}
+
+/// Create config-dependant object store.
+pub fn make_object_store(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
+    if let Some(data_dir) = &config.database_directory {
+        if !matches!(&config.object_store, Some(ObjectStoreType::File)) {
+            warn!(?data_dir, object_store_type=?config.object_store,
+                  "--data-dir / `INFLUXDB_IOX_DB_DIR` ignored. It only affects 'file' object stores");
+        }
+    }
+
+    let remote_store: Arc<DynObjectStore> = match &config.object_store {
+        Some(ObjectStoreType::Memory) | None => {
+            info!(object_store_type = "Memory", "Object Store");
+            Arc::new(InMemory::new())
+        }
+        Some(ObjectStoreType::MemoryThrottled) => {
+            let config = ThrottleConfig {
+                // for every call: assume a 100ms latency
+                wait_delete_per_call: Duration::from_millis(100),
+                wait_get_per_call: Duration::from_millis(100),
+                wait_list_per_call: Duration::from_millis(100),
+                wait_list_with_delimiter_per_call: Duration::from_millis(100),
+                wait_put_per_call: Duration::from_millis(100),
+
+                // for list operations: assume we need 1 call per 1k entries at 100ms
+                wait_list_per_entry: Duration::from_millis(100) / 1_000,
+                wait_list_with_delimiter_per_entry: Duration::from_millis(100) / 1_000,
+
+                // for upload/download: assume 1GByte/s
+                wait_get_per_byte: Duration::from_secs(1) / 1_000_000_000,
+            };
+
+            info!(?config, object_store_type = "Memory", "Object Store");
+            Arc::new(ThrottledStore::new(InMemory::new(), config))
+        }
+
+        Some(ObjectStoreType::Google) => new_gcs(config)?,
+        Some(ObjectStoreType::S3) => new_s3(config)?,
+        Some(ObjectStoreType::Azure) => new_azure(config)?,
+        Some(ObjectStoreType::File) => match config.database_directory.as_ref() {
+            Some(db_dir) => {
+                info!(?db_dir, object_store_type = "Directory", "Object Store");
+                fs::create_dir_all(db_dir)
+                    .context(CreatingDatabaseDirectorySnafu { path: db_dir })?;
+
+                let store = object_store::local::LocalFileSystem::new_with_prefix(db_dir)
+                    .context(CreateLocalFileSystemSnafu { path: db_dir })?;
+                Arc::new(store)
+            }
+            None => MissingObjectStoreConfigSnafu {
+                object_store: ObjectStoreType::File,
+                missing: "data-dir",
+            }
+            .fail()?,
+        },
+    };
+
+    if let Some(cache_config) = &config.cache_config {
+        let cache = parquet_cache::make_client(
+            cache_config.namespace_addr.clone(),
+            Arc::clone(&remote_store),
+        );
+        info!(?cache_config, "Parquet cache enabled");
+        Ok(cache)
+    } else {
+        Ok(remote_store)
+    }
+}
+
+/// The `object_store::signer::Signer` trait is only implemented for AWS currently, so when the AWS
+/// feature is enabled and the configured object store is S3, return a signer.
+#[cfg(feature = "aws")]
+pub fn make_presigned_url_signer(
+    config: &ObjectStoreConfig,
+) -> Result<Option<Arc<dyn object_store::signer::Signer>>, ParseError> {
+    match &config.object_store {
+        Some(ObjectStoreType::S3) => Ok(Some(Arc::new(build_s3(config)?))),
+        _ => Ok(None),
+    }
+}
+
+/// The `object_store::signer::Signer` trait is only implemented for AWS currently, so if the AWS
+/// feature isn't enabled, don't return a signer.
+#[cfg(not(feature = "aws"))]
+pub fn make_presigned_url_signer(
+    _config: &ObjectStoreConfig,
+) -> Result<Option<Arc<dyn object_store::signer::Signer>>, ParseError> {
+    Ok(None)
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum CheckError {
+    #[snafu(display("Cannot read from object store: {}", source))]
+    CannotReadObjectStore { source: object_store::Error },
+}
+
+/// Check if object store is properly configured and accepts writes and reads.
+///
+/// Note: This does NOT test if the object store is writable!
+pub async fn check_object_store(object_store: &DynObjectStore) -> Result<(), CheckError> {
+    // Use some prefix that will very likely end in an empty result, so we don't pull too much actual data here.
+    let uuid = Uuid::new_v4().to_string();
+    let prefix = Path::from_iter([uuid]);
+
+    // create stream (this might fail if the store is not readable)
+    let mut stream = object_store.list(Some(&prefix));
+
+    // ... but sometimes it fails only if we use the resulting stream, so try that once
+    stream
+        .try_next()
+        .await
+        .context(CannotReadObjectStoreSnafu)?;
+
+    // store seems to be readable
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser;
+    use std::env;
+    use tempfile::TempDir;
+
+    #[test]
+    fn default_object_store_is_memory() {
+        let config = ObjectStoreConfig::try_parse_from(["server"]).unwrap();
+
+        let object_store = make_object_store(&config).unwrap();
+        assert_eq!(&object_store.to_string(), "InMemory")
+    }
+
+    #[test]
+    fn explicitly_set_object_store_to_memory() {
+        let config =
+            ObjectStoreConfig::try_parse_from(["server", "--object-store", "memory"]).unwrap();
+
+        let object_store = make_object_store(&config).unwrap();
+        assert_eq!(&object_store.to_string(), "InMemory")
+    }
+
+    #[test]
+    fn default_url_signer_is_none() {
+        let config = ObjectStoreConfig::try_parse_from(["server"]).unwrap();
+
+        let signer = make_presigned_url_signer(&config).unwrap();
+        assert!(signer.is_none(), "Expected None, got {signer:?}");
+    }
+
+    #[test]
+    #[cfg(feature = "aws")]
+    fn valid_s3_config() {
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "s3",
+            "--bucket",
+            "mybucket",
+            "--aws-access-key-id",
+            "NotARealAWSAccessKey",
+            "--aws-secret-access-key",
+            "NotARealAWSSecretAccessKey",
+        ])
+        .unwrap();
+
+        let object_store = make_object_store(&config).unwrap();
+        assert_eq!(
+            &object_store.to_string(),
+            "LimitStore(16, AmazonS3(mybucket))"
+        )
+    }
+
+    #[test]
+    #[cfg(feature = "aws")]
+    fn s3_config_missing_params() {
+        let mut config =
+            ObjectStoreConfig::try_parse_from(["server", "--object-store", "s3"]).unwrap();
+
+        // clean out eventual leaks via env variables
+        config.bucket = None;
+
+        let err = make_object_store(&config).unwrap_err().to_string();
+
+        assert_eq!(
+            err,
+            "Error configuring Amazon S3: Generic S3 error: Missing bucket name"
+        );
+    }
+
+    #[test]
+    #[cfg(feature = "aws")]
+    fn valid_s3_url_signer() {
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "s3",
+            "--bucket",
+            "mybucket",
+            "--aws-access-key-id",
+            "NotARealAWSAccessKey",
+            "--aws-secret-access-key",
+            "NotARealAWSSecretAccessKey",
+        ])
+        .unwrap();
+
+        assert!(make_presigned_url_signer(&config).unwrap().is_some());
+
+        // Even with the aws feature on, any other object store shouldn't create a signer.
+        let root = TempDir::new().unwrap();
+        let root_path = root.path().to_str().unwrap();
+
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "file",
+            "--data-dir",
+            root_path,
+        ])
+        .unwrap();
+
+        let signer = make_presigned_url_signer(&config).unwrap();
+        assert!(signer.is_none(), "Expected None, got {signer:?}");
+    }
+
+    #[test]
+    #[cfg(feature = "aws")]
+    fn s3_url_signer_config_missing_params() {
+        let mut config =
+            ObjectStoreConfig::try_parse_from(["server", "--object-store", "s3"]).unwrap();
+
+        // clean out eventual leaks via env variables
+        config.bucket = None;
+
+        let err = make_presigned_url_signer(&config).unwrap_err().to_string();
+
+        assert_eq!(
+            err,
+            "Error configuring Amazon S3: Generic S3 error: Missing bucket name"
+        );
+    }
+
+    #[test]
+    #[cfg(feature = "gcp")]
+    fn valid_google_config() {
+        use std::io::Write;
+        use tempfile::NamedTempFile;
+
+        let mut file = NamedTempFile::new().expect("tempfile should be created");
+        const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#;
+        writeln!(file, "{FAKE_KEY}").unwrap();
+        let path = file.path().to_str().expect("file path should exist");
+
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "google",
+            "--bucket",
+            "mybucket",
+            "--google-service-account",
+            path,
+        ])
+        .unwrap();
+
+        let object_store = make_object_store(&config).unwrap();
+        assert_eq!(
+            &object_store.to_string(),
+            "LimitStore(16, GoogleCloudStorage(mybucket))"
+        )
+    }
+
+    #[test]
+    #[cfg(feature = "gcp")]
+    fn google_config_missing_params() {
+        let mut config =
+            ObjectStoreConfig::try_parse_from(["server", "--object-store", "google"]).unwrap();
+
+        // clean out eventual leaks via env variables
+        config.bucket = None;
+
+        let err = make_object_store(&config).unwrap_err().to_string();
+
+        assert_eq!(
+            err,
+            "Error configuring GCS: Generic GCS error: Missing bucket name"
+        );
+    }
+
+    #[test]
+    #[cfg(feature = "azure")]
+    fn valid_azure_config() {
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "azure",
+            "--bucket",
+            "mybucket",
+            "--azure-storage-account",
+            "NotARealStorageAccount",
+            "--azure-storage-access-key",
+            "Zm9vYmFy", // base64 encoded "foobar"
+        ])
+        .unwrap();
+
+        let object_store = make_object_store(&config).unwrap();
+        assert_eq!(&object_store.to_string(), "LimitStore(16, MicrosoftAzure { account: NotARealStorageAccount, container: mybucket })")
+    }
+
+    #[test]
+    #[cfg(feature = "azure")]
+    fn azure_config_missing_params() {
+        let mut config =
+            ObjectStoreConfig::try_parse_from(["server", "--object-store", "azure"]).unwrap();
+
+        // clean out eventual leaks via env variables
+        config.bucket = None;
+
+        let err = make_object_store(&config).unwrap_err().to_string();
+
+        assert_eq!(
+            err,
+            "Error configuring Microsoft Azure: Generic MicrosoftAzure error: Container name must be specified"
+        );
+    }
+
+    #[test]
+    fn valid_file_config() {
+        let root = TempDir::new().unwrap();
+        let root_path = root.path().to_str().unwrap();
+
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "file",
+            "--data-dir",
+            root_path,
+        ])
+        .unwrap();
+
+        let object_store = make_object_store(&config).unwrap().to_string();
+        assert!(
+            object_store.starts_with("LocalFileSystem"),
+            "{}",
+            object_store
+        )
+    }
+
+    #[test]
+    fn file_config_missing_params() {
+        // this test tests for failure to configure the object store because of data-dir configuration missing
+        // if the INFLUXDB_IOX_DB_DIR env variable is set, the test fails because the configuration is
+        // actually present.
+        env::remove_var("INFLUXDB_IOX_DB_DIR");
+        let config =
+            ObjectStoreConfig::try_parse_from(["server", "--object-store", "file"]).unwrap();
+
+        let err = make_object_store(&config).unwrap_err().to_string();
+
+        assert_eq!(
+            err,
+            "Specified File for the object store, required configuration missing for \
+            data-dir"
+        );
+    }
+
+    #[test]
+    fn valid_cache_config() {
+        let root = TempDir::new().unwrap();
+        let root_path = root.path().to_str().unwrap();
+
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "file",
+            "--data-dir",
+            root_path,
+            "--parquet-cache-namespace-addr",
+            "http://k8s-noninstance-general-service-route:8080",
+        ])
+        .unwrap();
+
+        let object_store = make_object_store(&config).unwrap().to_string();
+        assert!(
+            object_store.starts_with("DataCacheObjectStore"),
+            "{}",
+            object_store
+        )
+    }
+}
diff --git a/clap_blocks/src/parquet_cache.rs b/clap_blocks/src/parquet_cache.rs
new file mode 100644
index 0000000..d93aa94
--- /dev/null
+++ b/clap_blocks/src/parquet_cache.rs
@@ -0,0 +1,57 @@
+//! CLI handling for parquet data cache config (via CLI arguments and environment variables).
+
+/// Config for cache client.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct ParquetCacheClientConfig {
+    /// The address for the service namespace (not a given instance).
+    ///
+    /// When the client comes online, it discovers the keyspace
+    /// by issue requests to this address.
+    #[clap(
+        long = "parquet-cache-namespace-addr",
+        env = "INFLUXDB_IOX_PARQUET_CACHE_NAMESPACE_ADDR",
+        required = false
+    )]
+    pub namespace_addr: String,
+}
+
+/// Config for cache instance.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct ParquetCacheInstanceConfig {
+    /// The path to the config file for the keyspace.
+    #[clap(
+        long = "parquet-cache-keyspace-config-path",
+        env = "INFLUXDB_IOX_PARQUET_CACHE_KEYSPACE_CONFIG_PATH",
+        required = true
+    )]
+    pub keyspace_config_path: String,
+
+    /// The hostname of the cache instance (k8s pod) running this process.
+    ///
+    /// Cache controller should be setting this env var.
+    #[clap(
+        long = "parquet-cache-instance-hostname",
+        env = "HOSTNAME",
+        required = true
+    )]
+    pub instance_hostname: String,
+
+    /// The local directory to store data.
+    #[clap(
+        long = "parquet-cache-local-dir",
+        env = "INFLUXDB_IOX_PARQUET_CACHE_LOCAL_DIR",
+        required = true
+    )]
+    pub local_dir: String,
+}
+
+impl From<ParquetCacheInstanceConfig> for parquet_cache::ParquetCacheServerConfig {
+    fn from(instance_config: ParquetCacheInstanceConfig) -> Self {
+        Self {
+            keyspace_config_path: instance_config.keyspace_config_path,
+            hostname: instance_config.instance_hostname,
+            local_dir: instance_config.local_dir,
+            policy_config: Default::default(),
+        }
+    }
+}
diff --git a/clap_blocks/src/querier.rs b/clap_blocks/src/querier.rs
new file mode 100644
index 0000000..4a62455
--- /dev/null
+++ b/clap_blocks/src/querier.rs
@@ -0,0 +1,264 @@
+//! Querier-related configs.
+
+use crate::{
+    ingester_address::IngesterAddress,
+    memory_size::MemorySize,
+    single_tenant::{CONFIG_AUTHZ_ENV_NAME, CONFIG_AUTHZ_FLAG},
+};
+use std::{collections::HashMap, num::NonZeroUsize};
+
+/// CLI config for querier configuration
+#[derive(Debug, Clone, PartialEq, Eq, clap::Parser)]
+pub struct QuerierConfig {
+    /// Addr for connection to authz
+    #[clap(long = CONFIG_AUTHZ_FLAG, env = CONFIG_AUTHZ_ENV_NAME)]
+    pub authz_address: Option<String>,
+
+    /// The number of threads to use for queries.
+    ///
+    /// If not specified, defaults to the number of cores on the system
+    #[clap(
+        long = "num-query-threads",
+        env = "INFLUXDB_IOX_NUM_QUERY_THREADS",
+        action
+    )]
+    pub num_query_threads: Option<NonZeroUsize>,
+
+    /// Size of memory pool used during query exec, in bytes.
+    ///
+    /// If queries attempt to allocate more than this many bytes
+    /// during execution, they will error with "ResourcesExhausted".
+    ///
+    /// Can be given as absolute value or in percentage of the total available memory (e.g. `10%`).
+    #[clap(
+        long = "exec-mem-pool-bytes",
+        env = "INFLUXDB_IOX_EXEC_MEM_POOL_BYTES",
+        default_value = "8589934592",  // 8GB
+        action
+    )]
+    pub exec_mem_pool_bytes: MemorySize,
+
+    /// gRPC address for the router to talk with the ingesters. For
+    /// example:
+    ///
+    /// "http://127.0.0.1:8083"
+    ///
+    /// or
+    ///
+    /// "http://10.10.10.1:8083,http://10.10.10.2:8083"
+    ///
+    /// for multiple addresses.
+    #[clap(
+        long = "ingester-addresses",
+        env = "INFLUXDB_IOX_INGESTER_ADDRESSES",
+        required = false,
+        num_args = 0..,
+        value_delimiter = ','
+    )]
+    pub ingester_addresses: Vec<IngesterAddress>,
+
+    /// Size of the RAM cache used to store catalog metadata information in bytes.
+    ///
+    /// Can be given as absolute value or in percentage of the total available memory (e.g. `10%`).
+    #[clap(
+        long = "ram-pool-metadata-bytes",
+        env = "INFLUXDB_IOX_RAM_POOL_METADATA_BYTES",
+        default_value = "134217728",  // 128MB
+        action
+    )]
+    pub ram_pool_metadata_bytes: MemorySize,
+
+    /// Size of the RAM cache used to store data in bytes.
+    ///
+    /// Can be given as absolute value or in percentage of the total available memory (e.g. `10%`).
+    #[clap(
+        long = "ram-pool-data-bytes",
+        env = "INFLUXDB_IOX_RAM_POOL_DATA_BYTES",
+        default_value = "1073741824",  // 1GB
+        action
+    )]
+    pub ram_pool_data_bytes: MemorySize,
+
+    /// Limit the number of concurrent queries.
+    #[clap(
+        long = "max-concurrent-queries",
+        env = "INFLUXDB_IOX_MAX_CONCURRENT_QUERIES",
+        default_value = "10",
+        action
+    )]
+    pub max_concurrent_queries: usize,
+
+    /// After how many ingester query errors should the querier enter circuit breaker mode?
+    ///
+    /// The querier normally contacts the ingester for any unpersisted data during query planning.
+    /// However, when the ingester can not be contacted for some reason, the querier will begin
+    /// returning results that do not include unpersisted data and enter "circuit breaker mode"
+    /// to avoid continually retrying the failing connection on subsequent queries.
+    ///
+    /// If circuits are open, the querier will NOT contact the ingester and no unpersisted data
+    /// will be presented to the user.
+    ///
+    /// Circuits will switch to "half open" after some jittered timeout and the querier will try to
+    /// use the ingester in question again. If this succeeds, we are back to normal, otherwise it
+    /// will back off exponentially before trying again (and again ...).
+    ///
+    /// In a production environment the `ingester_circuit_state` metric should be monitored.
+    #[clap(
+        long = "ingester-circuit-breaker-threshold",
+        env = "INFLUXDB_IOX_INGESTER_CIRCUIT_BREAKER_THRESHOLD",
+        default_value = "10",
+        action
+    )]
+    pub ingester_circuit_breaker_threshold: u64,
+
+    /// DataFusion config.
+    #[clap(
+        long = "datafusion-config",
+        env = "INFLUXDB_IOX_DATAFUSION_CONFIG",
+        default_value = "",
+        value_parser = parse_datafusion_config,
+        action
+    )]
+    pub datafusion_config: HashMap<String, String>,
+
+    /// Use the new V2 API to talk to the ingester.
+    ///
+    /// Defaults to "no".
+    ///
+    /// See <https://github.com/influxdata/influxdb_iox/issues/8169>.
+    #[clap(long = "v2-ingester-api", env = "INFLUXDB_IOX_V2_INGESTER_API", action)]
+    pub v2_ingester_api: bool,
+}
+
+fn parse_datafusion_config(
+    s: &str,
+) -> Result<HashMap<String, String>, Box<dyn std::error::Error + Send + Sync + 'static>> {
+    let s = s.trim();
+    if s.is_empty() {
+        return Ok(HashMap::with_capacity(0));
+    }
+
+    let mut out = HashMap::new();
+    for part in s.split(',') {
+        let kv = part.trim().splitn(2, ':').collect::<Vec<_>>();
+        match kv.as_slice() {
+            [key, value] => {
+                let key_owned = key.trim().to_owned();
+                let value_owned = value.trim().to_owned();
+                let existed = out.insert(key_owned, value_owned).is_some();
+                if existed {
+                    return Err(format!("key '{key}' passed multiple times").into());
+                }
+            }
+            _ => {
+                return Err(
+                    format!("Invalid key value pair - expected 'KEY:VALUE' got '{s}'").into(),
+                );
+            }
+        }
+    }
+
+    Ok(out)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser;
+    use test_helpers::assert_contains;
+
+    #[test]
+    fn test_default() {
+        let actual = QuerierConfig::try_parse_from(["my_binary"]).unwrap();
+
+        assert_eq!(actual.num_query_threads, None);
+        assert!(actual.ingester_addresses.is_empty());
+        assert!(actual.datafusion_config.is_empty());
+    }
+
+    #[test]
+    fn test_num_threads() {
+        let actual =
+            QuerierConfig::try_parse_from(["my_binary", "--num-query-threads", "42"]).unwrap();
+
+        assert_eq!(
+            actual.num_query_threads,
+            Some(NonZeroUsize::new(42).unwrap())
+        );
+    }
+
+    #[test]
+    fn test_ingester_addresses_list() {
+        let querier = QuerierConfig::try_parse_from([
+            "my_binary",
+            "--ingester-addresses",
+            "http://ingester-0:8082,http://ingester-1:8082",
+        ])
+        .unwrap();
+
+        let actual: Vec<_> = querier
+            .ingester_addresses
+            .iter()
+            .map(ToString::to_string)
+            .collect();
+
+        let expected = vec!["http://ingester-0:8082/", "http://ingester-1:8082/"];
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn bad_ingester_addresses_list() {
+        let actual = QuerierConfig::try_parse_from([
+            "my_binary",
+            "--ingester-addresses",
+            "\\ingester-0:8082",
+        ])
+        .unwrap_err()
+        .to_string();
+
+        assert_contains!(
+            actual,
+            "error: \
+            invalid value '\\ingester-0:8082' \
+            for '--ingester-addresses [<INGESTER_ADDRESSES>...]': \
+            invalid uri character"
+        );
+    }
+
+    #[test]
+    fn test_datafusion_config() {
+        let actual = QuerierConfig::try_parse_from([
+            "my_binary",
+            "--datafusion-config= foo : bar , x:y:z  ",
+        ])
+        .unwrap();
+
+        assert_eq!(
+            actual.datafusion_config,
+            HashMap::from([
+                (String::from("foo"), String::from("bar")),
+                (String::from("x"), String::from("y:z")),
+            ]),
+        );
+    }
+
+    #[test]
+    fn bad_datafusion_config() {
+        let actual = QuerierConfig::try_parse_from(["my_binary", "--datafusion-config=foo"])
+            .unwrap_err()
+            .to_string();
+        assert_contains!(
+            actual,
+            "error: invalid value 'foo' for '--datafusion-config <DATAFUSION_CONFIG>': Invalid key value pair - expected 'KEY:VALUE' got 'foo'"
+        );
+
+        let actual =
+            QuerierConfig::try_parse_from(["my_binary", "--datafusion-config=foo:bar,baz:1,foo:2"])
+                .unwrap_err()
+                .to_string();
+        assert_contains!(
+            actual,
+            "error: invalid value 'foo:bar,baz:1,foo:2' for '--datafusion-config <DATAFUSION_CONFIG>': key 'foo' passed multiple times"
+        );
+    }
+}
diff --git a/clap_blocks/src/router.rs b/clap_blocks/src/router.rs
new file mode 100644
index 0000000..28442d7
--- /dev/null
+++ b/clap_blocks/src/router.rs
@@ -0,0 +1,165 @@
+//! CLI config for the router using the RPC write path
+
+use crate::{
+    bulk_ingest::BulkIngestConfig,
+    gossip::GossipConfig,
+    ingester_address::IngesterAddress,
+    single_tenant::{
+        CONFIG_AUTHZ_ENV_NAME, CONFIG_AUTHZ_FLAG, CONFIG_CST_ENV_NAME, CONFIG_CST_FLAG,
+    },
+};
+use std::{
+    num::{NonZeroUsize, ParseIntError},
+    time::Duration,
+};
+
+/// CLI config for the router using the RPC write path
+#[derive(Debug, Clone, clap::Parser)]
+#[allow(missing_copy_implementations)]
+pub struct RouterConfig {
+    /// Gossip config.
+    #[clap(flatten)]
+    pub gossip_config: GossipConfig,
+
+    /// Bulk ingest API config.
+    #[clap(flatten)]
+    pub bulk_ingest_config: BulkIngestConfig,
+
+    /// Addr for connection to authz
+    #[clap(
+        long = CONFIG_AUTHZ_FLAG,
+        env = CONFIG_AUTHZ_ENV_NAME,
+        requires("single_tenant_deployment"),
+    )]
+    pub authz_address: Option<String>,
+
+    /// Differential handling based upon deployment to CST vs MT.
+    ///
+    /// At minimum, differs in supports of v1 endpoint. But also includes
+    /// differences in namespace handling, etc.
+    #[clap(
+        long = CONFIG_CST_FLAG,
+        env = CONFIG_CST_ENV_NAME,
+        default_value = "false",
+        requires_if("true", "authz_address")
+    )]
+    pub single_tenant_deployment: bool,
+
+    /// The maximum number of simultaneous requests the HTTP server is
+    /// configured to accept.
+    ///
+    /// This number of requests, multiplied by the maximum request body size the
+    /// HTTP server is configured with gives the rough amount of memory a HTTP
+    /// server will use to buffer request bodies in memory.
+    ///
+    /// A default maximum of 200 requests, multiplied by the default 10MiB
+    /// maximum for HTTP request bodies == ~2GiB.
+    #[clap(
+        long = "max-http-requests",
+        env = "INFLUXDB_IOX_MAX_HTTP_REQUESTS",
+        default_value = "200",
+        action
+    )]
+    pub http_request_limit: usize,
+
+    /// When writing line protocol data, does an error on a single line
+    /// reject the write? Or will all individual valid lines be written?
+    /// Set to true to enable all valid lines to write.
+    #[clap(
+        long = "partial-writes-enabled",
+        env = "INFLUXDB_IOX_PARTIAL_WRITES_ENABLED",
+        default_value = "false",
+        action
+    )]
+    pub permit_partial_writes: bool,
+
+    /// gRPC address for the router to talk with the ingesters. For
+    /// example:
+    ///
+    /// "http://127.0.0.1:8083"
+    ///
+    /// or
+    ///
+    /// "http://10.10.10.1:8083,http://10.10.10.2:8083"
+    ///
+    /// for multiple addresses.
+    #[clap(
+        long = "ingester-addresses",
+        env = "INFLUXDB_IOX_INGESTER_ADDRESSES",
+        required = true,
+        num_args=1..,
+        value_delimiter = ','
+    )]
+    pub ingester_addresses: Vec<IngesterAddress>,
+
+    /// Retention period to use when auto-creating namespaces.
+    /// For infinite retention, leave this unset and it will default to `None`.
+    /// Setting it to zero will not make it infinite.
+    /// Ignored if namespace-autocreation-enabled is set to false.
+    #[clap(
+        long = "new-namespace-retention-hours",
+        env = "INFLUXDB_IOX_NEW_NAMESPACE_RETENTION_HOURS",
+        action
+    )]
+    pub new_namespace_retention_hours: Option<u64>,
+
+    /// When writing data to a non-existent namespace, should the router auto-create the namespace
+    /// or reject the write? Set to false to disable namespace autocreation.
+    #[clap(
+        long = "namespace-autocreation-enabled",
+        env = "INFLUXDB_IOX_NAMESPACE_AUTOCREATION_ENABLED",
+        default_value = "true",
+        action
+    )]
+    pub namespace_autocreation_enabled: bool,
+
+    /// Specify the timeout in seconds for a single RPC write request to an
+    /// ingester.
+    #[clap(
+        long = "rpc-write-timeout-seconds",
+        env = "INFLUXDB_IOX_RPC_WRITE_TIMEOUT_SECONDS",
+        default_value = "3",
+        value_parser = parse_duration
+    )]
+    pub rpc_write_timeout_seconds: Duration,
+
+    /// Specify the maximum allowed outgoing RPC write message size when
+    /// communicating with the Ingester.
+    #[clap(
+        long = "rpc-write-max-outgoing-bytes",
+        env = "INFLUXDB_IOX_RPC_WRITE_MAX_OUTGOING_BYTES",
+        default_value = "104857600", // 100MiB
+    )]
+    pub rpc_write_max_outgoing_bytes: usize,
+
+    /// Enable optional replication for each RPC write.
+    ///
+    /// This value specifies the total number of copies of data after
+    /// replication, defaulting to 1.
+    ///
+    /// If the desired replication level is not achieved, a partial write error
+    /// will be returned to the user. The write MAY be queryable after a partial
+    /// write failure.
+    #[clap(
+        long = "rpc-write-replicas",
+        env = "INFLUXDB_IOX_RPC_WRITE_REPLICAS",
+        default_value = "1"
+    )]
+    pub rpc_write_replicas: NonZeroUsize,
+
+    /// Specify the maximum number of probe requests to be sent per second.
+    ///
+    /// At least 20% of these requests must succeed within a second for the
+    /// endpoint to be considered healthy.
+    #[clap(
+        long = "rpc-write-health-num-probes",
+        env = "INFLUXDB_IOX_RPC_WRITE_HEALTH_NUM_PROBES",
+        default_value = "10"
+    )]
+    pub rpc_write_health_num_probes: u64,
+}
+
+/// Map a string containing an integer number of seconds into a [`Duration`].
+fn parse_duration(input: &str) -> Result<Duration, ParseIntError> {
+    input.parse().map(Duration::from_secs)
+}
diff --git a/clap_blocks/src/run_config.rs b/clap_blocks/src/run_config.rs
new file mode 100644
index 0000000..e5c5939
--- /dev/null
+++ b/clap_blocks/src/run_config.rs
@@ -0,0 +1,107 @@
+//! Common config for all `run` commands.
+use trace_exporters::TracingConfig;
+use trogging::cli::LoggingConfig;
+
+use crate::{object_store::ObjectStoreConfig, socket_addr::SocketAddr};
+
+/// The default bind address for the HTTP API.
+pub const DEFAULT_API_BIND_ADDR: &str = "127.0.0.1:8080";
+
+/// The default bind address for the gRPC.
+pub const DEFAULT_GRPC_BIND_ADDR: &str = "127.0.0.1:8082";
+
+/// Common config for all `run` commands.
+#[derive(Debug, Clone, clap::Parser)]
+pub struct RunConfig {
+    /// logging options
+    #[clap(flatten)]
+    pub(crate) logging_config: LoggingConfig,
+
+    /// tracing options
+    #[clap(flatten)]
+    pub(crate) tracing_config: TracingConfig,
+
+    /// The address on which IOx will serve HTTP API requests.
+    #[clap(
+        long = "api-bind",
+        env = "INFLUXDB_IOX_BIND_ADDR",
+        default_value = DEFAULT_API_BIND_ADDR,
+        action,
+    )]
+    pub http_bind_address: SocketAddr,
+
+    /// The address on which IOx will serve Storage gRPC API requests.
+    #[clap(
+        long = "grpc-bind",
+        env = "INFLUXDB_IOX_GRPC_BIND_ADDR",
+        default_value = DEFAULT_GRPC_BIND_ADDR,
+        action,
+    )]
+    pub grpc_bind_address: SocketAddr,
+
+    /// Maximum size of HTTP requests.
+    #[clap(
+        long = "max-http-request-size",
+        env = "INFLUXDB_IOX_MAX_HTTP_REQUEST_SIZE",
+        default_value = "10485760", // 10 MiB
+        action,
+    )]
+    pub max_http_request_size: usize,
+
+    /// object store config
+    #[clap(flatten)]
+    pub(crate) object_store_config: ObjectStoreConfig,
+}
+
+impl RunConfig {
+    /// Get a reference to the run config's tracing config.
+    pub fn tracing_config(&self) -> &TracingConfig {
+        &self.tracing_config
+    }
+
+    /// Get a reference to the run config's object store config.
+    pub fn object_store_config(&self) -> &ObjectStoreConfig {
+        &self.object_store_config
+    }
+
+    /// Get a mutable reference to the run config's tracing config.
+    pub fn tracing_config_mut(&mut self) -> &mut TracingConfig {
+        &mut self.tracing_config
+    }
+
+    /// Get a reference to the run config's logging config.
+    pub fn logging_config(&self) -> &LoggingConfig {
+        &self.logging_config
+    }
+
+    /// set the http bind address
+    pub fn with_http_bind_address(mut self, http_bind_address: SocketAddr) -> Self {
+        self.http_bind_address = http_bind_address;
+        self
+    }
+
+    /// set the grpc bind address
+    pub fn with_grpc_bind_address(mut self, grpc_bind_address: SocketAddr) -> Self {
+        self.grpc_bind_address = grpc_bind_address;
+        self
+    }
+
+    /// Create a new instance for all-in-one mode, only allowing some arguments.
+    pub fn new(
+        logging_config: LoggingConfig,
+        tracing_config: TracingConfig,
+        http_bind_address: SocketAddr,
+        grpc_bind_address: SocketAddr,
+        max_http_request_size: usize,
+        object_store_config: ObjectStoreConfig,
+    ) -> Self {
+        Self {
+            logging_config,
+            tracing_config,
+            http_bind_address,
+            grpc_bind_address,
+            max_http_request_size,
+            object_store_config,
+        }
+    }
+}
diff --git a/clap_blocks/src/single_tenant.rs b/clap_blocks/src/single_tenant.rs
new file mode 100644
index 0000000..fb7fb95
--- /dev/null
+++ b/clap_blocks/src/single_tenant.rs
@@ -0,0 +1,11 @@
+//! CLI config for request authorization.
+
+/// Env var providing authz address
+pub const CONFIG_AUTHZ_ENV_NAME: &str = "INFLUXDB_IOX_AUTHZ_ADDR";
+/// CLI flag for authz address
+pub const CONFIG_AUTHZ_FLAG: &str = "authz-addr";
+
+/// Env var for single tenancy deployments
+pub const CONFIG_CST_ENV_NAME: &str = "INFLUXDB_IOX_SINGLE_TENANCY";
+/// CLI flag for single tenancy deployments
+pub const CONFIG_CST_FLAG: &str = "single-tenancy";
diff --git a/clap_blocks/src/socket_addr.rs b/clap_blocks/src/socket_addr.rs
new file mode 100644
index 0000000..02a1014
--- /dev/null
+++ b/clap_blocks/src/socket_addr.rs
@@ -0,0 +1,77 @@
+//! Config for socket addresses.
+use std::{net::ToSocketAddrs, ops::Deref};
+
+/// Parsable socket address.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct SocketAddr(std::net::SocketAddr);
+
+impl Deref for SocketAddr {
+    type Target = std::net::SocketAddr;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl std::fmt::Display for SocketAddr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl std::str::FromStr for SocketAddr {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_socket_addrs() {
+            Ok(mut addrs) => {
+                if let Some(addr) = addrs.next() {
+                    Ok(Self(addr))
+                } else {
+                    Err(format!("Found no addresses for '{s}'"))
+                }
+            }
+            Err(e) => Err(format!("Cannot parse socket address '{s}': {e}")),
+        }
+    }
+}
+
+impl From<SocketAddr> for std::net::SocketAddr {
+    fn from(addr: SocketAddr) -> Self {
+        addr.0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::{
+        net::{Ipv4Addr, Ipv6Addr, SocketAddrV4, SocketAddrV6},
+        str::FromStr,
+    };
+
+    #[test]
+    fn test_socketaddr() {
+        let addr: std::net::SocketAddr = SocketAddr::from_str("127.0.0.1:1234").unwrap().into();
+        assert_eq!(addr, std::net::SocketAddr::from(([127, 0, 0, 1], 1234)),);
+
+        let addr: std::net::SocketAddr = SocketAddr::from_str("localhost:1234").unwrap().into();
+        // depending on where the test runs, localhost will either resolve to a ipv4 or
+        // an ipv6 addr.
+        match addr {
+            std::net::SocketAddr::V4(so) => {
+                assert_eq!(so, SocketAddrV4::new(Ipv4Addr::new(127, 0, 0, 1), 1234))
+            }
+            std::net::SocketAddr::V6(so) => assert_eq!(
+                so,
+                SocketAddrV6::new(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1), 1234, 0, 0)
+            ),
+        };
+
+        assert_eq!(
+            SocketAddr::from_str("!@INv_a1d(ad0/resp_!").unwrap_err(),
+            "Cannot parse socket address '!@INv_a1d(ad0/resp_!': invalid socket address",
+        );
+    }
+}
diff --git a/client_util/Cargo.toml b/client_util/Cargo.toml
new file mode 100644
index 0000000..8b2e12f
--- /dev/null
+++ b/client_util/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "client_util"
+description = "Shared code for IOx clients"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+http = "0.2.11"
+reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls-native-roots"] }
+thiserror = "1.0.56"
+tonic = { workspace = true }
+tower = "0.4"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread"] }
+mockito = { version = "1.2", default-features = false }
diff --git a/client_util/src/connection.rs b/client_util/src/connection.rs
new file mode 100644
index 0000000..d671502
--- /dev/null
+++ b/client_util/src/connection.rs
@@ -0,0 +1,295 @@
+use crate::tower::{SetRequestHeadersLayer, SetRequestHeadersService};
+use http::header::HeaderName;
+use http::HeaderMap;
+use http::{uri::InvalidUri, HeaderValue, Uri};
+use std::convert::TryInto;
+use std::time::Duration;
+use thiserror::Error;
+use tonic::transport::{Channel, Endpoint};
+use tower::make::MakeConnection;
+
+/// The connection type used for clients. Use [`Builder`] to create
+/// instances of [`Connection`] objects
+#[derive(Debug, Clone)]
+pub struct Connection {
+    grpc_connection: GrpcConnection,
+    http_connection: HttpConnection,
+}
+
+impl Connection {
+    /// Create a new Connection
+    fn new(grpc_connection: GrpcConnection, http_connection: HttpConnection) -> Self {
+        Self {
+            grpc_connection,
+            http_connection,
+        }
+    }
+
+    /// Consume `self` and return a [`GrpcConnection`] (suitable for use in
+    /// tonic clients)
+    pub fn into_grpc_connection(self) -> GrpcConnection {
+        self.grpc_connection
+    }
+
+    /// Consume `self` and return a [`HttpConnection`] (suitable for making
+    /// calls to /api/v2 endpoints)
+    pub fn into_http_connection(self) -> HttpConnection {
+        self.http_connection
+    }
+}
+
+/// The type used to make tonic (gRPC) requests
+pub type GrpcConnection = SetRequestHeadersService<tonic::transport::Channel>;
+
+/// The type used to make raw http request
+#[derive(Debug, Clone)]
+pub struct HttpConnection {
+    /// The base uri of the IOx http API endpoint
+    uri: Uri,
+    /// http client connection
+    http_client: reqwest::Client,
+}
+
+impl HttpConnection {
+    fn new(uri: Uri, http_client: reqwest::Client) -> Self {
+        Self { uri, http_client }
+    }
+
+    /// Return a reference to the underyling http client
+    pub fn client(&self) -> &reqwest::Client {
+        &self.http_client
+    }
+
+    /// Return a reference to the base uri of the IOx http API endpoint
+    pub fn uri(&self) -> &Uri {
+        &self.uri
+    }
+}
+
+/// The default User-Agent header sent by the HTTP client.
+pub const USER_AGENT: &str = concat!(env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"));
+/// The default connection timeout
+pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(1);
+/// The default request timeout
+pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
+
+/// Errors returned by the ConnectionBuilder
+#[derive(Debug, Error)]
+pub enum Error {
+    /// Server returned an invalid argument error
+    #[error("Connection error: {}{}", source, details)]
+    TransportError {
+        /// underlying [`tonic::transport::Error`]
+        source: tonic::transport::Error,
+        /// stringified version of the tonic error's source
+        details: String,
+    },
+
+    /// Client received an unexpected error from the server
+    #[error("Invalid URI: {}", .0)]
+    InvalidUri(#[from] InvalidUri),
+}
+
+// Custom impl to include underlying source (not included in tonic
+// transport error)
+impl From<tonic::transport::Error> for Error {
+    fn from(source: tonic::transport::Error) -> Self {
+        use std::error::Error;
+        let details = source
+            .source()
+            .map(|e| format!(" ({e})"))
+            .unwrap_or_default();
+
+        Self::TransportError { source, details }
+    }
+}
+
+/// Result type for the ConnectionBuilder
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A builder that produces a connection that can be used with any of the gRPC
+/// clients
+///
+/// ```no_run
+/// #[tokio::main]
+/// # async fn main() {
+/// use client_util::connection::Builder;
+/// use std::time::Duration;
+///
+/// let connection = Builder::new()
+///     .timeout(Duration::from_secs(42))
+///     .user_agent("my_awesome_client")
+///     .build("http://127.0.0.1:8082/")
+///     .await
+///     .expect("connection must succeed");
+/// # }
+/// ```
+#[derive(Debug, Clone)]
+pub struct Builder {
+    user_agent: String,
+    headers: Vec<(HeaderName, HeaderValue)>,
+    connect_timeout: Duration,
+    timeout: Duration,
+}
+
+impl std::default::Default for Builder {
+    fn default() -> Self {
+        Self {
+            user_agent: USER_AGENT.into(),
+            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
+            timeout: DEFAULT_TIMEOUT,
+            headers: Default::default(),
+        }
+    }
+}
+
+impl Builder {
+    /// Create a new default builder
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Construct the [`Connection`] instance using the specified base URL.
+    pub async fn build<D>(self, dst: D) -> Result<Connection>
+    where
+        D: TryInto<Uri, Error = InvalidUri> + Send,
+    {
+        let endpoint = self.create_endpoint(dst)?;
+        let channel = endpoint.connect().await?;
+        Ok(self.compose_middleware(channel, endpoint))
+    }
+
+    /// Construct the [`Connection`] instance using the specified base URL and custom connector.
+    pub async fn build_with_connector<D, C>(self, dst: D, connector: C) -> Result<Connection>
+    where
+        D: TryInto<Uri, Error = InvalidUri> + Send,
+        C: MakeConnection<Uri> + Send + 'static,
+        C::Connection: Unpin + Send + 'static,
+        C::Future: Send + 'static,
+        Box<dyn std::error::Error + Send + Sync>: From<C::Error> + Send + 'static,
+    {
+        let endpoint = self.create_endpoint(dst)?;
+        let channel = endpoint.connect_with_connector(connector).await?;
+        Ok(self.compose_middleware(channel, endpoint))
+    }
+
+    fn create_endpoint<D>(&self, dst: D) -> Result<Endpoint>
+    where
+        D: TryInto<Uri, Error = InvalidUri> + Send,
+    {
+        let endpoint = Endpoint::from(dst.try_into()?)
+            .user_agent(&self.user_agent)?
+            .connect_timeout(self.connect_timeout)
+            .timeout(self.timeout);
+        Ok(endpoint)
+    }
+
+    fn compose_middleware(self, channel: Channel, endpoint: Endpoint) -> Connection {
+        let headers_map: HeaderMap = self.headers.iter().cloned().collect();
+
+        // Compose channel with new tower middleware stack
+        let grpc_connection = tower::ServiceBuilder::new()
+            .layer(SetRequestHeadersLayer::new(self.headers))
+            .service(channel);
+
+        let http_client = reqwest::Client::builder()
+            .connection_verbose(true)
+            .default_headers(headers_map)
+            .build()
+            .expect("reqwest::Client should have built");
+
+        let http_connection = HttpConnection::new(endpoint.uri().clone(), http_client);
+
+        Connection::new(grpc_connection, http_connection)
+    }
+
+    /// Set the `User-Agent` header sent by this client.
+    pub fn user_agent(self, user_agent: impl Into<String>) -> Self {
+        Self {
+            user_agent: user_agent.into(),
+            ..self
+        }
+    }
+
+    /// Sets a header to be included on all requests
+    pub fn header(self, header: impl Into<HeaderName>, value: impl Into<HeaderValue>) -> Self {
+        let mut headers = self.headers;
+        headers.push((header.into(), value.into()));
+        Self { headers, ..self }
+    }
+
+    /// Sets the maximum duration of time the client will wait for the IOx
+    /// server to accept the TCP connection before aborting the request.
+    ///
+    /// Note this does not bound the request duration - see
+    /// [`timeout`][Self::timeout].
+    pub fn connect_timeout(self, timeout: Duration) -> Self {
+        Self {
+            connect_timeout: timeout,
+            ..self
+        }
+    }
+
+    /// Bounds the total amount of time a single client HTTP request take before
+    /// being aborted.
+    ///
+    /// This timeout includes:
+    ///
+    ///  - Establishing the TCP connection (see [`connect_timeout`])
+    ///  - Sending the HTTP request
+    ///  - Waiting for, and receiving the entire HTTP response
+    ///
+    /// [`connect_timeout`]: Self::connect_timeout
+    pub fn timeout(self, timeout: Duration) -> Self {
+        Self { timeout, ..self }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use reqwest::Method;
+
+    #[test]
+    fn test_builder_cloneable() {
+        // Clone is used by Conductor.
+        fn assert_clone<T: Clone>(_t: T) {}
+        assert_clone(Builder::default())
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn headers_are_set() {
+        let mut mock_server = mockito::Server::new_async().await;
+        let url = mock_server.url();
+
+        let http_connection = Builder::new()
+            .header(
+                HeaderName::from_static("foo"),
+                HeaderValue::from_static("bar"),
+            )
+            .build(&url)
+            .await
+            .unwrap()
+            .into_http_connection();
+
+        let url = format!("{url}/the_api");
+        println!("Sending to {url}");
+
+        let m = mock_server
+            .mock("POST", "/the_api")
+            .with_status(201)
+            .with_body("world")
+            .match_header("FOO", "bar")
+            .create_async()
+            .await;
+
+        http_connection
+            .client()
+            .request(Method::POST, &url)
+            .send()
+            .await
+            .expect("Error making http request");
+
+        m.assert_async().await;
+    }
+}
diff --git a/client_util/src/lib.rs b/client_util/src/lib.rs
new file mode 100644
index 0000000..74a1a34
--- /dev/null
+++ b/client_util/src/lib.rs
@@ -0,0 +1,32 @@
+//! Shared InfluxDB IOx API client functionality
+#![deny(
+    rustdoc::broken_intra_doc_links,
+    rustdoc::bare_urls,
+    rust_2018_idioms,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+#![warn(
+    missing_docs,
+    clippy::todo,
+    clippy::dbg_macro,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(clippy::missing_docs_in_private_items)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+/// Builder for constructing connections for use with the various gRPC clients
+pub mod connection;
+
+/// Helper to set client headers.
+pub mod tower;
+
+/// Namespace <--> org/bucket utilities
+pub mod namespace_translation;
diff --git a/client_util/src/namespace_translation.rs b/client_util/src/namespace_translation.rs
new file mode 100644
index 0000000..53f011e
--- /dev/null
+++ b/client_util/src/namespace_translation.rs
@@ -0,0 +1,90 @@
+//! Contains logic to map namespace back/forth to org/bucket
+
+use thiserror::Error;
+
+/// Errors returned by namespace parsing
+#[allow(missing_docs)]
+#[derive(Debug, Error)]
+pub enum Error {
+    #[error("Invalid namespace '{namespace}': {reason}")]
+    InvalidNamespace { namespace: String, reason: String },
+}
+
+impl Error {
+    fn new(namespace: impl Into<String>, reason: impl Into<String>) -> Self {
+        Self::InvalidNamespace {
+            namespace: namespace.into(),
+            reason: reason.into(),
+        }
+    }
+}
+
+/// Splits up the namespace name into org_id and bucket_id
+pub fn split_namespace(namespace: &str) -> Result<(&str, &str), Error> {
+    let mut iter = namespace.split('_');
+    let org_id = iter.next().ok_or_else(|| Error::new(namespace, "empty"))?;
+
+    if org_id.is_empty() {
+        return Err(Error::new(namespace, "No org_id found"));
+    }
+
+    let bucket_id = iter
+        .next()
+        .ok_or_else(|| Error::new(namespace, "Could not find '_'"))?;
+
+    if bucket_id.is_empty() {
+        return Err(Error::new(namespace, "No bucket_id found"));
+    }
+
+    if iter.next().is_some() {
+        return Err(Error::new(namespace, "More than one '_'"));
+    }
+
+    Ok((org_id, bucket_id))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn split_good() {
+        assert_eq!(split_namespace("foo_bar").unwrap(), ("foo", "bar"));
+    }
+
+    #[test]
+    #[should_panic(expected = "No org_id found")]
+    fn split_bad_empty() {
+        split_namespace("").unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "No org_id found")]
+    fn split_bad_only_underscore() {
+        split_namespace("_").unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "No org_id found")]
+    fn split_bad_empty_org_id() {
+        split_namespace("_ff").unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "No bucket_id found")]
+    fn split_bad_empty_bucket_id() {
+        split_namespace("ff_").unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "More than one '_'")]
+    fn split_too_many() {
+        split_namespace("ff_bf_").unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "More than one '_'")]
+    fn split_way_too_many() {
+        split_namespace("ff_bf_dfd_3_f").unwrap();
+    }
+}
diff --git a/client_util/src/tower.rs b/client_util/src/tower.rs
new file mode 100644
index 0000000..73eae36
--- /dev/null
+++ b/client_util/src/tower.rs
@@ -0,0 +1,79 @@
+use http::header::HeaderName;
+use http::{HeaderValue, Request, Response};
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use tower::{Layer, Service};
+
+/// `SetRequestHeadersLayer` sets the provided headers on all requests flowing through it
+/// unless they're already set
+#[derive(Debug, Clone)]
+pub(crate) struct SetRequestHeadersLayer {
+    headers: Arc<Vec<(HeaderName, HeaderValue)>>,
+}
+
+impl SetRequestHeadersLayer {
+    pub(crate) fn new(headers: Vec<(HeaderName, HeaderValue)>) -> Self {
+        Self {
+            headers: Arc::new(headers),
+        }
+    }
+}
+
+impl<S> Layer<S> for SetRequestHeadersLayer {
+    type Service = SetRequestHeadersService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        SetRequestHeadersService {
+            service,
+            headers: Arc::clone(&self.headers),
+        }
+    }
+}
+
+/// SetRequestHeadersService wraps an inner tower::Service and sets the provided
+/// headers on requests flowing through it
+#[derive(Debug, Clone)]
+pub struct SetRequestHeadersService<S> {
+    service: S,
+    headers: Arc<Vec<(HeaderName, HeaderValue)>>,
+}
+
+impl<S> SetRequestHeadersService<S> {
+    /// Create sevice from inner service and headers.
+    pub fn new(service: S, headers: Vec<(HeaderName, HeaderValue)>) -> Self {
+        Self {
+            service,
+            headers: Arc::new(headers),
+        }
+    }
+
+    /// De-construct service into parts.
+    ///
+    /// The can be used to call [`new`](Self::new) again.
+    pub fn into_parts(self) -> (S, Arc<Vec<(HeaderName, HeaderValue)>>) {
+        let SetRequestHeadersService { service, headers } = self;
+
+        (service, headers)
+    }
+}
+
+impl<S, ReqBody, ResBody> Service<Request<ReqBody>> for SetRequestHeadersService<S>
+where
+    S: Service<Request<ReqBody>, Response = Response<ResBody>>,
+{
+    type Response = Response<ResBody>;
+    type Error = S::Error;
+    type Future = S::Future;
+
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.service.poll_ready(cx)
+    }
+
+    fn call(&mut self, mut request: Request<ReqBody>) -> Self::Future {
+        let headers = request.headers_mut();
+        for (name, value) in self.headers.iter() {
+            headers.insert(name, value.clone());
+        }
+        self.service.call(request)
+    }
+}
diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml
new file mode 100644
index 0000000..c38745c
--- /dev/null
+++ b/data_types/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "data_types"
+description = "Shared data types"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow-buffer = { workspace = true }
+bytes = "1.5"
+chrono = { version = "0.4", default-features = false }
+croaring = "1.0.0"
+influxdb-line-protocol = { path = "../influxdb_line_protocol" }
+iox_time = { path = "../iox_time" }
+generated_types = { path = "../generated_types" }
+murmur3 = "0.5.2"
+observability_deps = { path = "../observability_deps" }
+once_cell = "1"
+ordered-float = "4"
+percent-encoding = "2.3.1"
+prost = { workspace = true }
+schema = { path = "../schema" }
+serde_json = "1.0"
+siphasher = "1.0"
+sha2 = { version = "0.10", default-features = false }
+snafu = "0.8"
+sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "uuid"] }
+thiserror = "1.0.56"
+uuid = { version = "1", features = ["v4"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+assert_matches = "1"
+paste = "1.0.14"
+proptest = { version = "1.4.0", default-features = false }
+test_helpers = { path = "../test_helpers" }
+hex = "0.4.2"
diff --git a/data_types/src/columns.rs b/data_types/src/columns.rs
new file mode 100644
index 0000000..1c6b0a9
--- /dev/null
+++ b/data_types/src/columns.rs
@@ -0,0 +1,997 @@
+//! Types having to do with columns.
+
+use super::TableId;
+use generated_types::influxdata::iox::{column_type::v1 as proto, gossip};
+use influxdb_line_protocol::FieldValue;
+use schema::{builder::SchemaBuilder, sort::SortKey, InfluxColumnType, InfluxFieldType, Schema};
+use snafu::Snafu;
+use std::cmp::Ordering;
+use std::collections::HashSet;
+use std::{
+    collections::{BTreeMap, BTreeSet, HashMap},
+    convert::TryFrom,
+    ops::Deref,
+    sync::Arc,
+};
+
+/// Unique ID for a `Column`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct ColumnId(i64);
+
+#[allow(missing_docs)]
+impl ColumnId {
+    pub fn new(v: i64) -> Self {
+        Self(v)
+    }
+    pub fn get(&self) -> i64 {
+        self.0
+    }
+}
+
+/// Column definitions for a table indexed by their name
+#[derive(Debug, Clone, Eq, PartialEq, Hash, Default)]
+pub struct ColumnsByName(BTreeMap<Arc<str>, ColumnSchema>);
+
+impl From<BTreeMap<Arc<str>, ColumnSchema>> for ColumnsByName {
+    fn from(value: BTreeMap<Arc<str>, ColumnSchema>) -> Self {
+        Self(value)
+    }
+}
+
+impl ColumnsByName {
+    /// Create a new instance holding the given [`Column`]s.
+    pub fn new(columns: impl IntoIterator<Item = Column>) -> Self {
+        Self(
+            columns
+                .into_iter()
+                .map(|c| {
+                    (
+                        Arc::from(c.name),
+                        ColumnSchema {
+                            id: c.id,
+                            column_type: c.column_type,
+                        },
+                    )
+                })
+                .collect(),
+        )
+    }
+
+    /// Add the given column name and schema to this set of columns.
+    ///
+    /// # Panics
+    ///
+    /// This method panics if a column of the same name already exists in `self`.
+    pub fn add_column(&mut self, column_name: impl Into<Arc<str>>, column_schema: ColumnSchema) {
+        let old = self.0.insert(column_name.into(), column_schema);
+        assert!(old.is_none());
+    }
+
+    /// Iterate over the names and columns.
+    pub fn iter(&self) -> impl Iterator<Item = (&Arc<str>, &ColumnSchema)> {
+        self.0.iter()
+    }
+
+    /// Whether a column with this name is in the set.
+    pub fn contains_column_name(&self, name: &str) -> bool {
+        self.0.contains_key(name)
+    }
+
+    /// Return number of columns in the set.
+    pub fn column_count(&self) -> usize {
+        self.0.len()
+    }
+
+    /// Return the set of column names. Used in combination with a write operation's
+    /// column names to determine whether a write would exceed the max allowed columns.
+    pub fn names(&self) -> BTreeSet<&str> {
+        self.0.keys().map(|name| name.as_ref()).collect()
+    }
+
+    /// Return an iterator of the set of column IDs.
+    pub fn ids(&self) -> impl Iterator<Item = ColumnId> + '_ {
+        self.0.values().map(|c| c.id)
+    }
+
+    /// Return column ids of the given column names
+    ///
+    /// # Panics
+    ///
+    /// Panics if any of the names are not found in this set.
+    pub fn ids_for_names<T>(&self, names: impl IntoIterator<Item = T> + Send) -> SortKeyIds
+    where
+        T: AsRef<str>,
+    {
+        SortKeyIds::from(names.into_iter().map(|name| {
+            let name = name.as_ref();
+            self.get(name)
+                .unwrap_or_else(|| panic!("column name not found: {}", name))
+                .id
+                .get()
+        }))
+    }
+
+    /// Get a column by its name.
+    pub fn get(&self, name: &str) -> Option<&ColumnSchema> {
+        self.0.get(name)
+    }
+
+    /// Get the `ColumnId` for the time column, if present (a table created through
+    /// `table_load_or_create` will always have a time column).
+    pub fn time_column_id(&self) -> Option<ColumnId> {
+        self.get(schema::TIME_COLUMN_NAME).map(|column| column.id)
+    }
+
+    /// Create `ID->name` map for columns.
+    pub fn id_map(&self) -> HashMap<ColumnId, Arc<str>> {
+        self.0
+            .iter()
+            .map(|(name, c)| (c.id, Arc::clone(name)))
+            .collect()
+    }
+}
+
+impl IntoIterator for ColumnsByName {
+    type Item = (Arc<str>, ColumnSchema);
+    type IntoIter = std::collections::btree_map::IntoIter<Arc<str>, ColumnSchema>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.into_iter()
+    }
+}
+
+impl FromIterator<(Arc<str>, ColumnSchema)> for ColumnsByName {
+    fn from_iter<T: IntoIterator<Item = (Arc<str>, ColumnSchema)>>(iter: T) -> Self {
+        Self(BTreeMap::from_iter(iter))
+    }
+}
+
+// ColumnsByName is a newtype so that we can implement this `TryFrom` in this crate
+impl TryFrom<ColumnsByName> for Schema {
+    type Error = schema::builder::Error;
+
+    fn try_from(value: ColumnsByName) -> Result<Self, Self::Error> {
+        let mut builder = SchemaBuilder::new();
+
+        for (column_name, column_schema) in value.into_iter() {
+            let t = InfluxColumnType::from(column_schema.column_type);
+            builder.influx_column(column_name.as_ref(), t);
+        }
+
+        builder.build()
+    }
+}
+
+/// Data object for a column
+#[derive(Debug, Clone, sqlx::FromRow, Eq, PartialEq)]
+pub struct Column {
+    /// the column id
+    pub id: ColumnId,
+    /// the table id the column is in
+    pub table_id: TableId,
+    /// the name of the column, which is unique in the table
+    pub name: String,
+    /// the logical type of the column
+    pub column_type: ColumnType,
+}
+
+impl Column {
+    /// returns true if the column type is a tag
+    pub fn is_tag(&self) -> bool {
+        self.column_type == ColumnType::Tag
+    }
+
+    /// returns true if the column type matches the line protocol field value type
+    pub fn matches_field_type(&self, field_value: &FieldValue<'_>) -> bool {
+        match field_value {
+            FieldValue::I64(_) => self.column_type == ColumnType::I64,
+            FieldValue::U64(_) => self.column_type == ColumnType::U64,
+            FieldValue::F64(_) => self.column_type == ColumnType::F64,
+            FieldValue::String(_) => self.column_type == ColumnType::String,
+            FieldValue::Boolean(_) => self.column_type == ColumnType::Bool,
+        }
+    }
+}
+
+/// The column id and its type for a column
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
+pub struct ColumnSchema {
+    /// the column id
+    pub id: ColumnId,
+    /// the column type
+    pub column_type: ColumnType,
+}
+
+impl ColumnSchema {
+    /// returns true if the column is a tag
+    pub fn is_tag(&self) -> bool {
+        self.column_type == ColumnType::Tag
+    }
+
+    /// returns true if the column matches the line protocol field value type
+    pub fn matches_field_type(&self, field_value: &FieldValue<'_>) -> bool {
+        matches!(
+            (field_value, self.column_type),
+            (FieldValue::I64(_), ColumnType::I64)
+                | (FieldValue::U64(_), ColumnType::U64)
+                | (FieldValue::F64(_), ColumnType::F64)
+                | (FieldValue::String(_), ColumnType::String)
+                | (FieldValue::Boolean(_), ColumnType::Bool)
+        )
+    }
+
+    /// Returns true if `mb_column` is of the same type as `self`.
+    pub fn matches_type(&self, mb_column_influx_type: InfluxColumnType) -> bool {
+        self.column_type == mb_column_influx_type
+    }
+}
+
+impl TryFrom<&gossip::v1::Column> for ColumnSchema {
+    type Error = Box<dyn std::error::Error>;
+
+    fn try_from(v: &gossip::v1::Column) -> Result<Self, Self::Error> {
+        Ok(Self {
+            id: ColumnId::new(v.column_id),
+            column_type: ColumnType::try_from(v.column_type as i16)?,
+        })
+    }
+}
+
+/// The column data type
+#[allow(missing_docs)]
+#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[repr(i16)]
+pub enum ColumnType {
+    I64 = 1,
+    U64 = 2,
+    F64 = 3,
+    Bool = 4,
+    String = 5,
+    Time = 6,
+    Tag = 7,
+}
+
+impl ColumnType {
+    /// the short string description of the type
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::I64 => "i64",
+            Self::U64 => "u64",
+            Self::F64 => "f64",
+            Self::Bool => "bool",
+            Self::String => "string",
+            Self::Time => "time",
+            Self::Tag => "tag",
+        }
+    }
+}
+
+impl std::fmt::Display for ColumnType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = self.as_str();
+
+        write!(f, "{s}")
+    }
+}
+
+/// Errors deserialising a protobuf serialised [`ColumnType`].
+#[derive(Debug, Snafu)]
+#[snafu(display("invalid column value"))]
+#[allow(missing_copy_implementations)]
+pub struct ColumnTypeProtoError {}
+
+impl TryFrom<i16> for ColumnType {
+    type Error = ColumnTypeProtoError;
+
+    fn try_from(value: i16) -> Result<Self, Self::Error> {
+        match value {
+            x if x == Self::I64 as i16 => Ok(Self::I64),
+            x if x == Self::U64 as i16 => Ok(Self::U64),
+            x if x == Self::F64 as i16 => Ok(Self::F64),
+            x if x == Self::Bool as i16 => Ok(Self::Bool),
+            x if x == Self::String as i16 => Ok(Self::String),
+            x if x == Self::Time as i16 => Ok(Self::Time),
+            x if x == Self::Tag as i16 => Ok(Self::Tag),
+            _ => Err(ColumnTypeProtoError {}),
+        }
+    }
+}
+
+impl From<InfluxColumnType> for ColumnType {
+    fn from(value: InfluxColumnType) -> Self {
+        match value {
+            InfluxColumnType::Tag => Self::Tag,
+            InfluxColumnType::Field(InfluxFieldType::Float) => Self::F64,
+            InfluxColumnType::Field(InfluxFieldType::Integer) => Self::I64,
+            InfluxColumnType::Field(InfluxFieldType::UInteger) => Self::U64,
+            InfluxColumnType::Field(InfluxFieldType::String) => Self::String,
+            InfluxColumnType::Field(InfluxFieldType::Boolean) => Self::Bool,
+            InfluxColumnType::Timestamp => Self::Time,
+        }
+    }
+}
+
+impl From<ColumnType> for InfluxColumnType {
+    fn from(value: ColumnType) -> Self {
+        match value {
+            ColumnType::I64 => Self::Field(InfluxFieldType::Integer),
+            ColumnType::U64 => Self::Field(InfluxFieldType::UInteger),
+            ColumnType::F64 => Self::Field(InfluxFieldType::Float),
+            ColumnType::Bool => Self::Field(InfluxFieldType::Boolean),
+            ColumnType::String => Self::Field(InfluxFieldType::String),
+            ColumnType::Time => Self::Timestamp,
+            ColumnType::Tag => Self::Tag,
+        }
+    }
+}
+
+impl PartialEq<InfluxColumnType> for ColumnType {
+    fn eq(&self, got: &InfluxColumnType) -> bool {
+        match self {
+            Self::I64 => matches!(got, InfluxColumnType::Field(InfluxFieldType::Integer)),
+            Self::U64 => matches!(got, InfluxColumnType::Field(InfluxFieldType::UInteger)),
+            Self::F64 => matches!(got, InfluxColumnType::Field(InfluxFieldType::Float)),
+            Self::Bool => matches!(got, InfluxColumnType::Field(InfluxFieldType::Boolean)),
+            Self::String => matches!(got, InfluxColumnType::Field(InfluxFieldType::String)),
+            Self::Time => matches!(got, InfluxColumnType::Timestamp),
+            Self::Tag => matches!(got, InfluxColumnType::Tag),
+        }
+    }
+}
+
+/// Returns the `ColumnType` for the passed in line protocol `FieldValue` type
+pub fn column_type_from_field(field_value: &FieldValue<'_>) -> ColumnType {
+    match field_value {
+        FieldValue::I64(_) => ColumnType::I64,
+        FieldValue::U64(_) => ColumnType::U64,
+        FieldValue::F64(_) => ColumnType::F64,
+        FieldValue::String(_) => ColumnType::String,
+        FieldValue::Boolean(_) => ColumnType::Bool,
+    }
+}
+
+impl TryFrom<proto::ColumnType> for ColumnType {
+    type Error = &'static str;
+
+    fn try_from(value: proto::ColumnType) -> Result<Self, Self::Error> {
+        Ok(match value {
+            proto::ColumnType::I64 => Self::I64,
+            proto::ColumnType::U64 => Self::U64,
+            proto::ColumnType::F64 => Self::F64,
+            proto::ColumnType::Bool => Self::Bool,
+            proto::ColumnType::String => Self::String,
+            proto::ColumnType::Time => Self::Time,
+            proto::ColumnType::Tag => Self::Tag,
+            proto::ColumnType::Unspecified => return Err("unknown column type"),
+        })
+    }
+}
+
+impl From<ColumnType> for proto::ColumnType {
+    fn from(value: ColumnType) -> Self {
+        match value {
+            ColumnType::I64 => Self::I64,
+            ColumnType::U64 => Self::U64,
+            ColumnType::F64 => Self::F64,
+            ColumnType::Bool => Self::Bool,
+            ColumnType::String => Self::String,
+            ColumnType::Time => Self::Time,
+            ColumnType::Tag => Self::Tag,
+        }
+    }
+}
+
+/// Set of columns and used as Set data type.
+///
+/// # Data Structure
+/// This is internally implemented as a sorted vector. The sorting allows for fast [`PartialEq`]/[`Eq`]/[`Hash`] and
+/// ensures that the PostgreSQL data is deterministic. Note that PostgreSQL does NOT have a set type at the moment, so
+/// this is stored as an array.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, sqlx::Type)]
+#[sqlx(transparent, no_pg_array)]
+pub struct ColumnSet(Vec<ColumnId>);
+
+impl ColumnSet {
+    /// Create new column set.
+    ///
+    /// The order of the passed columns will NOT be preserved.
+    ///
+    /// # Panic
+    /// Panics when the set of passed columns contains duplicates.
+    pub fn new<I>(columns: I) -> Self
+    where
+        I: IntoIterator<Item = ColumnId>,
+    {
+        let mut columns: Vec<ColumnId> = columns.into_iter().collect();
+        columns.sort();
+
+        assert!(
+            columns.windows(2).all(|w| w[0] != w[1]),
+            "set contains duplicates"
+        );
+
+        columns.shrink_to_fit();
+
+        Self(columns)
+    }
+
+    /// Create a new empty [`ColumnSet`]
+    pub fn empty() -> Self {
+        Self(Vec::new())
+    }
+
+    /// Estimate the memory consumption of this object and its contents
+    pub fn size(&self) -> usize {
+        std::mem::size_of_val(self) + (std::mem::size_of::<ColumnId>() * self.0.capacity())
+    }
+
+    /// The set is empty or not
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    /// Computes the union of `self` and `other`
+    pub fn union(&mut self, other: &Self) {
+        let mut insert_idx = 0;
+        let mut src_idx = 0;
+
+        while insert_idx < self.0.len() && src_idx < other.0.len() {
+            let s = self.0[insert_idx];
+            let o = other.0[src_idx];
+
+            match s.cmp(&o) {
+                Ordering::Less => insert_idx += 1,
+                Ordering::Equal => {
+                    insert_idx += 1;
+                    src_idx += 1;
+                }
+                Ordering::Greater => {
+                    self.0.insert(insert_idx, o);
+                    insert_idx += 1;
+                    src_idx += 1;
+                }
+            }
+        }
+        self.0.extend_from_slice(&other.0[src_idx..]);
+    }
+
+    /// Returns the indices and ids in `self` that are present in both `self` and `other`
+    ///
+    /// ```
+    /// # use data_types::{ColumnId, ColumnSet};
+    /// let a = ColumnSet::new([1, 2, 4, 6, 7].into_iter().map(ColumnId::new));
+    /// let b = ColumnSet::new([2, 4, 6].into_iter().map(ColumnId::new));
+    ///
+    /// assert_eq!(
+    ///     a.intersect(&b).collect::<Vec<_>>(),
+    ///     vec![(1, b[0]), (2, b[1]), (3, b[2])]
+    /// )
+    /// ```
+    pub fn intersect<'a>(
+        &'a self,
+        other: &'a Self,
+    ) -> impl Iterator<Item = (usize, ColumnId)> + 'a {
+        let mut left_idx = 0;
+        let mut right_idx = 0;
+        std::iter::from_fn(move || loop {
+            let s = self.0.get(left_idx)?;
+            let o = other.get(right_idx)?;
+
+            match s.cmp(o) {
+                Ordering::Less => left_idx += 1,
+                Ordering::Greater => right_idx += 1,
+                Ordering::Equal => {
+                    let t = left_idx;
+                    left_idx += 1;
+                    right_idx += 1;
+                    return Some((t, *s));
+                }
+            }
+        })
+    }
+}
+
+impl From<ColumnSet> for Vec<ColumnId> {
+    fn from(set: ColumnSet) -> Self {
+        set.0
+    }
+}
+
+impl Deref for ColumnSet {
+    type Target = [ColumnId];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.deref()
+    }
+}
+
+/// Set of sorted column IDs in a specific given order at creation time, to be used as a
+/// [`SortKey`] by looking up the column names in the table's schema.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, sqlx::Type, Default)]
+#[sqlx(transparent, no_pg_array)]
+pub struct SortKeyIds(Vec<ColumnId>);
+
+impl SortKeyIds {
+    /// Create new sorted column set.
+    ///
+    /// The order of the passed columns will be preserved.
+    ///
+    /// # Panic
+    /// Panics when the set of passed columns contains duplicates.
+    pub fn new<I>(columns: I) -> Self
+    where
+        I: IntoIterator<Item = ColumnId>,
+    {
+        let mut columns: Vec<ColumnId> = columns.into_iter().collect();
+
+        // Validate the ID set contains no duplicates.
+        //
+        // This validates an invariant in debug builds, skipping the cost
+        // for release builds.
+        if cfg!(debug_assertions) {
+            SortKeyIds::check_for_deplicates(&columns);
+        }
+
+        // Must continue with columns in original order
+        columns.shrink_to_fit();
+
+        Self(columns)
+    }
+
+    /// Given another set of sort key IDs, merge them together and, if needed, return a value to
+    /// use to update the catalog.
+    ///
+    /// If `other` contains any column IDs that are not present in `self`, create a new
+    /// `SortKeyIds` instance that includes the new columns in `other` (in the same order they
+    /// appear in `other`) appended to the existing columns, but keeping the time column ID last.
+    ///
+    /// If existing columns appear in `self` in a different order than they appear in `other`, the
+    /// order in `self` takes precedence and remains unchanged.
+    ///
+    /// If `self` contains all the sort keys in `other` already (regardless of order), this will
+    /// return `None` as no update to the catalog is needed.
+    pub fn maybe_append(&self, other: &Self, time_column_id: ColumnId) -> Option<Self> {
+        let existing_columns_without_time = self
+            .iter()
+            .cloned()
+            .filter(|&column_id| column_id != time_column_id);
+
+        let mut new_columns = other
+            .iter()
+            .cloned()
+            .filter(|column_id| !self.contains(column_id))
+            .peekable();
+
+        if new_columns.peek().is_none() {
+            None
+        } else {
+            Some(SortKeyIds::new(
+                existing_columns_without_time
+                    .chain(new_columns)
+                    .chain(std::iter::once(time_column_id)),
+            ))
+        }
+    }
+
+    /// Estimate the memory consumption of this object and its contents
+    pub fn size(&self) -> usize {
+        std::mem::size_of_val(self) + (std::mem::size_of::<ColumnId>() * self.0.capacity())
+    }
+
+    /// Build a [`SortKey`] from [`SortKeyIds`]; looking up column names in the provided
+    /// [`ColumnsByName`] map by converting it to a `HashMap<ColumnId, &str>. If you already have
+    /// an id-to-name column map, use [`SortKeyIds::to_sort_key_using_map`] instead.
+    ///
+    /// If you have a [`Partition`][super::Partition], it may be more convenient to call the
+    /// [`Partition::sort_key`][super::Partition::sort_key] method instead!
+    ///
+    /// # Panics
+    ///
+    /// Will panic if an ID isn't found in the column map.
+    pub fn to_sort_key(&self, columns: &ColumnsByName) -> SortKey {
+        let column_id_map = columns.id_map();
+        self.to_sort_key_using_map(&column_id_map)
+    }
+
+    /// Build a [`SortKey`] from [`SortKeyIds`]; looking up column names in the provided
+    /// [`HashMap<ColumnId, &str>`] map.
+    ///
+    /// If you have a [`Partition`][super::Partition], it may be more convenient to call the
+    /// [`Partition::sort_key`][super::Partition::sort_key] method instead!
+    ///
+    /// # Panics
+    ///
+    /// Will panic if an ID isn't found in the column map.
+    pub fn to_sort_key_using_map(&self, column_id_map: &HashMap<ColumnId, Arc<str>>) -> SortKey {
+        SortKey::from_columns(self.0.iter().map(|id| {
+            Arc::clone(
+                column_id_map.get(id).unwrap_or_else(|| {
+                    panic!("cannot find column names for sort key id {}", id.get())
+                }),
+            )
+        }))
+    }
+
+    /// Returns `true` if `other` is a monotonic update of `self`.
+    ///
+    /// # Panics
+    ///
+    /// Assumes "time" is the last column in both sets, and panics if the last
+    /// columns are not identical.
+    pub fn is_monotonic_update(&self, other: &Self) -> bool {
+        // The SortKeyIds always reference the time column last (if set).
+        if self.0.last().is_some() {
+            assert_eq!(
+                self.0.last(),
+                other.last(),
+                "last column in sort IDs must be time, and cannot change"
+            );
+        }
+
+        // Ensure the values in other are a prefix match, with the exception of
+        // the last "time" column.
+        self.0.len() <= other.len()
+            && self
+                .0
+                .iter()
+                .take(self.0.len().saturating_sub(1))
+                .zip(other.iter())
+                .all(|(a, b)| a == b)
+    }
+
+    fn check_for_deplicates(columns: &[ColumnId]) {
+        let mut column_ids: HashSet<i64> = HashSet::with_capacity(columns.len());
+        for c in columns {
+            match column_ids.get(&c.0) {
+                Some(_) => {
+                    panic!("set contains duplicates");
+                }
+                _ => {
+                    column_ids.insert(c.0);
+                }
+            }
+        }
+    }
+}
+
+impl From<SortKeyIds> for Vec<ColumnId> {
+    fn from(set: SortKeyIds) -> Self {
+        set.0
+    }
+}
+
+impl Deref for SortKeyIds {
+    type Target = [ColumnId];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.deref()
+    }
+}
+
+impl<I> From<I> for SortKeyIds
+where
+    I: IntoIterator<Item = i64>,
+{
+    fn from(ids: I) -> Self {
+        Self::new(ids.into_iter().map(ColumnId::new).collect::<Vec<_>>())
+    }
+}
+
+impl From<&SortKeyIds> for Vec<i64> {
+    fn from(val: &SortKeyIds) -> Self {
+        val.0.iter().map(|id| id.get()).collect()
+    }
+}
+
+impl From<&SortKeyIds> for generated_types::influxdata::iox::catalog::v1::SortKeyIds {
+    fn from(val: &SortKeyIds) -> Self {
+        generated_types::influxdata::iox::catalog::v1::SortKeyIds {
+            array_sort_key_ids: val.into(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+
+    use super::*;
+
+    #[test]
+    #[should_panic = "set contains duplicates"]
+    fn test_column_set_duplicates() {
+        ColumnSet::new([ColumnId::new(1), ColumnId::new(2), ColumnId::new(1)]);
+    }
+
+    #[test]
+    fn test_column_set_eq() {
+        let set_1 = ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]);
+        let set_2 = ColumnSet::new([ColumnId::new(2), ColumnId::new(1)]);
+        assert_eq!(set_1, set_2);
+    }
+
+    #[test]
+    fn test_column_set_union_intersect() {
+        let a = ColumnSet::new([1, 2, 5, 7].into_iter().map(ColumnId::new));
+        let b = ColumnSet::new([1, 5, 6, 7, 8].into_iter().map(ColumnId::new));
+
+        let mut t = ColumnSet::empty();
+        t.union(&a);
+        assert_eq!(t, a);
+
+        assert_eq!(
+            t.intersect(&a).collect::<Vec<_>>(),
+            vec![(0, a[0]), (1, a[1]), (2, a[2]), (3, a[3])]
+        );
+
+        t.union(&b);
+        let expected = ColumnSet::new([1, 2, 5, 6, 7, 8].into_iter().map(ColumnId::new));
+        assert_eq!(t, expected);
+
+        assert_eq!(
+            t.intersect(&a).collect::<Vec<_>>(),
+            vec![(0, a[0]), (1, a[1]), (2, a[2]), (4, a[3])]
+        );
+
+        assert_eq!(
+            t.intersect(&b).collect::<Vec<_>>(),
+            vec![(0, b[0]), (2, b[1]), (3, b[2]), (4, b[3]), (5, b[4])]
+        );
+    }
+
+    #[test]
+    #[should_panic = "set contains duplicates"]
+    fn test_sorted_column_set_duplicates() {
+        SortKeyIds::new([
+            ColumnId::new(2),
+            ColumnId::new(1),
+            ColumnId::new(3),
+            ColumnId::new(1),
+        ]);
+    }
+
+    #[test]
+    fn test_sorted_column_set() {
+        let set = SortKeyIds::new([ColumnId::new(2), ColumnId::new(1), ColumnId::new(3)]);
+        // verify the order is preserved
+        assert_eq!(set[0], ColumnId::new(2));
+        assert_eq!(set[1], ColumnId::new(1));
+        assert_eq!(set[2], ColumnId::new(3));
+    }
+
+    #[test]
+    fn test_column_schema() {
+        assert_eq!(
+            ColumnType::try_from(proto::ColumnType::I64).unwrap(),
+            ColumnType::I64,
+        );
+        assert_eq!(
+            ColumnType::try_from(proto::ColumnType::U64).unwrap(),
+            ColumnType::U64,
+        );
+        assert_eq!(
+            ColumnType::try_from(proto::ColumnType::F64).unwrap(),
+            ColumnType::F64,
+        );
+        assert_eq!(
+            ColumnType::try_from(proto::ColumnType::Bool).unwrap(),
+            ColumnType::Bool,
+        );
+        assert_eq!(
+            ColumnType::try_from(proto::ColumnType::String).unwrap(),
+            ColumnType::String,
+        );
+        assert_eq!(
+            ColumnType::try_from(proto::ColumnType::Time).unwrap(),
+            ColumnType::Time,
+        );
+        assert_eq!(
+            ColumnType::try_from(proto::ColumnType::Tag).unwrap(),
+            ColumnType::Tag,
+        );
+
+        assert!(ColumnType::try_from(proto::ColumnType::Unspecified).is_err());
+    }
+
+    #[test]
+    fn test_gossip_proto_conversion() {
+        let proto = gossip::v1::Column {
+            name: "bananas".to_string(),
+            column_id: 42,
+            column_type: gossip::v1::column::ColumnType::String as _,
+        };
+
+        let got = ColumnSchema::try_from(&proto).expect("should succeed");
+        assert_matches!(got, ColumnSchema{id, column_type} => {
+            assert_eq!(id.get(), 42);
+            assert_eq!(column_type, ColumnType::String);
+        });
+    }
+
+    #[test]
+    fn test_gossip_proto_conversion_invalid_type() {
+        let proto = gossip::v1::Column {
+            name: "bananas".to_string(),
+            column_id: 42,
+            column_type: 42,
+        };
+
+        ColumnSchema::try_from(&proto).expect_err("should succeed");
+    }
+
+    #[test]
+    fn test_columns_by_names_exist() {
+        let columns = build_columns_by_names();
+
+        let ids = columns.ids_for_names(["foo", "bar"]);
+        assert_eq!(ids, SortKeyIds::from([1, 2]));
+    }
+
+    #[test]
+    fn test_columns_by_names_exist_different_order() {
+        let columns = build_columns_by_names();
+
+        let ids = columns.ids_for_names(["bar", "foo"]);
+        assert_eq!(ids, SortKeyIds::from([2, 1]));
+    }
+
+    #[test]
+    #[should_panic = "column name not found: baz"]
+    fn test_columns_by_names_not_exist() {
+        let columns = build_columns_by_names();
+        columns.ids_for_names(["foo", "baz"]);
+    }
+
+    fn build_columns_by_names() -> ColumnsByName {
+        let mut columns: BTreeMap<Arc<str>, ColumnSchema> = BTreeMap::new();
+        columns.insert(
+            "foo".into(),
+            ColumnSchema {
+                id: ColumnId::new(1),
+                column_type: ColumnType::I64,
+            },
+        );
+        columns.insert(
+            "bar".into(),
+            ColumnSchema {
+                id: ColumnId::new(2),
+                column_type: ColumnType::I64,
+            },
+        );
+        columns.insert(
+            "time".into(),
+            ColumnSchema {
+                id: ColumnId::new(3),
+                column_type: ColumnType::Time,
+            },
+        );
+        columns.insert(
+            "tag1".into(),
+            ColumnSchema {
+                id: ColumnId::new(4),
+                column_type: ColumnType::Tag,
+            },
+        );
+
+        ColumnsByName(columns)
+    }
+
+    // panic if the sort_key_ids are not found in the columns
+    #[test]
+    #[should_panic(expected = "cannot find column names for sort key id 3")]
+    fn test_panic_build_sort_key_from_ids_and_map() {
+        // table columns
+        let uno = ColumnSchema {
+            id: ColumnId::new(1),
+            column_type: ColumnType::Tag,
+        };
+        let dos = ColumnSchema {
+            id: ColumnId::new(2),
+            column_type: ColumnType::Tag,
+        };
+        let mut column_map = ColumnsByName::default();
+        column_map.add_column("uno", uno);
+        column_map.add_column("dos", dos);
+
+        // sort_key_ids include some columns that are not in the columns
+        let sort_key_ids = SortKeyIds::from([2, 3]);
+        sort_key_ids.to_sort_key(&column_map);
+    }
+
+    #[test]
+    fn test_build_sort_key_from_ids_and_map() {
+        // table columns
+        let uno = ColumnSchema {
+            id: ColumnId::new(1),
+            column_type: ColumnType::Tag,
+        };
+        let dos = ColumnSchema {
+            id: ColumnId::new(2),
+            column_type: ColumnType::Tag,
+        };
+        let tres = ColumnSchema {
+            id: ColumnId::new(3),
+            column_type: ColumnType::Tag,
+        };
+        let mut column_map = ColumnsByName::default();
+        column_map.add_column("uno", uno);
+        column_map.add_column("dos", dos);
+        column_map.add_column("tres", tres);
+
+        // sort_key_ids is empty
+        let sort_key_ids = SortKeyIds::default();
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::empty());
+
+        // sort_key_ids include all columns and in the same order
+        let sort_key_ids = SortKeyIds::from([1, 2, 3]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["uno", "dos", "tres"]));
+
+        // sort_key_ids include all columns but in different order
+        let sort_key_ids = SortKeyIds::from([2, 3, 1]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["dos", "tres", "uno"]));
+
+        // sort_key_ids include some columns
+        let sort_key_ids = SortKeyIds::from([2, 3]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["dos", "tres"]));
+
+        // sort_key_ids include some columns in different order
+        let sort_key_ids = SortKeyIds::from([3, 1]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["tres", "uno"]));
+    }
+
+    #[test]
+    fn test_sort_key_ids_round_trip_encoding() {
+        let original = SortKeyIds::from([1, 2, 3]);
+
+        let encoded: generated_types::influxdata::iox::catalog::v1::SortKeyIds = (&original).into();
+
+        let decoded: SortKeyIds = encoded.array_sort_key_ids.into();
+        assert_eq!(decoded, original);
+    }
+
+    macro_rules! test_is_monotonic_update {
+        (
+            $name:ident,
+            a = $a:expr,
+            b = $b:expr,
+            want = $want:expr
+        ) => {
+            paste::paste! {
+                #[test]
+                fn [<test_is_monotonic_update_ $name>]() {
+                    let a = SortKeyIds::new($a.into_iter().map(ColumnId::new));
+                    let b = SortKeyIds::new($b.into_iter().map(ColumnId::new));
+                    assert_eq!(a.is_monotonic_update(&b), $want)
+                }
+            }
+        };
+    }
+
+    test_is_monotonic_update!(equal, a = [42, 24, 1], b = [42, 24, 1], want = true);
+
+    test_is_monotonic_update!(empty, a = [], b = [42, 24, 1], want = true);
+
+    test_is_monotonic_update!(
+        longer_with_time,
+        a = [42, 24, 1],
+        b = [42, 24, 13, 1],
+        want = true
+    );
+
+    test_is_monotonic_update!(shorter_with_time, a = [42, 24, 1], b = [1], want = false);
+
+    test_is_monotonic_update!(
+        mismatch_with_time,
+        a = [42, 24, 1],
+        b = [24, 42, 1],
+        want = false
+    );
+
+    test_is_monotonic_update!(mismatch, a = [42, 24, 1], b = [24, 42, 1], want = false);
+}
diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs
new file mode 100644
index 0000000..951af51
--- /dev/null
+++ b/data_types/src/lib.rs
@@ -0,0 +1,2799 @@
+//! Shared data types
+
+// `clippy::use_self` is deliberately excluded from the lints this crate uses.
+// See <https://github.com/rust-lang/rust-clippy/issues/6902>.
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use thiserror::Error;
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod columns;
+pub use columns::*;
+mod namespace_name;
+pub use namespace_name::*;
+pub mod partition_template;
+use partition_template::*;
+pub mod partition;
+pub use partition::*;
+pub mod sequence_number_set;
+pub mod service_limits;
+pub mod snapshot;
+
+pub use service_limits::*;
+
+use observability_deps::tracing::warn;
+use schema::TIME_COLUMN_NAME;
+use snafu::Snafu;
+use std::{
+    borrow::Borrow,
+    collections::{BTreeMap, BTreeSet, HashMap},
+    convert::TryFrom,
+    fmt::{Display, Write},
+    mem::{self, size_of_val},
+    num::{FpCategory, NonZeroU64},
+    ops::{Add, Deref, Sub},
+    sync::Arc,
+};
+use uuid::Uuid;
+
+/// Errors deserialising a protobuf serialised [`ParquetFile`].
+#[derive(Debug, Snafu)]
+#[snafu(display("invalid compaction level value"))]
+#[allow(missing_copy_implementations)]
+pub struct CompactionLevelProtoError {}
+
+/// Compaction levels
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, sqlx::Type)]
+#[repr(i16)]
+pub enum CompactionLevel {
+    /// The starting compaction level for parquet files persisted by an Ingester is zero.
+    Initial = 0,
+    /// Level of files persisted by a Compactor that do not overlap with non-level-0 files.
+    FileNonOverlapped = 1,
+    /// Level of files persisted by a Compactor that are fully compacted and should not be
+    /// recompacted unless a new overlapping Initial level file arrives
+    Final = 2,
+}
+
+impl Display for CompactionLevel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Initial => write!(f, "CompactionLevel::L0"),
+            Self::FileNonOverlapped => write!(f, "CompactionLevel::L1"),
+            Self::Final => write!(f, "CompactionLevel::L2"),
+        }
+    }
+}
+
+impl TryFrom<i32> for CompactionLevel {
+    type Error = CompactionLevelProtoError;
+
+    fn try_from(value: i32) -> Result<Self, Self::Error> {
+        match value {
+            x if x == Self::Initial as i32 => Ok(Self::Initial),
+            x if x == Self::FileNonOverlapped as i32 => Ok(Self::FileNonOverlapped),
+            x if x == Self::Final as i32 => Ok(Self::Final),
+            _ => Err(CompactionLevelProtoError {}),
+        }
+    }
+}
+
+impl CompactionLevel {
+    /// When compacting files of this level, provide the level that the resulting file should be.
+    /// Does not exceed the maximum available level.
+    pub fn next(&self) -> Self {
+        match self {
+            Self::Initial => Self::FileNonOverlapped,
+            Self::FileNonOverlapped => Self::Final,
+            Self::Final => Self::Final,
+        }
+    }
+
+    /// Return previous level
+    pub fn prev(&self) -> Self {
+        match self {
+            Self::Initial => Self::Initial,
+            Self::FileNonOverlapped => Self::Initial,
+            Self::Final => Self::FileNonOverlapped,
+        }
+    }
+
+    /// Returns all levels
+    pub fn all() -> &'static [Self] {
+        &[Self::Initial, Self::FileNonOverlapped, Self::Final]
+    }
+
+    /// Static name
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::Initial => "L0",
+            Self::FileNonOverlapped => "L1",
+            Self::Final => "L2",
+        }
+    }
+}
+
+/// Unique ID for a `Namespace`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct NamespaceId(i64);
+
+#[allow(missing_docs)]
+impl NamespaceId {
+    pub const fn new(v: i64) -> Self {
+        Self(v)
+    }
+    pub fn get(&self) -> i64 {
+        self.0
+    }
+}
+
+impl std::fmt::Display for NamespaceId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+/// Unique ID for a `Table`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct TableId(i64);
+
+#[allow(missing_docs)]
+impl TableId {
+    pub const fn new(v: i64) -> Self {
+        Self(v)
+    }
+
+    pub fn get(&self) -> i64 {
+        self.0
+    }
+
+    pub const fn to_be_bytes(&self) -> [u8; 8] {
+        self.0.to_be_bytes()
+    }
+}
+
+impl std::fmt::Display for TableId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+/// A sequence number from an ingester
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct SequenceNumber(u64);
+
+#[allow(missing_docs)]
+impl SequenceNumber {
+    pub fn new(v: u64) -> Self {
+        Self(v)
+    }
+    pub fn get(&self) -> u64 {
+        self.0
+    }
+}
+
+impl Add<u64> for SequenceNumber {
+    type Output = Self;
+
+    fn add(self, other: u64) -> Self {
+        Self(self.0 + other)
+    }
+}
+
+impl Sub<u64> for SequenceNumber {
+    type Output = Self;
+
+    fn sub(self, other: u64) -> Self {
+        Self(self.0 - other)
+    }
+}
+
+/// A time in nanoseconds from epoch.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct Timestamp(i64);
+
+#[allow(missing_docs)]
+impl Timestamp {
+    pub fn new(v: i64) -> Self {
+        Self(v)
+    }
+
+    pub fn get(&self) -> i64 {
+        self.0
+    }
+}
+
+impl From<iox_time::Time> for Timestamp {
+    fn from(time: iox_time::Time) -> Self {
+        Self::new(time.timestamp_nanos())
+    }
+}
+
+impl From<Timestamp> for iox_time::Time {
+    fn from(time: Timestamp) -> iox_time::Time {
+        iox_time::Time::from_timestamp_nanos(time.get())
+    }
+}
+
+impl Add for Timestamp {
+    type Output = Self;
+
+    fn add(self, other: Self) -> Self {
+        Self(self.0.checked_add(other.0).expect("timestamp wraparound"))
+    }
+}
+
+impl Sub for Timestamp {
+    type Output = Self;
+
+    fn sub(self, other: Self) -> Self {
+        Self(self.0.checked_sub(other.0).expect("timestamp wraparound"))
+    }
+}
+
+impl Add<i64> for Timestamp {
+    type Output = Self;
+
+    fn add(self, rhs: i64) -> Self::Output {
+        self + Self(rhs)
+    }
+}
+
+impl Sub<i64> for Timestamp {
+    type Output = Self;
+
+    fn sub(self, rhs: i64) -> Self::Output {
+        self - Self(rhs)
+    }
+}
+
+/// Unique ID for a `ParquetFile`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct ParquetFileId(i64);
+
+#[allow(missing_docs)]
+impl ParquetFileId {
+    pub fn new(v: i64) -> Self {
+        Self(v)
+    }
+    pub fn get(&self) -> i64 {
+        self.0
+    }
+}
+
+impl std::fmt::Display for ParquetFileId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Use `self.number` to refer to each positional data point.
+        write!(f, "{}", self.0)
+    }
+}
+
+/// Unique store UUID for a [`ParquetFile`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct ObjectStoreId(Uuid);
+
+#[allow(missing_docs)]
+impl ObjectStoreId {
+    #[allow(clippy::new_without_default)]
+    pub fn new() -> Self {
+        Self::from_uuid(Uuid::new_v4())
+    }
+
+    pub fn from_uuid(uuid: Uuid) -> Self {
+        Self(uuid)
+    }
+
+    pub fn get_uuid(&self) -> Uuid {
+        self.0
+    }
+}
+
+impl std::fmt::Display for ObjectStoreId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl std::str::FromStr for ObjectStoreId {
+    type Err = uuid::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let uuid = Uuid::parse_str(s)?;
+        Ok(Self::from_uuid(uuid))
+    }
+}
+
+/// Data object for a namespace
+#[derive(Debug, Clone, PartialEq, sqlx::FromRow)]
+pub struct Namespace {
+    /// The id of the namespace
+    pub id: NamespaceId,
+    /// The unique name of the namespace
+    pub name: String,
+    /// The retention period in ns. None represents infinite duration (i.e. never drop data).
+    pub retention_period_ns: Option<i64>,
+    /// The maximum number of tables that can exist in this namespace
+    pub max_tables: MaxTables,
+    /// The maximum number of columns per table in this namespace
+    pub max_columns_per_table: MaxColumnsPerTable,
+    /// When this file was marked for deletion.
+    pub deleted_at: Option<Timestamp>,
+    /// The partition template to use for new tables in this namespace either created implicitly or
+    /// created without specifying a partition template.
+    pub partition_template: NamespacePartitionTemplateOverride,
+}
+
+/// Schema collection for a namespace. This is an in-memory object useful for a schema
+/// cache.
+#[derive(Debug, Clone, PartialEq, Hash)]
+pub struct NamespaceSchema {
+    /// the namespace id
+    pub id: NamespaceId,
+    /// the tables in the namespace by name
+    pub tables: BTreeMap<String, TableSchema>,
+    /// The maximum number of tables permitted in this namespace.
+    pub max_tables: MaxTables,
+    /// the number of columns per table this namespace allows
+    pub max_columns_per_table: MaxColumnsPerTable,
+    /// The retention period in ns.
+    /// None represents infinite duration (i.e. never drop data).
+    pub retention_period_ns: Option<i64>,
+    /// The partition template to use for new tables in this namespace either created implicitly or
+    /// created without specifying a partition template.
+    pub partition_template: NamespacePartitionTemplateOverride,
+}
+
+impl NamespaceSchema {
+    /// Start a new `NamespaceSchema` with empty `tables` but the rest of the information populated
+    /// from the given `Namespace`.
+    pub fn new_empty_from(namespace: &Namespace) -> Self {
+        let &Namespace {
+            id,
+            retention_period_ns,
+            max_tables,
+            max_columns_per_table,
+            ref partition_template,
+            ..
+        } = namespace;
+
+        Self {
+            id,
+            tables: BTreeMap::new(),
+            max_tables,
+            max_columns_per_table,
+            retention_period_ns,
+            partition_template: partition_template.clone(),
+        }
+    }
+}
+
+impl NamespaceSchema {
+    /// Estimated Size in bytes including `self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+            + self
+                .tables
+                .iter()
+                .map(|(k, v)| size_of_val(k) + k.capacity() + v.size())
+                .sum::<usize>()
+    }
+}
+
+impl From<&NamespaceSchema> for generated_types::influxdata::iox::schema::v1::NamespaceSchema {
+    fn from(schema: &NamespaceSchema) -> Self {
+        namespace_schema_proto(schema.id, schema.tables.iter())
+    }
+}
+
+/// Generate [`NamespaceSchema`] protobuf from a `NamespaceId` and a list of tables. Useful to
+/// filter the tables returned from an API request to a particular table without needing to clone
+/// the whole `NamespaceSchema` to use the `From` impl.
+pub fn namespace_schema_proto<'a>(
+    id: NamespaceId,
+    tables: impl Iterator<Item = (&'a String, &'a TableSchema)>,
+) -> generated_types::influxdata::iox::schema::v1::NamespaceSchema {
+    use generated_types::influxdata::iox::schema::v1 as proto;
+    proto::NamespaceSchema {
+        id: id.get(),
+        tables: tables
+            .map(|(name, t)| (name.clone(), proto::TableSchema::from(t)))
+            .collect(),
+    }
+}
+
+/// Data object for a table
+#[derive(Debug, Clone, sqlx::FromRow, PartialEq)]
+pub struct Table {
+    /// The id of the table
+    pub id: TableId,
+    /// The namespace id that the table is in
+    pub namespace_id: NamespaceId,
+    /// The name of the table, which is unique within the associated namespace
+    pub name: String,
+    /// The partition template to use for writes in this table.
+    pub partition_template: TablePartitionTemplateOverride,
+}
+
+/// Serialise a [`Table`] object into its protobuf representation.
+impl From<Table> for generated_types::influxdata::iox::table::v1::Table {
+    fn from(value: Table) -> Self {
+        generated_types::influxdata::iox::table::v1::Table {
+            id: value.id.get(),
+            name: value.name,
+            namespace_id: value.namespace_id.get(),
+            partition_template: value.partition_template.as_proto().cloned(),
+        }
+    }
+}
+
+/// Column definitions for a table
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct TableSchema {
+    /// the table id
+    pub id: TableId,
+
+    /// The partition template to use for writes in this table.
+    pub partition_template: TablePartitionTemplateOverride,
+
+    /// the table's columns by their name
+    pub columns: ColumnsByName,
+}
+
+impl TableSchema {
+    /// Initialize new `TableSchema` from the information in the given `Table`.
+    pub fn new_empty_from(table: &Table) -> Self {
+        Self {
+            id: table.id,
+            partition_template: table.partition_template.clone(),
+            columns: ColumnsByName::default(),
+        }
+    }
+
+    /// Add `col` to this table schema.
+    ///
+    /// # Panics
+    ///
+    /// This method panics if a column of the same name already exists in
+    /// `self`, or if `col` references a different `table_id`.
+    pub fn add_column(&mut self, col: Column) {
+        let Column {
+            id,
+            name,
+            column_type,
+            table_id,
+        } = col;
+
+        assert_eq!(table_id, self.id);
+
+        let column_schema = ColumnSchema { id, column_type };
+        self.add_column_schema(name, column_schema);
+    }
+
+    /// Add the name and column schema to this table's schema.
+    ///
+    /// # Panics
+    ///
+    /// This method panics if a column of the same name already exists in
+    /// `self`.
+    pub fn add_column_schema(
+        &mut self,
+        column_name: impl Into<Arc<str>>,
+        column_schema: ColumnSchema,
+    ) {
+        self.columns.add_column(column_name, column_schema);
+    }
+
+    /// Estimated Size in bytes including `self`.
+    pub fn size(&self) -> usize {
+        size_of_val(self)
+            + self
+                .columns
+                .iter()
+                .map(|(k, v)| size_of_val(k) + k.as_ref().len() + size_of_val(v))
+                .sum::<usize>()
+    }
+
+    /// Create `ID->name` map for columns.
+    pub fn column_id_map(&self) -> HashMap<ColumnId, Arc<str>> {
+        self.columns.id_map()
+    }
+
+    /// Whether a column with this name is in the schema.
+    pub fn contains_column_name(&self, name: &str) -> bool {
+        self.columns.contains_column_name(name)
+    }
+
+    /// Return the set of column names for this table. Used in combination with a write operation's
+    /// column names to determine whether a write would exceed the max allowed columns.
+    pub fn column_names(&self) -> BTreeSet<&str> {
+        self.columns.names()
+    }
+
+    /// Return number of columns of the table
+    pub fn column_count(&self) -> usize {
+        self.columns.column_count()
+    }
+}
+
+impl From<&TableSchema> for generated_types::influxdata::iox::schema::v1::TableSchema {
+    fn from(table_schema: &TableSchema) -> Self {
+        use generated_types::influxdata::iox::schema::v1 as proto;
+
+        Self {
+            id: table_schema.id.get(),
+            columns: table_schema
+                .columns
+                .iter()
+                .map(|(name, c)| {
+                    (
+                        name.to_string(),
+                        proto::ColumnSchema {
+                            id: c.id.get(),
+                            column_type: c.column_type as i32,
+                        },
+                    )
+                })
+                .collect(),
+        }
+    }
+}
+
+/// Data recorded when compaction skips a partition.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::FromRow)]
+pub struct SkippedCompaction {
+    /// the partition
+    pub partition_id: PartitionId,
+    /// the reason compaction was skipped
+    pub reason: String,
+    /// when compaction was skipped
+    pub skipped_at: Timestamp,
+    /// estimated memory budget
+    pub estimated_bytes: i64,
+    /// limit on memory budget
+    pub limit_bytes: i64,
+    /// num files selected to compact
+    pub num_files: i64,
+    /// limit on num files
+    pub limit_num_files: i64,
+    /// limit on num files for the first file in a partition
+    pub limit_num_files_first_in_partition: i64,
+}
+
+impl From<SkippedCompaction>
+    for generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction
+{
+    fn from(skipped_compaction: SkippedCompaction) -> Self {
+        let SkippedCompaction {
+            partition_id,
+            reason,
+            skipped_at,
+            estimated_bytes,
+            limit_bytes,
+            num_files,
+            limit_num_files,
+            limit_num_files_first_in_partition,
+        } = skipped_compaction;
+
+        Self {
+            partition_id: partition_id.get(),
+            reason,
+            skipped_at: skipped_at.get(),
+            estimated_bytes,
+            limit_bytes,
+            num_files,
+            limit_num_files,
+            limit_num_files_first_in_partition,
+        }
+    }
+}
+
+impl From<generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction>
+    for SkippedCompaction
+{
+    fn from(
+        skipped_compaction: generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction,
+    ) -> Self {
+        Self {
+            partition_id: PartitionId::new(skipped_compaction.partition_id),
+            reason: skipped_compaction.reason,
+            skipped_at: Timestamp::new(skipped_compaction.skipped_at),
+            estimated_bytes: skipped_compaction.estimated_bytes,
+            limit_bytes: skipped_compaction.limit_bytes,
+            num_files: skipped_compaction.num_files,
+            limit_num_files: skipped_compaction.limit_num_files,
+            limit_num_files_first_in_partition: skipped_compaction
+                .limit_num_files_first_in_partition,
+        }
+    }
+}
+
+/// Data for a parquet file reference that has been inserted in the catalog.
+#[derive(Debug, Clone, PartialEq, Eq, sqlx::FromRow)]
+pub struct ParquetFile {
+    /// the id of the file in the catalog
+    pub id: ParquetFileId,
+    /// the namespace
+    pub namespace_id: NamespaceId,
+    /// the table
+    pub table_id: TableId,
+    /// the partition identifier
+    pub partition_id: PartitionId,
+    /// the optional partition hash id
+    pub partition_hash_id: Option<PartitionHashId>,
+    /// the uuid used in the object store path for this file
+    pub object_store_id: ObjectStoreId,
+    /// the min timestamp of data in this file
+    pub min_time: Timestamp,
+    /// the max timestamp of data in this file
+    pub max_time: Timestamp,
+    /// When this file was marked for deletion
+    pub to_delete: Option<Timestamp>,
+    /// file size in bytes
+    pub file_size_bytes: i64,
+    /// the number of rows of data in this file
+    pub row_count: i64,
+    /// The compaction level of the file.
+    ///
+    ///  * 0 (`CompactionLevel::Initial`): represents a level-0 file that is persisted by an
+    ///      Ingester. Partitions with level-0 files are usually hot/recent partitions.
+    ///  * 1 (`CompactionLevel::FileOverlapped`): represents a level-1 file that is persisted by a
+    ///      Compactor and potentially overlaps with other level-1 files. Partitions with level-1
+    ///      files are partitions with a lot of or/and large overlapped files that have to go
+    ///      through many compaction cycles before they are fully compacted to non-overlapped
+    ///      files.
+    ///  * 2 (`CompactionLevel::FileNonOverlapped`): represents a level-1 file that is persisted by
+    ///      a Compactor and does not overlap with other files except level 0 ones. Eventually,
+    ///      cold partitions (partitions that no longer needs to get compacted) will only include
+    ///      one or many level-1 files
+    pub compaction_level: CompactionLevel,
+    /// the creation time of the parquet file
+    pub created_at: Timestamp,
+    /// Set of columns within this parquet file.
+    ///
+    /// # Relation to Table-wide Column Set
+    /// Columns within this set may or may not be part of the table-wide schema.
+    ///
+    /// Columns that are NOT part of the table-wide schema must be ignored. It is likely that these
+    /// columns were originally part of the table but were later removed.
+    ///
+    /// # Column Types
+    /// Column types are identical to the table-wide types.
+    ///
+    /// # Column Order & Sort Key
+    /// The columns that are present in the table-wide schema are sorted according to the partition
+    /// sort key. The occur in the parquet file according to this order.
+    pub column_set: ColumnSet,
+    /// the max of created_at of all L0 files needed for file/chunk ordering for deduplication
+    pub max_l0_created_at: Timestamp,
+}
+
+impl ParquetFile {
+    /// Create new file from given parameters and ID.
+    ///
+    /// [`to_delete`](Self::to_delete) will be set to `None`.
+    pub fn from_params(params: ParquetFileParams, id: ParquetFileId) -> Self {
+        Self {
+            id,
+            partition_id: params.partition_id,
+            partition_hash_id: params.partition_hash_id,
+            namespace_id: params.namespace_id,
+            table_id: params.table_id,
+            object_store_id: params.object_store_id,
+            min_time: params.min_time,
+            max_time: params.max_time,
+            to_delete: None,
+            file_size_bytes: params.file_size_bytes,
+            row_count: params.row_count,
+            compaction_level: params.compaction_level,
+            created_at: params.created_at,
+            column_set: params.column_set,
+            max_l0_created_at: params.max_l0_created_at,
+        }
+    }
+
+    /// Estimate the memory consumption of this object and its contents
+    pub fn size(&self) -> usize {
+        let hash_id = self
+            .partition_hash_id
+            .as_ref()
+            .map(|x| x.size())
+            .unwrap_or_default();
+
+        std::mem::size_of_val(self) + hash_id + self.column_set.size()
+            - std::mem::size_of_val(&self.column_set)
+    }
+
+    /// Return true if the time range overlaps with the time range of the given file
+    pub fn overlaps(&self, other: &Self) -> bool {
+        self.min_time <= other.max_time && self.max_time >= other.min_time
+    }
+
+    /// Return true if the time range of this file overlaps with the given time range
+    pub fn overlaps_time_range(&self, min_time: Timestamp, max_time: Timestamp) -> bool {
+        self.min_time <= max_time && self.max_time >= min_time
+    }
+
+    /// Return true if the time range of this file overlaps with any of the given split times.
+    pub fn needs_split(&self, split_times: &Vec<i64>) -> bool {
+        for t in split_times {
+            // split time is the last timestamp on the "left" side of the split, if it equals
+            // the min time, one ns goes left, the rest goes right.
+            if self.min_time.get() <= *t && self.max_time.get() > *t {
+                return true;
+            }
+        }
+        false
+    }
+
+    /// Return true if the time range of this file overlaps with any of the given file ranges
+    pub fn overlaps_ranges(&self, ranges: &Vec<FileRange>) -> bool {
+        for range in ranges {
+            if self.min_time.get() <= range.max && self.max_time.get() >= range.min {
+                return true;
+            }
+        }
+        false
+    }
+
+    /// Temporary to aid incremental migration
+    pub fn transition_partition_id(&self) -> TransitionPartitionId {
+        TransitionPartitionId::from_parts(self.partition_id, self.partition_hash_id.clone())
+    }
+}
+
+impl From<ParquetFile> for generated_types::influxdata::iox::catalog::v1::ParquetFile {
+    fn from(v: ParquetFile) -> Self {
+        Self {
+            id: v.id.get(),
+            namespace_id: v.namespace_id.get(),
+            table_id: v.table_id.get(),
+            partition_id: v.partition_id.get(),
+            partition_hash_id: v
+                .partition_hash_id
+                .map(|x| x.as_bytes().to_vec())
+                .unwrap_or_default(),
+            object_store_id: v.object_store_id.to_string(),
+            min_time: v.min_time.get(),
+            max_time: v.max_time.get(),
+            to_delete: v.to_delete.map(|v| v.get()),
+            file_size_bytes: v.file_size_bytes,
+            row_count: v.row_count,
+            compaction_level: v.compaction_level as i32,
+            created_at: v.created_at.get(),
+            column_set: v.column_set.iter().map(|v| v.get()).collect(),
+            max_l0_created_at: v.max_l0_created_at.get(),
+        }
+    }
+}
+
+/// Errors deserialising a protobuf serialised [`ParquetFile`].
+#[derive(Debug, Error)]
+pub enum ParquetFileProtoError {
+    /// The proto type does not contain a partition ID.
+    #[error("no partition id specified for parquet file")]
+    NoPartitionId,
+
+    /// The specified partition ID is invalid.
+    #[error(transparent)]
+    InvalidPartitionId(#[from] PartitionIdProtoError),
+
+    /// The specified object store UUID is invalid.
+    #[error("invalid object store ID: {0}")]
+    InvalidObjectStoreId(uuid::Error),
+
+    /// The specified compaction level value is invalid.
+    #[error(transparent)]
+    InvalidCompactionLevel(#[from] CompactionLevelProtoError),
+}
+
+/// Data for a parquet file to be inserted into the catalog.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ParquetFileParams {
+    /// the namespace
+    pub namespace_id: NamespaceId,
+    /// the table
+    pub table_id: TableId,
+    /// the partition identifier
+    pub partition_id: PartitionId,
+    /// the partition hash ID
+    pub partition_hash_id: Option<PartitionHashId>,
+    /// the uuid used in the object store path for this file
+    pub object_store_id: ObjectStoreId,
+    /// the min timestamp of data in this file
+    pub min_time: Timestamp,
+    /// the max timestamp of data in this file
+    pub max_time: Timestamp,
+    /// file size in bytes
+    pub file_size_bytes: i64,
+    /// the number of rows of data in this file
+    pub row_count: i64,
+    /// the compaction level of the file
+    pub compaction_level: CompactionLevel,
+    /// the creation time of the parquet file
+    pub created_at: Timestamp,
+    /// columns in this file.
+    pub column_set: ColumnSet,
+    /// the max of created_at of all L0 files
+    pub max_l0_created_at: Timestamp,
+}
+
+/// ID of a chunk.
+///
+/// This ID is unique within a single partition.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct ChunkId(Uuid);
+
+impl ChunkId {
+    /// Create new, random ID.
+    #[allow(clippy::new_without_default)] // `new` creates non-deterministic result
+    pub fn new() -> Self {
+        Self(Uuid::new_v4())
+    }
+
+    /// **TESTING ONLY:** Create new ID from integer.
+    ///
+    /// Since this can easily lead to ID collisions (which in turn can lead to panics), this must
+    /// only be used for testing purposes!
+    pub fn new_test(id: u128) -> Self {
+        Self(Uuid::from_u128(id))
+    }
+
+    /// The chunk id is only effective in case the chunk's order is the same with another chunk.
+    /// Hence collisions are safe in that context.
+    pub fn new_id(id: u128) -> Self {
+        Self(Uuid::from_u128(id))
+    }
+
+    /// Get inner UUID.
+    pub fn get(&self) -> Uuid {
+        self.0
+    }
+}
+
+impl std::fmt::Debug for ChunkId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        <Self as std::fmt::Display>::fmt(self, f)
+    }
+}
+
+impl std::fmt::Display for ChunkId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if (self.0.get_variant() == uuid::Variant::RFC4122)
+            && (self.0.get_version() == Some(uuid::Version::Random))
+        {
+            f.debug_tuple("ChunkId").field(&self.0).finish()
+        } else {
+            f.debug_tuple("ChunkId").field(&self.0.as_u128()).finish()
+        }
+    }
+}
+
+impl From<ObjectStoreId> for ChunkId {
+    fn from(id: ObjectStoreId) -> Self {
+        Self(id.get_uuid())
+    }
+}
+
+/// Order of a chunk.
+///
+/// This is used for:
+/// 1. **upsert order:** chunks with higher order overwrite data in chunks with lower order
+/// 2. **locking order:** chunks must be locked in consistent (ascending) order
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct ChunkOrder(i64);
+
+impl ChunkOrder {
+    /// The minimum ordering value a chunk could have. Currently only used in testing.
+    pub const MIN: Self = Self(0);
+
+    /// The maximum chunk order.
+    pub const MAX: Self = Self(i64::MAX);
+
+    /// Create a ChunkOrder from the given value.
+    pub fn new(order: i64) -> Self {
+        Self(order)
+    }
+
+    /// Under underlying order as integer.
+    pub fn get(&self) -> i64 {
+        self.0
+    }
+}
+
+/// Represents a parsed delete predicate for evaluation by the InfluxDB IOx
+/// query engine.
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct DeletePredicate {
+    /// Only rows within this range are included in
+    /// results. Other rows are excluded.
+    pub range: TimestampRange,
+
+    /// Optional arbitrary predicates, represented as list of
+    /// expressions applied a logical conjunction (aka they
+    /// are 'AND'ed together). Only rows that evaluate to TRUE for all
+    /// these expressions should be returned. Other rows are excluded
+    /// from the results.
+    pub exprs: Vec<DeleteExpr>,
+}
+
+impl DeletePredicate {
+    /// Format expr to SQL string.
+    pub fn expr_sql_string(&self) -> String {
+        let mut out = String::new();
+        for expr in &self.exprs {
+            if !out.is_empty() {
+                write!(&mut out, " AND ").expect("writing to a string shouldn't fail");
+            }
+            write!(&mut out, "{expr}").expect("writing to a string shouldn't fail");
+        }
+        out
+    }
+
+    /// Return the approximate memory size of the predicate, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>() + self.exprs.iter().map(|expr| expr.size()).sum::<usize>()
+    }
+
+    /// Return the delete predicate for data outside retention
+    /// We need to only retain time >= retention_time.
+    /// Thus we only need to set the range to MIN < time < retention_time
+    pub fn retention_delete_predicate(retention_time: i64) -> Self {
+        let range = TimestampRange {
+            start: i64::MIN,
+            end: retention_time,
+        };
+        Self {
+            range,
+            exprs: vec![],
+        }
+    }
+}
+
+/// Single expression to be used as parts of a predicate.
+///
+/// Only very simple expression of the type `<column> <op> <scalar>` are supported.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct DeleteExpr {
+    /// Column (w/o table name).
+    pub column: String,
+
+    /// Operator.
+    pub op: Op,
+
+    /// Scalar value.
+    pub scalar: Scalar,
+}
+
+impl DeleteExpr {
+    /// Create a new [`DeleteExpr`]
+    pub fn new(column: String, op: Op, scalar: Scalar) -> Self {
+        Self { column, op, scalar }
+    }
+
+    /// Column (w/o table name).
+    pub fn column(&self) -> &str {
+        &self.column
+    }
+
+    /// Operator.
+    pub fn op(&self) -> Op {
+        self.op
+    }
+
+    /// Scalar value.
+    pub fn scalar(&self) -> &Scalar {
+        &self.scalar
+    }
+
+    /// Return the approximate memory size of the expression, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>() + self.column.capacity() + self.scalar.size()
+    }
+}
+
+impl std::fmt::Display for DeleteExpr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            r#""{}"{}{}"#,
+            self.column().replace('\\', r"\\").replace('"', r#"\""#),
+            self.op(),
+            self.scalar(),
+        )
+    }
+}
+
+/// Binary operator that can be evaluated on a column and a scalar value.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Op {
+    /// Strict equality (`=`).
+    Eq,
+
+    /// Inequality (`!=`).
+    Ne,
+}
+
+impl std::fmt::Display for Op {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Eq => write!(f, "="),
+            Self::Ne => write!(f, "!="),
+        }
+    }
+}
+
+/// Scalar value of a certain type.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[allow(missing_docs)]
+pub enum Scalar {
+    Bool(bool),
+    I64(i64),
+    F64(ordered_float::OrderedFloat<f64>),
+    String(String),
+}
+
+impl Scalar {
+    /// Return the approximate memory size of the scalar, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + match &self {
+                Self::Bool(_) | Self::I64(_) | Self::F64(_) => 0,
+                Self::String(s) => s.capacity(),
+            }
+    }
+}
+
+impl std::fmt::Display for Scalar {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Scalar::Bool(value) => value.fmt(f),
+            Scalar::I64(value) => value.fmt(f),
+            Scalar::F64(value) => match value.classify() {
+                FpCategory::Nan => write!(f, "'NaN'"),
+                FpCategory::Infinite if *value.as_ref() < 0.0 => write!(f, "'-Infinity'"),
+                FpCategory::Infinite => write!(f, "'Infinity'"),
+                _ => write!(f, "{:?}", value.as_ref()),
+            },
+            Scalar::String(value) => {
+                write!(f, "'{}'", value.replace('\\', r"\\").replace('\'', r"\'"))
+            }
+        }
+    }
+}
+
+/// A string that cannot be empty
+///
+/// This is particularly useful for types that map to/from protobuf, where string fields
+/// are not nullable - that is they default to an empty string if not specified
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct NonEmptyString(Box<str>);
+
+impl NonEmptyString {
+    /// Create a new `NonEmptyString` from the provided `String`
+    ///
+    /// Returns None if empty
+    pub fn new(s: impl Into<String>) -> Option<Self> {
+        let s = s.into();
+        match s.is_empty() {
+            true => None,
+            false => Some(Self(s.into_boxed_str())),
+        }
+    }
+}
+
+impl Deref for NonEmptyString {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+/// Column name, statistics which encode type information
+#[derive(Debug, PartialEq, Clone)]
+pub struct ColumnSummary {
+    /// Column name
+    pub name: String,
+
+    /// Column's Influx data model type
+    pub influxdb_type: InfluxDbType,
+
+    /// Per column
+    pub stats: Statistics,
+}
+
+impl ColumnSummary {
+    /// Returns the total number of rows (including nulls) in this column
+    pub fn total_count(&self) -> u64 {
+        self.stats.total_count()
+    }
+
+    /// Updates statistics from other if the same type, otherwise a noop
+    pub fn update_from(&mut self, other: &Self) {
+        match (&mut self.stats, &other.stats) {
+            (Statistics::F64(s), Statistics::F64(o)) => {
+                s.update_from(o);
+            }
+            (Statistics::I64(s), Statistics::I64(o)) => {
+                s.update_from(o);
+            }
+            (Statistics::Bool(s), Statistics::Bool(o)) => {
+                s.update_from(o);
+            }
+            (Statistics::String(s), Statistics::String(o)) => {
+                s.update_from(o);
+            }
+            (Statistics::U64(s), Statistics::U64(o)) => {
+                s.update_from(o);
+            }
+            // do catch alls for the specific types, that way if a new type gets added, the compiler
+            // will complain.
+            (Statistics::F64(_), _) => unreachable!(),
+            (Statistics::I64(_), _) => unreachable!(),
+            (Statistics::U64(_), _) => unreachable!(),
+            (Statistics::Bool(_), _) => unreachable!(),
+            (Statistics::String(_), _) => unreachable!(),
+        }
+    }
+
+    /// Updates these statistics so that that the total length of this
+    /// column is `len` rows, padding it with trailing NULLs if
+    /// necessary
+    pub fn update_to_total_count(&mut self, len: u64) {
+        let total_count = self.total_count();
+        assert!(
+            total_count <= len,
+            "trying to shrink column stats from {total_count} to {len}"
+        );
+        let delta = len - total_count;
+        self.stats.update_for_nulls(delta);
+    }
+
+    /// Return size in bytes of this Column metadata (not the underlying column)
+    pub fn size(&self) -> usize {
+        mem::size_of::<Self>() + self.name.len() + self.stats.size()
+    }
+}
+
+// Replicate this enum here as it can't be derived from the existing statistics
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[allow(missing_docs)]
+pub enum InfluxDbType {
+    Tag,
+    Field,
+    Timestamp,
+}
+
+/// Summary statistics for a column.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct StatValues<T> {
+    /// minimum (non-NaN, non-NULL) value, if any
+    pub min: Option<T>,
+
+    /// maximum (non-NaN, non-NULL) value, if any
+    pub max: Option<T>,
+
+    /// total number of values in this column, including null values
+    pub total_count: u64,
+
+    /// number of null values in this column
+    pub null_count: Option<u64>,
+
+    /// number of distinct values in this column if known
+    ///
+    /// This includes NULLs and NANs
+    pub distinct_count: Option<NonZeroU64>,
+}
+
+/// Represents the result of comparing the min/max ranges of two [`StatValues`]
+#[derive(Debug, PartialEq, Eq, Copy, Clone)]
+pub enum StatOverlap {
+    /// There is at least one value that exists in both ranges
+    NonZero,
+
+    /// There are zero values that exists in both ranges
+    Zero,
+
+    /// It is not known if there are any intersections (e.g. because
+    /// one of the bounds is not Known / is None)
+    Unknown,
+}
+
+impl<T> StatValues<T>
+where
+    T: PartialOrd,
+{
+    /// returns information about the overlap between two `StatValues`
+    pub fn overlaps(&self, other: &Self) -> StatOverlap {
+        match (&self.min, &self.max, &other.min, &other.max) {
+            (Some(self_min), Some(self_max), Some(other_min), Some(other_max)) => {
+                if self_min <= other_max && self_max >= other_min {
+                    StatOverlap::NonZero
+                } else {
+                    StatOverlap::Zero
+                }
+            }
+            // At least one of the values was None
+            _ => StatOverlap::Unknown,
+        }
+    }
+}
+
+impl<T> Default for StatValues<T> {
+    fn default() -> Self {
+        Self {
+            min: None,
+            max: None,
+            total_count: 0,
+            null_count: None,
+            distinct_count: None,
+        }
+    }
+}
+
+impl<T> StatValues<T> {
+    /// Create new statistics with no values
+    pub fn new_empty() -> Self {
+        Self {
+            min: None,
+            max: None,
+            total_count: 0,
+            null_count: Some(0),
+            distinct_count: None,
+        }
+    }
+
+    /// Returns true if both the min and max values are None (aka not known)
+    pub fn is_none(&self) -> bool {
+        self.min.is_none() && self.max.is_none()
+    }
+
+    /// Update the statistics values to account for `num_nulls` additional null values
+    pub fn update_for_nulls(&mut self, num_nulls: u64) {
+        self.total_count += num_nulls;
+        self.null_count = self.null_count.map(|x| x + num_nulls);
+    }
+
+    /// updates the statistics keeping the min, max and incrementing count.
+    ///
+    /// The type plumbing exists to allow calling with `&str` on a `StatValues<String>`.
+    pub fn update<U: ?Sized>(&mut self, other: &U)
+    where
+        T: Borrow<U>,
+        U: ToOwned<Owned = T> + PartialOrd + IsNan,
+    {
+        self.total_count += 1;
+        self.distinct_count = None;
+
+        if !other.is_nan() {
+            match &self.min {
+                None => self.min = Some(other.to_owned()),
+                Some(s) => {
+                    if s.borrow() > other {
+                        self.min = Some(other.to_owned());
+                    }
+                }
+            }
+
+            match &self.max {
+                None => {
+                    self.max = Some(other.to_owned());
+                }
+                Some(s) => {
+                    if other > s.borrow() {
+                        self.max = Some(other.to_owned());
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<T> StatValues<T>
+where
+    T: Clone + PartialOrd,
+{
+    /// Updates statistics from other
+    pub fn update_from(&mut self, other: &Self) {
+        self.total_count += other.total_count;
+        self.null_count = self.null_count.zip(other.null_count).map(|(a, b)| a + b);
+
+        // No way to accurately aggregate counts
+        self.distinct_count = None;
+
+        match (&self.min, &other.min) {
+            (None, None) | (Some(_), None) => {}
+            (None, Some(o)) => self.min = Some(o.clone()),
+            (Some(s), Some(o)) => {
+                if s > o {
+                    self.min = Some(o.clone());
+                }
+            }
+        }
+
+        match (&self.max, &other.max) {
+            (None, None) | (Some(_), None) => {}
+            (None, Some(o)) => self.max = Some(o.clone()),
+            (Some(s), Some(o)) => {
+                if o > s {
+                    self.max = Some(o.clone());
+                }
+            }
+        };
+    }
+}
+
+impl<T> StatValues<T>
+where
+    T: IsNan + PartialOrd,
+{
+    /// Create new statistics with the specified count and null count
+    pub fn new(min: Option<T>, max: Option<T>, total_count: u64, null_count: Option<u64>) -> Self {
+        let distinct_count = None;
+        Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
+    }
+
+    /// Create statistics for a column that only has nulls up to now
+    pub fn new_all_null(total_count: u64, distinct_count: Option<u64>) -> Self {
+        let min = None;
+        let max = None;
+        let null_count = Some(total_count);
+
+        if let Some(count) = distinct_count {
+            assert!(count > 0);
+        }
+        Self::new_with_distinct(
+            min,
+            max,
+            total_count,
+            null_count,
+            distinct_count.map(|c| NonZeroU64::new(c).unwrap()),
+        )
+    }
+
+    /// Create statistics for a column with zero nulls and unknown distinct count
+    pub fn new_non_null(min: Option<T>, max: Option<T>, total_count: u64) -> Self {
+        let null_count = Some(0);
+        let distinct_count = None;
+        Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
+    }
+
+    /// Create new statistics with the specified count and null count and distinct values
+    pub fn new_with_distinct(
+        min: Option<T>,
+        max: Option<T>,
+        total_count: u64,
+        null_count: Option<u64>,
+        distinct_count: Option<NonZeroU64>,
+    ) -> Self {
+        if let Some(min) = &min {
+            assert!(!min.is_nan());
+        }
+        if let Some(max) = &max {
+            assert!(!max.is_nan());
+        }
+        if let (Some(min), Some(max)) = (&min, &max) {
+            assert!(min <= max);
+        }
+
+        Self {
+            min,
+            max,
+            total_count,
+            null_count,
+            distinct_count,
+        }
+    }
+}
+
+/// Whether a type is NaN or not.
+pub trait IsNan {
+    /// Test for NaNess.
+    fn is_nan(&self) -> bool;
+}
+
+impl<T: IsNan> IsNan for &T {
+    fn is_nan(&self) -> bool {
+        (*self).is_nan()
+    }
+}
+
+macro_rules! impl_is_nan_false {
+    ($t:ty) => {
+        impl IsNan for $t {
+            fn is_nan(&self) -> bool {
+                false
+            }
+        }
+    };
+}
+
+impl_is_nan_false!(bool);
+impl_is_nan_false!(str);
+impl_is_nan_false!(String);
+impl_is_nan_false!(i8);
+impl_is_nan_false!(i16);
+impl_is_nan_false!(i32);
+impl_is_nan_false!(i64);
+impl_is_nan_false!(u8);
+impl_is_nan_false!(u16);
+impl_is_nan_false!(u32);
+impl_is_nan_false!(u64);
+
+impl IsNan for f64 {
+    fn is_nan(&self) -> bool {
+        Self::is_nan(*self)
+    }
+}
+
+/// Statistics and type information for a column.
+#[derive(Debug, PartialEq, Clone)]
+#[allow(missing_docs)]
+pub enum Statistics {
+    I64(StatValues<i64>),
+    U64(StatValues<u64>),
+    Bool(StatValues<bool>),
+    String(StatValues<String>),
+
+    /// For the purposes of min/max values of floats, NaN values are ignored (no
+    /// ordering is applied to NaNs).
+    F64(StatValues<f64>),
+}
+
+impl Statistics {
+    /// Returns the total number of rows in this column
+    pub fn total_count(&self) -> u64 {
+        match self {
+            Self::I64(s) => s.total_count,
+            Self::U64(s) => s.total_count,
+            Self::F64(s) => s.total_count,
+            Self::Bool(s) => s.total_count,
+            Self::String(s) => s.total_count,
+        }
+    }
+
+    /// Returns true if both the min and max values are None (aka not known)
+    pub fn is_none(&self) -> bool {
+        match self {
+            Self::I64(v) => v.is_none(),
+            Self::U64(v) => v.is_none(),
+            Self::F64(v) => v.is_none(),
+            Self::Bool(v) => v.is_none(),
+            Self::String(v) => v.is_none(),
+        }
+    }
+
+    /// Returns the number of null rows in this column
+    pub fn null_count(&self) -> Option<u64> {
+        match self {
+            Self::I64(s) => s.null_count,
+            Self::U64(s) => s.null_count,
+            Self::F64(s) => s.null_count,
+            Self::Bool(s) => s.null_count,
+            Self::String(s) => s.null_count,
+        }
+    }
+
+    /// Returns the distinct count if known
+    pub fn distinct_count(&self) -> Option<NonZeroU64> {
+        match self {
+            Self::I64(s) => s.distinct_count,
+            Self::U64(s) => s.distinct_count,
+            Self::F64(s) => s.distinct_count,
+            Self::Bool(s) => s.distinct_count,
+            Self::String(s) => s.distinct_count,
+        }
+    }
+
+    /// Update the statistics values to account for `num_nulls` additional null values
+    pub fn update_for_nulls(&mut self, num_nulls: u64) {
+        match self {
+            Self::I64(v) => v.update_for_nulls(num_nulls),
+            Self::U64(v) => v.update_for_nulls(num_nulls),
+            Self::F64(v) => v.update_for_nulls(num_nulls),
+            Self::Bool(v) => v.update_for_nulls(num_nulls),
+            Self::String(v) => v.update_for_nulls(num_nulls),
+        }
+    }
+
+    /// Return the size in bytes of this stats instance
+    pub fn size(&self) -> usize {
+        match self {
+            Self::String(v) => std::mem::size_of::<Self>() + v.string_size(),
+            _ => std::mem::size_of::<Self>(),
+        }
+    }
+
+    /// Return a human interpretable description of this type
+    pub fn type_name(&self) -> &'static str {
+        match self {
+            Self::I64(_) => "I64",
+            Self::U64(_) => "U64",
+            Self::F64(_) => "F64",
+            Self::Bool(_) => "Bool",
+            Self::String(_) => "String",
+        }
+    }
+
+    /// Extract i64 type.
+    pub fn as_i64(&self) -> Option<&StatValues<i64>> {
+        match self {
+            Self::I64(val) => Some(val),
+            _ => None,
+        }
+    }
+}
+
+impl StatValues<String> {
+    /// Returns the bytes associated by storing min/max string values
+    pub fn string_size(&self) -> usize {
+        self.min.as_ref().map(|x| x.len()).unwrap_or(0)
+            + self.max.as_ref().map(|x| x.len()).unwrap_or(0)
+    }
+}
+
+/// Metadata and statistics information for a table. This can be
+/// either for the portion of a Table stored within a single chunk or
+/// aggregated across chunks.
+#[derive(Debug, PartialEq, Clone, Default)]
+pub struct TableSummary {
+    /// Per column statistics
+    pub columns: Vec<ColumnSummary>,
+}
+
+impl TableSummary {
+    /// Get the column summary by name.
+    pub fn column(&self, name: &str) -> Option<&ColumnSummary> {
+        self.columns.iter().find(|c| c.name == name)
+    }
+
+    /// Returns the total number of rows in the columns of this summary
+    pub fn total_count(&self) -> u64 {
+        // Assumes that all tables have the same number of rows, so
+        // pick the first one
+        let count = self.columns.first().map(|c| c.total_count()).unwrap_or(0);
+
+        // Validate that the counts are consistent across columns
+        for c in &self.columns {
+            // Restore to assert when https://github.com/influxdata/influxdb_iox/issues/2124 is fixed
+            if c.total_count() != count {
+                warn!(column_name=%c.name,
+                      column_count=c.total_count(), previous_count=count,
+                      "Mismatch in statistics count, see #2124");
+            }
+        }
+        count
+    }
+
+    /// Updates the table summary with combined stats from the other. Counts are
+    /// treated as non-overlapping so they're just added together. If the
+    /// type of a column differs between the two tables, no update is done
+    /// on that column. Columns that only exist in the other are cloned into
+    /// this table summary.
+    pub fn update_from(&mut self, other: &Self) {
+        let new_total_count = self.total_count() + other.total_count();
+
+        // update all existing columns
+        for col in &mut self.columns {
+            if let Some(other_col) = other.column(&col.name) {
+                col.update_from(other_col);
+            } else {
+                col.update_to_total_count(new_total_count);
+            }
+        }
+
+        // Add any columns that were new
+        for col in &other.columns {
+            if self.column(&col.name).is_none() {
+                let mut new_col = col.clone();
+                // ensure the count is consistent
+                new_col.update_to_total_count(new_total_count);
+                self.columns.push(new_col);
+            }
+        }
+    }
+
+    /// Total size of all ColumnSummaries that belong to this table which include
+    /// column names and their stats
+    pub fn size(&self) -> usize {
+        let size: usize = self.columns.iter().map(|c| c.size()).sum();
+        size + mem::size_of::<Self>() // Add size of this struct that points to
+                                      // table and ColumnSummary
+    }
+
+    /// Extracts min/max values of the timestamp column, if possible
+    pub fn time_range(&self) -> Option<TimestampMinMax> {
+        self.column(TIME_COLUMN_NAME).and_then(|c| {
+            if let Statistics::I64(StatValues {
+                min: Some(min),
+                max: Some(max),
+                ..
+            }) = &c.stats
+            {
+                Some(TimestampMinMax::new(*min, *max))
+            } else {
+                None
+            }
+        })
+    }
+}
+
+/// minimum time that can be represented.
+///
+/// 1677-09-21 00:12:43.145224194 +0000 UTC
+///
+/// The two lowest minimum integers are used as sentinel values.  The
+/// minimum value needs to be used as a value lower than any other value for
+/// comparisons and another separate value is needed to act as a sentinel
+/// default value that is unusable by the user, but usable internally.
+/// Because these two values need to be used for a special purpose, we do
+/// not allow users to write points at these two times.
+///
+/// Source: [influxdb](https://github.com/influxdata/influxdb/blob/540bb66e1381a48a6d1ede4fc3e49c75a7d9f4af/models/time.go#L12-L34)
+pub const MIN_NANO_TIME: i64 = i64::MIN + 2;
+
+/// maximum time that can be represented.
+///
+/// 2262-04-11 23:47:16.854775806 +0000 UTC
+///
+/// The highest time represented by a nanosecond needs to be used for an exclusive range, so the
+/// maximum time needs to be one less than the possible maximum number of nanoseconds representable
+/// by an int64 so that we don't lose a point at that one time.
+/// Source: [influxdb](https://github.com/influxdata/influxdb/blob/540bb66e1381a48a6d1ede4fc3e49c75a7d9f4af/models/time.go#L12-L34)
+pub const MAX_NANO_TIME: i64 = i64::MAX - 1;
+
+/// Specifies a continuous range of nanosecond timestamps. Timestamp
+/// predicates are so common and critical to performance of timeseries
+/// databases in general, and IOx in particular, that they are handled
+/// specially
+///
+/// Timestamp ranges are defined such that a value `v` is within the
+/// range iff:
+///
+/// ```text
+///  range.start <= v < range.end
+/// ```
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Copy, Debug, Hash)]
+pub struct TimestampRange {
+    /// Start defines the inclusive lower bound. Minimum value is [MIN_NANO_TIME]
+    start: i64,
+    /// End defines the exclusive upper bound. Maximum value is [MAX_NANO_TIME]
+    end: i64,
+}
+
+impl TimestampRange {
+    /// Create a new TimestampRange.
+    ///
+    /// Takes an inclusive start and an exclusive end. You may create an empty range by setting `start = end`.
+    ///
+    /// Clamps `start` to [`MIN_NANO_TIME`].
+    /// end is unclamped. End may be set to `i64:MAX == MAX_NANO_TIME+1` to indicate that the upper bound is NOT
+    /// restricted (this does NOT affect `start` though!).
+    ///
+    /// If `start > end`, this will be interpreted as an empty time range and `start` will be set to `end`.
+    pub fn new(start: i64, end: i64) -> Self {
+        let start = start.clamp(MIN_NANO_TIME, end);
+        let end = end.max(MIN_NANO_TIME);
+        Self { start, end }
+    }
+
+    /// Returns true if this range contains all representable timestamps
+    pub fn contains_all(&self) -> bool {
+        self.start <= MIN_NANO_TIME && self.end > MAX_NANO_TIME
+    }
+
+    /// Returns true if this range contains all representable timestamps except possibly MAX_NANO_TIME
+    ///
+    /// This is required for queries from InfluxQL, which are intended to be
+    /// for all time but instead can be for [MIN_NANO_TIME, MAX_NANO_TIME).
+    /// When <https://github.com/influxdata/idpe/issues/13094> is fixed,
+    /// all uses of contains_nearly_all should be replaced by contains_all
+    pub fn contains_nearly_all(&self) -> bool {
+        self.start <= MIN_NANO_TIME && self.end >= MAX_NANO_TIME
+    }
+
+    #[inline]
+    /// Returns true if this range contains the value v
+    pub fn contains(&self, v: i64) -> bool {
+        self.start <= v && v < self.end
+    }
+
+    /// Return the timestamp exclusive range's end.
+    pub fn end(&self) -> i64 {
+        self.end
+    }
+
+    /// Return the timestamp inclusive range's start.
+    pub fn start(&self) -> i64 {
+        self.start
+    }
+}
+
+/// Specifies a min/max timestamp value.
+///
+/// Note this differs subtlety (but critically) from a
+/// [`TimestampRange`] as the minimum and maximum values are included ([`TimestampRange`] has an exclusive end).
+#[derive(Clone, Debug, Copy, PartialEq, Eq)]
+pub struct TimestampMinMax {
+    /// The minimum timestamp value
+    pub min: i64,
+    /// the maximum timestamp value
+    pub max: i64,
+}
+
+impl TimestampMinMax {
+    /// Create a new TimestampMinMax. Panics if min > max.
+    pub fn new(min: i64, max: i64) -> Self {
+        assert!(min <= max, "expected min ({min}) <= max ({max})");
+        Self { min, max }
+    }
+
+    #[inline]
+    /// Returns true if any of the values between min / max
+    /// (inclusive) are contained within the specified timestamp range
+    pub fn overlaps(&self, range: TimestampRange) -> bool {
+        range.contains(self.min)
+            || range.contains(self.max)
+            || (self.min <= range.start && self.max >= range.end)
+    }
+
+    /// Returns the union of this range with `other` with the minimum of the `min`s
+    /// and the maximum of the `max`es
+
+    pub fn union(&self, other: &Self) -> Self {
+        Self {
+            min: self.min.min(other.min),
+            max: self.max.max(other.max),
+        }
+    }
+}
+
+/// FileRange describes a range of files by the min/max time and the sum of their capacities.
+#[derive(Clone, Debug, Copy, PartialEq, Eq)]
+pub struct FileRange {
+    /// The minimum time of any file in the range
+    pub min: i64,
+    /// The maximum time of any file in the range
+    pub max: i64,
+    /// The sum of the sizes of all files in the range
+    pub cap: usize,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::borrow::Cow;
+
+    use ordered_float::OrderedFloat;
+
+    #[test]
+    fn test_chunk_id_new() {
+        // `ChunkId::new()` create new random ID
+        assert_ne!(ChunkId::new(), ChunkId::new());
+    }
+
+    #[test]
+    fn test_chunk_id_new_test() {
+        // `ChunkId::new_test(...)` creates deterministic ID
+        assert_eq!(ChunkId::new_test(1), ChunkId::new_test(1));
+        assert_ne!(ChunkId::new_test(1), ChunkId::new_test(2));
+    }
+
+    #[test]
+    fn test_chunk_id_debug_and_display() {
+        // Random chunk IDs use UUID-format
+        let id_random = ChunkId::new();
+        let inner: Uuid = id_random.get();
+        assert_eq!(format!("{id_random:?}"), format!("ChunkId({inner})"));
+        assert_eq!(format!("{id_random}"), format!("ChunkId({inner})"));
+
+        // Deterministic IDs use integer format
+        let id_test = ChunkId::new_test(42);
+        assert_eq!(format!("{id_test:?}"), "ChunkId(42)");
+        assert_eq!(format!("{id_test}"), "ChunkId(42)");
+    }
+
+    #[test]
+    fn test_expr_to_sql_no_expressions() {
+        let pred = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![],
+        };
+        assert_eq!(&pred.expr_sql_string(), "");
+    }
+
+    #[test]
+    fn test_expr_to_sql_operators() {
+        let pred = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![
+                DeleteExpr {
+                    column: String::from("col1"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(1),
+                },
+                DeleteExpr {
+                    column: String::from("col2"),
+                    op: Op::Ne,
+                    scalar: Scalar::I64(2),
+                },
+            ],
+        };
+        assert_eq!(&pred.expr_sql_string(), r#""col1"=1 AND "col2"!=2"#);
+    }
+
+    #[test]
+    fn test_expr_to_sql_column_escape() {
+        let pred = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![
+                DeleteExpr {
+                    column: String::from("col 1"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(1),
+                },
+                DeleteExpr {
+                    column: String::from(r"col\2"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(2),
+                },
+                DeleteExpr {
+                    column: String::from(r#"col"3"#),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(3),
+                },
+            ],
+        };
+        assert_eq!(
+            &pred.expr_sql_string(),
+            r#""col 1"=1 AND "col\\2"=2 AND "col\"3"=3"#
+        );
+    }
+
+    #[test]
+    fn test_expr_to_sql_bool() {
+        let pred = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![
+                DeleteExpr {
+                    column: String::from("col1"),
+                    op: Op::Eq,
+                    scalar: Scalar::Bool(false),
+                },
+                DeleteExpr {
+                    column: String::from("col2"),
+                    op: Op::Eq,
+                    scalar: Scalar::Bool(true),
+                },
+            ],
+        };
+        assert_eq!(&pred.expr_sql_string(), r#""col1"=false AND "col2"=true"#);
+    }
+
+    #[test]
+    fn test_expr_to_sql_i64() {
+        let pred = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![
+                DeleteExpr {
+                    column: String::from("col1"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(0),
+                },
+                DeleteExpr {
+                    column: String::from("col2"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(-1),
+                },
+                DeleteExpr {
+                    column: String::from("col3"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(1),
+                },
+                DeleteExpr {
+                    column: String::from("col4"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(i64::MIN),
+                },
+                DeleteExpr {
+                    column: String::from("col5"),
+                    op: Op::Eq,
+                    scalar: Scalar::I64(i64::MAX),
+                },
+            ],
+        };
+        assert_eq!(
+            &pred.expr_sql_string(),
+            r#""col1"=0 AND "col2"=-1 AND "col3"=1 AND "col4"=-9223372036854775808 AND "col5"=9223372036854775807"#
+        );
+    }
+
+    #[test]
+    fn test_expr_to_sql_f64() {
+        let pred = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![
+                DeleteExpr {
+                    column: String::from("col1"),
+                    op: Op::Eq,
+                    scalar: Scalar::F64(OrderedFloat::from(0.0)),
+                },
+                DeleteExpr {
+                    column: String::from("col2"),
+                    op: Op::Eq,
+                    scalar: Scalar::F64(OrderedFloat::from(-0.0)),
+                },
+                DeleteExpr {
+                    column: String::from("col3"),
+                    op: Op::Eq,
+                    scalar: Scalar::F64(OrderedFloat::from(1.0)),
+                },
+                DeleteExpr {
+                    column: String::from("col4"),
+                    op: Op::Eq,
+                    scalar: Scalar::F64(OrderedFloat::from(f64::INFINITY)),
+                },
+                DeleteExpr {
+                    column: String::from("col5"),
+                    op: Op::Eq,
+                    scalar: Scalar::F64(OrderedFloat::from(f64::NEG_INFINITY)),
+                },
+                DeleteExpr {
+                    column: String::from("col6"),
+                    op: Op::Eq,
+                    scalar: Scalar::F64(OrderedFloat::from(f64::NAN)),
+                },
+            ],
+        };
+        assert_eq!(
+            &pred.expr_sql_string(),
+            r#""col1"=0.0 AND "col2"=-0.0 AND "col3"=1.0 AND "col4"='Infinity' AND "col5"='-Infinity' AND "col6"='NaN'"#
+        );
+    }
+
+    #[test]
+    fn test_expr_to_sql_string() {
+        let pred = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![
+                DeleteExpr {
+                    column: String::from("col1"),
+                    op: Op::Eq,
+                    scalar: Scalar::String(String::from("")),
+                },
+                DeleteExpr {
+                    column: String::from("col2"),
+                    op: Op::Eq,
+                    scalar: Scalar::String(String::from("foo")),
+                },
+                DeleteExpr {
+                    column: String::from("col3"),
+                    op: Op::Eq,
+                    scalar: Scalar::String(String::from(r"fo\o")),
+                },
+                DeleteExpr {
+                    column: String::from("col4"),
+                    op: Op::Eq,
+                    scalar: Scalar::String(String::from(r#"fo'o"#)),
+                },
+            ],
+        };
+        assert_eq!(
+            &pred.expr_sql_string(),
+            r#""col1"='' AND "col2"='foo' AND "col3"='fo\\o' AND "col4"='fo\'o'"#
+        );
+    }
+
+    #[test]
+    fn statistics_new_non_null() {
+        let actual = StatValues::new_non_null(Some(-1i64), Some(1i64), 3);
+        let expected = StatValues {
+            min: Some(-1i64),
+            max: Some(1i64),
+            total_count: 3,
+            null_count: Some(0),
+            distinct_count: None,
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn statistics_new_all_null() {
+        // i64 values do not have a distinct count
+        let actual = StatValues::<i64>::new_all_null(3, None);
+        let expected = StatValues {
+            min: None,
+            max: None,
+            total_count: 3,
+            null_count: Some(3),
+            distinct_count: None,
+        };
+        assert_eq!(actual, expected);
+
+        // string columns can have a distinct count
+        let actual = StatValues::<i64>::new_all_null(3, Some(1_u64));
+        let expected = StatValues {
+            min: None,
+            max: None,
+            total_count: 3,
+            null_count: Some(3),
+            distinct_count: Some(NonZeroU64::try_from(1_u64).unwrap()),
+        };
+        assert_eq!(actual, expected);
+    }
+
+    impl<T> StatValues<T>
+    where
+        T: IsNan + PartialOrd + Clone,
+    {
+        fn new_with_value(starting_value: T) -> Self {
+            let starting_value = if starting_value.is_nan() {
+                None
+            } else {
+                Some(starting_value)
+            };
+
+            let min = starting_value.clone();
+            let max = starting_value;
+            let total_count = 1;
+            let null_count = Some(0);
+            let distinct_count = None;
+            Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
+        }
+    }
+
+    impl Statistics {
+        /// Return the minimum value, if any, formatted as a string
+        fn min_as_str(&self) -> Option<Cow<'_, str>> {
+            match self {
+                Self::I64(v) => v.min.map(|x| Cow::Owned(x.to_string())),
+                Self::U64(v) => v.min.map(|x| Cow::Owned(x.to_string())),
+                Self::F64(v) => v.min.map(|x| Cow::Owned(x.to_string())),
+                Self::Bool(v) => v.min.map(|x| Cow::Owned(x.to_string())),
+                Self::String(v) => v.min.as_deref().map(Cow::Borrowed),
+            }
+        }
+
+        /// Return the maximum value, if any, formatted as a string
+        fn max_as_str(&self) -> Option<Cow<'_, str>> {
+            match self {
+                Self::I64(v) => v.max.map(|x| Cow::Owned(x.to_string())),
+                Self::U64(v) => v.max.map(|x| Cow::Owned(x.to_string())),
+                Self::F64(v) => v.max.map(|x| Cow::Owned(x.to_string())),
+                Self::Bool(v) => v.max.map(|x| Cow::Owned(x.to_string())),
+                Self::String(v) => v.max.as_deref().map(Cow::Borrowed),
+            }
+        }
+    }
+
+    #[test]
+    fn statistics_update() {
+        let mut stat = StatValues::new_with_value(23);
+        assert_eq!(stat.min, Some(23));
+        assert_eq!(stat.max, Some(23));
+        assert_eq!(stat.total_count, 1);
+
+        stat.update(&55);
+        assert_eq!(stat.min, Some(23));
+        assert_eq!(stat.max, Some(55));
+        assert_eq!(stat.total_count, 2);
+
+        stat.update(&6);
+        assert_eq!(stat.min, Some(6));
+        assert_eq!(stat.max, Some(55));
+        assert_eq!(stat.total_count, 3);
+
+        stat.update(&30);
+        assert_eq!(stat.min, Some(6));
+        assert_eq!(stat.max, Some(55));
+        assert_eq!(stat.total_count, 4);
+    }
+
+    #[test]
+    fn statistics_default() {
+        let mut stat = StatValues::default();
+        assert_eq!(stat.min, None);
+        assert_eq!(stat.max, None);
+        assert_eq!(stat.total_count, 0);
+
+        stat.update(&55);
+        assert_eq!(stat.min, Some(55));
+        assert_eq!(stat.max, Some(55));
+        assert_eq!(stat.total_count, 1);
+
+        let mut stat = StatValues::<String>::default();
+        assert_eq!(stat.min, None);
+        assert_eq!(stat.max, None);
+        assert_eq!(stat.total_count, 0);
+
+        stat.update("cupcakes");
+        assert_eq!(stat.min, Some("cupcakes".to_string()));
+        assert_eq!(stat.max, Some("cupcakes".to_string()));
+        assert_eq!(stat.total_count, 1);
+
+        stat.update("woo");
+        assert_eq!(stat.min, Some("cupcakes".to_string()));
+        assert_eq!(stat.max, Some("woo".to_string()));
+        assert_eq!(stat.total_count, 2);
+    }
+
+    #[test]
+    fn statistics_is_none() {
+        let mut stat = StatValues::default();
+        assert!(stat.is_none());
+        stat.min = Some(0);
+        assert!(!stat.is_none());
+        stat.max = Some(1);
+        assert!(!stat.is_none());
+    }
+
+    #[test]
+    fn statistics_overlaps() {
+        let stat1 = StatValues {
+            min: Some(10),
+            max: Some(20),
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat1), StatOverlap::NonZero);
+
+        //    [--stat1--]
+        // [--stat2--]
+        let stat2 = StatValues {
+            min: Some(5),
+            max: Some(15),
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat2), StatOverlap::NonZero);
+        assert_eq!(stat2.overlaps(&stat1), StatOverlap::NonZero);
+
+        //    [--stat1--]
+        //        [--stat3--]
+        let stat3 = StatValues {
+            min: Some(15),
+            max: Some(25),
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat3), StatOverlap::NonZero);
+        assert_eq!(stat3.overlaps(&stat1), StatOverlap::NonZero);
+
+        //    [--stat1--]
+        //                [--stat4--]
+        let stat4 = StatValues {
+            min: Some(25),
+            max: Some(35),
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat4), StatOverlap::Zero);
+        assert_eq!(stat4.overlaps(&stat1), StatOverlap::Zero);
+
+        //              [--stat1--]
+        // [--stat5--]
+        let stat5 = StatValues {
+            min: Some(0),
+            max: Some(5),
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat5), StatOverlap::Zero);
+        assert_eq!(stat5.overlaps(&stat1), StatOverlap::Zero);
+    }
+
+    #[test]
+    fn statistics_overlaps_none() {
+        let stat1 = StatValues {
+            min: Some(10),
+            max: Some(20),
+            ..Default::default()
+        };
+
+        let stat2 = StatValues {
+            min: None,
+            max: Some(20),
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat2), StatOverlap::Unknown);
+        assert_eq!(stat2.overlaps(&stat1), StatOverlap::Unknown);
+
+        let stat3 = StatValues {
+            min: Some(10),
+            max: None,
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat3), StatOverlap::Unknown);
+        assert_eq!(stat3.overlaps(&stat1), StatOverlap::Unknown);
+
+        let stat4 = StatValues {
+            min: None,
+            max: None,
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat4), StatOverlap::Unknown);
+        assert_eq!(stat4.overlaps(&stat1), StatOverlap::Unknown);
+    }
+
+    #[test]
+    fn statistics_overlaps_mixed_none() {
+        let stat1 = StatValues {
+            min: Some(10),
+            max: None,
+            ..Default::default()
+        };
+
+        let stat2 = StatValues {
+            min: None,
+            max: Some(5),
+            ..Default::default()
+        };
+        assert_eq!(stat1.overlaps(&stat2), StatOverlap::Unknown);
+        assert_eq!(stat2.overlaps(&stat1), StatOverlap::Unknown);
+    }
+
+    #[test]
+    fn update_string() {
+        let mut stat = StatValues::new_with_value("bbb".to_string());
+        assert_eq!(stat.min, Some("bbb".to_string()));
+        assert_eq!(stat.max, Some("bbb".to_string()));
+        assert_eq!(stat.total_count, 1);
+
+        stat.update("aaa");
+        assert_eq!(stat.min, Some("aaa".to_string()));
+        assert_eq!(stat.max, Some("bbb".to_string()));
+        assert_eq!(stat.total_count, 2);
+
+        stat.update("z");
+        assert_eq!(stat.min, Some("aaa".to_string()));
+        assert_eq!(stat.max, Some("z".to_string()));
+        assert_eq!(stat.total_count, 3);
+
+        stat.update("p");
+        assert_eq!(stat.min, Some("aaa".to_string()));
+        assert_eq!(stat.max, Some("z".to_string()));
+        assert_eq!(stat.total_count, 4);
+    }
+
+    #[test]
+    fn stats_is_none() {
+        let stat = Statistics::I64(StatValues::new_non_null(Some(-1), Some(100), 1));
+        assert!(!stat.is_none());
+
+        let stat = Statistics::I64(StatValues::new_non_null(None, Some(100), 1));
+        assert!(!stat.is_none());
+
+        let stat = Statistics::I64(StatValues::new_non_null(None, None, 0));
+        assert!(stat.is_none());
+    }
+
+    #[test]
+    fn stats_as_str_i64() {
+        let stat = Statistics::I64(StatValues::new_non_null(Some(-1), Some(100), 1));
+        assert_eq!(stat.min_as_str(), Some("-1".into()));
+        assert_eq!(stat.max_as_str(), Some("100".into()));
+
+        let stat = Statistics::I64(StatValues::new_non_null(None, None, 1));
+        assert_eq!(stat.min_as_str(), None);
+        assert_eq!(stat.max_as_str(), None);
+    }
+
+    #[test]
+    fn stats_as_str_u64() {
+        let stat = Statistics::U64(StatValues::new_non_null(Some(1), Some(100), 1));
+        assert_eq!(stat.min_as_str(), Some("1".into()));
+        assert_eq!(stat.max_as_str(), Some("100".into()));
+
+        let stat = Statistics::U64(StatValues::new_non_null(None, None, 1));
+        assert_eq!(stat.min_as_str(), None);
+        assert_eq!(stat.max_as_str(), None);
+    }
+
+    #[test]
+    fn stats_as_str_f64() {
+        let stat = Statistics::F64(StatValues::new_non_null(Some(99.0), Some(101.0), 1));
+        assert_eq!(stat.min_as_str(), Some("99".into()));
+        assert_eq!(stat.max_as_str(), Some("101".into()));
+
+        let stat = Statistics::F64(StatValues::new_non_null(None, None, 1));
+        assert_eq!(stat.min_as_str(), None);
+        assert_eq!(stat.max_as_str(), None);
+    }
+
+    #[test]
+    fn stats_as_str_bool() {
+        let stat = Statistics::Bool(StatValues::new_non_null(Some(false), Some(true), 1));
+        assert_eq!(stat.min_as_str(), Some("false".into()));
+        assert_eq!(stat.max_as_str(), Some("true".into()));
+
+        let stat = Statistics::Bool(StatValues::new_non_null(None, None, 1));
+        assert_eq!(stat.min_as_str(), None);
+        assert_eq!(stat.max_as_str(), None);
+    }
+
+    #[test]
+    fn stats_as_str_str() {
+        let stat = Statistics::String(StatValues::new_non_null(
+            Some("a".to_string()),
+            Some("zz".to_string()),
+            1,
+        ));
+        assert_eq!(stat.min_as_str(), Some("a".into()));
+        assert_eq!(stat.max_as_str(), Some("zz".into()));
+
+        let stat = Statistics::String(StatValues::new_non_null(None, None, 1));
+        assert_eq!(stat.min_as_str(), None);
+        assert_eq!(stat.max_as_str(), None);
+    }
+
+    #[test]
+    fn table_update_from() {
+        let mut string_stats = StatValues::new_with_value("foo".to_string());
+        string_stats.update("bar");
+        let string_col = ColumnSummary {
+            name: "string".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::String(string_stats),
+        };
+
+        let mut int_stats = StatValues::new_with_value(1);
+        int_stats.update(&5);
+        let int_col = ColumnSummary {
+            name: "int".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::I64(int_stats),
+        };
+
+        let mut float_stats = StatValues::new_with_value(9.1);
+        float_stats.update(&1.3);
+        let float_col = ColumnSummary {
+            name: "float".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::F64(float_stats),
+        };
+
+        let mut table_a = TableSummary {
+            columns: vec![string_col, int_col, float_col],
+        };
+
+        let mut string_stats = StatValues::new_with_value("aaa".to_string());
+        string_stats.update("zzz");
+        let string_col = ColumnSummary {
+            name: "string".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::String(string_stats),
+        };
+
+        let mut int_stats = StatValues::new_with_value(3);
+        int_stats.update(&9);
+        let int_col = ColumnSummary {
+            name: "int".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::I64(int_stats),
+        };
+
+        let mut table_b = TableSummary {
+            columns: vec![int_col, string_col],
+        };
+
+        // keep this to test joining the other way
+        let table_c = table_a.clone();
+
+        table_a.update_from(&table_b);
+        let col = table_a.column("string").unwrap();
+        assert_eq!(
+            col.stats,
+            Statistics::String(StatValues::new_non_null(
+                Some("aaa".to_string()),
+                Some("zzz".to_string()),
+                4,
+            ))
+        );
+
+        let col = table_a.column("int").unwrap();
+        assert_eq!(
+            col.stats,
+            Statistics::I64(StatValues::new_non_null(Some(1), Some(9), 4))
+        );
+
+        let col = table_a.column("float").unwrap();
+        assert_eq!(
+            col.stats,
+            Statistics::F64(StatValues::new(Some(1.3), Some(9.1), 4, Some(2)))
+        );
+
+        table_b.update_from(&table_c);
+        let col = table_b.column("string").unwrap();
+        assert_eq!(
+            col.stats,
+            Statistics::String(StatValues::new_non_null(
+                Some("aaa".to_string()),
+                Some("zzz".to_string()),
+                4,
+            ))
+        );
+
+        let col = table_b.column("int").unwrap();
+        assert_eq!(
+            col.stats,
+            Statistics::I64(StatValues::new_non_null(Some(1), Some(9), 4))
+        );
+
+        let col = table_b.column("float").unwrap();
+        assert_eq!(
+            col.stats,
+            Statistics::F64(StatValues::new(Some(1.3), Some(9.1), 4, Some(2)))
+        );
+    }
+
+    #[test]
+    fn table_update_from_new_column() {
+        let string_stats = StatValues::new_with_value("bar".to_string());
+        let string_col = ColumnSummary {
+            name: "string".to_string(),
+            influxdb_type: InfluxDbType::Tag,
+            stats: Statistics::String(string_stats),
+        };
+
+        let int_stats = StatValues::new_with_value(5);
+        let int_col = ColumnSummary {
+            name: "int".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::I64(int_stats),
+        };
+
+        // table summary that does not have the "string" col
+        let table1 = TableSummary {
+            columns: vec![int_col.clone()],
+        };
+
+        // table summary that has both columns
+        let table2 = TableSummary {
+            columns: vec![int_col, string_col],
+        };
+
+        // Statistics should be the same regardless of the order we update the stats
+
+        let expected_string_stats = Statistics::String(StatValues::new(
+            Some("bar".to_string()),
+            Some("bar".to_string()),
+            2,       // total count is 2 even though did not appear in the update
+            Some(1), // 1 null
+        ));
+
+        let expected_int_stats = Statistics::I64(StatValues::new(
+            Some(5),
+            Some(5),
+            2,
+            Some(0), // no nulls
+        ));
+
+        // update table 1 with table 2
+        let mut table = table1.clone();
+        table.update_from(&table2);
+
+        assert_eq!(
+            &table.column("string").unwrap().stats,
+            &expected_string_stats
+        );
+
+        assert_eq!(&table.column("int").unwrap().stats, &expected_int_stats);
+
+        // update table 2 with table 1
+        let mut table = table2;
+        table.update_from(&table1);
+
+        assert_eq!(
+            &table.column("string").unwrap().stats,
+            &expected_string_stats
+        );
+
+        assert_eq!(&table.column("int").unwrap().stats, &expected_int_stats);
+    }
+
+    #[test]
+    fn column_update_from_boolean() {
+        let bool_false = ColumnSummary {
+            name: "b".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::Bool(StatValues::new(Some(false), Some(false), 1, Some(1))),
+        };
+        let bool_true = ColumnSummary {
+            name: "b".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::Bool(StatValues::new(Some(true), Some(true), 1, Some(2))),
+        };
+
+        let expected_stats = Statistics::Bool(StatValues::new(Some(false), Some(true), 2, Some(3)));
+
+        let mut b = bool_false.clone();
+        b.update_from(&bool_true);
+        assert_eq!(b.stats, expected_stats);
+
+        let mut b = bool_true;
+        b.update_from(&bool_false);
+        assert_eq!(b.stats, expected_stats);
+    }
+
+    #[test]
+    fn column_update_from_u64() {
+        let mut min = ColumnSummary {
+            name: "foo".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::U64(StatValues::new(Some(5), Some(23), 1, Some(1))),
+        };
+
+        let max = ColumnSummary {
+            name: "foo".to_string(),
+            influxdb_type: InfluxDbType::Field,
+            stats: Statistics::U64(StatValues::new(Some(6), Some(506), 43, Some(2))),
+        };
+
+        min.update_from(&max);
+
+        let expected = Statistics::U64(StatValues::new(Some(5), Some(506), 44, Some(3)));
+        assert_eq!(min.stats, expected);
+    }
+
+    #[test]
+    fn nans() {
+        let mut stat = StatValues::default();
+        assert_eq!(stat.min, None);
+        assert_eq!(stat.max, None);
+        assert_eq!(stat.total_count, 0);
+
+        stat.update(&f64::NAN);
+        assert_eq!(stat.min, None);
+        assert_eq!(stat.max, None);
+        assert_eq!(stat.total_count, 1);
+
+        stat.update(&1.0);
+        assert_eq!(stat.min, Some(1.0));
+        assert_eq!(stat.max, Some(1.0));
+        assert_eq!(stat.total_count, 2);
+
+        stat.update(&2.0);
+        assert_eq!(stat.min, Some(1.0));
+        assert_eq!(stat.max, Some(2.0));
+        assert_eq!(stat.total_count, 3);
+
+        stat.update(&f64::INFINITY);
+        assert_eq!(stat.min, Some(1.0));
+        assert_eq!(stat.max, Some(f64::INFINITY));
+        assert_eq!(stat.total_count, 4);
+
+        stat.update(&-1.0);
+        assert_eq!(stat.min, Some(-1.0));
+        assert_eq!(stat.max, Some(f64::INFINITY));
+        assert_eq!(stat.total_count, 5);
+
+        // ===========
+
+        let mut stat = StatValues::new_with_value(2.0);
+        stat.update(&f64::INFINITY);
+        assert_eq!(stat.min, Some(2.0));
+        assert_eq!(stat.max, Some(f64::INFINITY));
+        assert_eq!(stat.total_count, 2);
+
+        stat.update(&f64::NAN);
+        assert_eq!(stat.min, Some(2.0));
+        assert_eq!(stat.max, Some(f64::INFINITY));
+        assert_eq!(stat.total_count, 3);
+
+        // ===========
+
+        let mut stat2 = StatValues::new_with_value(1.0);
+        stat2.update_from(&stat);
+        assert_eq!(stat2.min, Some(1.0));
+        assert_eq!(stat.max, Some(f64::INFINITY));
+        assert_eq!(stat2.total_count, 4);
+
+        // ===========
+
+        let stat2 = StatValues::new_with_value(1.0);
+        stat.update_from(&stat2);
+        assert_eq!(stat.min, Some(1.0));
+        assert_eq!(stat.max, Some(f64::INFINITY));
+        assert_eq!(stat.total_count, 4);
+
+        // ===========
+
+        let stat = StatValues::new_with_value(f64::NAN);
+        assert_eq!(stat.min, None);
+        assert_eq!(stat.max, None);
+        assert_eq!(stat.total_count, 1);
+    }
+
+    #[test]
+    fn test_timestamp_nano_min_max() {
+        let cases = vec![
+            (
+                "MIN / MAX Nanos",
+                TimestampRange::new(MIN_NANO_TIME, MAX_NANO_TIME + 1),
+            ),
+            ("MIN/MAX i64", TimestampRange::new(i64::MIN, i64::MAX)),
+        ];
+
+        for (name, range) in cases {
+            println!("case: {name}");
+            assert!(!range.contains(i64::MIN));
+            assert!(!range.contains(i64::MIN + 1));
+            assert!(range.contains(MIN_NANO_TIME));
+            assert!(range.contains(MIN_NANO_TIME + 1));
+            assert!(range.contains(MAX_NANO_TIME - 1));
+            assert!(range.contains(MAX_NANO_TIME));
+            assert!(!range.contains(i64::MAX));
+            assert!(range.contains_all());
+            assert!(range.contains_nearly_all());
+        }
+    }
+
+    #[test]
+    fn test_timestamp_i64_min_max_offset() {
+        let range = TimestampRange::new(MIN_NANO_TIME + 1, MAX_NANO_TIME - 1);
+
+        assert!(!range.contains(i64::MIN));
+        assert!(!range.contains(MIN_NANO_TIME));
+        assert!(range.contains(MIN_NANO_TIME + 1));
+        assert!(range.contains(MAX_NANO_TIME - 2));
+        assert!(!range.contains(MAX_NANO_TIME - 1));
+        assert!(!range.contains(MAX_NANO_TIME));
+        assert!(!range.contains(i64::MAX));
+        assert!(!range.contains_all());
+        assert!(!range.contains_nearly_all());
+    }
+
+    #[test]
+    fn test_timestamp_i64_min_max_offset_max() {
+        let range = TimestampRange::new(MIN_NANO_TIME, MAX_NANO_TIME);
+
+        assert!(!range.contains(i64::MIN));
+        assert!(range.contains(MIN_NANO_TIME));
+        assert!(range.contains(MIN_NANO_TIME + 1));
+        assert!(range.contains(MAX_NANO_TIME - 1));
+        assert!(!range.contains(MAX_NANO_TIME));
+        assert!(!range.contains(i64::MAX));
+        assert!(!range.contains_all());
+        assert!(range.contains_nearly_all());
+    }
+
+    #[test]
+    fn test_timestamp_range_contains() {
+        let range = TimestampRange::new(100, 200);
+        assert!(!range.contains(99));
+        assert!(range.contains(100));
+        assert!(range.contains(101));
+        assert!(range.contains(199));
+        assert!(!range.contains(200));
+        assert!(!range.contains(201));
+    }
+
+    #[test]
+    fn test_timestamp_range_overlaps() {
+        let range = TimestampRange::new(100, 200);
+        assert!(!TimestampMinMax::new(0, 99).overlaps(range));
+        assert!(TimestampMinMax::new(0, 100).overlaps(range));
+        assert!(TimestampMinMax::new(0, 101).overlaps(range));
+
+        assert!(TimestampMinMax::new(0, 200).overlaps(range));
+        assert!(TimestampMinMax::new(0, 201).overlaps(range));
+        assert!(TimestampMinMax::new(0, 300).overlaps(range));
+
+        assert!(TimestampMinMax::new(100, 101).overlaps(range));
+        assert!(TimestampMinMax::new(100, 200).overlaps(range));
+        assert!(TimestampMinMax::new(100, 201).overlaps(range));
+
+        assert!(TimestampMinMax::new(101, 101).overlaps(range));
+        assert!(TimestampMinMax::new(101, 200).overlaps(range));
+        assert!(TimestampMinMax::new(101, 201).overlaps(range));
+
+        assert!(!TimestampMinMax::new(200, 200).overlaps(range));
+        assert!(!TimestampMinMax::new(200, 201).overlaps(range));
+
+        assert!(!TimestampMinMax::new(201, 300).overlaps(range));
+    }
+
+    #[test]
+    #[should_panic(expected = "expected min (2) <= max (1)")]
+    fn test_timestamp_min_max_invalid() {
+        TimestampMinMax::new(2, 1);
+    }
+
+    #[test]
+    fn test_table_schema_size() {
+        let schema1 = TableSchema {
+            id: TableId::new(1),
+            partition_template: Default::default(),
+            columns: ColumnsByName::default(),
+        };
+        let schema2 = TableSchema {
+            id: TableId::new(2),
+            partition_template: Default::default(),
+            columns: ColumnsByName::new([Column {
+                id: ColumnId::new(1),
+                table_id: TableId::new(2),
+                name: String::from("foo"),
+                column_type: ColumnType::Bool,
+            }]),
+        };
+        assert!(schema1.size() < schema2.size());
+    }
+
+    #[test]
+    fn test_namespace_schema_size() {
+        let schema1 = NamespaceSchema {
+            id: NamespaceId::new(1),
+            tables: BTreeMap::from([]),
+            max_tables: MaxTables::try_from(42).unwrap(),
+            max_columns_per_table: MaxColumnsPerTable::try_from(4).unwrap(),
+            retention_period_ns: None,
+            partition_template: Default::default(),
+        };
+        let schema2 = NamespaceSchema {
+            id: NamespaceId::new(1),
+            tables: BTreeMap::from([(
+                String::from("foo"),
+                TableSchema {
+                    id: TableId::new(1),
+                    columns: ColumnsByName::default(),
+                    partition_template: Default::default(),
+                },
+            )]),
+            max_tables: MaxTables::try_from(42).unwrap(),
+            max_columns_per_table: MaxColumnsPerTable::try_from(4).unwrap(),
+            retention_period_ns: None,
+            partition_template: Default::default(),
+        };
+        assert!(schema1.size() < schema2.size());
+    }
+
+    #[test]
+    #[should_panic = "timestamp wraparound"]
+    fn test_timestamp_wraparound_panic_add_i64() {
+        let _ = Timestamp::new(i64::MAX) + 1;
+    }
+
+    #[test]
+    #[should_panic = "timestamp wraparound"]
+    fn test_timestamp_wraparound_panic_sub_i64() {
+        let _ = Timestamp::new(i64::MIN) - 1;
+    }
+
+    #[test]
+    #[should_panic = "timestamp wraparound"]
+    fn test_timestamp_wraparound_panic_add_timestamp() {
+        let _ = Timestamp::new(i64::MAX) + Timestamp::new(1);
+    }
+
+    #[test]
+    #[should_panic = "timestamp wraparound"]
+    fn test_timestamp_wraparound_panic_sub_timestamp() {
+        let _ = Timestamp::new(i64::MIN) - Timestamp::new(1);
+    }
+
+    #[test]
+    fn test_timestamprange_start_after_end() {
+        let tr = TimestampRange::new(2, 1);
+        assert_eq!(tr.start(), 1);
+        assert_eq!(tr.end(), 1);
+    }
+}
diff --git a/data_types/src/namespace_name.rs b/data_types/src/namespace_name.rs
new file mode 100644
index 0000000..e9e2e58
--- /dev/null
+++ b/data_types/src/namespace_name.rs
@@ -0,0 +1,350 @@
+use std::{borrow::Cow, ops::RangeInclusive};
+
+use thiserror::Error;
+
+/// Length constraints for a [`NamespaceName`] name.
+///
+/// A `RangeInclusive` is a closed interval, covering [1, 64]
+const LENGTH_CONSTRAINT: RangeInclusive<usize> = 1..=64;
+
+/// Allowlist of chars for a [`NamespaceName`] name.
+///
+/// '/' | '_' | '-' are utilized by the platforms.
+fn is_allowed(c: char) -> bool {
+    c.is_alphanumeric() || matches!(c, '/' | '_' | '-')
+}
+
+/// Errors returned when attempting to construct a [`NamespaceName`] from an org
+/// & bucket string pair.
+#[derive(Debug, Error)]
+pub enum OrgBucketMappingError {
+    /// An error returned when the org, or bucket string contains invalid
+    /// characters.
+    #[error("invalid namespace name: {0}")]
+    InvalidNamespaceName(#[from] NamespaceNameError),
+
+    /// Either the org, or bucket is an empty string.
+    #[error("missing org/bucket value")]
+    NoOrgBucketSpecified,
+}
+
+/// [`NamespaceName`] name validation errors.
+#[derive(Debug, Error)]
+pub enum NamespaceNameError {
+    /// The provided namespace name does not fall within the valid length of a
+    /// namespace.
+    #[error(
+        "namespace name {} length must be between {} and {} characters",
+        name,
+        LENGTH_CONSTRAINT.start(),
+        LENGTH_CONSTRAINT.end()
+    )]
+    LengthConstraint {
+        /// The user-provided namespace that failed validation.
+        name: String,
+    },
+
+    /// The provided namespace name contains an unacceptable character.
+    #[error(
+        "namespace name '{}' contains invalid character, character number {} \
+        is not whitelisted",
+        name,
+        bad_char_offset
+    )]
+    BadChars {
+        /// The zero-indexed (multi-byte) character position that failed
+        /// validation.
+        bad_char_offset: usize,
+        /// The user-provided namespace that failed validation.
+        name: String,
+    },
+}
+
+/// A correctly formed namespace name.
+///
+/// Using this wrapper type allows the consuming code to enforce the invariant
+/// that only valid names are provided.
+///
+/// This type derefs to a `str` and therefore can be used in place of anything
+/// that is expecting a `str`:
+///
+/// ```rust
+/// # use data_types::NamespaceName;
+/// fn print_namespace(s: &str) {
+///     println!("namespace name: {}", s);
+/// }
+///
+/// let ns = NamespaceName::new("data").unwrap();
+/// print_namespace(&ns);
+/// ```
+///
+/// But this is not reciprocal - functions that wish to accept only
+/// pre-validated names can use `NamespaceName` as a parameter.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NamespaceName<'a>(Cow<'a, str>);
+
+impl<'a> NamespaceName<'a> {
+    /// Create a new, valid NamespaceName.
+    pub fn new<T: Into<Cow<'a, str>>>(name: T) -> Result<Self, NamespaceNameError> {
+        let name: Cow<'a, str> = name.into();
+
+        if !LENGTH_CONSTRAINT.contains(&name.len()) {
+            return Err(NamespaceNameError::LengthConstraint {
+                name: name.to_string(),
+            });
+        }
+
+        // Validate the name contains only valid characters.
+        //
+        // NOTE: If changing these characters, please update the error message
+        // above.
+        if let Some(bad_char_offset) = name.chars().position(|c| !is_allowed(c)) {
+            return Err(NamespaceNameError::BadChars {
+                bad_char_offset,
+                name: name.to_string(),
+            });
+        };
+
+        Ok(Self(name))
+    }
+
+    /// Borrow a string slice of the name.
+    pub fn as_str(&self) -> &str {
+        self.0.as_ref()
+    }
+
+    /// Map an InfluxDB 2.X org & bucket into an IOx NamespaceName.
+    ///
+    /// This function ensures the mapping is unambiguous by encoding any
+    /// non-alphanumeric characters in both `org` and `bucket` in addition to
+    /// the validation performed in [`NamespaceName::new()`].
+    pub fn from_org_and_bucket<O: AsRef<str>, B: AsRef<str>>(
+        org: O,
+        bucket: B,
+    ) -> Result<Self, OrgBucketMappingError> {
+        let org = org.as_ref();
+        let bucket = bucket.as_ref();
+
+        if org.is_empty() || bucket.is_empty() {
+            return Err(OrgBucketMappingError::NoOrgBucketSpecified);
+        }
+
+        Ok(Self::new(format!("{}_{}", org, bucket))?)
+    }
+
+    /// Efficiently returns the string representation of this [`NamespaceName`].
+    ///
+    /// If this [`NamespaceName`] contains an owned string, it is returned
+    /// without cloning.
+    pub fn into_string(self) -> String {
+        self.0.into_owned()
+    }
+}
+
+impl<'a> std::convert::From<NamespaceName<'a>> for String {
+    fn from(name: NamespaceName<'a>) -> Self {
+        name.0.to_string()
+    }
+}
+
+impl<'a> std::convert::From<&NamespaceName<'a>> for String {
+    fn from(name: &NamespaceName<'a>) -> Self {
+        name.to_string()
+    }
+}
+
+impl<'a> std::convert::TryFrom<&'a str> for NamespaceName<'a> {
+    type Error = NamespaceNameError;
+
+    fn try_from(v: &'a str) -> Result<Self, Self::Error> {
+        Self::new(v)
+    }
+}
+
+impl<'a> std::convert::TryFrom<String> for NamespaceName<'a> {
+    type Error = NamespaceNameError;
+
+    fn try_from(v: String) -> Result<Self, Self::Error> {
+        Self::new(v)
+    }
+}
+
+impl<'a> std::ops::Deref for NamespaceName<'a> {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+impl<'a> AsRef<[u8]> for NamespaceName<'a> {
+    fn as_ref(&self) -> &[u8] {
+        self.as_str().as_bytes()
+    }
+}
+
+impl<'a> std::fmt::Display for NamespaceName<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_org_bucket_map_db_ok() {
+        let got = NamespaceName::from_org_and_bucket("org", "bucket")
+            .expect("failed on valid DB mapping");
+
+        assert_eq!(got.as_str(), "org_bucket");
+        assert_eq!(got.into_string(), "org_bucket");
+    }
+
+    #[test]
+    fn test_into_string() {
+        // Ref type str
+        assert_eq!(
+            NamespaceName::new("bananas").unwrap().into_string(),
+            "bananas"
+        );
+        // Owned type string
+        assert_eq!(
+            NamespaceName::new("bananas".to_string())
+                .unwrap()
+                .into_string(),
+            "bananas"
+        );
+    }
+
+    #[test]
+    fn test_org_bucket_map_db_contains_underscore() {
+        let got = NamespaceName::from_org_and_bucket("my_org", "bucket").unwrap();
+        assert_eq!(got.as_str(), "my_org_bucket");
+
+        let got = NamespaceName::from_org_and_bucket("org", "my_bucket").unwrap();
+        assert_eq!(got.as_str(), "org_my_bucket");
+
+        let got = NamespaceName::from_org_and_bucket("org", "my__bucket").unwrap();
+        assert_eq!(got.as_str(), "org_my__bucket");
+
+        let got = NamespaceName::from_org_and_bucket("my_org", "my_bucket").unwrap();
+        assert_eq!(got.as_str(), "my_org_my_bucket");
+    }
+
+    #[test]
+    fn test_org_bucket_map_db_contains_underscore_and_percent() {
+        let err = NamespaceName::from_org_and_bucket("my%5Forg", "bucket");
+        assert!(matches!(
+            err,
+            Err(OrgBucketMappingError::InvalidNamespaceName { .. })
+        ));
+
+        let err = NamespaceName::from_org_and_bucket("my%5Forg_", "bucket");
+        assert!(matches!(
+            err,
+            Err(OrgBucketMappingError::InvalidNamespaceName { .. })
+        ));
+    }
+
+    #[test]
+    fn test_bad_namespace_name_fails_validation() {
+        let err = NamespaceName::from_org_and_bucket("org", "bucket?");
+        assert!(matches!(
+            err,
+            Err(OrgBucketMappingError::InvalidNamespaceName { .. })
+        ));
+
+        let err = NamespaceName::from_org_and_bucket("org!", "bucket");
+        assert!(matches!(
+            err,
+            Err(OrgBucketMappingError::InvalidNamespaceName { .. })
+        ));
+    }
+
+    #[test]
+    fn test_empty_org_bucket() {
+        let err = NamespaceName::from_org_and_bucket("", "")
+            .expect_err("should fail with empty org/bucket valuese");
+        assert!(matches!(err, OrgBucketMappingError::NoOrgBucketSpecified));
+    }
+
+    #[test]
+    fn test_deref() {
+        let db = NamespaceName::new("my_example_name").unwrap();
+        assert_eq!(&*db, "my_example_name");
+    }
+
+    #[test]
+    fn test_too_short() {
+        let name = "".to_string();
+        let got = NamespaceName::try_from(name).unwrap_err();
+
+        assert!(matches!(
+            got,
+            NamespaceNameError::LengthConstraint { name: _n }
+        ));
+    }
+
+    #[test]
+    fn test_too_long() {
+        let name = "my_example_name_that_is_quite_a_bit_longer_than_allowed_even_though_database_names_can_be_quite_long_bananas".to_string();
+        let got = NamespaceName::try_from(name).unwrap_err();
+
+        assert!(matches!(
+            got,
+            NamespaceNameError::LengthConstraint { name: _n }
+        ));
+    }
+
+    #[test]
+    fn test_bad_chars_null() {
+        let got = NamespaceName::new("example\x00").unwrap_err();
+        assert_eq!(got.to_string() , "namespace name 'example\x00' contains invalid character, character number 7 is not whitelisted");
+    }
+
+    #[test]
+    fn test_bad_chars_high_control() {
+        let got = NamespaceName::new("\u{007f}example").unwrap_err();
+        assert_eq!(got.to_string() , "namespace name '\u{007f}example' contains invalid character, character number 0 is not whitelisted");
+    }
+
+    #[test]
+    fn test_bad_chars_tab() {
+        let got = NamespaceName::new("example\tdb").unwrap_err();
+        assert_eq!(got.to_string() , "namespace name 'example\tdb' contains invalid character, character number 7 is not whitelisted");
+    }
+
+    #[test]
+    fn test_bad_chars_newline() {
+        let got = NamespaceName::new("my_example\ndb").unwrap_err();
+        assert_eq!(got.to_string() , "namespace name 'my_example\ndb' contains invalid character, character number 10 is not whitelisted");
+    }
+
+    #[test]
+    fn test_bad_chars_whitespace() {
+        let got = NamespaceName::new("my_example db").unwrap_err();
+        assert_eq!(got.to_string() , "namespace name 'my_example db' contains invalid character, character number 10 is not whitelisted");
+    }
+
+    #[test]
+    fn test_bad_chars_single_quote() {
+        let got = NamespaceName::new("my_example'db").unwrap_err();
+        assert_eq!(got.to_string() , "namespace name 'my_example\'db' contains invalid character, character number 10 is not whitelisted");
+    }
+
+    #[test]
+    fn test_ok_chars() {
+        let db =
+            NamespaceName::new("my-example-db_with_underscores/and/fwd/slash/AndCaseSensitive")
+                .unwrap();
+        assert_eq!(
+            &*db,
+            "my-example-db_with_underscores/and/fwd/slash/AndCaseSensitive"
+        );
+
+        let db = NamespaceName::new("a_ã_京").unwrap();
+        assert_eq!(&*db, "a_ã_京");
+    }
+}
diff --git a/data_types/src/partition.rs b/data_types/src/partition.rs
new file mode 100644
index 0000000..eb09524
--- /dev/null
+++ b/data_types/src/partition.rs
@@ -0,0 +1,690 @@
+//! Types having to do with partitions.
+
+use super::{ColumnsByName, SortKeyIds, TableId, Timestamp};
+
+use schema::sort::SortKey;
+use sha2::Digest;
+use std::{fmt::Display, sync::Arc};
+use thiserror::Error;
+
+/// Unique ID for a `Partition` during the transition from catalog-assigned sequential
+/// `PartitionId`s to deterministic `PartitionHashId`s.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum TransitionPartitionId {
+    /// The old catalog-assigned sequential `PartitionId`s that are in the process of being
+    /// deprecated.
+    Deprecated(PartitionId),
+    /// The new deterministic, hash-based `PartitionHashId`s that will be the new way to identify
+    /// partitions.
+    Deterministic(PartitionHashId),
+}
+
+impl TransitionPartitionId {
+    /// Create a [`TransitionPartitionId`] from a [`PartitionId`] and optional [`PartitionHashId`]
+    pub fn from_parts(id: PartitionId, hash_id: Option<PartitionHashId>) -> Self {
+        match hash_id {
+            Some(x) => Self::Deterministic(x),
+            None => Self::Deprecated(id),
+        }
+    }
+
+    /// Size in bytes including `self`.
+    pub fn size(&self) -> usize {
+        match self {
+            Self::Deprecated(_) => std::mem::size_of::<Self>(),
+            Self::Deterministic(id) => {
+                std::mem::size_of::<Self>() + id.size() - std::mem::size_of_val(id)
+            }
+        }
+    }
+}
+
+impl<'a, R> sqlx::FromRow<'a, R> for TransitionPartitionId
+where
+    R: sqlx::Row,
+    &'static str: sqlx::ColumnIndex<R>,
+    PartitionId: sqlx::decode::Decode<'a, R::Database>,
+    PartitionId: sqlx::types::Type<R::Database>,
+    Option<PartitionHashId>: sqlx::decode::Decode<'a, R::Database>,
+    Option<PartitionHashId>: sqlx::types::Type<R::Database>,
+{
+    fn from_row(row: &'a R) -> sqlx::Result<Self> {
+        let partition_id: Option<PartitionId> = row.try_get("partition_id")?;
+        let partition_hash_id: Option<PartitionHashId> = row.try_get("partition_hash_id")?;
+
+        let transition_partition_id = match (partition_id, partition_hash_id) {
+            (_, Some(hash_id)) => TransitionPartitionId::Deterministic(hash_id),
+            (Some(id), _) => TransitionPartitionId::Deprecated(id),
+            (None, None) => {
+                return Err(sqlx::Error::ColumnDecode {
+                    index: "partition_id".into(),
+                    source: "Both partition_id and partition_hash_id were NULL".into(),
+                })
+            }
+        };
+
+        Ok(transition_partition_id)
+    }
+}
+
+impl From<(PartitionId, Option<&PartitionHashId>)> for TransitionPartitionId {
+    fn from((partition_id, partition_hash_id): (PartitionId, Option<&PartitionHashId>)) -> Self {
+        Self::from_parts(partition_id, partition_hash_id.cloned())
+    }
+}
+
+impl std::fmt::Display for TransitionPartitionId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Deprecated(old_partition_id) => write!(f, "{}", old_partition_id.0),
+            Self::Deterministic(partition_hash_id) => write!(f, "{}", partition_hash_id),
+        }
+    }
+}
+
+impl TransitionPartitionId {
+    /// Create a new `TransitionPartitionId::Deterministic` with the given table
+    /// ID and partition key. Provided to reduce typing and duplication a bit,
+    /// and because this variant should be most common now.
+    ///
+    /// This MUST NOT be used for partitions that are addressed using legacy /
+    /// deprecated catalog row IDs, which should use
+    /// [`TransitionPartitionId::Deprecated`] instead.
+    pub fn new(table_id: TableId, partition_key: &PartitionKey) -> Self {
+        Self::Deterministic(PartitionHashId::new(table_id, partition_key))
+    }
+
+    /// Create a new `TransitionPartitionId` for cases in tests where you need some value but the
+    /// value doesn't matter. Public and not test-only so that other crates' tests can use this.
+    pub fn arbitrary_for_testing() -> Self {
+        Self::new(TableId::new(0), &PartitionKey::from("arbitrary"))
+    }
+}
+
+/// Errors deserialising protobuf representations of [`TransitionPartitionId`].
+#[derive(Debug, Error)]
+pub enum PartitionIdProtoError {
+    /// The proto type does not contain an ID.
+    #[error("no id specified for partition id")]
+    NoId,
+
+    /// The specified hash ID is invalid.
+    #[error(transparent)]
+    InvalidHashId(#[from] PartitionHashIdError),
+}
+
+/// Serialise a [`TransitionPartitionId`] to a protobuf representation.
+impl From<TransitionPartitionId>
+    for generated_types::influxdata::iox::catalog::v1::PartitionIdentifier
+{
+    fn from(value: TransitionPartitionId) -> Self {
+        use generated_types::influxdata::iox::catalog::v1 as proto;
+        match value {
+            TransitionPartitionId::Deprecated(id) => proto::PartitionIdentifier {
+                id: Some(proto::partition_identifier::Id::CatalogId(id.get())),
+            },
+            TransitionPartitionId::Deterministic(hash) => proto::PartitionIdentifier {
+                id: Some(proto::partition_identifier::Id::HashId(
+                    hash.as_bytes().to_owned(),
+                )),
+            },
+        }
+    }
+}
+
+/// Deserialise a [`TransitionPartitionId`] from a protobuf representation.
+impl TryFrom<generated_types::influxdata::iox::catalog::v1::PartitionIdentifier>
+    for TransitionPartitionId
+{
+    type Error = PartitionIdProtoError;
+
+    fn try_from(
+        value: generated_types::influxdata::iox::catalog::v1::PartitionIdentifier,
+    ) -> Result<Self, Self::Error> {
+        use generated_types::influxdata::iox::catalog::v1 as proto;
+
+        let id = value.id.ok_or(PartitionIdProtoError::NoId)?;
+
+        Ok(match id {
+            proto::partition_identifier::Id::CatalogId(v) => {
+                TransitionPartitionId::Deprecated(PartitionId::new(v))
+            }
+            proto::partition_identifier::Id::HashId(hash) => {
+                TransitionPartitionId::Deterministic(PartitionHashId::try_from(hash.as_slice())?)
+            }
+        })
+    }
+}
+
+/// Unique ID for a `Partition`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type, sqlx::FromRow)]
+#[sqlx(transparent)]
+pub struct PartitionId(i64);
+
+#[allow(missing_docs)]
+impl PartitionId {
+    pub const fn new(v: i64) -> Self {
+        Self(v)
+    }
+    pub fn get(&self) -> i64 {
+        self.0
+    }
+}
+
+impl std::fmt::Display for PartitionId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+/// Defines a partition via an arbitrary string within a table within
+/// a namespace.
+///
+/// Implemented as a reference-counted string, serialisable to
+/// the Postgres VARCHAR data type.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct PartitionKey(Arc<str>);
+
+impl PartitionKey {
+    /// Returns true if this instance of [`PartitionKey`] is backed by the same
+    /// string storage as other.
+    pub fn ptr_eq(&self, other: &Self) -> bool {
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+
+    /// Returns underlying string.
+    pub fn inner(&self) -> &str {
+        &self.0
+    }
+
+    /// Returns the bytes of the inner string.
+    pub fn as_bytes(&self) -> &[u8] {
+        self.0.as_bytes()
+    }
+}
+
+impl Display for PartitionKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(&self.0)
+    }
+}
+
+impl From<String> for PartitionKey {
+    fn from(s: String) -> Self {
+        assert!(!s.is_empty());
+        Self(s.into())
+    }
+}
+
+impl From<&str> for PartitionKey {
+    fn from(s: &str) -> Self {
+        assert!(!s.is_empty());
+        Self(s.into())
+    }
+}
+
+impl sqlx::Type<sqlx::Postgres> for PartitionKey {
+    fn type_info() -> sqlx::postgres::PgTypeInfo {
+        // Store this type as VARCHAR
+        sqlx::postgres::PgTypeInfo::with_name("VARCHAR")
+    }
+}
+
+impl sqlx::Encode<'_, sqlx::Postgres> for PartitionKey {
+    fn encode_by_ref(
+        &self,
+        buf: &mut <sqlx::Postgres as sqlx::database::HasArguments<'_>>::ArgumentBuffer,
+    ) -> sqlx::encode::IsNull {
+        <&str as sqlx::Encode<sqlx::Postgres>>::encode(&self.0, buf)
+    }
+}
+
+impl sqlx::Decode<'_, sqlx::Postgres> for PartitionKey {
+    fn decode(
+        value: <sqlx::Postgres as sqlx::database::HasValueRef<'_>>::ValueRef,
+    ) -> Result<Self, Box<dyn std::error::Error + 'static + Send + Sync>> {
+        Ok(Self(
+            <String as sqlx::Decode<sqlx::Postgres>>::decode(value)?.into(),
+        ))
+    }
+}
+
+impl sqlx::Type<sqlx::Sqlite> for PartitionKey {
+    fn type_info() -> sqlx::sqlite::SqliteTypeInfo {
+        <String as sqlx::Type<sqlx::Sqlite>>::type_info()
+    }
+}
+
+impl sqlx::Encode<'_, sqlx::Sqlite> for PartitionKey {
+    fn encode_by_ref(
+        &self,
+        buf: &mut <sqlx::Sqlite as sqlx::database::HasArguments<'_>>::ArgumentBuffer,
+    ) -> sqlx::encode::IsNull {
+        <String as sqlx::Encode<sqlx::Sqlite>>::encode(self.0.to_string(), buf)
+    }
+}
+
+impl sqlx::Decode<'_, sqlx::Sqlite> for PartitionKey {
+    fn decode(
+        value: <sqlx::Sqlite as sqlx::database::HasValueRef<'_>>::ValueRef,
+    ) -> Result<Self, Box<dyn std::error::Error + 'static + Send + Sync>> {
+        Ok(Self(
+            <String as sqlx::Decode<sqlx::Sqlite>>::decode(value)?.into(),
+        ))
+    }
+}
+
+const PARTITION_HASH_ID_SIZE_BYTES: usize = 32;
+
+/// Uniquely identify a partition based on its table ID and partition key.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, sqlx::FromRow)]
+#[sqlx(transparent)]
+pub struct PartitionHashId(Arc<[u8; PARTITION_HASH_ID_SIZE_BYTES]>);
+
+impl std::fmt::Display for PartitionHashId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        for byte in &*self.0 {
+            write!(f, "{:02x}", byte)?;
+        }
+        Ok(())
+    }
+}
+
+impl std::hash::Hash for PartitionHashId {
+    #[inline(always)]
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        // the slice is already hashed, so we can be a bit more efficient:
+        // A hash of an object is technically only 64bits (this is what `Hasher::finish()` will produce). We assume that
+        // the SHA256 hash sum that was used to create the partition hash is good enough so that every 64-bit slice of
+        // it is a good hash candidate for the entire object. Hence, we only forward the first 64 bits to the hasher and
+        // call it a day.
+
+        // There is currently no nice way to slice fixed-sized arrays, see:
+        // https://github.com/rust-lang/rust/issues/90091
+        //
+        // So we implement this the hard way (to avoid some nasty panic paths that are quite expensive within a hash function).
+        // Conversion borrowed from https://github.com/rust-lang/rfcs/issues/1833#issuecomment-269509262
+        const N_BYTES: usize = u64::BITS as usize / 8;
+        #[allow(clippy::assertions_on_constants)]
+        const _: () = assert!(PARTITION_HASH_ID_SIZE_BYTES >= N_BYTES);
+        let ptr = self.0.as_ptr() as *const [u8; N_BYTES];
+        let sub: &[u8; N_BYTES] = unsafe { &*ptr };
+
+        state.write_u64(u64::from_ne_bytes(*sub));
+    }
+}
+
+/// Reasons bytes specified aren't a valid `PartitionHashId`.
+#[derive(Debug, Error)]
+#[allow(missing_copy_implementations)]
+pub enum PartitionHashIdError {
+    /// The bytes specified were not valid
+    #[error("Could not interpret bytes as `PartitionHashId`: {data:?}")]
+    InvalidBytes {
+        /// The bytes used in the attempt to create a `PartitionHashId`
+        data: Vec<u8>,
+    },
+}
+
+impl TryFrom<&[u8]> for PartitionHashId {
+    type Error = PartitionHashIdError;
+
+    fn try_from(data: &[u8]) -> Result<Self, Self::Error> {
+        let data: [u8; PARTITION_HASH_ID_SIZE_BYTES] =
+            data.try_into()
+                .map_err(|_| PartitionHashIdError::InvalidBytes {
+                    data: data.to_vec(),
+                })?;
+
+        Ok(Self(Arc::new(data)))
+    }
+}
+
+impl PartitionHashId {
+    /// Create a new `PartitionHashId`.
+    pub fn new(table_id: TableId, partition_key: &PartitionKey) -> Self {
+        Self::from_raw(table_id, partition_key.as_bytes())
+    }
+
+    /// Create a new `PartitionHashId`
+    pub fn from_raw(table_id: TableId, key: &[u8]) -> Self {
+        // The hash ID of a partition is the SHA-256 of the `TableId` then the `PartitionKey`. This
+        // particular hash format was chosen so that there won't be collisions and this value can
+        // be used to uniquely identify a Partition without needing to go to the catalog to get a
+        // database-assigned ID. Given that users might set their `PartitionKey`, a cryptographic
+        // hash scoped by the `TableId` is needed to prevent malicious users from constructing
+        // collisions. This data will be held in memory across many services, so SHA-256 was chosen
+        // over SHA-512 to get the needed attributes in the smallest amount of space.
+        let mut inner = sha2::Sha256::new();
+
+        let table_bytes = table_id.to_be_bytes();
+        // Avoiding collisions depends on the table ID's bytes always being a fixed size. So even
+        // though the current return type of `TableId::to_be_bytes` is `[u8; 8]`, we're asserting
+        // on the length here to make sure this code's assumptions hold even if the type of
+        // `TableId` changes in the future.
+        assert_eq!(table_bytes.len(), 8);
+        inner.update(table_bytes);
+
+        inner.update(key);
+        Self(Arc::new(inner.finalize().into()))
+    }
+
+    /// Read access to the bytes of the hash identifier.
+    pub fn as_bytes(&self) -> &[u8] {
+        self.0.as_ref()
+    }
+
+    /// Size in bytes including `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>() + self.0.len()
+    }
+
+    /// Create a new `PartitionHashId` for cases in tests where you need some value but the value
+    /// doesn't matter. Public and not test-only so that other crates' tests can use this.
+    pub fn arbitrary_for_testing() -> Self {
+        Self::new(TableId::new(0), &PartitionKey::from("arbitrary"))
+    }
+}
+
+impl<'q> sqlx::encode::Encode<'q, sqlx::Postgres> for &'q PartitionHashId {
+    fn encode_by_ref(&self, buf: &mut sqlx::postgres::PgArgumentBuffer) -> sqlx::encode::IsNull {
+        buf.extend_from_slice(self.0.as_ref());
+
+        sqlx::encode::IsNull::No
+    }
+}
+
+impl<'q> sqlx::encode::Encode<'q, sqlx::Sqlite> for &'q PartitionHashId {
+    fn encode_by_ref(
+        &self,
+        args: &mut Vec<sqlx::sqlite::SqliteArgumentValue<'q>>,
+    ) -> sqlx::encode::IsNull {
+        args.push(sqlx::sqlite::SqliteArgumentValue::Blob(
+            std::borrow::Cow::Borrowed(self.0.as_ref()),
+        ));
+
+        sqlx::encode::IsNull::No
+    }
+}
+
+impl<'r, DB: ::sqlx::Database> ::sqlx::decode::Decode<'r, DB> for PartitionHashId
+where
+    &'r [u8]: sqlx::Decode<'r, DB>,
+{
+    fn decode(
+        value: <DB as ::sqlx::database::HasValueRef<'r>>::ValueRef,
+    ) -> ::std::result::Result<
+        Self,
+        ::std::boxed::Box<
+            dyn ::std::error::Error + 'static + ::std::marker::Send + ::std::marker::Sync,
+        >,
+    > {
+        let data = <&[u8] as ::sqlx::decode::Decode<'r, DB>>::decode(value)?;
+        let data: [u8; PARTITION_HASH_ID_SIZE_BYTES] = data.try_into()?;
+        Ok(Self(Arc::new(data)))
+    }
+}
+
+impl<'r, DB: ::sqlx::Database> ::sqlx::Type<DB> for PartitionHashId
+where
+    &'r [u8]: ::sqlx::Type<DB>,
+{
+    fn type_info() -> DB::TypeInfo {
+        <&[u8] as ::sqlx::Type<DB>>::type_info()
+    }
+}
+
+/// Data object for a partition. The combination of table and key are unique (i.e. only one record
+/// can exist for each combo)
+#[derive(Debug, Clone, PartialEq, Eq, sqlx::FromRow, Hash)]
+pub struct Partition {
+    /// the id of the partition
+    pub id: PartitionId,
+    /// The unique hash derived from the table ID and partition key, if available. This will become
+    /// required when partitions without the value have aged out.
+    hash_id: Option<PartitionHashId>,
+    /// the table the partition is under
+    pub table_id: TableId,
+    /// the string key of the partition
+    pub partition_key: PartitionKey,
+
+    /// Vector of column IDs that describes how *every* parquet file
+    /// in this [`Partition`] is sorted. The sort key contains all the
+    /// primary key (PK) columns that have been persisted, and nothing
+    /// else. The PK columns are all `tag` columns and the `time`
+    /// column.
+    ///
+    /// Even though it is possible for both the unpersisted data
+    /// and/or multiple parquet files to contain different subsets of
+    /// columns, the partition's sort key is guaranteed to be
+    /// "compatible" across all files. Compatible means that the
+    /// parquet file is sorted in the same order as the partition
+    /// sort key after removing any missing columns.
+    ///
+    /// Partitions are initially created before any data is persisted
+    /// with an empty sort key. The partition sort key is updated as
+    /// needed when data is persisted to parquet files: both on the
+    /// first persist when the sort key is empty, as on subsequent
+    /// persist operations when new tags occur in newly inserted data.
+    ///
+    /// Updating inserts new columns into the existing sort key. The order
+    /// of the existing columns relative to each other is NOT changed.
+    ///
+    /// For example, updating `A,B,C` to either `A,D,B,C` or `A,B,C,D`
+    /// is legal. However, updating to `A,C,D,B` is not because the
+    /// relative order of B and C has been reversed.
+    sort_key_ids: SortKeyIds,
+
+    /// The time at which the newest file of the partition is created
+    pub new_file_at: Option<Timestamp>,
+}
+
+impl Partition {
+    /// Create a new Partition data object from the given attributes. This constructor will take
+    /// care of computing the [`PartitionHashId`].
+    ///
+    /// This is only appropriate to use in the catalog or in tests.
+    pub fn new_catalog_only(
+        id: PartitionId,
+        hash_id: Option<PartitionHashId>,
+        table_id: TableId,
+        partition_key: PartitionKey,
+        sort_key_ids: SortKeyIds,
+        new_file_at: Option<Timestamp>,
+    ) -> Self {
+        Self {
+            id,
+            hash_id,
+            table_id,
+            partition_key,
+            sort_key_ids,
+            new_file_at,
+        }
+    }
+
+    /// If this partition has a `PartitionHashId` stored in the catalog, use that. Otherwise, use
+    /// the database-assigned `PartitionId`.
+    pub fn transition_partition_id(&self) -> TransitionPartitionId {
+        TransitionPartitionId::from((self.id, self.hash_id.as_ref()))
+    }
+
+    /// The unique hash derived from the table ID and partition key, if it exists in the catalog.
+    pub fn hash_id(&self) -> Option<&PartitionHashId> {
+        self.hash_id.as_ref()
+    }
+
+    /// The sort key IDs, if the sort key has been set
+    pub fn sort_key_ids(&self) -> Option<&SortKeyIds> {
+        if self.sort_key_ids.is_empty() {
+            None
+        } else {
+            Some(&self.sort_key_ids)
+        }
+    }
+
+    /// The sort key containing the column names found in the specified column map.
+    ///
+    /// # Panics
+    ///
+    /// Will panic if an ID isn't found in the column map.
+    pub fn sort_key(&self, columns_by_name: &ColumnsByName) -> Option<SortKey> {
+        self.sort_key_ids()
+            .map(|sort_key_ids| sort_key_ids.to_sort_key(columns_by_name))
+    }
+
+    /// Change the sort key IDs to the given sort key IDs. This should only be used in the
+    /// in-memory catalog or in tests; all other sort key updates should go through the catalog
+    /// functions.
+    pub fn set_sort_key_ids(&mut self, sort_key_ids: &SortKeyIds) {
+        self.sort_key_ids = sort_key_ids.clone();
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use std::hash::{Hash, Hasher};
+
+    use super::*;
+
+    use assert_matches::assert_matches;
+    use proptest::{prelude::*, proptest};
+
+    /// A fixture test asserting the deterministic partition ID generation
+    /// algorithm outputs a fixed value, preventing accidental changes to the
+    /// derived ID.
+    ///
+    /// This hash byte value MUST NOT change for the lifetime of a cluster
+    /// (though the encoding used in this test can).
+    #[test]
+    fn display_partition_hash_id_in_hex() {
+        let partition_hash_id =
+            PartitionHashId::new(TableId::new(5), &PartitionKey::from("2023-06-08"));
+
+        assert_eq!(
+            "ebd1041daa7c644c99967b817ae607bdcb754c663f2c415f270d6df720280f7a",
+            partition_hash_id.to_string()
+        );
+    }
+
+    prop_compose! {
+        /// Return an arbitrary [`TransitionPartitionId`] with a randomised ID
+        /// value.
+        pub fn arbitrary_partition_id()(
+            use_hash in any::<bool>(),
+            row_id in any::<i64>(),
+            hash_id in any::<[u8; PARTITION_HASH_ID_SIZE_BYTES]>()
+        ) -> TransitionPartitionId {
+            match use_hash {
+                true => TransitionPartitionId::Deterministic(PartitionHashId(hash_id.into())),
+                false => TransitionPartitionId::Deprecated(PartitionId::new(row_id)),
+            }
+        }
+    }
+
+    proptest! {
+        #[test]
+        fn partition_hash_id_representations(
+            table_id in 0..i64::MAX,
+            partition_key in ".+",
+        ) {
+            let table_id = TableId::new(table_id);
+            let partition_key = PartitionKey::from(partition_key);
+
+            let partition_hash_id = PartitionHashId::new(table_id, &partition_key);
+
+            // ID generation MUST be deterministic.
+            let partition_hash_id_regenerated = PartitionHashId::new(table_id, &partition_key);
+            assert_eq!(partition_hash_id, partition_hash_id_regenerated);
+
+            // ID generation MUST be collision resistant; different inputs -> different IDs
+            let other_table_id = TableId::new(table_id.get().wrapping_add(1));
+            let different_partition_hash_id = PartitionHashId::new(other_table_id, &partition_key);
+            assert_ne!(partition_hash_id, different_partition_hash_id);
+
+            // The bytes of the partition hash ID are stored in the catalog and sent from the
+            // ingesters to the queriers. We should be able to round-trip through bytes.
+            let bytes_representation = partition_hash_id.as_bytes();
+            assert_eq!(bytes_representation.len(), 32);
+            let from_bytes = PartitionHashId::try_from(bytes_representation).unwrap();
+            assert_eq!(from_bytes, partition_hash_id);
+
+            // The hex string of the bytes is used in the Parquet file path in object storage, and
+            // should always be the same length.
+            let string_representation = partition_hash_id.to_string();
+            assert_eq!(string_representation.len(), 64);
+
+            // While nothing is currently deserializing the hex string to create `PartitionHashId`
+            // instances, it should work because there's nothing preventing it either.
+            let bytes_from_string = hex::decode(string_representation).unwrap();
+            let from_string = PartitionHashId::try_from(&bytes_from_string[..]).unwrap();
+            assert_eq!(from_string, partition_hash_id);
+        }
+
+        /// Assert a [`TransitionPartitionId`] is round-trippable through proto
+        /// serialisation.
+        #[test]
+        fn prop_partition_id_proto_round_trip(id in arbitrary_partition_id()) {
+            use generated_types::influxdata::iox::catalog::v1 as proto;
+
+            // Encoding is infallible
+            let encoded = proto::PartitionIdentifier::from(id.clone());
+
+            // Decoding a valid ID is infallible.
+            let decoded = TransitionPartitionId::try_from(encoded).unwrap();
+
+            // The deserialised value must match the input (round trippable)
+            assert_eq!(decoded, id);
+        }
+    }
+
+    #[test]
+    fn test_proto_no_id() {
+        use generated_types::influxdata::iox::catalog::v1 as proto;
+
+        let msg = proto::PartitionIdentifier { id: None };
+
+        assert_matches!(
+            TransitionPartitionId::try_from(msg),
+            Err(PartitionIdProtoError::NoId)
+        );
+    }
+
+    #[test]
+    fn test_proto_bad_hash() {
+        use generated_types::influxdata::iox::catalog::v1 as proto;
+
+        let msg = proto::PartitionIdentifier {
+            id: Some(proto::partition_identifier::Id::HashId(vec![42])),
+        };
+
+        assert_matches!(
+            TransitionPartitionId::try_from(msg),
+            Err(PartitionIdProtoError::InvalidHashId(_))
+        );
+    }
+
+    #[test]
+    fn test_hash_partition_hash_id() {
+        let id = PartitionHashId::arbitrary_for_testing();
+
+        let mut hasher = TestHasher::default();
+        id.hash(&mut hasher);
+
+        assert_eq!(hasher.written, vec![id.as_bytes()[..8].to_vec()],);
+    }
+
+    #[derive(Debug, Default)]
+    struct TestHasher {
+        written: Vec<Vec<u8>>,
+    }
+
+    impl Hasher for TestHasher {
+        fn finish(&self) -> u64 {
+            unimplemented!()
+        }
+
+        fn write(&mut self, bytes: &[u8]) {
+            self.written.push(bytes.to_vec());
+        }
+    }
+}
diff --git a/data_types/src/partition_template.rs b/data_types/src/partition_template.rs
new file mode 100644
index 0000000..bbd0633
--- /dev/null
+++ b/data_types/src/partition_template.rs
@@ -0,0 +1,1949 @@
+//! Partition templating with per-namespace & table override types.
+//!
+//! The override types utilise per-entity wrappers for type safety, ensuring a
+//! namespace override is not used in a table (and vice versa), as well as to
+//! ensure the correct chain of inheritance is adhered to at compile time.
+//!
+//! A partitioning template is resolved by evaluating the following (in order of
+//! precedence):
+//!
+//!    1. Table name override, if specified.
+//!    2. Namespace name override, if specified.
+//!    3. Default partitioning scheme (YYYY-MM-DD)
+//!
+//! Each of the [`NamespacePartitionTemplateOverride`] and
+//! [`TablePartitionTemplateOverride`] stores only the override, if provided,
+//! and implicitly resolves to the default partitioning scheme if no override is
+//! specified (indicated by the presence of [`Option::None`] in the wrapper).
+//!
+//! ## Default Partition Key
+//!
+//! The default partition key format is specified by [`PARTITION_BY_DAY_PROTO`],
+//! with a template consisting of a single part: a YYYY-MM-DD representation of
+//! the time row timestamp.
+//!
+//! ## Partition Key Format
+//!
+//! Should a partition template be used that generates a partition key
+//! containing more than one part, those parts are delimited by the `|`
+//! character ([`PARTITION_KEY_DELIMITER`]), chosen to be an unusual character
+//! that is unlikely to occur in user-provided column values in order to
+//! minimise the need to encode the value in the common case, while still
+//! providing legible / printable keys. Should the user-provided column value
+//! contain the `|` key, it is [percent encoded] (in addition to `!` below, and
+//! the `%` character itself) to prevent ambiguity.
+//!
+//! It is an invariant that the resulting partition key derived from a given
+//! template has the same number and ordering of parts.
+//!
+//! If the partition key template references a [`TemplatePart::TagValue`] column
+//! that is not present in the row, a single `!` is inserted, indicating a NULL
+//! template key part. If the value of the part is an empty string (""), a `^`
+//! is inserted to ensure a non-empty partition key is always generated. Like
+//! the `|` key above, any occurrence of these characters in a user-provided
+//! column value is percent encoded.
+//!
+//! Because this serialisation format can be unambiguously reversed, the
+//! [`build_column_values()`] function can be used to obtain the set of
+//! [`TemplatePart::TagValue`] the key was constructed from.
+//!
+//! ### Value Truncation
+//!
+//! Partition key parts are limited to, at most, 200 bytes in length
+//! ([`PARTITION_KEY_MAX_PART_LEN`]). If any single partition key part exceeds
+//! this length limit, it is truncated and the truncation marker `#`
+//! ([`PARTITION_KEY_PART_TRUNCATED`]) is appended.
+//!
+//! When rebuilding column values using [`build_column_values()`], a truncated
+//! key part yields [`ColumnValue::Prefix`], which can only be used for prefix
+//! matching - equality matching against a string always returns false.
+//!
+//! Two considerations must be made when truncating the generated key:
+//!
+//!  * The string may contain encoded sequences in the form %XX, and the string
+//!    should not be split within an encoded sequence, or decoding the string
+//!    will fail.
+//!
+//!  * This may be a unicode string - what the user might consider a "character"
+//!    may in fact be multiple unicode code-points, each of which may span
+//!    multiple bytes.
+//!
+//! Slicing a unicode code-point in half may lead to an invalid UTF-8 string,
+//! which will prevent it from being used in Rust (and likely many other
+//! languages/systems). Because partition keys are represented as strings and
+//! not bytes, splitting a code-point in half MUST be avoided.
+//!
+//! Further to this, a sequence of multiple code-points can represent a single
+//! "character" - this is called a grapheme. For example, the representation of
+//! the Tamil "ni" character "நி" is composed of two multi-byte code-points; the
+//! Tamil letter "na" which renders as "ந" and the vowel sign "ி", each 3 bytes
+//! long. If split after the first 3 bytes, the compound "ni" character will be
+//! incorrectly rendered as the single "na"/"ந" character.
+//!
+//! Depending on what the consumer of the split string considers a character,
+//! prefix/equality matching may produce differing results if a grapheme is
+//! split. If the caller performs a byte-wise comparison, everything is fine -
+//! if they perform a "character" comparison, then the equality may be lost
+//! depending on what they consider a character.
+//!
+//! Therefore this implementation takes the conservative approach of never
+//! splitting code-points (for UTF-8 correctness) nor graphemes for simplicity
+//! and compatibility for the consumer. This may be relaxed in the future to
+//! allow splitting graphemes, but by being conservative we give ourselves this
+//! option - we can't easily do the reverse!
+//!
+//! ## Part Limit & Maximum Key Size
+//!
+//! The number of parts in a partition template is limited to 8
+//! ([`MAXIMUM_NUMBER_OF_TEMPLATE_PARTS`]), validated at creation time.
+//!
+//! Together with the above value truncation, this bounds the maximum length of
+//! a partition key to 1,607 bytes (1.57 KiB).
+//!
+//! ### Reserved Characters
+//!
+//! Reserved characters that are percent encoded (in addition to non-ASCII
+//! characters), and their meaning:
+//!
+//!   * `|` - partition key part delimiter ([`PARTITION_KEY_DELIMITER`])
+//!   * `!` - NULL/missing partition key part ([`PARTITION_KEY_VALUE_NULL`])
+//!   * `^` - empty string partition key part ([`PARTITION_KEY_VALUE_EMPTY`])
+//!   * `#` - key part truncation marker ([`PARTITION_KEY_PART_TRUNCATED`])
+//!   * `%` - required for unambiguous reversal of percent encoding
+//!
+//! These characters are defined in [`ENCODED_PARTITION_KEY_CHARS`] and chosen
+//! due to their low likelihood of occurrence in user-provided column values.
+//!
+//! ### Reserved Tag Values
+//!
+//! Reserved tag values that cannot be used:
+//!
+//!   * `time` - The time column has special meaning and is covered by strftime
+//!              formatters ([`TAG_VALUE_KEY_TIME`])
+//!
+//! ### Examples
+//!
+//! When using the partition template below:
+//!
+//! ```text
+//!      [
+//!          TemplatePart::TimeFormat("%Y"),
+//!          TemplatePart::TagValue("a"),
+//!          TemplatePart::TagValue("b"),
+//!          TemplatePart::Bucket("c", 10)
+//!      ]
+//! ```
+//!
+//! The following partition keys are derived:
+//!
+//!   * `time=2023-01-01, a=bananas, b=plátanos, c=ananas`   -> `2023|bananas|plátanos|5`
+//!   * `time=2023-01-01, b=plátanos`                        -> `2023|!|plátanos|!`
+//!   * `time=2023-01-01, another=cat, b=plátanos`           -> `2023|!|plátanos|!`
+//!   * `time=2023-01-01`                                    -> `2023|!|!|!`
+//!   * `time=2023-01-01, a=cat|dog, b=!, c=!`               -> `2023|cat%7Cdog|%21|8`
+//!   * `time=2023-01-01, a=%50, c=%50`                      -> `2023|%2550|!|9`
+//!   * `time=2023-01-01, a=, c=`                            -> `2023|^|!|0`
+//!   * `time=2023-01-01, a=<long string>`                   -> `2023|<long string>#|!|!`
+//!
+//! When using the default partitioning template (YYYY-MM-DD) there is no
+//! encoding necessary, as the derived partition key contains a single part, and
+//! no reserved characters.
+//!
+//! [percent encoded]: https://url.spec.whatwg.org/#percent-encoded-bytes
+use std::{
+    borrow::Cow,
+    fmt::{Display, Formatter},
+    ops::Range,
+    sync::Arc,
+};
+
+use chrono::{
+    format::{Numeric, StrftimeItems},
+    DateTime, Days, Months, Utc,
+};
+use generated_types::influxdata::iox::partition_template::v1 as proto;
+use murmur3::murmur3_32;
+use once_cell::sync::Lazy;
+use percent_encoding::{percent_decode_str, AsciiSet, CONTROLS};
+use schema::TIME_COLUMN_NAME;
+use thiserror::Error;
+
+/// Reasons a user-specified partition template isn't valid.
+#[derive(Debug, Error)]
+#[allow(missing_copy_implementations)]
+pub enum ValidationError {
+    /// The partition template didn't define any parts.
+    #[error("Custom partition template must have at least one part")]
+    NoParts,
+
+    /// The partition template exceeded the maximum allowed number of parts.
+    #[error(
+        "Custom partition template specified {specified} parts. \
+        Partition templates may have a maximum of {MAXIMUM_NUMBER_OF_TEMPLATE_PARTS} parts."
+    )]
+    TooManyParts {
+        /// The number of template parts that were present in the user-provided custom partition
+        /// template.
+        specified: usize,
+    },
+
+    /// The partition template defines a [`TimeFormat`] part, but the
+    /// provided strftime formatter is invalid.
+    ///
+    /// [`TimeFormat`]: [`proto::template_part::Part::TimeFormat`]
+    #[error("invalid strftime format in partition template: {0}")]
+    InvalidStrftime(String),
+
+    /// The partition template defines a [`TagValue`] part or [`Bucket`] part,
+    /// but the provided tag name value is invalid.
+    ///
+    /// [`TagValue`]: [`proto::template_part::Part::TagValue`]
+    /// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+    #[error("invalid tag name value in partition template: {0}")]
+    InvalidTagValue(String),
+
+    /// The partition template defines a [`Bucket`] part, but the provided
+    /// number of buckets is invalid.
+    ///
+    /// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+    #[error(
+        "number of buckets in partition template must be in range \
+        [{ALLOWED_BUCKET_QUANTITIES:?}), number specified: {0}"
+    )]
+    InvalidNumberOfBuckets(u32),
+
+    /// The partition template defines a [`TagValue`] or [`Bucket`] part
+    /// which repeats a tag name used in another [`TagValue`] or [`Bucket`] part.
+    /// This is not allowed
+    ///
+    /// [`TagValue`]: [`proto::template_part::Part::TagValue`]
+    /// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+    #[error("tag name value cannot be repeated in partition template: {0}")]
+    RepeatedTagValue(String),
+}
+
+/// The maximum number of template parts a custom partition template may specify, to limit the
+/// amount of space in the catalog used by the custom partition template and the partition keys
+/// created with it.
+pub const MAXIMUM_NUMBER_OF_TEMPLATE_PARTS: usize = 8;
+
+/// The sentinel character used to delimit partition key parts in the partition
+/// key string.
+pub const PARTITION_KEY_DELIMITER: char = '|';
+
+/// The sentinel character used to indicate an empty string partition key part
+/// in the partition key string.
+pub const PARTITION_KEY_VALUE_EMPTY: char = '^';
+
+/// The `str` form of the [`PARTITION_KEY_VALUE_EMPTY`] character.
+pub const PARTITION_KEY_VALUE_EMPTY_STR: &str = "^";
+
+/// The sentinel character used to indicate a missing partition key part in the
+/// partition key string.
+pub const PARTITION_KEY_VALUE_NULL: char = '!';
+
+/// The `str` form of the [`PARTITION_KEY_VALUE_NULL`] character.
+pub const PARTITION_KEY_VALUE_NULL_STR: &str = "!";
+
+/// The maximum permissible length of a partition key part, after encoding
+/// reserved & non-ASCII characters.
+pub const PARTITION_KEY_MAX_PART_LEN: usize = 200;
+
+/// The truncation sentinel character, used to explicitly identify a partition
+/// key as having been truncated.
+///
+/// Truncated partition key parts can only be used for prefix matching, and
+/// yield a [`ColumnValue::Prefix`] from [`build_column_values()`].
+pub const PARTITION_KEY_PART_TRUNCATED: char = '#';
+
+/// The reserved tag value key for the `time` column, which is reserved as
+/// a specifically formatted column for the time associated with any given
+/// data point.
+pub const TAG_VALUE_KEY_TIME: &str = "time";
+
+/// The range of bucket quantities allowed for [`Bucket`] template parts.
+///    
+/// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+pub const ALLOWED_BUCKET_QUANTITIES: Range<u32> = Range {
+    start: 1,
+    end: 100_000,
+};
+
+/// The minimal set of characters that must be encoded during partition key
+/// generation when they form part of a partition key part, in order to be
+/// unambiguously reversible.
+///
+/// See module-level documentation & [`build_column_values()`].
+pub const ENCODED_PARTITION_KEY_CHARS: AsciiSet = CONTROLS
+    .add(PARTITION_KEY_DELIMITER as u8)
+    .add(PARTITION_KEY_VALUE_NULL as u8)
+    .add(PARTITION_KEY_VALUE_EMPTY as u8)
+    .add(PARTITION_KEY_PART_TRUNCATED as u8)
+    .add(b'%'); // Required for reversible unambiguous encoding
+
+/// Allocationless and protobufless access to the parts of a template needed to
+/// actually do partitioning.
+#[derive(Debug, Clone)]
+pub enum TemplatePart<'a> {
+    /// A tag-value partition part.
+    ///
+    /// Specifies the name of the tag column.
+    TagValue(&'a str),
+
+    /// A strftime formatter.
+    ///
+    /// Specifies the formatter spec applied to the [`TIME_COLUMN_NAME`] column.
+    TimeFormat(&'a str),
+
+    /// A bucketing partition part.
+    ///
+    /// Specifies the name of the tag column used to derive which of the `n`
+    /// buckets the data belongs in, through the mechanism implemented by the
+    /// [`bucket_for_tag_value`] function.
+    Bucket(&'a str, u32),
+}
+
+/// The default partitioning scheme is by each day according to the "time" column.
+pub static PARTITION_BY_DAY_PROTO: Lazy<Arc<proto::PartitionTemplate>> = Lazy::new(|| {
+    Arc::new(proto::PartitionTemplate {
+        parts: vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat(
+                "%Y-%m-%d".to_owned(),
+            )),
+        }],
+    })
+});
+
+// This applies murmur3 32 bit hashing to the tag value string, as Iceberg would.
+//
+// * <https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements>
+fn iceberg_hash(tag_value: &str) -> u32 {
+    murmur3_32(&mut tag_value.as_bytes(), 0).expect("read of tag value string must never error")
+}
+
+/// Hash bucket the provided tag value to a bucket ID in the range `[0,num_buckets)`.
+///
+/// This applies murmur3 32 bit hashing to the tag value string, zero-ing the sign bit
+/// then modulo assigning it to a bucket as Iceberg would.
+///
+/// * <https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements>
+/// * <https://iceberg.apache.org/spec/#bucket-transform-details>
+///
+///
+/// # Panics
+///
+/// If `num_buckets` is zero, this will panic. Validation MUST prevent
+/// [`TemplatePart::Bucket`] from being constructed with a zero bucket count. It just
+/// makes no sense and shouldn't need to be checked here.
+#[inline(always)]
+pub fn bucket_for_tag_value(tag_value: &str, num_buckets: u32) -> u32 {
+    // Hash the tag value as iceberg would.
+    let hash = iceberg_hash(tag_value);
+    // Then bucket it as iceberg would, by removing the sign bit from the
+    // 32 bit murmur hash and modulo by the number of buckets to assign
+    // across.
+    (hash & i32::MAX as u32) % num_buckets
+}
+
+/// A partition template specified by a namespace record.
+///
+/// Internally this type is [`None`] when no namespace-level override is
+/// specified, resulting in the default being used.
+#[derive(Debug, PartialEq, Clone, Default, sqlx::Type, Hash)]
+#[sqlx(transparent, no_pg_array)]
+pub struct NamespacePartitionTemplateOverride(Option<serialization::Wrapper>);
+
+impl NamespacePartitionTemplateOverride {
+    /// A const "default" impl for testing.
+    pub const fn const_default() -> Self {
+        Self(None)
+    }
+
+    /// Return the protobuf representation of this template.
+    pub fn as_proto(&self) -> Option<&proto::PartitionTemplate> {
+        self.0.as_ref().map(|v| v.inner())
+    }
+}
+
+impl TryFrom<proto::PartitionTemplate> for NamespacePartitionTemplateOverride {
+    type Error = ValidationError;
+
+    fn try_from(partition_template: proto::PartitionTemplate) -> Result<Self, Self::Error> {
+        Ok(Self(Some(serialization::Wrapper::try_from(
+            partition_template,
+        )?)))
+    }
+}
+
+/// A partition template specified by a table record.
+#[derive(Debug, PartialEq, Eq, Clone, Default, sqlx::Type, Hash)]
+#[sqlx(transparent, no_pg_array)]
+pub struct TablePartitionTemplateOverride(Option<serialization::Wrapper>);
+
+impl TablePartitionTemplateOverride {
+    /// When a table is being explicitly created, the creation request might have contained a
+    /// custom partition template for that table. If the custom partition template is present, use
+    /// it. Otherwise, use the namespace's partition template.
+    ///
+    /// # Errors
+    ///
+    /// This function will return an error if the custom partition template specified is invalid.
+    pub fn try_new(
+        custom_table_template: Option<proto::PartitionTemplate>,
+        namespace_template: &NamespacePartitionTemplateOverride,
+    ) -> Result<Self, ValidationError> {
+        match (custom_table_template, namespace_template.0.as_ref()) {
+            (Some(table_proto), _) => {
+                Ok(Self(Some(serialization::Wrapper::try_from(table_proto)?)))
+            }
+            (None, Some(namespace_serialization_wrapper)) => {
+                Ok(Self(Some(namespace_serialization_wrapper.clone())))
+            }
+            (None, None) => Ok(Self(None)),
+        }
+    }
+
+    /// Returns the number of parts in this template.
+    #[allow(clippy::len_without_is_empty)] // Senseless - there must always be >0 parts.
+    pub fn len(&self) -> usize {
+        self.parts().count()
+    }
+
+    /// Iterate through the protobuf parts and lend out what the `mutable_batch` crate needs to
+    /// build `PartitionKey`s. If this table doesn't have a custom template, use the application
+    /// default of partitioning by day.
+    pub fn parts(&self) -> impl Iterator<Item = TemplatePart<'_>> {
+        self.0
+            .as_ref()
+            .map(|serialization_wrapper| serialization_wrapper.inner())
+            .unwrap_or_else(|| &PARTITION_BY_DAY_PROTO)
+            .parts
+            .iter()
+            .flat_map(|part| part.part.as_ref())
+            .map(|part| match part {
+                proto::template_part::Part::TagValue(value) => TemplatePart::TagValue(value),
+                proto::template_part::Part::TimeFormat(fmt) => TemplatePart::TimeFormat(fmt),
+                proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name,
+                    num_buckets,
+                }) => TemplatePart::Bucket(tag_name, *num_buckets),
+            })
+    }
+
+    /// Size in bytes, including `self`.
+    ///
+    /// This accounts for the entire allocation of this object, even when it shared (via an internal [`Arc`]).
+    pub fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+            + self
+                .0
+                .as_ref()
+                .map(|wrapper| {
+                    let inner = wrapper.inner();
+
+                    // inner is wrapped into an Arc, so we need to account for that allocation
+                    std::mem::size_of::<proto::PartitionTemplate>()
+                        + (inner.parts.capacity() * std::mem::size_of::<proto::TemplatePart>())
+                        + inner
+                            .parts
+                            .iter()
+                            .map(|part| {
+                                part.part
+                                    .as_ref()
+                                    .map(|part| match part {
+                                        proto::template_part::Part::TagValue(s) => s.capacity(),
+                                        proto::template_part::Part::TimeFormat(s) => s.capacity(),
+                                        proto::template_part::Part::Bucket(proto::Bucket {
+                                            tag_name,
+                                            num_buckets: _,
+                                        }) => tag_name.capacity() + std::mem::size_of::<u32>(),
+                                    })
+                                    .unwrap_or_default()
+                            })
+                            .sum::<usize>()
+                })
+                .unwrap_or_default()
+    }
+
+    /// Return the protobuf representation of this template.
+    pub fn as_proto(&self) -> Option<&proto::PartitionTemplate> {
+        self.0.as_ref().map(|v| v.inner())
+    }
+}
+
+/// Display the serde_json representation so that the output
+/// can be copy/pasted into CLI tools, etc as the partition
+/// template is specified as JSON
+impl Display for TablePartitionTemplateOverride {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}",
+            self.as_proto()
+                .map(|proto| serde_json::to_string(proto)
+                    .expect("serialization should be infallible"))
+                .unwrap_or_default()
+        )
+    }
+}
+
+impl TryFrom<Option<proto::PartitionTemplate>> for TablePartitionTemplateOverride {
+    type Error = ValidationError;
+
+    fn try_from(p: Option<proto::PartitionTemplate>) -> Result<Self, Self::Error> {
+        Ok(Self(p.map(serialization::Wrapper::try_from).transpose()?))
+    }
+}
+
+/// This manages the serialization/deserialization of the `proto::PartitionTemplate` type to and
+/// from the database through `sqlx` for the `NamespacePartitionTemplateOverride` and
+/// `TablePartitionTemplateOverride` types. It's an internal implementation detail to minimize code
+/// duplication.
+mod serialization {
+    use super::{
+        ValidationError, ALLOWED_BUCKET_QUANTITIES, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS,
+        TAG_VALUE_KEY_TIME,
+    };
+    use chrono::{format::StrftimeItems, Utc};
+    use generated_types::influxdata::iox::partition_template::v1 as proto;
+    use std::{collections::HashSet, fmt::Write, sync::Arc};
+
+    #[derive(Debug, Clone, PartialEq, Hash)]
+    pub struct Wrapper(Arc<proto::PartitionTemplate>);
+
+    impl Wrapper {
+        /// Read access to the inner proto
+        pub fn inner(&self) -> &proto::PartitionTemplate {
+            &self.0
+        }
+
+        /// THIS IS FOR TESTING PURPOSES ONLY AND SHOULD NOT BE USED IN PRODUCTION CODE.
+        ///
+        /// The application shouldn't be putting invalid templates into the database because all
+        /// creation of `Wrapper`s should be going through the
+        /// `TryFrom::try_from<proto::PartitionTemplate>` constructor that rejects invalid
+        /// templates. However, that leaves the possibility of the database getting an invalid
+        /// template through some other means, and we want to be able to construct those easily in
+        /// tests to make sure code using partition templates can handle the unlikely possibility
+        /// of an invalid template in the database.
+        pub(super) fn for_testing_possibility_of_invalid_value_in_database(
+            proto: proto::PartitionTemplate,
+        ) -> Self {
+            Self(Arc::new(proto))
+        }
+    }
+
+    // protobuf types normally don't implement `Eq`, but for this concrete type this is OK
+    impl Eq for Wrapper {}
+
+    impl TryFrom<proto::PartitionTemplate> for Wrapper {
+        type Error = ValidationError;
+
+        fn try_from(partition_template: proto::PartitionTemplate) -> Result<Self, Self::Error> {
+            // There must be at least one part.
+            if partition_template.parts.is_empty() {
+                return Err(ValidationError::NoParts);
+            }
+
+            // There may not be more than `MAXIMUM_NUMBER_OF_TEMPLATE_PARTS` parts.
+            let specified = partition_template.parts.len();
+            if specified > MAXIMUM_NUMBER_OF_TEMPLATE_PARTS {
+                return Err(ValidationError::TooManyParts { specified });
+            }
+
+            let mut seen_tags: HashSet<&str> = HashSet::with_capacity(specified);
+
+            // All time formats must be valid and tag values may not specify any
+            // restricted values.
+            for part in &partition_template.parts {
+                match &part.part {
+                    Some(proto::template_part::Part::TimeFormat(fmt)) => {
+                        // Empty is not a valid time format
+                        if fmt.is_empty() {
+                            return Err(ValidationError::InvalidStrftime(fmt.into()));
+                        }
+
+                        // Chrono will panic during timestamp formatting if this
+                        // formatter directive is used!
+                        //
+                        // An upper-case Z does not trigger the panic code path so
+                        // is not checked for.
+                        if fmt.contains("%#z") {
+                            return Err(ValidationError::InvalidStrftime(
+                                "%#z cannot be used".to_string(),
+                            ));
+                        }
+
+                        // Currently we can only tell whether a nonempty format is valid by trying
+                        // to use it. See <https://github.com/chronotope/chrono/issues/47>
+                        let mut dev_null = String::new();
+                        write!(
+                            dev_null,
+                            "{}",
+                            Utc::now().format_with_items(StrftimeItems::new(fmt))
+                        )
+                        .map_err(|_| ValidationError::InvalidStrftime(fmt.into()))?
+                    }
+                    Some(proto::template_part::Part::TagValue(value)) => {
+                        // Empty is not a valid tag value
+                        if value.is_empty() {
+                            return Err(ValidationError::InvalidTagValue(value.into()));
+                        }
+
+                        if value.contains(TAG_VALUE_KEY_TIME) {
+                            return Err(ValidationError::InvalidTagValue(format!(
+                                "{TAG_VALUE_KEY_TIME} cannot be used"
+                            )));
+                        }
+
+                        if !seen_tags.insert(value.as_str()) {
+                            return Err(ValidationError::RepeatedTagValue(value.into()));
+                        }
+                    }
+                    Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name,
+                        num_buckets,
+                    })) => {
+                        if tag_name.is_empty() {
+                            return Err(ValidationError::InvalidTagValue(tag_name.into()));
+                        }
+
+                        if tag_name.contains(TAG_VALUE_KEY_TIME) {
+                            return Err(ValidationError::InvalidTagValue(format!(
+                                "{TAG_VALUE_KEY_TIME} cannot be used"
+                            )));
+                        }
+
+                        if !seen_tags.insert(tag_name.as_str()) {
+                            return Err(ValidationError::RepeatedTagValue(tag_name.into()));
+                        }
+
+                        if !ALLOWED_BUCKET_QUANTITIES.contains(num_buckets) {
+                            return Err(ValidationError::InvalidNumberOfBuckets(*num_buckets));
+                        }
+                    }
+                    None => {}
+                }
+            }
+
+            Ok(Self(Arc::new(partition_template)))
+        }
+    }
+
+    impl<DB> sqlx::Type<DB> for Wrapper
+    where
+        sqlx::types::Json<Self>: sqlx::Type<DB>,
+        DB: sqlx::Database,
+    {
+        fn type_info() -> DB::TypeInfo {
+            <sqlx::types::Json<Self> as sqlx::Type<DB>>::type_info()
+        }
+    }
+
+    impl<'q, DB> sqlx::Encode<'q, DB> for Wrapper
+    where
+        DB: sqlx::Database,
+        for<'b> sqlx::types::Json<&'b proto::PartitionTemplate>: sqlx::Encode<'q, DB>,
+    {
+        fn encode_by_ref(
+            &self,
+            buf: &mut <DB as sqlx::database::HasArguments<'q>>::ArgumentBuffer,
+        ) -> sqlx::encode::IsNull {
+            <sqlx::types::Json<&proto::PartitionTemplate> as sqlx::Encode<'_, DB>>::encode_by_ref(
+                &sqlx::types::Json(&self.0),
+                buf,
+            )
+        }
+    }
+
+    impl<'q, DB> sqlx::Decode<'q, DB> for Wrapper
+    where
+        DB: sqlx::Database,
+        sqlx::types::Json<proto::PartitionTemplate>: sqlx::Decode<'q, DB>,
+    {
+        fn decode(
+            value: <DB as sqlx::database::HasValueRef<'q>>::ValueRef,
+        ) -> Result<Self, Box<dyn std::error::Error + 'static + Send + Sync>> {
+            Ok(Self(
+                <sqlx::types::Json<proto::PartitionTemplate> as sqlx::Decode<'_, DB>>::decode(
+                    value,
+                )?
+                .0
+                .into(),
+            ))
+        }
+    }
+}
+
+/// The value of a column, reversed from a partition key.
+///
+/// See [`build_column_values()`].
+#[derive(Debug, Clone, PartialEq)]
+pub enum ColumnValue<'a> {
+    /// The inner value is the exact, unmodified input column value.
+    Identity(Cow<'a, str>),
+
+    /// The inner value is a variable length prefix of the input column value.
+    ///
+    /// The string value is always guaranteed to be valid UTF-8.
+    ///
+    /// Attempting to equality match this variant against a string will always
+    /// be false - use [`ColumnValue::is_prefix_match_of()`] to prefix match
+    /// instead.
+    Prefix(Cow<'a, str>),
+
+    /// Datetime.
+    Datetime {
+        /// Inclusive begin of the datatime partition range.
+        begin: DateTime<Utc>,
+
+        /// Exclusive end of the datatime partition range.
+        end: DateTime<Utc>,
+    },
+
+    /// The inner value is the ID of the bucket selected through a modulo hash
+    /// of the input column value.
+    Bucket(u32),
+}
+
+impl<'a> ColumnValue<'a> {
+    /// Returns true if `other` is a byte-wise prefix match of `self`.
+    ///
+    /// This method can be called for both [`ColumnValue::Identity`] and
+    /// [`ColumnValue::Prefix`].
+    pub fn is_prefix_match_of<T>(&self, other: T) -> bool
+    where
+        T: AsRef<[u8]>,
+    {
+        let this = match self {
+            ColumnValue::Identity(v) => v.as_bytes(),
+            ColumnValue::Prefix(v) => v.as_bytes(),
+            ColumnValue::Datetime { .. } | ColumnValue::Bucket(..) => {
+                return false;
+            }
+        };
+
+        other.as_ref().starts_with(this)
+    }
+}
+
+impl<'a, T> PartialEq<T> for ColumnValue<'a>
+where
+    T: AsRef<str>,
+{
+    fn eq(&self, other: &T) -> bool {
+        match self {
+            ColumnValue::Identity(v) => other.as_ref().eq(v.as_ref()),
+            ColumnValue::Prefix(_) => false,
+            ColumnValue::Datetime { .. } => false,
+            ColumnValue::Bucket(..) => false,
+        }
+    }
+}
+
+/// Reverse a `partition_key` generated from the given partition key `template`,
+/// reconstructing the set of tag values in the form of `(column name, column
+/// value)` tuples that the `partition_key` was generated from.
+///
+/// The `partition_key` MUST have been generated by `template`.
+///
+/// Values are returned as a [`Cow`], avoiding the need for value copying if
+/// they do not need decoding. See module docs for encoding/decoding.
+///
+/// # Panics
+///
+/// This method panics if a column value is not valid UTF8 after decoding, or
+/// when a bucket ID is not valid (not a u32 or within the expected number of
+/// buckets).
+pub fn build_column_values<'a>(
+    template: &'a TablePartitionTemplateOverride,
+    partition_key: &'a str,
+) -> impl Iterator<Item = (&'a str, ColumnValue<'a>)> {
+    // Exploded parts of the generated key on the "/" character.
+    //
+    // Any uses of the "/" character within the partition key's user-provided
+    // values are url encoded, so this is an unambiguous field separator.
+    let key_parts = partition_key.split(PARTITION_KEY_DELIMITER);
+
+    // Obtain an iterator of template parts, from which the meaning of the key
+    // parts can be inferred.
+    let template_parts = template.parts();
+
+    // Invariant: the number of key parts generated from a given template always
+    // matches the number of template parts.
+    //
+    // The key_parts iterator is not an ExactSizeIterator, so an assert can't be
+    // placed here to validate this property.
+
+    // Produce an iterator of (template_part, template_value)
+    template_parts
+        .zip(key_parts)
+        .filter_map(|(template, value)| {
+            if value == PARTITION_KEY_VALUE_NULL_STR {
+                None
+            } else {
+                match template {
+                    TemplatePart::TagValue(col_name) => {
+                        Some((col_name, parse_part_tag_value(value)?))
+                    }
+                    TemplatePart::TimeFormat(format) => {
+                        Some((TIME_COLUMN_NAME, parse_part_time_format(value, format)?))
+                    }
+                    TemplatePart::Bucket(col_name, num_buckets) => {
+                        Some((col_name, parse_part_bucket(value, num_buckets)?))
+                    }
+                }
+            }
+        })
+}
+
+fn parse_part_tag_value(value: &str) -> Option<ColumnValue<'_>> {
+    // Perform re-mapping of sentinel values.
+    let value = match value {
+        PARTITION_KEY_VALUE_EMPTY_STR => {
+            // Re-map the empty string sentinel "^"" to an empty string
+            // value.
+            ""
+        }
+        _ => value,
+    };
+
+    // Reverse the urlencoding of all value parts
+    let decoded = percent_decode_str(value)
+        .decode_utf8()
+        .expect("invalid partition key part encoding");
+
+    // Inspect the final character in the string, pre-decoding, to
+    // determine if it has been truncated.
+    if value
+        .as_bytes()
+        .last()
+        .map(|v| *v == PARTITION_KEY_PART_TRUNCATED as u8)
+        .unwrap_or_default()
+    {
+        // Remove the truncation marker.
+        let len = decoded.len() - 1;
+
+        // Only allocate if needed; re-borrow a subslice of `Cow::Borrowed` if not.
+        let column_cow = match decoded {
+            Cow::Borrowed(s) => Cow::Borrowed(&s[..len]),
+            Cow::Owned(s) => Cow::Owned(s[..len].to_string()),
+        };
+        Some(ColumnValue::Prefix(column_cow))
+    } else {
+        Some(ColumnValue::Identity(decoded))
+    }
+}
+
+fn parse_part_time_format(value: &str, format: &str) -> Option<ColumnValue<'static>> {
+    use chrono::format::{parse, Item, Parsed};
+
+    let items = StrftimeItems::new(format);
+
+    let mut parsed = Parsed::new();
+    parse(&mut parsed, value, items.clone()).ok()?;
+
+    // fill in defaults
+    let parsed = parsed_implicit_defaults(parsed)?;
+
+    let begin = parsed.to_datetime_with_timezone(&Utc).ok()?;
+
+    let mut end: Option<DateTime<Utc>> = None;
+    for item in items {
+        let item_end = match item {
+            Item::Literal(_) | Item::OwnedLiteral(_) | Item::Space(_) | Item::OwnedSpace(_) => None,
+            Item::Error => {
+                return None;
+            }
+            Item::Numeric(numeric, _pad) => {
+                match numeric {
+                    Numeric::Year => Some(begin + Months::new(12)),
+                    Numeric::Month => Some(begin + Months::new(1)),
+                    Numeric::Day => Some(begin + Days::new(1)),
+                    _ => {
+                        // not supported
+                        return None;
+                    }
+                }
+            }
+            Item::Fixed(_) => {
+                // not implemented
+                return None;
+            }
+        };
+
+        end = match (end, item_end) {
+            (Some(a), Some(b)) => {
+                let a_d = a - begin;
+                let b_d = b - begin;
+                if a_d < b_d {
+                    Some(a)
+                } else {
+                    Some(b)
+                }
+            }
+            (None, Some(dt)) => Some(dt),
+            (Some(dt), None) => Some(dt),
+            (None, None) => None,
+        };
+    }
+
+    end.map(|end| ColumnValue::Datetime { begin, end })
+}
+
+fn parse_part_bucket(value: &str, num_buckets: u32) -> Option<ColumnValue<'_>> {
+    // Parse the bucket ID from the given value string.
+    let bucket_id = value
+        .parse::<u32>()
+        .expect("invalid partition key bucket encoding");
+    // Invariant: If the bucket ID (0 indexed) is greater than the number of
+    // buckets to spread data across the partition key is invalid.
+    assert!(bucket_id < num_buckets);
+
+    Some(ColumnValue::Bucket(bucket_id))
+}
+
+fn parsed_implicit_defaults(mut parsed: chrono::format::Parsed) -> Option<chrono::format::Parsed> {
+    parsed.year?;
+
+    if parsed.month.is_none() {
+        if parsed.day.is_some() {
+            return None;
+        }
+
+        parsed.set_month(1).ok()?;
+    }
+
+    if parsed.day.is_none() {
+        if parsed.hour_div_12.is_some() || parsed.hour_mod_12.is_some() {
+            return None;
+        }
+
+        parsed.set_day(1).ok()?;
+    }
+
+    if parsed.hour_div_12.is_none() || parsed.hour_mod_12.is_none() {
+        // consistency check
+        if parsed.hour_div_12.is_some() {
+            return None;
+        }
+        if parsed.hour_mod_12.is_some() {
+            return None;
+        }
+
+        if parsed.minute.is_some() {
+            return None;
+        }
+
+        parsed.set_hour(0).ok()?;
+    }
+
+    if parsed.minute.is_none() {
+        if parsed.second.is_some() {
+            return None;
+        }
+        if parsed.nanosecond.is_some() {
+            return None;
+        }
+
+        parsed.set_minute(0).ok()?;
+    }
+
+    Some(parsed)
+}
+
+/// In production code, the template should come from protobuf that is either from the database or
+/// from a gRPC request. In tests, building protobuf is painful, so here's an easier way to create
+/// a `TablePartitionTemplateOverride`.
+///
+/// This deliberately goes around the validation of the templates so that tests can verify code
+/// handles potentially invalid templates!
+pub fn test_table_partition_override(
+    parts: Vec<TemplatePart<'_>>,
+) -> TablePartitionTemplateOverride {
+    let parts = parts
+        .into_iter()
+        .map(|part| {
+            let part = match part {
+                TemplatePart::TagValue(value) => proto::template_part::Part::TagValue(value.into()),
+                TemplatePart::TimeFormat(fmt) => proto::template_part::Part::TimeFormat(fmt.into()),
+                TemplatePart::Bucket(value, num_buckets) => {
+                    proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: value.into(),
+                        num_buckets,
+                    })
+                }
+            };
+
+            proto::TemplatePart { part: Some(part) }
+        })
+        .collect();
+
+    let proto = proto::PartitionTemplate { parts };
+    TablePartitionTemplateOverride(Some(
+        serialization::Wrapper::for_testing_possibility_of_invalid_value_in_database(proto),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+    use chrono::TimeZone;
+    use proptest::prelude::*;
+    use sqlx::Encode;
+    use test_helpers::assert_error;
+
+    use super::*;
+
+    #[test]
+    fn test_partition_template_to_string() {
+        let template_empty: TablePartitionTemplateOverride =
+            TablePartitionTemplateOverride::default();
+
+        let template: Vec<TemplatePart<'_>> =
+            [TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a")]
+                .into_iter()
+                .collect::<Vec<_>>();
+        let template: TablePartitionTemplateOverride = test_table_partition_override(template);
+
+        assert_eq!(template_empty.to_string(), "");
+        assert_eq!(
+            template.to_string(),
+            "{\"parts\":[{\"timeFormat\":\"%Y\"},{\"tagValue\":\"a\"}]}"
+        );
+    }
+
+    #[test]
+    fn test_max_partition_key_len() {
+        let max_len: usize =
+            // 8 parts, at most 200 bytes long.
+            (MAXIMUM_NUMBER_OF_TEMPLATE_PARTS * PARTITION_KEY_MAX_PART_LEN)
+            // 7 delimiting characters between parts.
+            + (MAXIMUM_NUMBER_OF_TEMPLATE_PARTS - 1);
+
+        // If this changes, the module documentation should be changed too.
+        //
+        // This shouldn't change without consideration of primary key overlap as
+        // a result.
+        assert_eq!(max_len, 1_607, "update module docs please");
+    }
+
+    #[test]
+    fn empty_parts_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate { parts: vec![] });
+
+        assert_error!(err, ValidationError::NoParts);
+    }
+
+    #[test]
+    fn more_than_8_parts_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("region".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("region".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("region".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("region".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("region".into())),
+                },
+            ],
+        });
+
+        assert_error!(err, ValidationError::TooManyParts { specified } if specified == 9);
+    }
+
+    #[test]
+    fn repeated_tag_name_value_is_invalid() {
+        // Test [`TagValue`]
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("bananas".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("bananas".into())),
+                },
+            ],
+        });
+
+        assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas");
+
+        // Test [`Bucket`]
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
+            ],
+        });
+
+        assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas");
+
+        // Test a combination of [`TagValue`] and [`Bucket`]
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("bananas".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
+            ],
+        });
+
+        assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas");
+    }
+
+    /// Chrono will panic when formatting a timestamp if the "%#z" formatting
+    /// directive is used...
+    #[test]
+    fn test_secret_formatter_advice_panic() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TimeFormat("%#z".into())),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidStrftime(_));
+
+        // This doesn't trigger the panic, but is included for completeness.
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TimeFormat("%#Z".into())),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidStrftime(_));
+    }
+
+    #[test]
+    fn invalid_strftime_format_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TimeFormat("%3F".into())),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidStrftime(ref format) if format == "%3F");
+    }
+
+    #[test]
+    fn empty_strftime_format_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TimeFormat("".into())),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidStrftime(ref format) if format.is_empty());
+    }
+
+    /// "time" is a special column already covered by strftime, being a time
+    /// series database and all.
+    #[test]
+    fn time_tag_value_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("time".into())),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidTagValue(_));
+    }
+
+    #[test]
+    fn empty_tag_value_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("".into())),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidTagValue(ref value) if value.is_empty());
+    }
+
+    /// "time" is a special column already covered by strftime, being a time
+    /// series database and all.
+    #[test]
+    fn bucket_time_tag_name_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "time".into(),
+                    num_buckets: 42,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidTagValue(_));
+    }
+
+    #[test]
+    fn bucket_empty_tag_name_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "".into(),
+                    num_buckets: 42,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidTagValue(ref value) if value.is_empty());
+    }
+
+    #[test]
+    fn bucket_zero_num_buckets_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "arán".into(),
+                    num_buckets: 0,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidNumberOfBuckets(0));
+    }
+
+    #[test]
+    fn bucket_too_high_num_buckets_is_invalid() {
+        const TOO_HIGH: u32 = 100_000;
+
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "arán".into(),
+                    num_buckets: TOO_HIGH,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidNumberOfBuckets(TOO_HIGH));
+    }
+
+    fn identity(s: &str) -> ColumnValue<'_> {
+        ColumnValue::Identity(s.into())
+    }
+
+    fn bucket(bucket_id: u32) -> ColumnValue<'static> {
+        ColumnValue::Bucket(bucket_id)
+    }
+
+    fn prefix<'a, T>(s: T) -> ColumnValue<'a>
+    where
+        T: Into<Cow<'a, str>>,
+    {
+        ColumnValue::Prefix(s.into())
+    }
+
+    fn year(y: i32) -> ColumnValue<'static> {
+        ColumnValue::Datetime {
+            begin: Utc.with_ymd_and_hms(y, 1, 1, 0, 0, 0).unwrap(),
+            end: Utc.with_ymd_and_hms(y + 1, 1, 1, 0, 0, 0).unwrap(),
+        }
+    }
+
+    #[test]
+    fn test_iceberg_string_hash() {
+        assert_eq!(iceberg_hash("iceberg"), 1210000089);
+    }
+
+    // This is a test fixture designed to catch accidental changes to the
+    // Iceberg-like hash-bucket partitioning behaviour.
+    //
+    // You shouldn't be changing this!
+    #[test]
+    fn test_hash_bucket_fixture() {
+        // These are values lifted from the iceberg spark test suite for
+        // `BucketString`, sadly not provided in the reference/spec:
+        //
+        // https://github.com/apache/iceberg/blob/31e31fd819c846f49d2bd459b8bfadfdc3c2bc3a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkBucketFunction.java#L151-L169
+        //
+        assert_eq!(bucket_for_tag_value("abcdefg", 5), 4);
+        assert_eq!(bucket_for_tag_value("abc", 128), 122);
+        assert_eq!(bucket_for_tag_value("abcde", 64), 54);
+        assert_eq!(bucket_for_tag_value("测试", 12), 8);
+        assert_eq!(bucket_for_tag_value("测试raul试测", 16), 1);
+        assert_eq!(bucket_for_tag_value("", 16), 0);
+
+        // These are pre-existing arbitrary fixture values
+        assert_eq!(bucket_for_tag_value("bananas", 10), 1);
+        assert_eq!(bucket_for_tag_value("plátanos", 100), 98);
+        assert_eq!(bucket_for_tag_value("crobhaing bananaí", 1000), 166);
+        assert_eq!(bucket_for_tag_value("bread", 42), 9);
+        assert_eq!(bucket_for_tag_value("arán", 76), 72);
+        assert_eq!(bucket_for_tag_value("banana arán", 1337), 1284);
+        assert_eq!(
+            bucket_for_tag_value("uasmhéid bananaí", u32::MAX),
+            1109892861
+        );
+    }
+
+    /// Test to approximate and show how the tag value maps to the partition key
+    /// for the example cases in the mod-doc. The behaviour that renders the key
+    /// itself is a combination of this bucket assignment and the render logic.
+    #[test]
+    fn test_bucket_for_mod_doc() {
+        assert_eq!(bucket_for_tag_value("ananas", 10), 5);
+        assert_eq!(bucket_for_tag_value("!", 10), 8);
+        assert_eq!(bucket_for_tag_value("%50", 10), 9);
+        assert_eq!(bucket_for_tag_value("", 10), 0);
+    }
+
+    proptest! {
+        #[test]
+        fn prop_consistent_bucketing_within_limits(tag_values in proptest::collection::vec(any::<String>(), (1, 10)), num_buckets in any::<u32>()) {
+            for value in tag_values {
+                // First pass assign
+                let want_bucket = bucket_for_tag_value(&value, num_buckets);
+                // The assigned bucket must fit within the domain given to the bucketer.
+                assert!(want_bucket < num_buckets);
+                // Feed in the same tag value, expect the same result.
+                let got_bucket = bucket_for_tag_value(&value, num_buckets);
+                assert_eq!(want_bucket, got_bucket);
+            }
+        }
+    }
+
+    /// Generate a test that asserts "partition_key" is reversible, yielding
+    /// "want" assuming the partition "template" was used.
+    macro_rules! test_build_column_values {
+        (
+            $name:ident,
+            template = $template:expr,                 // Array/vec of TemplatePart
+            partition_key = $partition_key:expr,       // String derived partition key
+            want = $want:expr                          // Expected build_column_values() output
+        ) => {
+            paste::paste! {
+                #[test]
+                fn [<test_build_column_values_ $name>]() {
+                    let template = $template.into_iter().collect::<Vec<_>>();
+                    let template = test_table_partition_override(template);
+
+                    // normalise the values into a (str, ColumnValue) for the comparison
+                    let want = $want
+                        .into_iter()
+                        .collect::<Vec<_>>();
+
+                    let input = String::from($partition_key);
+                    let got = build_column_values(&template, input.as_str())
+                        .collect::<Vec<_>>();
+
+                    assert_eq!(got, want);
+                }
+            }
+        };
+    }
+
+    test_build_column_values!(
+        module_doc_example_1,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|bananas|plátanos|5",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("bananas")),
+            ("b", identity("plátanos")),
+            ("c", bucket(5)),
+        ]
+    );
+
+    test_build_column_values!(
+        module_doc_example_2, // Examples 2 and 3 are the same partition key
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|!|plátanos|!",
+        want = [(TIME_COLUMN_NAME, year(2023)), ("b", identity("plátanos")),]
+    );
+
+    test_build_column_values!(
+        module_doc_example_4,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|!|!|!",
+        want = [(TIME_COLUMN_NAME, year(2023)),]
+    );
+
+    test_build_column_values!(
+        module_doc_example_5,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|cat%7Cdog|%21|8",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("cat|dog")),
+            ("b", identity("!")),
+            ("c", bucket(8)),
+        ]
+    );
+
+    test_build_column_values!(
+        module_doc_example_6,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|%2550|!|9",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("%50")),
+            ("c", bucket(9)),
+        ]
+    );
+
+    test_build_column_values!(
+        module_doc_example_7,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|^|!|0",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("")),
+            ("c", bucket(0)),
+        ]
+    );
+
+    test_build_column_values!(
+        module_doc_example_8,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|BANANAS#|!|!|!",
+        want = [(TIME_COLUMN_NAME, year(2023)), ("a", prefix("BANANAS")),]
+    );
+
+    test_build_column_values!(
+        unicode_code_point_prefix,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|%28%E3%83%8E%E0%B2%A0%E7%9B%8A%E0%B2%A0%29%E3%83%8E%E5%BD%A1%E2%94%BB%E2%94%81%E2%94%BB#|!|!",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", prefix("(ノಠ益ಠ)ノ彡┻━┻")),
+        ]
+    );
+
+    test_build_column_values!(
+        unicode_grapheme,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+        ],
+        partition_key = "2023|%E0%AE%A8%E0%AE%BF#|!",
+        want = [(TIME_COLUMN_NAME, year(2023)), ("a", prefix("நி")),]
+    );
+
+    test_build_column_values!(
+        unambiguous,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+        ],
+        partition_key = "2023|is%7Cnot%21ambiguous%2510%23|!",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("is|not!ambiguous%10#")),
+        ]
+    );
+
+    test_build_column_values!(
+        datetime_fixed,
+        template = [TemplatePart::TimeFormat("foo"),],
+        partition_key = "foo",
+        want = []
+    );
+
+    test_build_column_values!(
+        datetime_null,
+        template = [TemplatePart::TimeFormat("%Y"),],
+        partition_key = "!",
+        want = []
+    );
+
+    test_build_column_values!(
+        datetime_range_y,
+        template = [TemplatePart::TimeFormat("%Y"),],
+        partition_key = "2023",
+        want = [(
+            TIME_COLUMN_NAME,
+            ColumnValue::Datetime {
+                begin: Utc.with_ymd_and_hms(2023, 1, 1, 0, 0, 0).unwrap(),
+                end: Utc.with_ymd_and_hms(2024, 1, 1, 0, 0, 0).unwrap(),
+            },
+        )]
+    );
+
+    test_build_column_values!(
+        datetime_range_y_m,
+        template = [TemplatePart::TimeFormat("%Y-%m"),],
+        partition_key = "2023-09",
+        want = [(
+            TIME_COLUMN_NAME,
+            ColumnValue::Datetime {
+                begin: Utc.with_ymd_and_hms(2023, 9, 1, 0, 0, 0).unwrap(),
+                end: Utc.with_ymd_and_hms(2023, 10, 1, 0, 0, 0).unwrap(),
+            },
+        )]
+    );
+
+    test_build_column_values!(
+        datetime_range_y_m_overflow_year,
+        template = [TemplatePart::TimeFormat("%Y-%m"),],
+        partition_key = "2023-12",
+        want = [(
+            TIME_COLUMN_NAME,
+            ColumnValue::Datetime {
+                begin: Utc.with_ymd_and_hms(2023, 12, 1, 0, 0, 0).unwrap(),
+                end: Utc.with_ymd_and_hms(2024, 1, 1, 0, 0, 0).unwrap(),
+            },
+        )]
+    );
+
+    test_build_column_values!(
+        datetime_range_y_m_d,
+        template = [TemplatePart::TimeFormat("%Y-%m-%d"),],
+        partition_key = "2023-09-01",
+        want = [(
+            TIME_COLUMN_NAME,
+            ColumnValue::Datetime {
+                begin: Utc.with_ymd_and_hms(2023, 9, 1, 0, 0, 0).unwrap(),
+                end: Utc.with_ymd_and_hms(2023, 9, 2, 0, 0, 0).unwrap(),
+            },
+        )]
+    );
+
+    test_build_column_values!(
+        datetime_range_y_m_d_overflow_month,
+        template = [TemplatePart::TimeFormat("%Y-%m-%d"),],
+        partition_key = "2023-09-30",
+        want = [(
+            TIME_COLUMN_NAME,
+            ColumnValue::Datetime {
+                begin: Utc.with_ymd_and_hms(2023, 9, 30, 0, 0, 0).unwrap(),
+                end: Utc.with_ymd_and_hms(2023, 10, 1, 0, 0, 0).unwrap(),
+            },
+        )]
+    );
+
+    test_build_column_values!(
+        datetime_range_y_m_d_overflow_year,
+        template = [TemplatePart::TimeFormat("%Y-%m-%d"),],
+        partition_key = "2023-12-31",
+        want = [(
+            TIME_COLUMN_NAME,
+            ColumnValue::Datetime {
+                begin: Utc.with_ymd_and_hms(2023, 12, 31, 0, 0, 0).unwrap(),
+                end: Utc.with_ymd_and_hms(2024, 1, 1, 0, 0, 0).unwrap(),
+            },
+        )]
+    );
+
+    test_build_column_values!(
+        datetime_range_d_m_y,
+        template = [TemplatePart::TimeFormat("%d-%m-%Y"),],
+        partition_key = "01-09-2023",
+        want = [(
+            TIME_COLUMN_NAME,
+            ColumnValue::Datetime {
+                begin: Utc.with_ymd_and_hms(2023, 9, 1, 0, 0, 0).unwrap(),
+                end: Utc.with_ymd_and_hms(2023, 9, 2, 0, 0, 0).unwrap(),
+            },
+        )]
+    );
+
+    test_build_column_values!(
+        bucket_part_fixture,
+        template = [
+            TemplatePart::Bucket("a", 41),
+            TemplatePart::Bucket("b", 91),
+            TemplatePart::Bucket("c", 144)
+        ],
+        partition_key = "1|2|3",
+        want = [("a", bucket(1)), ("b", bucket(2)), ("c", bucket(3)),]
+    );
+
+    #[test]
+    #[should_panic]
+    fn test_build_column_values_bucket_part_out_of_range_panics() {
+        let template = [
+            TemplatePart::Bucket("a", 42),
+            TemplatePart::Bucket("b", 42),
+            TemplatePart::Bucket("c", 42),
+        ]
+        .into_iter()
+        .collect::<Vec<_>>();
+        let template = test_table_partition_override(template);
+
+        // normalise the values into a (str, ColumnValue) for the comparison
+        let input = String::from("1|1|43");
+        let _ = build_column_values(&template, input.as_str()).collect::<Vec<_>>();
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_build_column_values_bucket_part_not_u32_panics() {
+        let template = [
+            TemplatePart::Bucket("a", 42),
+            TemplatePart::Bucket("b", 42),
+            TemplatePart::Bucket("c", 42),
+        ]
+        .into_iter()
+        .collect::<Vec<_>>();
+        let template = test_table_partition_override(template);
+
+        // normalise the values into a (str, ColumnValue) for the comparison
+        let input = String::from("1|1|bananas");
+        let _ = build_column_values(&template, input.as_str()).collect::<Vec<_>>();
+    }
+
+    test_build_column_values!(
+        datetime_not_compact_y_d,
+        template = [TemplatePart::TimeFormat("%Y-%d"),],
+        partition_key = "2023-01",
+        want = []
+    );
+
+    test_build_column_values!(
+        datetime_not_compact_m,
+        template = [TemplatePart::TimeFormat("%m"),],
+        partition_key = "01",
+        want = []
+    );
+
+    test_build_column_values!(
+        datetime_not_compact_d,
+        template = [TemplatePart::TimeFormat("%d"),],
+        partition_key = "01",
+        want = []
+    );
+
+    test_build_column_values!(
+        datetime_range_unimplemented_y_m_d_h,
+        template = [TemplatePart::TimeFormat("%Y-%m-%dT%H"),],
+        partition_key = "2023-12-31T00",
+        want = []
+    );
+
+    test_build_column_values!(
+        datetime_range_unimplemented_y_m_d_h_m,
+        template = [TemplatePart::TimeFormat("%Y-%m-%dT%H:%M"),],
+        partition_key = "2023-12-31T00:00",
+        want = []
+    );
+
+    test_build_column_values!(
+        datetime_range_unimplemented_y_m_d_h_m_s,
+        template = [TemplatePart::TimeFormat("%Y-%m-%dT%H:%M:%S"),],
+        partition_key = "2023-12-31T00:00:00",
+        want = []
+    );
+
+    test_build_column_values!(
+        empty_tag_only,
+        template = [TemplatePart::TagValue("a")],
+        partition_key = "!",
+        want = []
+    );
+
+    #[test]
+    fn test_null_partition_key_char_str_equality() {
+        assert_eq!(
+            PARTITION_KEY_VALUE_NULL.to_string(),
+            PARTITION_KEY_VALUE_NULL_STR
+        );
+    }
+
+    #[test]
+    fn test_column_value_partial_eq() {
+        assert_eq!(identity("bananas"), "bananas");
+
+        assert_ne!(identity("bananas"), "bananas2");
+        assert_ne!(identity("bananas2"), "bananas");
+
+        assert_ne!(prefix("bananas"), "bananas");
+        assert_ne!(prefix("bananas"), "bananas2");
+        assert_ne!(prefix("bananas2"), "bananas");
+    }
+
+    #[test]
+    fn test_column_value_is_prefix_match() {
+        let b = "bananas".to_string();
+        assert!(identity("bananas").is_prefix_match_of(b));
+
+        assert!(identity("bananas").is_prefix_match_of("bananas"));
+        assert!(identity("bananas").is_prefix_match_of("bananas2"));
+
+        assert!(prefix("bananas").is_prefix_match_of("bananas"));
+        assert!(prefix("bananas").is_prefix_match_of("bananas2"));
+
+        assert!(!identity("bananas2").is_prefix_match_of("bananas"));
+        assert!(!prefix("bananas2").is_prefix_match_of("bananas"));
+    }
+
+    /// This test asserts the default derived partitioning scheme with no
+    /// overrides.
+    ///
+    /// Changing this default during the lifetime of a cluster will cause the
+    /// implicit (not overridden) partition schemes to change, potentially
+    /// breaking the system invariant that a given primary keys maps to a
+    /// single partition.
+    ///
+    /// You shouldn't be changing this!
+    #[test]
+    fn test_default_template_fixture() {
+        let ns = NamespacePartitionTemplateOverride::default();
+        let table = TablePartitionTemplateOverride::try_new(None, &ns).unwrap();
+        let got = table.parts().collect::<Vec<_>>();
+        assert_matches!(got.as_slice(), [TemplatePart::TimeFormat("%Y-%m-%d")]);
+    }
+
+    #[test]
+    fn len_of_default_template_is_1() {
+        let ns = NamespacePartitionTemplateOverride::default();
+        let t = TablePartitionTemplateOverride::try_new(None, &ns).unwrap();
+
+        assert_eq!(t.len(), 1);
+    }
+
+    #[test]
+    fn no_custom_table_template_specified_gets_namespace_template() {
+        let namespace_template =
+            NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                }],
+            })
+            .unwrap();
+        let table_template =
+            TablePartitionTemplateOverride::try_new(None, &namespace_template).unwrap();
+
+        assert_eq!(table_template.len(), 1);
+        assert_eq!(table_template.0, namespace_template.0);
+    }
+
+    #[test]
+    fn custom_table_template_specified_ignores_namespace_template() {
+        let custom_table_template = proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("region".into())),
+            }],
+        };
+        let namespace_template =
+            NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                }],
+            })
+            .unwrap();
+        let table_template = TablePartitionTemplateOverride::try_new(
+            Some(custom_table_template.clone()),
+            &namespace_template,
+        )
+        .unwrap();
+
+        assert_eq!(table_template.len(), 1);
+        assert_eq!(table_template.0.unwrap().inner(), &custom_table_template);
+    }
+
+    // The JSON representation of the partition template protobuf is stored in the database, so
+    // the encode/decode implementations need to be stable if we want to avoid having to
+    // migrate the values stored in the database.
+
+    #[test]
+    fn proto_encode_json_stability() {
+        let custom_template = proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("region".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
+            ],
+        };
+        let expected_json_str = "{\"parts\":[\
+            {\"tagValue\":\"region\"},\
+            {\"timeFormat\":\"year-%Y\"},\
+            {\"bucket\":{\"tagName\":\"bananas\",\"numBuckets\":42}}\
+        ]}";
+
+        let namespace = NamespacePartitionTemplateOverride::try_from(custom_template).unwrap();
+        let mut buf = Default::default();
+        let _ = <NamespacePartitionTemplateOverride as Encode<'_, sqlx::Sqlite>>::encode_by_ref(
+            &namespace, &mut buf,
+        );
+
+        fn extract_sqlite_argument_text(
+            argument_value: &sqlx::sqlite::SqliteArgumentValue<'_>,
+        ) -> String {
+            match argument_value {
+                sqlx::sqlite::SqliteArgumentValue::Text(cow) => cow.to_string(),
+                other => panic!("Expected Text values, got: {other:?}"),
+            }
+        }
+
+        let namespace_json_str: String = buf.iter().map(extract_sqlite_argument_text).collect();
+        assert_eq!(namespace_json_str, expected_json_str);
+
+        let table = TablePartitionTemplateOverride::try_new(None, &namespace).unwrap();
+        let mut buf = Default::default();
+        let _ = <TablePartitionTemplateOverride as Encode<'_, sqlx::Sqlite>>::encode_by_ref(
+            &table, &mut buf,
+        );
+        let table_json_str: String = buf.iter().map(extract_sqlite_argument_text).collect();
+        assert_eq!(table_json_str, expected_json_str);
+        assert_eq!(table.len(), 3);
+    }
+
+    #[test]
+    fn test_template_size_reporting() {
+        const BASE_SIZE: usize = std::mem::size_of::<TablePartitionTemplateOverride>()
+            + std::mem::size_of::<proto::PartitionTemplate>();
+
+        let first_string = "^";
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue(first_string.into())),
+                }],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template ");
+
+        assert_eq!(
+            template.size(),
+            BASE_SIZE + std::mem::size_of::<proto::TemplatePart>() + first_string.len()
+        );
+
+        let second_string = "region";
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue(second_string.into())),
+                }],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template ");
+
+        assert_eq!(
+            template.size(),
+            BASE_SIZE + std::mem::size_of::<proto::TemplatePart>() + second_string.len()
+        );
+
+        let time_string = "year-%Y";
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![
+                    proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TagValue(second_string.into())),
+                    },
+                    proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TimeFormat(time_string.into())),
+                    },
+                ],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template ");
+        assert_eq!(
+            template.size(),
+            BASE_SIZE
+                + std::mem::size_of::<proto::TemplatePart>()
+                + second_string.len()
+                + std::mem::size_of::<proto::TemplatePart>()
+                + time_string.len()
+        );
+
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: second_string.into(),
+                        num_buckets: 42,
+                    })),
+                }],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template");
+        assert_eq!(
+            template.size(),
+            BASE_SIZE
+                + std::mem::size_of::<proto::TemplatePart>()
+                + second_string.len()
+                + std::mem::size_of::<u32>()
+        );
+    }
+}
diff --git a/data_types/src/sequence_number_set.rs b/data_types/src/sequence_number_set.rs
new file mode 100644
index 0000000..ff8ca41
--- /dev/null
+++ b/data_types/src/sequence_number_set.rs
@@ -0,0 +1,325 @@
+//! A set of [`SequenceNumber`] instances.
+
+use std::collections::BTreeMap;
+
+use crate::SequenceNumber;
+
+/// A space-efficient encoded set of [`SequenceNumber`].
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct SequenceNumberSet(croaring::Treemap);
+
+impl SequenceNumberSet {
+    /// Add the specified [`SequenceNumber`] to the set.
+    pub fn add(&mut self, n: SequenceNumber) {
+        self.0.add(n.get() as _);
+    }
+
+    /// Remove the specified [`SequenceNumber`] to the set, if present.
+    ///
+    /// This is a no-op if `n` was not part of `self`.
+    pub fn remove(&mut self, n: SequenceNumber) {
+        self.0.remove(n.get() as _);
+    }
+
+    /// Add all the [`SequenceNumber`] in `other` to `self`.
+    ///
+    /// The result of this operation is the set union of both input sets.
+    pub fn add_set(&mut self, other: &Self) {
+        self.0.or_inplace(&other.0)
+    }
+
+    /// Remove all the [`SequenceNumber`] in `other` from `self`.
+    pub fn remove_set(&mut self, other: &Self) {
+        self.0.andnot_inplace(&other.0)
+    }
+
+    /// Reduce the memory usage of this set (trading off immediate CPU time) by
+    /// efficiently re-encoding the set (using run-length encoding).
+    pub fn run_optimise(&mut self) {
+        self.0.run_optimize();
+    }
+
+    /// Return true if the specified [`SequenceNumber`] has been added to
+    /// `self`.
+    pub fn contains(&self, n: SequenceNumber) -> bool {
+        self.0.contains(n.get() as _)
+    }
+
+    /// Returns the number of [`SequenceNumber`] in this set.
+    pub fn len(&self) -> u64 {
+        self.0.cardinality()
+    }
+
+    /// Return `true` if there are no [`SequenceNumber`] in this set.
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    /// Return an iterator of all [`SequenceNumber`] in this set.
+    pub fn iter(&self) -> impl Iterator<Item = SequenceNumber> + '_ {
+        self.0.iter().map(|v| SequenceNumber::new(v as _))
+    }
+
+    /// Initialise a [`SequenceNumberSet`] that is pre-allocated to contain up
+    /// to `n` elements without reallocating.
+    pub fn with_capacity(n: u32) -> Self {
+        let mut map = BTreeMap::new();
+        map.insert(0, croaring::Bitmap::with_container_capacity(n));
+        Self(croaring::Treemap { map })
+    }
+}
+
+impl Extend<SequenceNumber> for SequenceNumberSet {
+    fn extend<T: IntoIterator<Item = SequenceNumber>>(&mut self, iter: T) {
+        self.0.extend(iter.into_iter().map(|v| v.get() as _))
+    }
+}
+
+impl Extend<SequenceNumberSet> for SequenceNumberSet {
+    fn extend<T: IntoIterator<Item = SequenceNumberSet>>(&mut self, iter: T) {
+        for new_set in iter {
+            self.add_set(&new_set);
+        }
+    }
+}
+
+impl FromIterator<SequenceNumber> for SequenceNumberSet {
+    fn from_iter<T: IntoIterator<Item = SequenceNumber>>(iter: T) -> Self {
+        Self(iter.into_iter().map(|v| v.get() as _).collect())
+    }
+}
+
+/// Return the intersection of `self` and `other`.
+pub fn intersect(a: &SequenceNumberSet, b: &SequenceNumberSet) -> SequenceNumberSet {
+    SequenceNumberSet(a.0.and(&b.0))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use proptest::{prelude::prop, proptest, strategy::Strategy};
+
+    use super::*;
+
+    #[test]
+    fn test_set_operations() {
+        let mut a = SequenceNumberSet::default();
+        let mut b = SequenceNumberSet::default();
+
+        // Add an element and check it is readable
+        a.add(SequenceNumber::new(1));
+        assert!(a.contains(SequenceNumber::new(1)));
+        assert_eq!(a.len(), 1);
+        assert_eq!(a.iter().collect::<Vec<_>>(), vec![SequenceNumber::new(1)]);
+        assert!(!a.contains(SequenceNumber::new(42)));
+
+        // Merging an empty set into a should not change a
+        a.add_set(&b);
+        assert_eq!(a.len(), 1);
+        assert!(a.contains(SequenceNumber::new(1)));
+
+        // Merging a non-empty set should add the new elements
+        b.add(SequenceNumber::new(2));
+        a.add_set(&b);
+        assert_eq!(a.len(), 2);
+        assert!(a.contains(SequenceNumber::new(1)));
+        assert!(a.contains(SequenceNumber::new(2)));
+
+        // Removing the set should return it to the pre-merged state.
+        a.remove_set(&b);
+        assert_eq!(a.len(), 1);
+        assert!(a.contains(SequenceNumber::new(1)));
+
+        // Removing a non-existant element should be a NOP
+        a.remove(SequenceNumber::new(42));
+        assert_eq!(a.len(), 1);
+
+        // Removing the last element should result in an empty set.
+        a.remove(SequenceNumber::new(1));
+        assert_eq!(a.len(), 0);
+    }
+
+    #[test]
+    fn test_extend() {
+        let mut a = SequenceNumberSet::default();
+        a.add(SequenceNumber::new(42));
+
+        let extend_set = [SequenceNumber::new(4), SequenceNumber::new(2)];
+
+        assert!(a.contains(SequenceNumber::new(42)));
+        assert!(!a.contains(SequenceNumber::new(4)));
+        assert!(!a.contains(SequenceNumber::new(2)));
+
+        a.extend(extend_set);
+
+        assert!(a.contains(SequenceNumber::new(42)));
+        assert!(a.contains(SequenceNumber::new(4)));
+        assert!(a.contains(SequenceNumber::new(2)));
+    }
+
+    #[test]
+    fn test_extend_multiple_sets() {
+        let mut a = SequenceNumberSet::default();
+        a.add(SequenceNumber::new(7));
+
+        let b = [SequenceNumber::new(13), SequenceNumber::new(76)];
+        let c = [SequenceNumber::new(42), SequenceNumber::new(64)];
+
+        assert!(a.contains(SequenceNumber::new(7)));
+        for &num in [b, c].iter().flatten() {
+            assert!(!a.contains(num));
+        }
+
+        a.extend([
+            SequenceNumberSet::from_iter(b),
+            SequenceNumberSet::from_iter(c),
+        ]);
+        assert!(a.contains(SequenceNumber::new(7)));
+        for &num in [b, c].iter().flatten() {
+            assert!(a.contains(num));
+        }
+    }
+
+    #[test]
+    fn test_collect() {
+        let collect_set = [SequenceNumber::new(4), SequenceNumber::new(2)];
+
+        let a = collect_set.into_iter().collect::<SequenceNumberSet>();
+
+        assert!(!a.contains(SequenceNumber::new(42)));
+        assert!(a.contains(SequenceNumber::new(4)));
+        assert!(a.contains(SequenceNumber::new(2)));
+    }
+
+    #[test]
+    fn test_partial_eq() {
+        let mut a = SequenceNumberSet::default();
+        let mut b = SequenceNumberSet::default();
+
+        assert_eq!(a, b);
+
+        a.add(SequenceNumber::new(42));
+        assert_ne!(a, b);
+
+        b.add(SequenceNumber::new(42));
+        assert_eq!(a, b);
+
+        b.add(SequenceNumber::new(24));
+        assert_ne!(a, b);
+
+        a.add(SequenceNumber::new(24));
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn test_intersect() {
+        let a = [0, u64::MAX, 40, 41, 42, 43, 44, 45]
+            .into_iter()
+            .map(SequenceNumber::new)
+            .collect::<SequenceNumberSet>();
+
+        let b = [1, 5, u64::MAX, 42]
+            .into_iter()
+            .map(SequenceNumber::new)
+            .collect::<SequenceNumberSet>();
+
+        let intersection = intersect(&a, &b);
+        let want = [u64::MAX, 42]
+            .into_iter()
+            .map(SequenceNumber::new)
+            .collect::<SequenceNumberSet>();
+
+        assert_eq!(intersection, want);
+    }
+
+    /// Yield vec's of [`SequenceNumber`] derived from u64 values.
+    ///
+    /// This matches how the ingester allocates [`SequenceNumber`] - from a u64
+    /// source.
+    fn sequence_number_vec() -> impl Strategy<Value = Vec<SequenceNumber>> {
+        prop::collection::vec(0..u64::MAX, 0..1024)
+            .prop_map(|vec| vec.into_iter().map(SequenceNumber::new).collect())
+    }
+
+    // The following tests compare to an order-independent HashSet, as the
+    // SequenceNumber uses the PartialOrd impl of the inner u64 for ordering,
+    // resulting in incorrect output when compared to an ordered set of cast as
+    // u64.
+    //
+    //      https://github.com/influxdata/influxdb_iox/issues/7260
+    //
+    // These tests also cover, collect()-ing to a SequenceNumberSet, etc.
+    proptest! {
+        /// Perform a SequenceNumberSet intersection test comparing the results
+        /// to the known-good stdlib HashSet intersection implementation.
+        #[test]
+        fn prop_set_intersection(
+                a in sequence_number_vec(),
+                b in sequence_number_vec()
+            ) {
+            let known_a = a.iter().cloned().collect::<HashSet<_>>();
+            let known_b = b.iter().cloned().collect::<HashSet<_>>();
+            let set_a = a.into_iter().collect::<SequenceNumberSet>();
+            let set_b = b.into_iter().collect::<SequenceNumberSet>();
+
+            // The sets should be equal
+            assert_eq!(set_a.iter().collect::<HashSet<_>>(), known_a, "set a does not match");
+            assert_eq!(set_b.iter().collect::<HashSet<_>>(), known_b, "set b does not match");
+
+            let known_intersection = known_a.intersection(&known_b).cloned().collect::<HashSet<_>>();
+            let set_intersection = intersect(&set_a, &set_b).iter().collect::<HashSet<_>>();
+
+            // The set intersections should be equal.
+            assert_eq!(set_intersection, known_intersection);
+        }
+
+        /// Perform a SequenceNumberSet remove_set test comparing the results to
+        /// the known-good stdlib HashSet difference implementation.
+        #[test]
+        fn prop_set_difference(
+            a in sequence_number_vec(),
+            b in sequence_number_vec()
+        ) {
+            let known_a = a.iter().cloned().collect::<HashSet<_>>();
+            let known_b = b.iter().cloned().collect::<HashSet<_>>();
+            let mut set_a = a.into_iter().collect::<SequenceNumberSet>();
+            let set_b = b.into_iter().collect::<SequenceNumberSet>();
+
+            // The sets should be equal
+            assert_eq!(set_a.iter().collect::<HashSet<_>>(), known_a, "set a does not match");
+            assert_eq!(set_b.iter().collect::<HashSet<_>>(), known_b, "set b does not match");
+
+            let known_a = known_a.difference(&known_b).cloned().collect::<HashSet<_>>();
+            set_a.remove_set(&set_b);
+            let set_a = set_a.iter().collect::<HashSet<_>>();
+
+            // The set difference should be equal.
+            assert_eq!(set_a, known_a);
+        }
+
+        /// Perform a SequenceNumberSet add_set test comparing the results to
+        /// the known-good stdlib HashSet or implementation.
+        #[test]
+        fn prop_set_add(
+            a in sequence_number_vec(),
+            b in sequence_number_vec()
+        ) {
+            let known_a = a.iter().cloned().collect::<HashSet<_>>();
+            let known_b = b.iter().cloned().collect::<HashSet<_>>();
+            let mut set_a = a.into_iter().collect::<SequenceNumberSet>();
+            let set_b = b.into_iter().collect::<SequenceNumberSet>();
+
+            // The sets should be equal
+            assert_eq!(set_a.iter().collect::<HashSet<_>>(), known_a, "set a does not match");
+            assert_eq!(set_b.iter().collect::<HashSet<_>>(), known_b, "set b does not match");
+
+            let known_a = known_a.union(&known_b).cloned().collect::<HashSet<_>>();
+            set_a.add_set(&set_b);
+            let set_a = set_a.iter().collect::<HashSet<_>>();
+
+            // The sets should be equal.
+            assert_eq!(set_a, known_a);
+        }
+    }
+}
diff --git a/data_types/src/service_limits.rs b/data_types/src/service_limits.rs
new file mode 100644
index 0000000..7c00b6a
--- /dev/null
+++ b/data_types/src/service_limits.rs
@@ -0,0 +1,311 @@
+//! Types protecting production by implementing limits on customer data.
+
+use generated_types::influxdata::iox::namespace::{
+    v1 as namespace_proto, v1::update_namespace_service_protection_limit_request::LimitUpdate,
+};
+use observability_deps::tracing::*;
+use std::num::NonZeroUsize;
+use thiserror::Error;
+
+/// Definitions that apply to both MaxColumnsPerTable and MaxTables. Note that the hardcoded
+/// default value specified in the macro invocation must be greater than 0 and fit in an `i32`.
+macro_rules! define_service_limit {
+    ($type_name:ident, $default_value:expr, $documentation:expr) => {
+        /// $documentation
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+        pub struct $type_name(NonZeroUsize);
+
+        impl TryFrom<usize> for $type_name {
+            type Error = ServiceLimitError;
+
+            fn try_from(value: usize) -> Result<Self, Self::Error> {
+                // Even though the value is stored as a `usize`, service limits are stored as `i32`
+                // in the database and transferred as i32 over protobuf. So try to convert to an
+                // `i32` (and throw away the result) so that we know about invalid values before
+                // trying to use them.
+                if i32::try_from(value).is_err() {
+                    return Err(ServiceLimitError::MustFitInI32);
+                }
+
+                let nonzero_value =
+                    NonZeroUsize::new(value).ok_or(ServiceLimitError::MustBeGreaterThanZero)?;
+
+                Ok(Self(nonzero_value))
+            }
+        }
+
+        impl TryFrom<u64> for $type_name {
+            type Error = ServiceLimitError;
+
+            fn try_from(value: u64) -> Result<Self, Self::Error> {
+                // Even though the value is stored as a `usize`, service limits are stored as `i32`
+                // in the database and transferred as i32 over protobuf. So try to convert to an
+                // `i32` (and throw away the result) so that we know about invalid values before
+                // trying to use them.
+                if i32::try_from(value).is_err() {
+                    return Err(ServiceLimitError::MustFitInI32);
+                }
+
+                let nonzero_value = usize::try_from(value)
+                    .ok()
+                    .and_then(NonZeroUsize::new)
+                    .ok_or(ServiceLimitError::MustBeGreaterThanZero)?;
+
+                Ok(Self(nonzero_value))
+            }
+        }
+
+        impl TryFrom<i32> for $type_name {
+            type Error = ServiceLimitError;
+
+            fn try_from(value: i32) -> Result<Self, Self::Error> {
+                let nonzero_value = usize::try_from(value)
+                    .ok()
+                    .and_then(NonZeroUsize::new)
+                    .ok_or(ServiceLimitError::MustBeGreaterThanZero)?;
+
+                Ok(Self(nonzero_value))
+            }
+        }
+
+        #[allow(missing_docs)]
+        impl $type_name {
+            pub fn get(&self) -> usize {
+                self.0.get()
+            }
+
+            /// For use by the database and some protobuf representations. It should not be
+            /// possible to construct an instance that contains a `NonZeroUsize` that won't fit in
+            /// an `i32`.
+            pub fn get_i32(&self) -> i32 {
+                self.0.get() as i32
+            }
+
+            /// Constant-time default for use in constructing test constants.
+            pub const fn const_default() -> Self {
+                // This is safe because the hardcoded value is not 0.
+                let value = unsafe { NonZeroUsize::new_unchecked($default_value) };
+
+                Self(value)
+            }
+        }
+
+        impl Default for $type_name {
+            fn default() -> Self {
+                Self::const_default()
+            }
+        }
+
+        impl std::fmt::Display for $type_name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(f, "{}", self.0)
+            }
+        }
+
+        // Tell sqlx this is an i32 in the database.
+        impl<DB> sqlx::Type<DB> for $type_name
+        where
+            i32: sqlx::Type<DB>,
+            DB: sqlx::Database,
+        {
+            fn type_info() -> DB::TypeInfo {
+                <i32 as sqlx::Type<DB>>::type_info()
+            }
+        }
+
+        impl<'q, DB> sqlx::Encode<'q, DB> for $type_name
+        where
+            DB: sqlx::Database,
+            i32: sqlx::Encode<'q, DB>,
+        {
+            fn encode_by_ref(
+                &self,
+                buf: &mut <DB as sqlx::database::HasArguments<'q>>::ArgumentBuffer,
+            ) -> sqlx::encode::IsNull {
+                <i32 as sqlx::Encode<'_, DB>>::encode_by_ref(&self.get_i32(), buf)
+            }
+        }
+
+        // The database stores i32s, so there's a chance of invalid values already being stored in
+        // there. When deserializing those values, rather than panicking or returning an error, log
+        // and use the default instead.
+        impl<'r, DB: ::sqlx::Database> ::sqlx::decode::Decode<'r, DB> for $type_name
+        where
+            i32: sqlx::Decode<'r, DB>,
+        {
+            fn decode(
+                value: <DB as ::sqlx::database::HasValueRef<'r>>::ValueRef,
+            ) -> ::std::result::Result<
+                Self,
+                ::std::boxed::Box<
+                    dyn ::std::error::Error + 'static + ::std::marker::Send + ::std::marker::Sync,
+                >,
+            > {
+                let data = <i32 as ::sqlx::decode::Decode<'r, DB>>::decode(value)?;
+
+                let data = Self::try_from(data).unwrap_or_else(|_| {
+                    error!("database contains invalid $type_name value {data}, using default value");
+                    Self::default()
+                });
+
+                Ok(data)
+            }
+        }
+    };
+}
+
+define_service_limit!(MaxTables, 500, "Max tables allowed in a namespace.");
+define_service_limit!(
+    MaxColumnsPerTable,
+    200,
+    "Max columns per table allowed in a namespace."
+);
+
+/// Overrides for service protection limits.
+#[derive(Debug, Copy, Clone)]
+pub struct NamespaceServiceProtectionLimitsOverride {
+    /// The maximum number of tables that can exist in this namespace
+    pub max_tables: Option<MaxTables>,
+    /// The maximum number of columns per table in this namespace
+    pub max_columns_per_table: Option<MaxColumnsPerTable>,
+}
+
+impl TryFrom<namespace_proto::ServiceProtectionLimits>
+    for NamespaceServiceProtectionLimitsOverride
+{
+    type Error = ServiceLimitError;
+
+    fn try_from(value: namespace_proto::ServiceProtectionLimits) -> Result<Self, Self::Error> {
+        let namespace_proto::ServiceProtectionLimits {
+            max_tables,
+            max_columns_per_table,
+        } = value;
+
+        Ok(Self {
+            max_tables: max_tables.map(MaxTables::try_from).transpose()?,
+            max_columns_per_table: max_columns_per_table
+                .map(MaxColumnsPerTable::try_from)
+                .transpose()?,
+        })
+    }
+}
+
+/// Updating one, but not both, of the limits is what the UpdateNamespaceServiceProtectionLimit
+/// gRPC request supports, so match that encoding on the Rust side.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ServiceLimitUpdate {
+    /// Requesting an update to the maximum number of tables allowed in this namespace
+    MaxTables(MaxTables),
+    /// Requesting an update to the maximum number of columns allowed in each table in this
+    /// namespace
+    MaxColumnsPerTable(MaxColumnsPerTable),
+}
+
+/// Errors converting from raw values to the service limits
+#[derive(Error, Debug, Clone, Copy)]
+pub enum ServiceLimitError {
+    /// A negative or 0 value was specified; those aren't allowed
+    #[error("service limit values must be greater than 0")]
+    MustBeGreaterThanZero,
+
+    /// No value was provided so we can't update anything
+    #[error("a supported service limit value is required")]
+    NoValueSpecified,
+
+    /// Limits are stored as `i32` in the database and transferred as i32 over protobuf, so even
+    /// though they are stored as `usize` in Rust, the `usize` value must be less than `i32::MAX`.
+    #[error("service limit values must fit in a 32-bit signed integer (`i32`)")]
+    MustFitInI32,
+}
+
+impl TryFrom<Option<LimitUpdate>> for ServiceLimitUpdate {
+    type Error = ServiceLimitError;
+
+    fn try_from(limit_update: Option<LimitUpdate>) -> Result<Self, Self::Error> {
+        match limit_update {
+            Some(LimitUpdate::MaxTables(n)) => {
+                Ok(ServiceLimitUpdate::MaxTables(MaxTables::try_from(n)?))
+            }
+            Some(LimitUpdate::MaxColumnsPerTable(n)) => Ok(ServiceLimitUpdate::MaxColumnsPerTable(
+                MaxColumnsPerTable::try_from(n)?,
+            )),
+            None => Err(ServiceLimitError::NoValueSpecified),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn extract_sqlite_argument_i32(argument_value: &sqlx::sqlite::SqliteArgumentValue<'_>) -> i32 {
+        match argument_value {
+            sqlx::sqlite::SqliteArgumentValue::Int(i) => *i,
+            other => panic!("Expected Int values, got: {other:?}"),
+        }
+    }
+
+    macro_rules! service_limit_test {
+        ($type_name:ident, $module_name: ident) => {
+            mod $module_name {
+                use super::*;
+
+                fn success<T: TryInto<$type_name>>(value: T, expected: usize)
+                where
+                    <T as TryInto<$type_name>>::Error: std::fmt::Debug,
+                {
+                    assert_eq!(value.try_into().unwrap().get(), expected);
+                }
+
+                #[test]
+                fn successful_conversions() {
+                    success(1usize, 1);
+                    success(1u64, 1);
+                    success(1i32, 1);
+                    success(i32::MAX, i32::MAX as usize);
+                }
+
+                fn failure<T: TryInto<$type_name>>(value: T, expected_error_message: &str)
+                where
+                    <T as TryInto<$type_name>>::Error: std::fmt::Debug + std::fmt::Display,
+                {
+                    assert_eq!(
+                        value.try_into().unwrap_err().to_string(),
+                        expected_error_message
+                    );
+                }
+
+                #[test]
+                fn failed_conversions() {
+                    failure(0usize, "service limit values must be greater than 0");
+                    failure(0u64, "service limit values must be greater than 0");
+                    failure(0i32, "service limit values must be greater than 0");
+                    failure(-1i32, "service limit values must be greater than 0");
+                    failure(
+                        i32::MAX as usize + 1,
+                        "service limit values must fit in a 32-bit signed integer (`i32`)",
+                    );
+                    failure(
+                        i32::MAX as u64 + 1,
+                        "service limit values must fit in a 32-bit signed integer (`i32`)",
+                    );
+                }
+
+                #[test]
+                fn encode() {
+                    let value = $type_name::try_from(10).unwrap();
+                    let mut buf = Default::default();
+                    let _ = <$type_name as sqlx::Encode<'_, sqlx::Sqlite>>::encode_by_ref(
+                        &value, &mut buf,
+                    );
+
+                    let encoded: Vec<_> = buf.iter().map(extract_sqlite_argument_i32).collect();
+                    assert_eq!(encoded, &[value.get_i32()]);
+                }
+            }
+        };
+    }
+
+    service_limit_test!(MaxTables, max_tables);
+    service_limit_test!(MaxColumnsPerTable, max_columns_per_table);
+}
diff --git a/data_types/src/snapshot/hash.rs b/data_types/src/snapshot/hash.rs
new file mode 100644
index 0000000..adf8c24
--- /dev/null
+++ b/data_types/src/snapshot/hash.rs
@@ -0,0 +1,219 @@
+//! A primitive hash table supporting linear probing
+
+use bytes::Bytes;
+use generated_types::influxdata::iox::catalog_cache::v1 as generated;
+use siphasher::sip::SipHasher24;
+
+use snafu::{ensure, Snafu};
+
+/// Error for [`HashBuckets`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs, missing_copy_implementations)]
+pub enum Error {
+    #[snafu(display("Bucket length not a power of two"))]
+    BucketsNotPower,
+    #[snafu(display("Unrecognized hash function"))]
+    UnrecognizedHash,
+}
+
+/// Result for [`HashBuckets`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A primitive hash table supporting [linear probing]
+///
+/// [linear probing](https://en.wikipedia.org/wiki/Linear_probing)
+#[derive(Debug, Clone)]
+pub struct HashBuckets {
+    /// The mask to yield index in `buckets` from a u64 hash
+    mask: usize,
+    /// A sequence of u32 encoding the value index + 1, or 0 if empty
+    buckets: Bytes,
+    /// The hash function to use
+    hash: SipHasher24,
+}
+
+impl HashBuckets {
+    /// Performs a lookup of `value`
+    pub fn lookup(&self, value: &[u8]) -> HashProbe<'_> {
+        self.lookup_raw(self.hash.hash(value))
+    }
+
+    fn lookup_raw(&self, hash: u64) -> HashProbe<'_> {
+        let idx = (hash as usize) & self.mask;
+        HashProbe {
+            idx,
+            buckets: self,
+            mask: self.mask as _,
+        }
+    }
+}
+
+impl TryFrom<generated::HashBuckets> for HashBuckets {
+    type Error = Error;
+
+    fn try_from(value: generated::HashBuckets) -> std::result::Result<Self, Self::Error> {
+        let buckets_len = value.buckets.len();
+        ensure!(buckets_len.count_ones() == 1, BucketsNotPowerSnafu);
+        let mask = buckets_len.wrapping_sub(1) ^ 3;
+        match value.hash_function {
+            Some(generated::hash_buckets::HashFunction::SipHash24(s)) => Ok(Self {
+                mask,
+                buckets: value.buckets,
+                hash: SipHasher24::new_with_keys(s.key0, s.key1),
+            }),
+            _ => Err(Error::UnrecognizedHash),
+        }
+    }
+}
+
+impl From<HashBuckets> for generated::HashBuckets {
+    fn from(value: HashBuckets) -> Self {
+        let (key0, key1) = value.hash.keys();
+        Self {
+            buckets: value.buckets,
+            hash_function: Some(generated::hash_buckets::HashFunction::SipHash24(
+                generated::SipHash24 { key0, key1 },
+            )),
+        }
+    }
+}
+
+/// Yields the indices to probe for equality
+#[derive(Debug)]
+pub struct HashProbe<'a> {
+    buckets: &'a HashBuckets,
+    idx: usize,
+    mask: usize,
+}
+
+impl<'a> Iterator for HashProbe<'a> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let slice = self.buckets.buckets.get(self.idx..self.idx + 4)?;
+        let entry = u32::from_le_bytes(slice.try_into().unwrap());
+        self.idx = (self.idx + 4) & self.mask;
+
+        // Empty entries are encoded as 0
+        Some(entry.checked_sub(1)? as usize)
+    }
+}
+
+/// An encoder for [`HashBuckets`]
+#[derive(Debug)]
+pub struct HashBucketsEncoder {
+    mask: usize,
+    buckets: Vec<u8>,
+    hash: SipHasher24,
+    len: u32,
+    capacity: u32,
+}
+
+impl HashBucketsEncoder {
+    /// Create a new [`HashBucketsEncoder`]
+    ///
+    /// # Panics
+    ///
+    /// Panics if capacity >= u32::MAX
+    pub fn new(capacity: usize) -> Self {
+        assert!(capacity < u32::MAX as usize);
+
+        let buckets_len = (capacity * 2).next_power_of_two() * 4;
+        let mask = buckets_len.wrapping_sub(1) ^ 3;
+        Self {
+            mask,
+            len: 0,
+            capacity: capacity as u32,
+            buckets: vec![0; buckets_len],
+            // Note: this uses keys (0, 0)
+            hash: SipHasher24::new(),
+        }
+    }
+
+    /// Append a new value
+    ///
+    /// # Panics
+    ///
+    /// Panics if this would exceed the capacity provided to new
+    pub fn push(&mut self, v: &[u8]) {
+        self.push_raw(self.hash.hash(v));
+    }
+
+    /// Append a new value by hash, returning the bucket index
+    fn push_raw(&mut self, hash: u64) -> usize {
+        assert_ne!(self.len, self.capacity);
+        self.len += 1;
+        let entry = self.len;
+        let mut idx = (hash as usize) & self.mask;
+        loop {
+            let s = &mut self.buckets[idx..idx + 4];
+            let s: &mut [u8; 4] = s.try_into().unwrap();
+            if s.iter().all(|x| *x == 0) {
+                *s = entry.to_le_bytes();
+                return idx / 4;
+            }
+            idx = (idx + 4) & self.mask;
+        }
+    }
+
+    /// Construct the output [`HashBuckets`]
+    pub fn finish(self) -> HashBuckets {
+        HashBuckets {
+            mask: self.mask,
+            hash: self.hash,
+            buckets: self.buckets.into(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_collision() {
+        let mut builder = HashBucketsEncoder::new(6);
+
+        assert_eq!(builder.push_raw(14), 3);
+        assert_eq!(builder.push_raw(297), 10);
+        assert_eq!(builder.push_raw(43), 11); // Hashes to occupied bucket 10
+        assert_eq!(builder.push_raw(60), 15);
+        assert_eq!(builder.push_raw(124), 0); // Hashes to occupied bucket 15
+        assert_eq!(builder.push_raw(0), 1); // Hashes to occupied bucket 0
+
+        let buckets = builder.finish();
+
+        let l = buckets.lookup_raw(14).collect::<Vec<_>>();
+        assert_eq!(l, vec![0]);
+
+        let l = buckets.lookup_raw(297).collect::<Vec<_>>();
+        assert_eq!(l, vec![1, 2]);
+
+        let l = buckets.lookup_raw(43).collect::<Vec<_>>();
+        assert_eq!(l, vec![1, 2]);
+
+        let l = buckets.lookup_raw(60).collect::<Vec<_>>();
+        assert_eq!(l, vec![3, 4, 5]);
+
+        let l = buckets.lookup_raw(0).collect::<Vec<_>>();
+        assert_eq!(l, vec![4, 5]);
+    }
+
+    #[test]
+    fn test_basic() {
+        let data = ["a", "", "bongos", "cupcakes", "bananas"];
+        let mut builder = HashBucketsEncoder::new(data.len());
+        for s in &data {
+            builder.push(s.as_bytes());
+        }
+        let buckets = builder.finish();
+
+        let contains = |s: &str| -> bool { buckets.lookup(s.as_bytes()).any(|idx| data[idx] == s) };
+
+        assert!(contains("a"));
+        assert!(contains(""));
+        assert!(contains("bongos"));
+        assert!(contains("bananas"));
+        assert!(!contains("windows"));
+    }
+}
diff --git a/data_types/src/snapshot/list.rs b/data_types/src/snapshot/list.rs
new file mode 100644
index 0000000..bd86b98
--- /dev/null
+++ b/data_types/src/snapshot/list.rs
@@ -0,0 +1,192 @@
+//! A list of [`Message`] supporting efficient skipping
+
+use bytes::Bytes;
+use prost::Message;
+use snafu::{ensure, Snafu};
+use std::marker::PhantomData;
+use std::ops::Range;
+
+use generated_types::influxdata::iox::catalog_cache::v1 as generated;
+
+/// Error type for [`MessageList`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(context(false), display("PackedList decode error: {source}"))]
+    DecodeError { source: prost::DecodeError },
+
+    #[snafu(context(false), display("PackedList encode error: {source}"))]
+    EncodeError { source: prost::EncodeError },
+
+    #[snafu(display("Invalid MessageList offsets: {start}..{end}"))]
+    InvalidSlice { start: usize, end: usize },
+
+    #[snafu(display("MessageList slice {start}..{end} out of bounds 0..{bounds}"))]
+    SliceOutOfBounds {
+        start: usize,
+        end: usize,
+        bounds: usize,
+    },
+}
+
+/// Error type for [`MessageList`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A packed list of [`Message`]
+///
+/// Normally protobuf encodes repeated fields by simply encoding the tag multiple times,
+/// see [here](https://protobuf.dev/programming-guides/encoding/#optional).
+///
+/// Unfortunately this means it is not possible to locate a value at a given index without
+/// decoding all prior records. [`MessageList`] therefore provides a list encoding, inspired
+/// by arrow, that provides this and is designed to be combined with [`prost`]'s support
+/// for zero-copy decoding of [`Bytes`]
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct MessageList<T: Message + Default> {
+    len: usize,
+    offsets: Bytes,
+    values: Bytes,
+    phantom: PhantomData<T>,
+}
+
+impl<T: Message + Default> MessageList<T> {
+    /// Encode `values` to a [`MessageList`]
+    pub fn encode(values: &[T]) -> Result<Self> {
+        let cap = (values.len() + 1) * 4;
+        let mut offsets: Vec<u8> = Vec::with_capacity(cap);
+        offsets.extend_from_slice(&0_u32.to_le_bytes());
+
+        let mut cap = 0;
+        for x in values {
+            cap += x.encoded_len();
+            let offset = u32::try_from(cap).unwrap();
+            offsets.extend_from_slice(&offset.to_le_bytes());
+        }
+
+        let mut data = Vec::with_capacity(cap);
+        values.iter().try_for_each(|x| x.encode(&mut data))?;
+
+        Ok(Self {
+            len: values.len(),
+            offsets: offsets.into(),
+            values: data.into(),
+            phantom: Default::default(),
+        })
+    }
+
+    /// Returns true if this list is empty
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    /// Returns the number of elements in this list
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns the element at index `idx`
+    pub fn get(&self, idx: usize) -> Result<T> {
+        let offset_start = idx * 4;
+        let offset_slice = &self.offsets[offset_start..offset_start + 8];
+        let start = u32::from_le_bytes(offset_slice[0..4].try_into().unwrap()) as usize;
+        let end = u32::from_le_bytes(offset_slice[4..8].try_into().unwrap()) as usize;
+
+        let bounds = self.values.len();
+        ensure!(end >= start, InvalidSliceSnafu { start, end });
+        ensure!(end <= bounds, SliceOutOfBoundsSnafu { start, end, bounds });
+
+        // We slice `Bytes` to preserve zero-copy
+        let data = self.values.slice(start..end);
+        Ok(T::decode(data)?)
+    }
+}
+
+impl<T: Message + Default> From<generated::MessageList> for MessageList<T> {
+    fn from(proto: generated::MessageList) -> Self {
+        let len = (proto.offsets.len() / 4).saturating_sub(1);
+        Self {
+            len,
+            offsets: proto.offsets,
+            values: proto.values,
+            phantom: Default::default(),
+        }
+    }
+}
+
+impl<T: Message + Default> From<MessageList<T>> for generated::MessageList {
+    fn from(value: MessageList<T>) -> Self {
+        Self {
+            offsets: value.offsets,
+            values: value.values,
+        }
+    }
+}
+
+impl<T: Message + Default> IntoIterator for MessageList<T> {
+    type Item = Result<T>;
+    type IntoIter = MessageListIter<T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        MessageListIter {
+            iter: (0..self.len),
+            list: self,
+        }
+    }
+}
+
+/// [`Iterator`] for [`MessageList`]
+#[derive(Debug)]
+pub struct MessageListIter<T: Message + Default> {
+    iter: Range<usize>,
+    list: MessageList<T>,
+}
+
+impl<T: Message + Default> Iterator for MessageListIter<T> {
+    type Item = Result<T>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        Some(self.list.get(self.iter.next()?))
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_simple() {
+        let strings = ["", "test", "foo", "abc", "", "skd"];
+        let strings: Vec<_> = strings.into_iter().map(ToString::to_string).collect();
+
+        let encoded = MessageList::encode(&strings).unwrap();
+
+        assert_eq!(encoded.get(5).unwrap().as_str(), "skd");
+        assert_eq!(encoded.get(2).unwrap().as_str(), "foo");
+        assert_eq!(encoded.get(0).unwrap().as_str(), "");
+
+        let decoded: Vec<_> = encoded.clone().into_iter().map(Result::unwrap).collect();
+        assert_eq!(strings, decoded);
+
+        let proto = generated::MessageList::from(encoded.clone());
+        let back = MessageList::<String>::from(proto.clone());
+        assert_eq!(encoded, back);
+
+        // Invalid decode should return error not panic
+        let invalid = MessageList::<i32>::from(proto);
+        invalid.get(2).unwrap_err();
+
+        let strings: Vec<String> = vec![];
+        let encoded = MessageList::encode(&strings).unwrap();
+        assert_eq!(encoded.len(), 0);
+        assert!(encoded.is_empty());
+
+        let proto = generated::MessageList::default();
+        let encoded = MessageList::<String>::from(proto);
+        assert_eq!(encoded.len(), 0);
+        assert!(encoded.is_empty());
+    }
+}
diff --git a/data_types/src/snapshot/mask.rs b/data_types/src/snapshot/mask.rs
new file mode 100644
index 0000000..ae9dc3b
--- /dev/null
+++ b/data_types/src/snapshot/mask.rs
@@ -0,0 +1,71 @@
+//! A packed bitmask
+
+use arrow_buffer::bit_iterator::BitIndexIterator;
+use arrow_buffer::bit_util::{ceil, set_bit};
+use bytes::Bytes;
+use generated_types::influxdata::iox::catalog_cache::v1 as generated;
+
+/// A packed bitmask
+#[derive(Debug, Clone)]
+pub struct BitMask {
+    mask: Bytes,
+    len: usize,
+}
+
+impl BitMask {
+    /// Returns an iterator of the set indices in this mask
+    pub fn set_indices(&self) -> BitIndexIterator<'_> {
+        BitIndexIterator::new(&self.mask, 0, self.len)
+    }
+}
+
+impl From<generated::BitMask> for BitMask {
+    fn from(value: generated::BitMask) -> Self {
+        Self {
+            mask: value.mask,
+            len: value.len as _,
+        }
+    }
+}
+
+impl From<BitMask> for generated::BitMask {
+    fn from(value: BitMask) -> Self {
+        Self {
+            mask: value.mask,
+            len: value.len as _,
+        }
+    }
+}
+
+/// A builder for [`BitMask`]
+#[derive(Debug)]
+pub struct BitMaskBuilder {
+    values: Vec<u8>,
+    len: usize,
+}
+
+impl BitMaskBuilder {
+    /// Create a new bitmask able to store `len` boolean values
+    #[inline]
+    pub fn new(len: usize) -> Self {
+        Self {
+            values: vec![0; ceil(len, 8)],
+            len,
+        }
+    }
+
+    /// Set the bit at index `idx`
+    #[inline]
+    pub fn set_bit(&mut self, idx: usize) {
+        set_bit(&mut self.values, idx)
+    }
+
+    /// Return the built [`BitMask`]
+    #[inline]
+    pub fn finish(self) -> BitMask {
+        BitMask {
+            mask: self.values.into(),
+            len: self.len,
+        }
+    }
+}
diff --git a/data_types/src/snapshot/mod.rs b/data_types/src/snapshot/mod.rs
new file mode 100644
index 0000000..7be5a93
--- /dev/null
+++ b/data_types/src/snapshot/mod.rs
@@ -0,0 +1,11 @@
+//! Definitions of catalog snapshots
+//!
+//! Snapshots are read-optimised, that is they are designed to be inexpensive to
+//! decode, making extensive use of zero-copy [`Bytes`](bytes::Bytes) in place of
+//! allocating structures such as `String` and `Vec`
+
+pub mod hash;
+pub mod list;
+pub mod mask;
+pub mod partition;
+pub mod table;
diff --git a/data_types/src/snapshot/partition.rs b/data_types/src/snapshot/partition.rs
new file mode 100644
index 0000000..d1838e5
--- /dev/null
+++ b/data_types/src/snapshot/partition.rs
@@ -0,0 +1,246 @@
+//! Snapshot definition for partitions
+
+use crate::snapshot::list::MessageList;
+use crate::snapshot::mask::{BitMask, BitMaskBuilder};
+use crate::{
+    ColumnId, ColumnSet, CompactionLevelProtoError, NamespaceId, ObjectStoreId, ParquetFile,
+    ParquetFileId, Partition, PartitionHashId, PartitionHashIdError, PartitionId,
+    SkippedCompaction, SortKeyIds, TableId, Timestamp,
+};
+use bytes::Bytes;
+use generated_types::influxdata::iox::{
+    catalog_cache::v1 as proto, skipped_compaction::v1 as skipped_compaction_proto,
+};
+use snafu::{OptionExt, ResultExt, Snafu};
+
+/// Error for [`PartitionSnapshot`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Error decoding PartitionFile: {source}"))]
+    FileDecode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error encoding ParquetFile: {source}"))]
+    FileEncode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Missing required field {field}"))]
+    RequiredField { field: &'static str },
+
+    #[snafu(context(false))]
+    CompactionLevel { source: CompactionLevelProtoError },
+
+    #[snafu(context(false))]
+    PartitionHashId { source: PartitionHashIdError },
+
+    #[snafu(display("Invalid partition key: {source}"))]
+    PartitionKey { source: std::str::Utf8Error },
+}
+
+/// Result for [`PartitionSnapshot`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A snapshot of a partition
+#[derive(Debug, Clone)]
+pub struct PartitionSnapshot {
+    /// The [`NamespaceId`]
+    namespace_id: NamespaceId,
+    /// The [`TableId`]
+    table_id: TableId,
+    /// The [`PartitionId`]
+    partition_id: PartitionId,
+    /// The [`PartitionHashId`]
+    partition_hash_id: Option<PartitionHashId>,
+    /// The generation of this snapshot
+    generation: u64,
+    /// The partition key
+    key: Bytes,
+    /// The files
+    files: MessageList<proto::PartitionFile>,
+    /// The columns for this partition
+    columns: ColumnSet,
+    /// The sort key ids
+    sort_key: SortKeyIds,
+    /// The time of a new file
+    new_file_at: Option<Timestamp>,
+    /// Skipped compaction.
+    skipped_compaction: Option<skipped_compaction_proto::SkippedCompaction>,
+}
+
+impl PartitionSnapshot {
+    /// Create a new [`PartitionSnapshot`] from the provided state
+    pub fn encode(
+        namespace_id: NamespaceId,
+        partition: Partition,
+        files: Vec<ParquetFile>,
+        skipped_compaction: Option<SkippedCompaction>,
+        generation: u64,
+    ) -> Result<Self> {
+        // Iterate in reverse order as schema additions are normally additive and
+        // so the later files will typically have more columns
+        let columns = files.iter().rev().fold(ColumnSet::empty(), |mut acc, v| {
+            acc.union(&v.column_set);
+            acc
+        });
+
+        let files = files
+            .into_iter()
+            .map(|file| {
+                let mut mask = BitMaskBuilder::new(columns.len());
+                for (idx, _) in columns.intersect(&file.column_set) {
+                    mask.set_bit(idx);
+                }
+
+                proto::PartitionFile {
+                    id: file.id.get(),
+                    object_store_uuid: Some(file.object_store_id.get_uuid().into()),
+                    min_time: file.min_time.0,
+                    max_time: file.max_time.0,
+                    file_size_bytes: file.file_size_bytes,
+                    row_count: file.row_count,
+                    compaction_level: file.compaction_level as _,
+                    created_at: file.created_at.0,
+                    max_l0_created_at: file.max_l0_created_at.0,
+                    column_mask: Some(mask.finish().into()),
+                }
+            })
+            .collect::<Vec<_>>();
+
+        Ok(Self {
+            generation,
+            columns,
+            namespace_id,
+            partition_id: partition.id,
+            partition_hash_id: partition.hash_id().cloned(),
+            key: partition.partition_key.as_bytes().to_vec().into(),
+            files: MessageList::encode(&files).context(FileEncodeSnafu)?,
+            sort_key: partition.sort_key_ids().cloned().unwrap_or_default(),
+            table_id: partition.table_id,
+            new_file_at: partition.new_file_at,
+            skipped_compaction: skipped_compaction.map(|sc| sc.into()),
+        })
+    }
+
+    /// Create a new [`PartitionSnapshot`] from a `proto` and generation
+    pub fn decode(proto: proto::Partition, generation: u64) -> Self {
+        let table_id = TableId::new(proto.table_id);
+        let partition_hash_id = proto
+            .partition_hash_id
+            .then(|| PartitionHashId::from_raw(table_id, proto.key.as_ref()));
+
+        Self {
+            generation,
+            table_id,
+            partition_hash_id,
+            key: proto.key,
+            files: MessageList::from(proto.files.unwrap_or_default()),
+            namespace_id: NamespaceId::new(proto.namespace_id),
+            partition_id: PartitionId::new(proto.partition_id),
+            columns: ColumnSet::new(proto.column_ids.into_iter().map(ColumnId::new)),
+            sort_key: SortKeyIds::new(proto.sort_key_ids.into_iter().map(ColumnId::new)),
+            new_file_at: proto.new_file_at.map(Timestamp::new),
+            skipped_compaction: proto.skipped_compaction,
+        }
+    }
+
+    /// Returns the generation of this snapshot
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+
+    /// Returns the [`PartitionId`]
+    pub fn partition_id(&self) -> PartitionId {
+        self.partition_id
+    }
+
+    /// Returns the [`PartitionHashId`] if any
+    pub fn partition_hash_id(&self) -> Option<&PartitionHashId> {
+        self.partition_hash_id.as_ref()
+    }
+
+    /// Returns the file at index `idx`
+    pub fn file(&self, idx: usize) -> Result<ParquetFile> {
+        let file = self.files.get(idx).context(FileDecodeSnafu)?;
+
+        let uuid = file.object_store_uuid.context(RequiredFieldSnafu {
+            field: "object_store_uuid",
+        })?;
+
+        let column_set = match file.column_mask {
+            Some(mask) => {
+                let mask = BitMask::from(mask);
+                ColumnSet::new(mask.set_indices().map(|idx| self.columns[idx]))
+            }
+            None => self.columns.clone(),
+        };
+
+        Ok(ParquetFile {
+            id: ParquetFileId(file.id),
+            namespace_id: self.namespace_id,
+            table_id: self.table_id,
+            partition_id: self.partition_id,
+            partition_hash_id: self.partition_hash_id.clone(),
+            object_store_id: ObjectStoreId::from_uuid(uuid.into()),
+            min_time: Timestamp(file.min_time),
+            max_time: Timestamp(file.max_time),
+            to_delete: None,
+            file_size_bytes: file.file_size_bytes,
+            row_count: file.row_count,
+            compaction_level: file.compaction_level.try_into()?,
+            created_at: Timestamp(file.created_at),
+            column_set,
+            max_l0_created_at: Timestamp(file.max_l0_created_at),
+        })
+    }
+
+    /// Returns an iterator over the files in this snapshot
+    pub fn files(&self) -> impl Iterator<Item = Result<ParquetFile>> + '_ {
+        (0..self.files.len()).map(|idx| self.file(idx))
+    }
+
+    /// Returns the [`Partition`] for this snapshot
+    pub fn partition(&self) -> Result<Partition> {
+        let key = std::str::from_utf8(&self.key).context(PartitionKeySnafu)?;
+        Ok(Partition::new_catalog_only(
+            self.partition_id,
+            self.partition_hash_id.clone(),
+            self.table_id,
+            key.into(),
+            self.sort_key.clone(),
+            self.new_file_at,
+        ))
+    }
+
+    /// Returns the columns IDs
+    pub fn column_ids(&self) -> &ColumnSet {
+        &self.columns
+    }
+
+    /// Return skipped compaction for this partition, if any.
+    pub fn skipped_compaction(&self) -> Option<SkippedCompaction> {
+        self.skipped_compaction
+            .as_ref()
+            .cloned()
+            .map(|sc| sc.into())
+    }
+}
+
+impl From<PartitionSnapshot> for proto::Partition {
+    fn from(value: PartitionSnapshot) -> Self {
+        Self {
+            key: value.key,
+            files: Some(value.files.into()),
+            namespace_id: value.namespace_id.get(),
+            table_id: value.table_id.get(),
+            partition_id: value.partition_id.get(),
+            partition_hash_id: value.partition_hash_id.is_some(),
+            column_ids: value.columns.iter().map(|x| x.get()).collect(),
+            sort_key_ids: value.sort_key.iter().map(|x| x.get()).collect(),
+            new_file_at: value.new_file_at.map(|x| x.get()),
+            skipped_compaction: value.skipped_compaction,
+        }
+    }
+}
diff --git a/data_types/src/snapshot/table.rs b/data_types/src/snapshot/table.rs
new file mode 100644
index 0000000..08c235d
--- /dev/null
+++ b/data_types/src/snapshot/table.rs
@@ -0,0 +1,197 @@
+//! Snapshot definition for tables
+use crate::snapshot::list::MessageList;
+use crate::{
+    Column, ColumnId, ColumnTypeProtoError, NamespaceId, Partition, PartitionId, Table, TableId,
+};
+use bytes::Bytes;
+use generated_types::influxdata::iox::catalog_cache::v1 as proto;
+use generated_types::influxdata::iox::column_type::v1::ColumnType;
+use generated_types::influxdata::iox::partition_template::v1::PartitionTemplate;
+use snafu::{ResultExt, Snafu};
+
+/// Error for [`TableSnapshot`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Error decoding TablePartition: {source}"))]
+    PartitionDecode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error encoding TablePartition: {source}"))]
+    PartitionEncode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error decoding TableColumn: {source}"))]
+    ColumnDecode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error encoding TableColumn: {source}"))]
+    ColumnEncode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Invalid column name: {source}"))]
+    ColumnName { source: std::str::Utf8Error },
+
+    #[snafu(display("Invalid table name: {source}"))]
+    TableName { source: std::str::Utf8Error },
+
+    #[snafu(display("Invalid partition template: {source}"))]
+    PartitionTemplate {
+        source: crate::partition_template::ValidationError,
+    },
+
+    #[snafu(context(false))]
+    ColumnType { source: ColumnTypeProtoError },
+}
+
+/// Result for [`TableSnapshot`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A snapshot of a table
+#[derive(Debug, Clone)]
+pub struct TableSnapshot {
+    table_id: TableId,
+    namespace_id: NamespaceId,
+    table_name: Bytes,
+    partitions: MessageList<proto::TablePartition>,
+    columns: MessageList<proto::TableColumn>,
+    partition_template: Option<PartitionTemplate>,
+    generation: u64,
+}
+
+impl TableSnapshot {
+    /// Create a new [`TableSnapshot`] from the provided state
+    pub fn encode(
+        table: Table,
+        partitions: Vec<Partition>,
+        columns: Vec<Column>,
+        generation: u64,
+    ) -> Result<Self> {
+        let columns: Vec<_> = columns
+            .into_iter()
+            .map(|c| proto::TableColumn {
+                id: c.id.get(),
+                name: c.name.into(),
+                column_type: ColumnType::from(c.column_type).into(),
+            })
+            .collect();
+
+        let partitions: Vec<_> = partitions
+            .into_iter()
+            .map(|p| proto::TablePartition {
+                id: p.id.get(),
+                key: p.partition_key.as_bytes().to_vec().into(),
+            })
+            .collect();
+
+        Ok(Self {
+            table_id: table.id,
+            namespace_id: table.namespace_id,
+            table_name: table.name.into(),
+            partitions: MessageList::encode(&partitions).context(PartitionEncodeSnafu)?,
+            columns: MessageList::encode(&columns).context(ColumnEncodeSnafu)?,
+            partition_template: table.partition_template.as_proto().cloned(),
+            generation,
+        })
+    }
+
+    /// Create a new [`TableSnapshot`] from a `proto` and generation
+    pub fn decode(proto: proto::Table, generation: u64) -> Self {
+        Self {
+            generation,
+            table_id: TableId::new(proto.table_id),
+            namespace_id: NamespaceId::new(proto.namespace_id),
+            table_name: proto.table_name,
+            partitions: MessageList::from(proto.partitions.unwrap_or_default()),
+            columns: MessageList::from(proto.columns.unwrap_or_default()),
+            partition_template: proto.partition_template,
+        }
+    }
+
+    /// Returns the [`Table`] for this snapshot
+    pub fn table(&self) -> Result<Table> {
+        let name = std::str::from_utf8(&self.table_name).context(TableNameSnafu)?;
+        let template = self
+            .partition_template
+            .clone()
+            .try_into()
+            .context(PartitionTemplateSnafu)?;
+
+        Ok(Table {
+            id: self.table_id,
+            namespace_id: self.namespace_id,
+            name: name.into(),
+            partition_template: template,
+        })
+    }
+
+    /// Returns the column by index
+    pub fn column(&self, idx: usize) -> Result<Column> {
+        let column = self.columns.get(idx).context(ColumnDecodeSnafu)?;
+        let name = std::str::from_utf8(&column.name).context(ColumnNameSnafu)?;
+
+        Ok(Column {
+            id: ColumnId::new(column.id),
+            table_id: self.table_id,
+            name: name.into(),
+            column_type: (column.column_type as i16).try_into()?,
+        })
+    }
+
+    /// Returns an iterator of the columns in this table
+    pub fn columns(&self) -> impl Iterator<Item = Result<Column>> + '_ {
+        (0..self.columns.len()).map(|idx| self.column(idx))
+    }
+
+    /// Returns an iterator of the [`PartitionId`] in this table
+    pub fn partitions(&self) -> impl Iterator<Item = Result<TableSnapshotPartition>> + '_ {
+        (0..self.partitions.len()).map(|idx| {
+            let p = self.partitions.get(idx).context(PartitionDecodeSnafu)?;
+            Ok(TableSnapshotPartition {
+                id: PartitionId::new(p.id),
+                key: p.key,
+            })
+        })
+    }
+
+    /// Returns the generation of this snapshot
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+}
+
+/// Partition information stored within [`TableSnapshot`]
+#[derive(Debug)]
+pub struct TableSnapshotPartition {
+    id: PartitionId,
+    key: Bytes,
+}
+
+impl TableSnapshotPartition {
+    /// Returns the [`PartitionId`] for this partition
+    pub fn id(&self) -> PartitionId {
+        self.id
+    }
+
+    /// Returns the partition key for this partition
+    pub fn key(&self) -> &[u8] {
+        &self.key
+    }
+}
+
+impl From<TableSnapshot> for proto::Table {
+    fn from(value: TableSnapshot) -> Self {
+        Self {
+            partitions: Some(value.partitions.into()),
+            columns: Some(value.columns.into()),
+            partition_template: value.partition_template,
+            namespace_id: value.namespace_id.get(),
+            table_id: value.table_id.get(),
+            table_name: value.table_name,
+        }
+    }
+}
diff --git a/datafusion_util/Cargo.toml b/datafusion_util/Cargo.toml
new file mode 100644
index 0000000..1f5f554
--- /dev/null
+++ b/datafusion_util/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "datafusion_util"
+description = "Datafusion utilities"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+async-trait = "0.1"
+datafusion = { workspace = true }
+futures = "0.3"
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+pin-project = "1.1"
+schema = { path = "../schema" }
+tokio = { version = "1.35", features = ["parking_lot", "sync"] }
+tokio-stream = "0.1"
+url = "2.5"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/datafusion_util/src/config.rs b/datafusion_util/src/config.rs
new file mode 100644
index 0000000..ed41b19
--- /dev/null
+++ b/datafusion_util/src/config.rs
@@ -0,0 +1,50 @@
+use std::{fmt::Display, sync::Arc};
+
+use datafusion::{
+    config::ConfigOptions, execution::runtime_env::RuntimeEnv, prelude::SessionConfig,
+};
+use object_store::ObjectStore;
+use schema::TIME_DATA_TIMEZONE;
+use url::Url;
+
+// The default catalog name - this impacts what SQL queries use if not specified
+pub const DEFAULT_CATALOG: &str = "public";
+// The default schema name - this impacts what SQL queries use if not specified
+pub const DEFAULT_SCHEMA: &str = "iox";
+
+/// The maximum number of rows that DataFusion should create in each RecordBatch
+pub const BATCH_SIZE: usize = 8 * 1024;
+
+/// Return a SessionConfig object configured for IOx
+pub fn iox_session_config() -> SessionConfig {
+    // Enable parquet predicate pushdown optimization
+    let mut options = ConfigOptions::new();
+    options.execution.parquet.pushdown_filters = true;
+    options.execution.parquet.reorder_filters = true;
+    options.execution.time_zone = TIME_DATA_TIMEZONE().map(|s| s.to_string());
+    options.optimizer.repartition_sorts = true;
+
+    SessionConfig::from(options)
+        .with_batch_size(BATCH_SIZE)
+        .with_create_default_catalog_and_schema(true)
+        .with_information_schema(true)
+        .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
+        // Tell the datafusion optimizer to avoid repartitioning sorted inputs
+        .with_prefer_existing_sort(true)
+        // Avoid repartitioning file scans as it destroys existing sort orders
+        // see https://github.com/influxdata/influxdb_iox/issues/9450
+        // see https://github.com/apache/arrow-datafusion/issues/8451
+        .with_repartition_file_scans(false)
+}
+
+/// Register the "IOx" object store provider for URLs of the form "iox://{id}
+///
+/// Return the previous registered store, if any
+pub fn register_iox_object_store<D: Display>(
+    runtime: impl AsRef<RuntimeEnv>,
+    id: D,
+    object_store: Arc<dyn ObjectStore>,
+) -> Option<Arc<dyn ObjectStore>> {
+    let url = Url::parse(&format!("iox://{id}")).unwrap();
+    runtime.as_ref().register_object_store(&url, object_store)
+}
diff --git a/datafusion_util/src/lib.rs b/datafusion_util/src/lib.rs
new file mode 100644
index 0000000..6323f06
--- /dev/null
+++ b/datafusion_util/src/lib.rs
@@ -0,0 +1,519 @@
+#![deny(
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    clippy::clone_on_ref_ptr,
+    rustdoc::broken_intra_doc_links,
+    rustdoc::bare_urls,
+    rust_2018_idioms,
+    unused_crate_dependencies
+)]
+#![allow(clippy::clone_on_ref_ptr)]
+
+//! This module contains various DataFusion utility functions.
+//!
+//! Almost everything for manipulating DataFusion `Expr`s IOx should be in DataFusion already
+//! (or if not it should be upstreamed).
+//!
+//! For example, check out
+//! [datafusion_optimizer::utils](https://docs.rs/datafusion-optimizer/13.0.0/datafusion_optimizer/utils/index.html)
+//! for expression manipulation functions.
+
+use datafusion::execution::memory_pool::{MemoryPool, UnboundedMemoryPool};
+use std::collections::HashSet;
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod config;
+pub mod sender;
+pub mod watch;
+
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use datafusion::arrow::array::BooleanArray;
+use datafusion::arrow::compute::filter_record_batch;
+use datafusion::arrow::datatypes::{DataType, Fields};
+use datafusion::common::stats::Precision;
+use datafusion::common::{DataFusionError, ToDFSchema};
+use datafusion::execution::context::TaskContext;
+use datafusion::logical_expr::expr::Sort;
+use datafusion::logical_expr::utils::inspect_expr_pre;
+use datafusion::physical_expr::execution_props::ExecutionProps;
+use datafusion::physical_expr::{create_physical_expr, PhysicalExpr};
+use datafusion::physical_optimizer::pruning::PruningPredicate;
+use datafusion::physical_plan::{collect, EmptyRecordBatchStream, ExecutionPlan};
+use datafusion::prelude::{lit, Column, Expr, SessionContext};
+use datafusion::{
+    arrow::{
+        datatypes::{Schema, SchemaRef},
+        record_batch::RecordBatch,
+    },
+    physical_plan::{RecordBatchStream, SendableRecordBatchStream},
+    scalar::ScalarValue,
+};
+use futures::{Stream, StreamExt};
+use schema::TIME_DATA_TIMEZONE;
+use tokio::sync::mpsc::{Receiver, UnboundedReceiver};
+use tokio_stream::wrappers::{ReceiverStream, UnboundedReceiverStream};
+use watch::WatchedTask;
+
+/// Traits to help creating DataFusion [`Expr`]s
+pub trait AsExpr {
+    /// Creates a DataFusion expr
+    fn as_expr(&self) -> Expr;
+
+    /// creates a DataFusion SortExpr
+    fn as_sort_expr(&self) -> Expr {
+        Expr::Sort(Sort {
+            expr: Box::new(self.as_expr()),
+            asc: true, // Sort ASCENDING
+            nulls_first: true,
+        })
+    }
+}
+
+impl AsExpr for Arc<str> {
+    fn as_expr(&self) -> Expr {
+        self.as_ref().as_expr()
+    }
+}
+
+impl AsExpr for str {
+    fn as_expr(&self) -> Expr {
+        // note using `col(<ident>)` will parse identifiers and try to
+        // split them on `.`.
+        //
+        // So it would treat 'foo.bar' as table 'foo', column 'bar'
+        //
+        // This is not correct for influxrpc, so instead treat it
+        // like the column "foo.bar"
+        Expr::Column(Column {
+            relation: None,
+            name: self.into(),
+        })
+    }
+}
+
+impl AsExpr for Expr {
+    fn as_expr(&self) -> Expr {
+        self.clone()
+    }
+}
+
+/// Creates an `Expr` that represents a Dictionary encoded string (e.g
+/// the type of constant that a tag would be compared to)
+pub fn lit_dict(value: &str) -> Expr {
+    // expr has been type coerced
+    lit(ScalarValue::Dictionary(
+        Box::new(DataType::Int32),
+        Box::new(ScalarValue::new_utf8(value)),
+    ))
+}
+
+/// Creates expression like:
+/// start <= time && time < end
+pub fn make_range_expr(start: i64, end: i64, time: impl AsRef<str>) -> Expr {
+    // We need to cast the start and end values to timestamps
+    // the equivalent of:
+    let ts_start = timestamptz_nano(start);
+    let ts_end = timestamptz_nano(end);
+
+    let time_col = time.as_ref().as_expr();
+    let ts_low = lit(ts_start).lt_eq(time_col.clone());
+    let ts_high = time_col.lt(lit(ts_end));
+
+    ts_low.and(ts_high)
+}
+
+/// Ensures all columns referred to in `filters` are in the `projection`, if
+/// any, adding them if necessary.
+pub fn extend_projection_for_filters(
+    schema: &Schema,
+    filters: &[Expr],
+    projection: Option<&Vec<usize>>,
+) -> Result<Option<Vec<usize>>, DataFusionError> {
+    let Some(mut projection) = projection.cloned() else {
+        return Ok(None);
+    };
+
+    let mut seen_cols: HashSet<usize> = projection.iter().cloned().collect();
+    for filter in filters {
+        inspect_expr_pre(filter, |expr| {
+            if let Expr::Column(c) = expr {
+                let idx = schema.index_of(&c.name)?;
+                // if haven't seen this column before, add it to the list
+                if seen_cols.insert(idx) {
+                    projection.push(idx);
+                }
+            }
+            Ok(()) as Result<(), DataFusionError>
+        })?;
+    }
+    Ok(Some(projection))
+}
+
+// TODO port this upstream to datafusion (maybe as From<Option> for Precision)
+/// Maps `Option::Some(T)` to `Precision::Exact(T)` and `Option::None` to
+/// `Precision::Absent`
+pub fn option_to_precision<T: std::fmt::Debug + Clone + PartialEq + Eq + PartialOrd>(
+    option: Option<T>,
+) -> Precision<T> {
+    match option {
+        Some(value) => Precision::Exact(value),
+        None => Precision::Absent,
+    }
+}
+
+/// A RecordBatchStream created from in-memory RecordBatches.
+#[derive(Debug)]
+pub struct MemoryStream {
+    schema: SchemaRef,
+    batches: Vec<RecordBatch>,
+}
+
+impl MemoryStream {
+    /// Create new stream.
+    ///
+    /// Must at least pass one record batch!
+    pub fn new(batches: Vec<RecordBatch>) -> Self {
+        assert!(!batches.is_empty(), "must at least pass one record batch");
+        Self {
+            schema: batches[0].schema(),
+            batches,
+        }
+    }
+
+    /// Create new stream with provided schema.
+    pub fn new_with_schema(batches: Vec<RecordBatch>, schema: SchemaRef) -> Self {
+        Self { schema, batches }
+    }
+}
+
+impl RecordBatchStream for MemoryStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl Stream for MemoryStream {
+    type Item = Result<RecordBatch, DataFusionError>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        _: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        if self.batches.is_empty() {
+            Poll::Ready(None)
+        } else {
+            Poll::Ready(Some(Ok(self.batches.remove(0))))
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.batches.len(), Some(self.batches.len()))
+    }
+}
+
+#[derive(Debug)]
+/// Implements a [`SendableRecordBatchStream`] to help create DataFusion outputs
+/// from tokio channels.
+///
+/// It sends streams of RecordBatches from a tokio channel *and* crucially knows
+/// up front the schema each batch will have be used.
+pub struct AdapterStream<T> {
+    /// Schema
+    schema: SchemaRef,
+    /// channel for getting deduplicated batches
+    inner: T,
+
+    /// Optional join handles of underlying tasks.
+    #[allow(dead_code)]
+    task: Arc<WatchedTask>,
+}
+
+impl AdapterStream<ReceiverStream<Result<RecordBatch, DataFusionError>>> {
+    /// Create a new stream which wraps the `inner` channel which produces
+    /// [`RecordBatch`]es that each have the specified schema
+    ///
+    /// Not called `new` because it returns a pinned reference rather than the
+    /// object itself.
+    pub fn adapt(
+        schema: SchemaRef,
+        rx: Receiver<Result<RecordBatch, DataFusionError>>,
+        task: Arc<WatchedTask>,
+    ) -> SendableRecordBatchStream {
+        let inner = ReceiverStream::new(rx);
+        Box::pin(Self {
+            schema,
+            inner,
+            task,
+        })
+    }
+}
+
+impl AdapterStream<UnboundedReceiverStream<Result<RecordBatch, DataFusionError>>> {
+    /// Create a new stream which wraps the `inner` unbounded channel which
+    /// produces [`RecordBatch`]es that each have the specified schema
+    ///
+    /// Not called `new` because it returns a pinned reference rather than the
+    /// object itself.
+    pub fn adapt_unbounded(
+        schema: SchemaRef,
+        rx: UnboundedReceiver<Result<RecordBatch, DataFusionError>>,
+        task: Arc<WatchedTask>,
+    ) -> SendableRecordBatchStream {
+        let inner = UnboundedReceiverStream::new(rx);
+        Box::pin(Self {
+            schema,
+            inner,
+            task,
+        })
+    }
+}
+
+impl<T> Stream for AdapterStream<T>
+where
+    T: Stream<Item = Result<RecordBatch, DataFusionError>> + Unpin,
+{
+    type Item = Result<RecordBatch, DataFusionError>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        self.inner.poll_next_unpin(cx)
+    }
+}
+
+impl<T> RecordBatchStream for AdapterStream<T>
+where
+    T: Stream<Item = Result<RecordBatch, DataFusionError>> + Unpin,
+{
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+/// Create a SendableRecordBatchStream a RecordBatch
+pub fn stream_from_batch(schema: SchemaRef, batch: RecordBatch) -> SendableRecordBatchStream {
+    stream_from_batches(schema, vec![batch])
+}
+
+/// Create a SendableRecordBatchStream from Vec of RecordBatches with the same schema
+pub fn stream_from_batches(
+    schema: SchemaRef,
+    batches: Vec<RecordBatch>,
+) -> SendableRecordBatchStream {
+    if batches.is_empty() {
+        return Box::pin(EmptyRecordBatchStream::new(schema));
+    }
+    Box::pin(MemoryStream::new_with_schema(batches, schema))
+}
+
+/// Create a SendableRecordBatchStream that sends back no RecordBatches with a specific schema
+pub fn stream_from_schema(schema: SchemaRef) -> SendableRecordBatchStream {
+    stream_from_batches(schema, vec![])
+}
+
+/// Execute the [ExecutionPlan] with a default [SessionContext] and
+/// collect the results in memory.
+///
+/// # Panics
+/// If an an error occurs
+pub async fn test_collect(plan: Arc<dyn ExecutionPlan>) -> Vec<RecordBatch> {
+    let session_ctx = SessionContext::new();
+    let task_ctx = Arc::new(TaskContext::from(&session_ctx));
+    collect(plan, task_ctx).await.unwrap()
+}
+
+/// Execute the specified partition of the [ExecutionPlan] with a
+/// default [SessionContext] returning the resulting stream.
+///
+/// # Panics
+/// If an an error occurs
+pub async fn test_execute_partition(
+    plan: Arc<dyn ExecutionPlan>,
+    partition: usize,
+) -> SendableRecordBatchStream {
+    let session_ctx = SessionContext::new();
+    let task_ctx = Arc::new(TaskContext::from(&session_ctx));
+    plan.execute(partition, task_ctx).unwrap()
+}
+
+/// Execute the specified partition of the [ExecutionPlan] with a
+/// default [SessionContext] and collect the results in memory.
+///
+/// # Panics
+/// If an an error occurs
+pub async fn test_collect_partition(
+    plan: Arc<dyn ExecutionPlan>,
+    partition: usize,
+) -> Vec<RecordBatch> {
+    let stream = test_execute_partition(plan, partition).await;
+    datafusion::physical_plan::common::collect(stream)
+        .await
+        .unwrap()
+}
+
+/// Filter data from RecordBatch
+///
+/// Borrowed from DF's <https://github.com/apache/arrow-datafusion/blob/ecd0081bde98e9031b81aa6e9ae2a4f309fcec12/datafusion/src/physical_plan/filter.rs#L186>.
+// TODO: if we make DF batch_filter public, we can call that function directly
+pub fn batch_filter(
+    batch: &RecordBatch,
+    predicate: &Arc<dyn PhysicalExpr>,
+) -> Result<RecordBatch, DataFusionError> {
+    predicate
+        .evaluate(batch)
+        .and_then(|v| v.into_array(batch.num_rows()))
+        .and_then(|array| {
+            array
+                .as_any()
+                .downcast_ref::<BooleanArray>()
+                .ok_or_else(|| {
+                    DataFusionError::Internal(
+                        "Filter predicate evaluated to non-boolean value".to_string(),
+                    )
+                })
+                // apply filter array to record batch
+                .and_then(|filter_array| {
+                    filter_record_batch(batch, filter_array)
+                        .map_err(|err| DataFusionError::ArrowError(err, None))
+                })
+        })
+}
+
+/// Returns a new schema where all the fields are nullable
+pub fn nullable_schema(schema: SchemaRef) -> SchemaRef {
+    // they are all already nullable
+    if schema.fields().iter().all(|f| f.is_nullable()) {
+        schema
+    } else {
+        // make a new schema with all nullable fields
+        let new_fields: Fields = schema
+            .fields()
+            .iter()
+            .map(|f| {
+                // make a copy of the field, but allow it to be nullable
+                f.as_ref().clone().with_nullable(true)
+            })
+            .collect();
+
+        Arc::new(Schema::new_with_metadata(
+            new_fields,
+            schema.metadata().clone(),
+        ))
+    }
+}
+
+/// Returns a [`PhysicalExpr`] from the logical [`Expr`] and Arrow [`SchemaRef`]
+pub fn create_physical_expr_from_schema(
+    props: &ExecutionProps,
+    expr: &Expr,
+    schema: &SchemaRef,
+) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+    let df_schema = Arc::clone(schema).to_dfschema_ref()?;
+    create_physical_expr(expr, df_schema.as_ref(), schema.as_ref(), props)
+}
+
+/// Returns a [`PruningPredicate`] from the logical [`Expr`] and Arrow [`SchemaRef`]
+pub fn create_pruning_predicate(
+    props: &ExecutionProps,
+    expr: &Expr,
+    schema: &SchemaRef,
+) -> Result<PruningPredicate, DataFusionError> {
+    let expr = create_physical_expr_from_schema(props, expr, schema)?;
+    PruningPredicate::try_new(expr, Arc::clone(schema))
+}
+
+/// Create a memory pool that has no limit
+pub fn unbounded_memory_pool() -> Arc<dyn MemoryPool> {
+    Arc::new(UnboundedMemoryPool::default())
+}
+
+/// Create a timestamp literal for the given UTC nanosecond offset in
+/// the timezone specified by [TIME_DATA_TIMEZONE].
+///
+/// N.B. If [TIME_DATA_TIMEZONE] specifies the None timezone then this
+/// function behaves identially to [datafusion::prelude::lit_timestamp_nano].
+pub fn lit_timestamptz_nano(ns: i64) -> Expr {
+    lit(timestamptz_nano(ns))
+}
+
+/// Create a scalar timestamp value for the given UTC nanosecond offset
+/// in the timezone specified by [TIME_DATA_TIMEZONE].
+pub fn timestamptz_nano(ns: i64) -> ScalarValue {
+    ScalarValue::TimestampNanosecond(Some(ns), TIME_DATA_TIMEZONE())
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::arrow::datatypes::{DataType, Field};
+    use schema::builder::SchemaBuilder;
+
+    use super::*;
+
+    #[test]
+    fn test_make_range_expr() {
+        // Test that the generated predicate is correct
+
+        let ts_predicate_expr = make_range_expr(101, 202, "time");
+        let expected_timezone = match TIME_DATA_TIMEZONE() {
+            Some(tz) => format!("Some(\"{tz}\")"),
+            None => "None".into(),
+        };
+        let expected_string =
+            format!("TimestampNanosecond(101, {expected_timezone}) <= time AND time < TimestampNanosecond(202, {expected_timezone})");
+        let actual_string = format!("{ts_predicate_expr}");
+
+        assert_eq!(actual_string, expected_string);
+    }
+
+    #[test]
+    fn test_nullable_schema_nullable() {
+        // schema is all nullable
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("foo", DataType::Int32, true),
+            Field::new("bar", DataType::Utf8, true),
+        ]));
+
+        assert_eq!(schema, nullable_schema(schema.clone()))
+    }
+
+    #[test]
+    fn test_nullable_schema_non_nullable() {
+        // schema has one nullable column
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("foo", DataType::Int32, false),
+            Field::new("bar", DataType::Utf8, true),
+        ]));
+
+        let expected_schema = Arc::new(Schema::new(vec![
+            Field::new("foo", DataType::Int32, true),
+            Field::new("bar", DataType::Utf8, true),
+        ]));
+
+        assert_eq!(expected_schema, nullable_schema(schema))
+    }
+
+    #[tokio::test]
+    async fn test_adapter_stream_panic_handling() {
+        let schema = SchemaBuilder::new().timestamp().build().unwrap().as_arrow();
+        let (tx, rx) = tokio::sync::mpsc::channel(2);
+        let tx_captured = tx.clone();
+        let fut = async move {
+            let _tx = tx_captured;
+            if true {
+                panic!("epic fail");
+            }
+
+            Ok(())
+        };
+        let join_handle = WatchedTask::new(fut, vec![tx], "test");
+        let stream = AdapterStream::adapt(schema, rx, join_handle);
+        datafusion::physical_plan::common::collect(stream)
+            .await
+            .unwrap_err();
+    }
+}
diff --git a/datafusion_util/src/sender.rs b/datafusion_util/src/sender.rs
new file mode 100644
index 0000000..7b74056
--- /dev/null
+++ b/datafusion_util/src/sender.rs
@@ -0,0 +1,36 @@
+use async_trait::async_trait;
+use tokio::sync::mpsc::{error::SendError, Sender, UnboundedSender};
+
+/// Trait to abstract over [bounded](Sender) and [unbounded](UnboundedSender) tokio [MPSC](tokio::sync::mpsc) senders.
+#[async_trait]
+pub trait AbstractSender: Clone + Send + Sync + 'static {
+    /// Channel payload type.
+    type T;
+
+    /// Send data.
+    async fn send(&self, value: Self::T) -> Result<(), SendError<Self::T>>;
+}
+
+#[async_trait]
+impl<T> AbstractSender for Sender<T>
+where
+    T: Send + 'static,
+{
+    type T = T;
+
+    async fn send(&self, value: Self::T) -> Result<(), SendError<Self::T>> {
+        self.send(value).await
+    }
+}
+
+#[async_trait]
+impl<T> AbstractSender for UnboundedSender<T>
+where
+    T: Send + 'static,
+{
+    type T = T;
+
+    async fn send(&self, value: Self::T) -> Result<(), SendError<Self::T>> {
+        self.send(value)
+    }
+}
diff --git a/datafusion_util/src/watch.rs b/datafusion_util/src/watch.rs
new file mode 100644
index 0000000..c23303e
--- /dev/null
+++ b/datafusion_util/src/watch.rs
@@ -0,0 +1,121 @@
+use std::{
+    future::Future,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use datafusion::{arrow::record_batch::RecordBatch, common::DataFusionError};
+use observability_deps::tracing::debug;
+use pin_project::{pin_project, pinned_drop};
+use tokio::task::{JoinError, JoinHandle};
+
+use crate::sender::AbstractSender;
+
+/// A [`RecordBatch`]-producing task that is watched.
+///
+/// This includes:
+/// - when the task panics, an error will be sent to all receiving channels
+/// - task is aborted on drop
+#[pin_project]
+#[derive(Debug)]
+pub struct WatchedTask(#[pin] AutoAbortJoinHandle<()>);
+
+impl WatchedTask {
+    pub fn new<F, S>(fut: F, tx: Vec<S>, description: &'static str) -> Arc<Self>
+    where
+        F: Future<Output = Result<(), DataFusionError>> + Send + 'static,
+        S: AbstractSender<T = Result<RecordBatch, DataFusionError>>,
+    {
+        let handle = AutoAbortJoinHandle::new(tokio::task::spawn(fut));
+        let handle = AutoAbortJoinHandle(tokio::task::spawn(watch_task(description, tx, handle)));
+        Arc::new(Self(handle))
+    }
+}
+
+impl Future for WatchedTask {
+    type Output = Result<(), JoinError>;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        this.0.poll(cx)
+    }
+}
+
+/// Watches the output of a tokio task for an error. If the task
+/// errors, send an arror error to the channel.
+///
+/// This pattern is used to run the operator logic in one task that
+/// sends to a channel that can return an error
+async fn watch_task<S>(
+    description: &'static str,
+    tx: Vec<S>,
+    task: AutoAbortJoinHandle<Result<(), DataFusionError>>,
+) where
+    S: AbstractSender<T = Result<RecordBatch, DataFusionError>>,
+{
+    let task_result = task.await;
+
+    let msg = match task_result {
+        Err(join_err) => {
+            debug!(e=%join_err, %description, "Error joining");
+            Some(DataFusionError::Context(
+                format!("Join error for '{description}'"),
+                Box::new(DataFusionError::External(Box::new(join_err))),
+            ))
+        }
+        Ok(Err(e)) => {
+            debug!(%e, %description, "Error in task itself");
+            Some(DataFusionError::Context(
+                format!("Execution error for '{description}'"),
+                Box::new(e),
+            ))
+        }
+        Ok(Ok(())) => {
+            // successful
+            None
+        }
+    };
+
+    // If there is a message to send down the channel, try and do so
+    if let Some(e) = msg {
+        let e = Arc::new(e);
+        for tx in tx {
+            // try and tell the receiver something went
+            // wrong. Note we ignore errors sending this message
+            // as that means the receiver has already been
+            // shutdown and no one cares anymore lol
+            let err = DataFusionError::External(Box::new(Arc::clone(&e)));
+            if tx.send(Err(err)).await.is_err() {
+                debug!(%description, "receiver hung up");
+            }
+        }
+    }
+}
+
+/// A [`JoinHandle`] that is aborted on drop.
+#[pin_project(PinnedDrop)]
+#[derive(Debug)]
+struct AutoAbortJoinHandle<T>(#[pin] JoinHandle<T>);
+
+impl<T> AutoAbortJoinHandle<T> {
+    pub(crate) fn new(handle: JoinHandle<T>) -> Self {
+        Self(handle)
+    }
+}
+
+#[pinned_drop]
+impl<T> PinnedDrop for AutoAbortJoinHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        self.0.abort();
+    }
+}
+
+impl<T> Future for AutoAbortJoinHandle<T> {
+    type Output = Result<T, JoinError>;
+
+    fn poll(self: std::pin::Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        this.0.poll(cx)
+    }
+}
diff --git a/dml/Cargo.toml b/dml/Cargo.toml
new file mode 100644
index 0000000..9ae6876
--- /dev/null
+++ b/dml/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "dml"
+description = "DML types"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow_util = { path = "../arrow_util" }
+data_types = { path = "../data_types" }
+hashbrown = { workspace = true }
+mutable_batch = { path = "../mutable_batch" }
+schema = { path = "../schema" }
+iox_time = { path = "../iox_time" }
+trace = { path = "../trace" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/dml/src/lib.rs b/dml/src/lib.rs
new file mode 100644
index 0000000..f09d6f9
--- /dev/null
+++ b/dml/src/lib.rs
@@ -0,0 +1,452 @@
+//! DML data types
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::time::Duration;
+
+use data_types::{
+    DeletePredicate, NamespaceId, NonEmptyString, PartitionKey, SequenceNumber, StatValues,
+    Statistics, TableId,
+};
+use hashbrown::HashMap;
+use iox_time::{Time, TimeProvider};
+use mutable_batch::MutableBatch;
+use trace::ctx::SpanContext;
+
+/// Metadata information about a DML operation
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct DmlMeta {
+    /// The sequence number associated with this write
+    sequence_number: Option<SequenceNumber>,
+
+    /// When this write was ingested into the write buffer
+    producer_ts: Option<Time>,
+
+    /// Optional span context associated w/ this write
+    span_ctx: Option<SpanContext>,
+
+    /// Bytes read from the wire
+    bytes_read: Option<usize>,
+}
+
+impl DmlMeta {
+    /// Create a new [`DmlMeta`] for a sequenced operation
+    pub fn sequenced(
+        sequence_number: SequenceNumber,
+        producer_ts: Time,
+        span_ctx: Option<SpanContext>,
+        bytes_read: usize,
+    ) -> Self {
+        Self {
+            sequence_number: Some(sequence_number),
+            producer_ts: Some(producer_ts),
+            span_ctx,
+            bytes_read: Some(bytes_read),
+        }
+    }
+
+    /// Create a new [`DmlMeta`] for an unsequenced operation
+    pub fn unsequenced(span_ctx: Option<SpanContext>) -> Self {
+        Self {
+            sequence_number: None,
+            producer_ts: None,
+            span_ctx,
+            bytes_read: None,
+        }
+    }
+
+    /// Gets the sequence number associated with the write if any
+    pub fn sequence(&self) -> Option<SequenceNumber> {
+        self.sequence_number
+    }
+
+    /// Gets the producer timestamp associated with the write if any
+    pub fn producer_ts(&self) -> Option<Time> {
+        self.producer_ts
+    }
+
+    /// returns the Duration since this DmlMeta was produced, if any
+    pub fn duration_since_production(&self, time_provider: &dyn TimeProvider) -> Option<Duration> {
+        self.producer_ts
+            .and_then(|ts| time_provider.now().checked_duration_since(ts))
+    }
+
+    /// Gets the span context if any
+    pub fn span_context(&self) -> Option<&SpanContext> {
+        self.span_ctx.as_ref()
+    }
+
+    /// Returns the number of bytes read from the wire if relevant
+    pub fn bytes_read(&self) -> Option<usize> {
+        self.bytes_read
+    }
+
+    /// Return the approximate memory size of the metadata, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + self
+                .span_ctx
+                .as_ref()
+                .map(|ctx| ctx.size())
+                .unwrap_or_default()
+            - self
+                .span_ctx
+                .as_ref()
+                .map(|_| std::mem::size_of::<SpanContext>())
+                .unwrap_or_default()
+    }
+}
+
+/// A DML operation
+#[derive(Debug, Clone)]
+pub enum DmlOperation {
+    /// A write operation
+    Write(DmlWrite),
+
+    /// A delete operation
+    Delete(DmlDelete),
+}
+
+impl DmlOperation {
+    /// Gets the metadata associated with this operation
+    pub fn meta(&self) -> &DmlMeta {
+        match &self {
+            Self::Write(w) => w.meta(),
+            Self::Delete(d) => d.meta(),
+        }
+    }
+
+    /// Sets the metadata for this operation
+    pub fn set_meta(&mut self, meta: DmlMeta) {
+        match self {
+            Self::Write(w) => w.set_meta(meta),
+            Self::Delete(d) => d.set_meta(meta),
+        }
+    }
+
+    /// Return the approximate memory size of the operation, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        match self {
+            Self::Write(w) => {
+                std::mem::size_of::<Self>() - std::mem::size_of::<DmlWrite>() + w.size()
+            }
+            Self::Delete(d) => {
+                std::mem::size_of::<Self>() - std::mem::size_of::<DmlDelete>() + d.size()
+            }
+        }
+    }
+
+    /// Namespace catalog ID associated with this operation
+    pub fn namespace_id(&self) -> NamespaceId {
+        match self {
+            Self::Write(w) => w.namespace_id(),
+            Self::Delete(d) => d.namespace_id(),
+        }
+    }
+}
+
+impl From<DmlWrite> for DmlOperation {
+    fn from(v: DmlWrite) -> Self {
+        Self::Write(v)
+    }
+}
+
+impl From<DmlDelete> for DmlOperation {
+    fn from(v: DmlDelete) -> Self {
+        Self::Delete(v)
+    }
+}
+
+/// A collection of writes to potentially multiple tables within the same namespace
+#[derive(Debug, Clone)]
+pub struct DmlWrite {
+    /// The namespace being written to
+    namespace_id: NamespaceId,
+    /// Writes to individual tables keyed by table ID
+    table_ids: HashMap<TableId, MutableBatch>,
+    /// Write metadata
+    meta: DmlMeta,
+    min_timestamp: i64,
+    max_timestamp: i64,
+    /// The partition key derived for this write.
+    partition_key: PartitionKey,
+}
+
+impl DmlWrite {
+    /// Create a new [`DmlWrite`]
+    ///
+    /// # Panic
+    ///
+    /// Panics if
+    ///
+    /// - `table_ids` is empty
+    /// - a MutableBatch is empty
+    /// - a MutableBatch lacks an i64 "time" column
+    pub fn new(
+        namespace_id: NamespaceId,
+        table_ids: HashMap<TableId, MutableBatch>,
+        partition_key: PartitionKey,
+        meta: DmlMeta,
+    ) -> Self {
+        assert_ne!(table_ids.len(), 0);
+
+        let mut stats = StatValues::new_empty();
+        for (table_id, table) in &table_ids {
+            match table
+                .column(schema::TIME_COLUMN_NAME)
+                .expect("time")
+                .stats()
+            {
+                Statistics::I64(col_stats) => stats.update_from(&col_stats),
+                s => unreachable!(
+                    "table \"{}\" has unexpected type for time column: {}",
+                    table_id,
+                    s.type_name()
+                ),
+            };
+        }
+
+        Self {
+            table_ids,
+            partition_key,
+            meta,
+            min_timestamp: stats.min.unwrap(),
+            max_timestamp: stats.max.unwrap(),
+            namespace_id,
+        }
+    }
+
+    /// Metadata associated with this write
+    pub fn meta(&self) -> &DmlMeta {
+        &self.meta
+    }
+
+    /// Set the metadata
+    pub fn set_meta(&mut self, meta: DmlMeta) {
+        self.meta = meta
+    }
+
+    /// Returns an iterator over the per-table writes within this [`DmlWrite`]
+    /// in no particular order
+    pub fn tables(&self) -> impl Iterator<Item = (&TableId, &MutableBatch)> + '_ {
+        self.table_ids.iter()
+    }
+
+    /// Consumes `self`, returning an iterator of the table ID and data contained within it.
+    pub fn into_tables(self) -> impl Iterator<Item = (TableId, MutableBatch)> {
+        self.table_ids.into_iter()
+    }
+
+    /// Gets the write for a given table
+    pub fn table(&self, id: &TableId) -> Option<&MutableBatch> {
+        self.table_ids.get(id)
+    }
+
+    /// Returns the number of tables within this write
+    pub fn table_count(&self) -> usize {
+        self.table_ids.len()
+    }
+
+    /// Returns the minimum timestamp in the write
+    pub fn min_timestamp(&self) -> i64 {
+        self.min_timestamp
+    }
+
+    /// Returns the maximum timestamp in the write
+    pub fn max_timestamp(&self) -> i64 {
+        self.max_timestamp
+    }
+
+    /// Return the approximate memory size of the write, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + self
+                .table_ids
+                .values()
+                .map(|v| std::mem::size_of::<TableId>() + v.size())
+                .sum::<usize>()
+            + self.meta.size()
+            + std::mem::size_of::<NamespaceId>()
+            + std::mem::size_of::<PartitionKey>()
+            - std::mem::size_of::<DmlMeta>()
+    }
+
+    /// Return the partition key derived for this op.
+    pub fn partition_key(&self) -> &PartitionKey {
+        &self.partition_key
+    }
+
+    /// Return the [`NamespaceId`] to which this [`DmlWrite`] should be applied.
+    pub fn namespace_id(&self) -> NamespaceId {
+        self.namespace_id
+    }
+}
+
+/// A delete operation
+#[derive(Debug, Clone, PartialEq)]
+pub struct DmlDelete {
+    namespace_id: NamespaceId,
+    predicate: DeletePredicate,
+    table_name: Option<NonEmptyString>,
+    meta: DmlMeta,
+}
+
+impl DmlDelete {
+    /// Create a new [`DmlDelete`]
+    pub fn new(
+        namespace_id: NamespaceId,
+        predicate: DeletePredicate,
+        table_name: Option<NonEmptyString>,
+        meta: DmlMeta,
+    ) -> Self {
+        Self {
+            namespace_id,
+            predicate,
+            table_name,
+            meta,
+        }
+    }
+
+    /// Returns the table_name for this delete
+    pub fn table_name(&self) -> Option<&str> {
+        self.table_name.as_deref()
+    }
+
+    /// Returns the [`DeletePredicate`]
+    pub fn predicate(&self) -> &DeletePredicate {
+        &self.predicate
+    }
+
+    /// Returns the [`DmlMeta`]
+    pub fn meta(&self) -> &DmlMeta {
+        &self.meta
+    }
+
+    /// Sets the [`DmlMeta`] for this [`DmlDelete`]
+    pub fn set_meta(&mut self, meta: DmlMeta) {
+        self.meta = meta
+    }
+
+    /// Return the approximate memory size of the delete, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>() + self.predicate.size() - std::mem::size_of::<DeletePredicate>()
+            + self
+                .table_name
+                .as_ref()
+                .map(|s| s.len())
+                .unwrap_or_default()
+            + self.meta.size()
+            - std::mem::size_of::<DmlMeta>()
+    }
+
+    /// Return the [`NamespaceId`] to which this operation should be applied.
+    pub fn namespace_id(&self) -> NamespaceId {
+        self.namespace_id
+    }
+}
+
+/// Test utilities
+pub mod test_util {
+    use arrow_util::display::pretty_format_batches;
+    use schema::Projection;
+
+    use super::*;
+
+    /// Asserts two operations are equal
+    pub fn assert_op_eq(a: &DmlOperation, b: &DmlOperation) {
+        match (a, b) {
+            (DmlOperation::Write(a), DmlOperation::Write(b)) => assert_writes_eq(a, b),
+            (DmlOperation::Delete(a), DmlOperation::Delete(b)) => assert_eq!(a, b),
+            (a, b) => panic!("a != b, {a:?} vs {b:?}"),
+        }
+    }
+
+    /// Asserts `a` contains a [`DmlWrite`] equal to `b`
+    pub fn assert_write_op_eq(a: &DmlOperation, b: &DmlWrite) {
+        match a {
+            DmlOperation::Write(a) => assert_writes_eq(a, b),
+            _ => panic!("unexpected operation: {a:?}"),
+        }
+    }
+
+    /// Asserts two writes are equal
+    pub fn assert_writes_eq(a: &DmlWrite, b: &DmlWrite) {
+        assert_eq!(a.namespace_id, b.namespace_id);
+        assert_eq!(a.partition_key(), b.partition_key());
+
+        // Depending on what implementation is under test ( :( ) different
+        // timestamp precisions may be used.
+        //
+        // Truncate them all to milliseconds (the lowest common denominator) so
+        // they are comparable.
+        assert_eq!(
+            truncate_timestamp_to_millis(a.meta()),
+            truncate_timestamp_to_millis(b.meta())
+        );
+
+        assert_eq!(a.table_count(), b.table_count());
+
+        for (table_id, a_batch) in a.tables() {
+            let b_batch = b.table(table_id).expect("table not found");
+
+            assert_eq!(
+                pretty_format_batches(&[a_batch.to_arrow(Projection::All).unwrap()]).unwrap(),
+                pretty_format_batches(&[b_batch.to_arrow(Projection::All).unwrap()]).unwrap(),
+                "batches for table \"{table_id}\" differ"
+            );
+        }
+    }
+
+    /// Asserts `a` contains a [`DmlDelete`] equal to `b`
+    pub fn assert_delete_op_eq(a: &DmlOperation, b: &DmlDelete) {
+        match a {
+            DmlOperation::Delete(a) => assert_eq!(a, b),
+            _ => panic!("unexpected operation: {a:?}"),
+        }
+    }
+
+    fn truncate_timestamp_to_millis(m: &DmlMeta) -> DmlMeta {
+        // Kafka supports millisecond precision in timestamps, so drop some
+        // precision from this producer timestamp in the metadata (which has
+        // nanosecond precision) to ensure the returned write is directly
+        // comparable to a write that has come through the write buffer.
+        //
+        // This mangling is to support testing comparisons only.
+        let timestamp = m
+            .producer_ts()
+            .expect("no producer timestamp in de-aggregated metadata");
+        let timestamp =
+            Time::from_timestamp_millis(timestamp.timestamp_millis()).expect("ts in range");
+
+        DmlMeta::sequenced(
+            m.sequence().unwrap(),
+            timestamp,
+            m.span_context().cloned(),
+            m.bytes_read().unwrap(),
+        )
+    }
+}
diff --git a/executor/Cargo.toml b/executor/Cargo.toml
new file mode 100644
index 0000000..a644901
--- /dev/null
+++ b/executor/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "executor"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+futures = "0.3"
+metric = { path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+once_cell = { version = "1.19", features = ["parking_lot"] }
+parking_lot = "0.12"
+pin-project = "1.1"
+snafu = "0.8"
+tokio = { version = "1.35" }
+tokio-util = { version = "0.7.10" }
+tokio_metrics_bridge = { path = "../tokio_metrics_bridge" }
+tokio_watchdog = { path = "../tokio_watchdog" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+# use libc on unix like platforms to set worker priority in DedicatedExecutor
+[target."cfg(unix)".dependencies.libc]
+version = "0.2"
diff --git a/executor/src/lib.rs b/executor/src/lib.rs
new file mode 100644
index 0000000..8df9054
--- /dev/null
+++ b/executor/src/lib.rs
@@ -0,0 +1,821 @@
+//! This module contains a dedicated thread pool for running "cpu
+//! intensive" workloads such as DataFusion plans
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use metric::Registry;
+use snafu::Snafu;
+#[cfg(tokio_unstable)]
+use tokio_metrics_bridge::setup_tokio_metrics;
+use tokio_watchdog::WatchdogConfig;
+// Workaround for "unused crate" lint false positives.
+#[cfg(not(tokio_unstable))]
+use tokio_metrics_bridge as _;
+use workspace_hack as _;
+
+use once_cell::sync::Lazy;
+use parking_lot::Mutex;
+use pin_project::{pin_project, pinned_drop};
+use std::{
+    num::NonZeroUsize,
+    panic::AssertUnwindSafe,
+    pin::Pin,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+use tokio::sync::oneshot::{error::RecvError, Receiver};
+use tokio_util::sync::CancellationToken;
+
+use futures::{
+    future::{BoxFuture, Shared},
+    ready, Future, FutureExt, TryFutureExt,
+};
+
+use observability_deps::tracing::warn;
+
+/// Task that can be added to the executor-internal queue.
+///
+/// Every task within the executor is represented by a [`Job`] that can be polled by the API user.
+struct Task {
+    fut: Pin<Box<dyn Future<Output = ()> + Send>>,
+    cancel: CancellationToken,
+
+    #[allow(dead_code)]
+    task_ref: Arc<()>,
+}
+
+impl Task {
+    /// Run task.
+    ///
+    /// This runs the payload or cancels if the linked [`Job`] is dropped.
+    async fn run(self) {
+        tokio::select! {
+            _ = self.cancel.cancelled() => (),
+            _ = self.fut => (),
+        }
+    }
+}
+
+/// Errors occuring when polling [`Job`].
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum JobError {
+    #[snafu(display("Worker thread gone, executor was likely shut down"))]
+    WorkerGone,
+
+    #[snafu(display("Panic: {msg}"))]
+    Panic { msg: String },
+}
+
+/// Job within the executor.
+///
+/// Dropping the job will cancel its linked task.
+#[pin_project(PinnedDrop)]
+#[derive(Debug)]
+pub struct Job<T> {
+    cancel: CancellationToken,
+    detached: bool,
+    #[pin]
+    rx: Receiver<Result<T, JobError>>,
+}
+
+impl<T> Job<T> {
+    /// Detached job so dropping it does not cancel it.
+    ///
+    /// You must ensure that this task eventually finishes, otherwise [`DedicatedExecutor::join`] may never return!
+    pub fn detach(mut self) {
+        // cannot destructure `Self` because we implement `Drop`, so we use a flag instead to prevent cancellation.
+        self.detached = true;
+    }
+}
+
+impl<T> Future for Job<T> {
+    type Output = Result<T, JobError>;
+
+    fn poll(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Self::Output> {
+        let this = self.project();
+        match ready!(this.rx.poll(cx)) {
+            Ok(res) => std::task::Poll::Ready(res),
+            Err(_) => std::task::Poll::Ready(Err(JobError::WorkerGone)),
+        }
+    }
+}
+
+#[pinned_drop]
+impl<T> PinnedDrop for Job<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.detached {
+            self.cancel.cancel();
+        }
+    }
+}
+
+/// Runs futures (and any `tasks` that are `tokio::task::spawned` by
+/// them) on a separate tokio Executor
+#[derive(Clone)]
+pub struct DedicatedExecutor {
+    state: Arc<Mutex<State>>,
+
+    /// Number of threads
+    num_threads: NonZeroUsize,
+
+    /// Used for testing.
+    ///
+    /// This will ignore explicit shutdown requests.
+    testing: bool,
+}
+
+/// Runs futures (and any `tasks` that are `tokio::task::spawned` by
+/// them) on a separate tokio Executor
+struct State {
+    /// Channel for requests -- the dedicated executor takes requests
+    /// from here and runs them.
+    ///
+    /// This is `None` if we triggered shutdown.
+    requests: Option<std::sync::mpsc::Sender<Task>>,
+
+    /// Receiver side indicating that shutdown is complete.
+    completed_shutdown: Shared<BoxFuture<'static, Result<(), Arc<RecvError>>>>,
+
+    /// Task counter (uses Arc strong count).
+    task_refs: Arc<()>,
+
+    /// The inner thread that can be used to join during drop.
+    thread: Option<std::thread::JoinHandle<()>>,
+}
+
+// IMPORTANT: Implement `Drop` for `State`, NOT for `DedicatedExecutor`, because the executor can be cloned and clones
+// share their inner state.
+impl Drop for State {
+    fn drop(&mut self) {
+        if self.requests.is_some() {
+            warn!("DedicatedExecutor dropped without calling shutdown()");
+            self.requests = None;
+        }
+
+        // do NOT poll the shared future if we are panicking due to https://github.com/rust-lang/futures-rs/issues/2575
+        if !std::thread::panicking() && self.completed_shutdown.clone().now_or_never().is_none() {
+            warn!("DedicatedExecutor dropped without waiting for worker termination",);
+        }
+
+        // join thread but don't care about the results
+        self.thread.take().expect("not dropped yet").join().ok();
+    }
+}
+
+/// The default worker priority (value passed to `libc::setpriority`);
+const WORKER_PRIORITY: i32 = 10;
+
+impl std::fmt::Debug for DedicatedExecutor {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Avoid taking the mutex in debug formatting
+        write!(f, "DedicatedExecutor")
+    }
+}
+
+/// [`DedicatedExecutor`] for testing purposes.
+static TESTING_EXECUTOR: Lazy<DedicatedExecutor> = Lazy::new(|| {
+    DedicatedExecutor::new_inner(
+        "testing",
+        NonZeroUsize::new(1).unwrap(),
+        Arc::new(Registry::default()),
+        true,
+    )
+});
+
+impl DedicatedExecutor {
+    /// Creates a new `DedicatedExecutor` with a dedicated tokio
+    /// executor that is separate from the threadpool created via
+    /// `[tokio::main]` or similar.
+    ///
+    /// The worker thread priority is set to low so that such tasks do
+    /// not starve other more important tasks (such as answering health checks)
+    ///
+    /// Follows the example from to stack overflow and spawns a new
+    /// thread to install a Tokio runtime "context"
+    /// <https://stackoverflow.com/questions/62536566>
+    ///
+    /// If you try to do this from a async context you see something like
+    /// thread 'plan::stringset::tests::test_builder_plan' panicked at 'Cannot
+    /// drop a runtime in a context where blocking is not allowed. This
+    /// happens when a runtime is dropped from within an asynchronous
+    /// context.', .../tokio-1.4.0/src/runtime/blocking/shutdown.rs:51:21
+    pub fn new(
+        thread_name: &'static str,
+        num_threads: NonZeroUsize,
+        metric_registry: Arc<Registry>,
+    ) -> Self {
+        Self::new_inner(thread_name, num_threads, metric_registry, false)
+    }
+
+    fn new_inner(
+        thread_name: &'static str,
+        num_threads: NonZeroUsize,
+        metric_registry: Arc<Registry>,
+        testing: bool,
+    ) -> Self {
+        let thread_counter = Arc::new(AtomicUsize::new(1));
+
+        let (tx_tasks, rx_tasks) = std::sync::mpsc::channel::<Task>();
+        let (tx_shutdown, rx_shutdown) = tokio::sync::oneshot::channel();
+
+        let thread = std::thread::Builder::new()
+            .name(format!("{thread_name} driver"))
+            .spawn(move || {
+                let runtime = tokio::runtime::Builder::new_multi_thread()
+                    .enable_all()
+                    .thread_name_fn(move || {
+                        format!(
+                            "{} {}",
+                            thread_name,
+                            thread_counter.fetch_add(1, Ordering::SeqCst)
+                        )
+                    })
+                    .worker_threads(num_threads.get())
+                    .on_thread_start(move || set_current_thread_priority(WORKER_PRIORITY))
+                    .build()
+                    .expect("Creating tokio runtime");
+
+                WatchdogConfig::new(runtime.handle(), &metric_registry)
+                    .with_runtime_name(thread_name)
+                    .with_tick_duration(Duration::from_millis(100))
+                    .with_warn_duration(Duration::from_millis(100))
+                    .install();
+
+                #[cfg(tokio_unstable)]
+                setup_tokio_metrics(runtime.metrics(), thread_name, metric_registry);
+                #[cfg(not(tokio_unstable))]
+                let _ = metric_registry;
+
+                runtime.block_on(async move {
+                    // Dropping the tokio runtime only waits for tasks to yield not to complete
+                    //
+                    // We therefore use a RwLock to wait for tasks to complete
+                    let join = Arc::new(tokio::sync::RwLock::new(()));
+
+                    while let Ok(task) = rx_tasks.recv() {
+                        let join = Arc::clone(&join);
+                        let handle = join.read_owned().await;
+
+                        tokio::task::spawn(async move {
+                            task.run().await;
+                            std::mem::drop(handle);
+                        });
+                    }
+
+                    // Wait for all tasks to finish
+                    let _guard = join.write().await;
+
+                    // signal shutdown, but it's OK if the other side is gone
+                    tx_shutdown.send(()).ok();
+                })
+            })
+            .expect("executor setup");
+
+        let state = State {
+            requests: Some(tx_tasks),
+            task_refs: Arc::new(()),
+            completed_shutdown: rx_shutdown.map_err(Arc::new).boxed().shared(),
+            thread: Some(thread),
+        };
+
+        Self {
+            state: Arc::new(Mutex::new(state)),
+            num_threads,
+            testing,
+        }
+    }
+
+    /// Create new executor for testing purposes.
+    ///
+    /// Internal state may be shared with other tests.
+    pub fn new_testing() -> Self {
+        TESTING_EXECUTOR.clone()
+    }
+
+    /// Number of threads that back this executor.
+    pub fn num_threads(&self) -> NonZeroUsize {
+        self.num_threads
+    }
+
+    /// Runs the specified Future (and any tasks it spawns) on the
+    /// `DedicatedExecutor`.
+    ///
+    /// Currently all tasks are added to the tokio executor
+    /// immediately and compete for the threadpool's resources.
+    pub fn spawn<T>(&self, task: T) -> Job<T::Output>
+    where
+        T: Future + Send + 'static,
+        T::Output: Send + 'static,
+    {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+
+        let fut = Box::pin(async move {
+            let task_output = AssertUnwindSafe(task).catch_unwind().await.map_err(|e| {
+                let s = if let Some(s) = e.downcast_ref::<String>() {
+                    s.clone()
+                } else if let Some(s) = e.downcast_ref::<&str>() {
+                    s.to_string()
+                } else {
+                    "unknown internal error".to_string()
+                };
+
+                JobError::Panic { msg: s }
+            });
+
+            if tx.send(task_output).is_err() {
+                warn!("Spawned task output ignored: receiver dropped")
+            }
+        });
+        let cancel = CancellationToken::new();
+        let mut state = self.state.lock();
+        let task = Task {
+            fut,
+            cancel: cancel.clone(),
+            task_ref: Arc::clone(&state.task_refs),
+        };
+
+        if let Some(requests) = &mut state.requests {
+            // would fail if someone has started shutdown
+            requests.send(task).ok();
+        } else {
+            warn!("tried to schedule task on an executor that was shutdown");
+        }
+
+        Job {
+            rx,
+            cancel,
+            detached: false,
+        }
+    }
+
+    /// Number of currently active tasks.
+    pub fn tasks(&self) -> usize {
+        let state = self.state.lock();
+
+        // the strong count is always `1 + jobs` because of the Arc we hold within Self
+        Arc::strong_count(&state.task_refs).saturating_sub(1)
+    }
+
+    /// signals shutdown of this executor and any Clones
+    pub fn shutdown(&self) {
+        if self.testing {
+            return;
+        }
+
+        // hang up the channel which will cause the dedicated thread
+        // to quit
+        let mut state = self.state.lock();
+        state.requests = None;
+    }
+
+    /// Stops all subsequent task executions, and waits for the worker
+    /// thread to complete. Note this will shutdown all clones of this
+    /// `DedicatedExecutor` as well.
+    ///
+    /// Only the first all to `join` will actually wait for the
+    /// executing thread to complete. All other calls to join will
+    /// complete immediately.
+    ///
+    /// # Panic / Drop
+    /// [`DedicatedExecutor`] implements shutdown on [`Drop`]. You should just use this behavior and NOT call
+    /// [`join`](Self::join) manually during [`Drop`] or panics because this might lead to another panic, see
+    /// <https://github.com/rust-lang/futures-rs/issues/2575>.
+    pub async fn join(&self) {
+        if self.testing {
+            return;
+        }
+
+        self.shutdown();
+
+        // get handle mutex is held
+        let handle = {
+            let state = self.state.lock();
+            state.completed_shutdown.clone()
+        };
+
+        // wait for completion while not holding the mutex to avoid
+        // deadlocks
+        handle.await.expect("Thread died?")
+    }
+}
+
+#[cfg(unix)]
+fn set_current_thread_priority(prio: i32) {
+    // on linux setpriority sets the current thread's priority
+    // (as opposed to the current process).
+    unsafe { libc::setpriority(0, 0, prio) };
+}
+
+#[cfg(not(unix))]
+fn set_current_thread_priority(prio: i32) {
+    warn!("Setting worker thread priority not supported on this platform");
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        panic::panic_any,
+        sync::{Arc, Barrier},
+        time::Duration,
+    };
+    use tokio::sync::Barrier as AsyncBarrier;
+
+    #[cfg(unix)]
+    fn get_current_thread_priority() -> i32 {
+        // on linux setpriority sets the current thread's priority
+        // (as opposed to the current process).
+        unsafe { libc::getpriority(0, 0) }
+    }
+
+    #[cfg(not(unix))]
+    fn get_current_thread_priority() -> i32 {
+        WORKER_PRIORITY
+    }
+
+    #[tokio::test]
+    async fn basic() {
+        let barrier = Arc::new(Barrier::new(2));
+
+        let exec = exec();
+        let dedicated_task = exec.spawn(do_work(42, Arc::clone(&barrier)));
+
+        // Note the dedicated task will never complete if it runs on
+        // the main tokio thread (as this test is not using the
+        // 'multithreaded' version of the executor and the call to
+        // barrier.wait actually blocks the tokio thread)
+        barrier.wait();
+
+        // should be able to get the result
+        assert_eq!(dedicated_task.await.unwrap(), 42);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn basic_clone() {
+        let barrier = Arc::new(Barrier::new(2));
+        let exec = exec();
+        // Run task on clone should work fine
+        let dedicated_task = exec.clone().spawn(do_work(42, Arc::clone(&barrier)));
+        barrier.wait();
+        assert_eq!(dedicated_task.await.unwrap(), 42);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn drop_clone() {
+        let barrier = Arc::new(Barrier::new(2));
+        let exec = exec();
+
+        drop(exec.clone());
+
+        let task = exec.spawn(do_work(42, Arc::clone(&barrier)));
+        barrier.wait();
+        assert_eq!(task.await.unwrap(), 42);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "foo")]
+    async fn just_panic() {
+        struct S(DedicatedExecutor);
+
+        impl Drop for S {
+            fn drop(&mut self) {
+                self.0.join().now_or_never();
+            }
+        }
+
+        let exec = exec();
+        let _s = S(exec);
+
+        // this must not lead to a double-panic and SIGILL
+        panic!("foo")
+    }
+
+    #[tokio::test]
+    async fn multi_task() {
+        let barrier = Arc::new(Barrier::new(3));
+
+        // make an executor with two threads
+        let exec = exec2();
+        let dedicated_task1 = exec.spawn(do_work(11, Arc::clone(&barrier)));
+        let dedicated_task2 = exec.spawn(do_work(42, Arc::clone(&barrier)));
+
+        // block main thread until completion of other two tasks
+        barrier.wait();
+
+        // should be able to get the result
+        assert_eq!(dedicated_task1.await.unwrap(), 11);
+        assert_eq!(dedicated_task2.await.unwrap(), 42);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn worker_priority() {
+        let exec = exec2();
+
+        let dedicated_task = exec.spawn(async move { get_current_thread_priority() });
+
+        assert_eq!(dedicated_task.await.unwrap(), WORKER_PRIORITY);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn tokio_spawn() {
+        let exec = exec2();
+
+        // spawn a task that spawns to other tasks and ensure they run on the dedicated
+        // executor
+        let dedicated_task = exec.spawn(async move {
+            // spawn separate tasks
+            let t1 = tokio::task::spawn(async {
+                let thread = std::thread::current();
+                let tname = thread.name().expect("thread is named");
+
+                assert!(
+                    tname.starts_with("Test DedicatedExecutor"),
+                    "Invalid thread name: {tname}",
+                );
+
+                25usize
+            });
+            t1.await.unwrap()
+        });
+
+        // Validate the inner task ran to completion (aka it did not panic)
+        assert_eq!(dedicated_task.await.unwrap(), 25);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn panic_on_executor_str() {
+        let exec = exec();
+        let dedicated_task = exec.spawn(async move {
+            if true {
+                panic!("At the disco, on the dedicated task scheduler");
+            } else {
+                42
+            }
+        });
+
+        // should not be able to get the result
+        let err = dedicated_task.await.unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Panic: At the disco, on the dedicated task scheduler",
+        );
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn panic_on_executor_string() {
+        let exec = exec();
+        let dedicated_task = exec.spawn(async move {
+            if true {
+                panic!("{} {}", 1, 2);
+            } else {
+                42
+            }
+        });
+
+        // should not be able to get the result
+        let err = dedicated_task.await.unwrap_err();
+        assert_eq!(err.to_string(), "Panic: 1 2",);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn panic_on_executor_other() {
+        let exec = exec();
+        let dedicated_task = exec.spawn(async move {
+            if true {
+                panic_any(1)
+            } else {
+                42
+            }
+        });
+
+        // should not be able to get the result
+        let err = dedicated_task.await.unwrap_err();
+        assert_eq!(err.to_string(), "Panic: unknown internal error",);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn executor_shutdown_while_task_running() {
+        let barrier = Arc::new(Barrier::new(2));
+        let captured = Arc::clone(&barrier);
+
+        let exec = exec();
+        let dedicated_task = exec.spawn(async move {
+            tokio::time::sleep(tokio::time::Duration::from_millis(1)).await;
+            do_work(42, captured).await
+        });
+
+        exec.shutdown();
+        // block main thread until completion of the outstanding task
+        barrier.wait();
+
+        // task should complete successfully
+        assert_eq!(dedicated_task.await.unwrap(), 42);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn executor_submit_task_after_shutdown() {
+        let exec = exec();
+
+        // Simulate trying to submit tasks once executor has shutdown
+        exec.shutdown();
+        let dedicated_task = exec.spawn(async { 11 });
+
+        // task should complete, but return an error
+        let err = dedicated_task.await.unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Worker thread gone, executor was likely shut down"
+        );
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn executor_submit_task_after_clone_shutdown() {
+        let exec = exec();
+
+        // shutdown the clone (but not the exec)
+        exec.clone().join().await;
+
+        // Simulate trying to submit tasks once executor has shutdown
+        let dedicated_task = exec.spawn(async { 11 });
+
+        // task should complete, but return an error
+        let err = dedicated_task.await.unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Worker thread gone, executor was likely shut down"
+        );
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn executor_join() {
+        let exec = exec();
+        // test it doesn't hang
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn executor_join2() {
+        let exec = exec();
+        // test it doesn't hang
+        exec.join().await;
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    #[allow(clippy::redundant_clone)]
+    async fn executor_clone_join() {
+        let exec = exec();
+        // test it doesn't hang
+        exec.clone().join().await;
+        exec.clone().join().await;
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn drop_receiver() {
+        // create empty executor
+        let exec = exec();
+        assert_eq!(exec.tasks(), 0);
+
+        // create first blocked task
+        let barrier1 = Arc::new(AsyncBarrier::new(2));
+        let dedicated_task1 = exec.spawn(do_work_async(11, Arc::clone(&barrier1)));
+        assert_eq!(exec.tasks(), 1);
+
+        // create second blocked task
+        let barrier2 = Arc::new(AsyncBarrier::new(2));
+        let dedicated_task2 = exec.spawn(do_work_async(22, Arc::clone(&barrier2)));
+        assert_eq!(exec.tasks(), 2);
+
+        // cancel task
+        drop(dedicated_task1);
+
+        // cancelation might take a short while
+        wait_for_tasks(&exec, 1).await;
+
+        // unblock other task
+        barrier2.wait().await;
+        assert_eq!(dedicated_task2.await.unwrap(), 22);
+        wait_for_tasks(&exec, 0).await;
+        assert_eq!(exec.tasks(), 0);
+
+        exec.join().await;
+    }
+
+    #[tokio::test]
+    async fn detach_receiver() {
+        // create empty executor
+        let exec = exec();
+        assert_eq!(exec.tasks(), 0);
+
+        // create first task
+        // `detach()` consumes the task but doesn't abort the task (in contrast to `drop`). We'll proof the that the
+        // task is still running by linking it to a 2nd task using a barrier with size 3 (two tasks plus the main thread).
+        let barrier = Arc::new(AsyncBarrier::new(3));
+        let dedicated_task = exec.spawn(do_work_async(11, Arc::clone(&barrier)));
+        dedicated_task.detach();
+        assert_eq!(exec.tasks(), 1);
+
+        // create second task
+        let dedicated_task = exec.spawn(do_work_async(22, Arc::clone(&barrier)));
+        assert_eq!(exec.tasks(), 2);
+
+        // wait a bit just to make sure that our tasks doesn't get dropped
+        tokio::time::sleep(Duration::from_millis(10)).await;
+        assert_eq!(exec.tasks(), 2);
+
+        // tasks should be unblocked because they both wait on the same barrier
+        // unblock tasks
+        barrier.wait().await;
+        wait_for_tasks(&exec, 0).await;
+        let result = dedicated_task.await.unwrap();
+        assert_eq!(result, 22);
+
+        exec.join().await;
+    }
+
+    /// Wait for the barrier and then return `result`
+    async fn do_work(result: usize, barrier: Arc<Barrier>) -> usize {
+        barrier.wait();
+        result
+    }
+
+    /// Wait for the barrier and then return `result`
+    async fn do_work_async(result: usize, barrier: Arc<AsyncBarrier>) -> usize {
+        barrier.wait().await;
+        result
+    }
+
+    // waits for up to 1 sec for the correct number of tasks
+    async fn wait_for_tasks(exec: &DedicatedExecutor, num: usize) {
+        tokio::time::timeout(Duration::from_secs(1), async {
+            loop {
+                if exec.tasks() == num {
+                    return;
+                }
+                tokio::time::sleep(Duration::from_millis(1)).await;
+            }
+        })
+        .await
+        .expect("Did not find expected num tasks within a second")
+    }
+
+    fn exec() -> DedicatedExecutor {
+        exec_with_threads(1)
+    }
+
+    fn exec2() -> DedicatedExecutor {
+        exec_with_threads(2)
+    }
+
+    fn exec_with_threads(threads: usize) -> DedicatedExecutor {
+        DedicatedExecutor::new(
+            "Test DedicatedExecutor",
+            NonZeroUsize::new(threads).unwrap(),
+            Arc::new(Registry::default()),
+        )
+    }
+}
diff --git a/flightsql/Cargo.toml b/flightsql/Cargo.toml
new file mode 100644
index 0000000..f4edb94
--- /dev/null
+++ b/flightsql/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "flightsql"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+arrow-flight = { workspace = true }
+arrow_util = { path = "../arrow_util" }
+datafusion = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+iox_query = { path = "../iox_query" }
+
+# Crates.io dependencies, in alphabetical order
+bytes = "1.5"
+snafu = "0.8"
+once_cell = { version = "1", default-features = false }
+prost = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/flightsql/src/cmd.rs b/flightsql/src/cmd.rs
new file mode 100644
index 0000000..bc76415
--- /dev/null
+++ b/flightsql/src/cmd.rs
@@ -0,0 +1,335 @@
+//! IOx FlightSQL Command structures
+
+use std::fmt::Display;
+
+use arrow_flight::sql::{
+    ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, Any,
+    CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
+    CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
+    CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandStatementQuery,
+};
+use bytes::Bytes;
+use prost::Message;
+use snafu::ResultExt;
+
+use crate::error::*;
+
+/// Represents a prepared statement "handle". IOx passes all state
+/// required to run the prepared statement back and forth to the
+/// client, so any querier instance can run it
+#[derive(Debug, Clone, PartialEq)]
+pub struct PreparedStatementHandle {
+    /// The raw SQL query text
+    query: String,
+}
+
+impl PreparedStatementHandle {
+    pub fn new(query: String) -> Self {
+        Self { query }
+    }
+
+    /// return the query
+    pub fn query(&self) -> &str {
+        self.query.as_ref()
+    }
+
+    fn try_decode(handle: Bytes) -> Result<Self> {
+        // Note: in IOx  handles are the entire decoded query
+        // It will likely need to get more sophisticated as part of
+        // https://github.com/influxdata/influxdb_iox/issues/6699
+        let query = String::from_utf8(handle.to_vec()).context(InvalidHandleSnafu)?;
+        Ok(Self { query })
+    }
+
+    fn encode(self) -> Bytes {
+        Bytes::from(self.query.into_bytes())
+    }
+}
+
+impl Display for PreparedStatementHandle {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Pepared({})", self.query)
+    }
+}
+
+/// Encode a PreparedStatementHandle as Bytes
+impl From<PreparedStatementHandle> for Bytes {
+    fn from(value: PreparedStatementHandle) -> Self {
+        Self::from(value.query.into_bytes())
+    }
+}
+
+/// Decoded / validated FlightSQL command messages
+///
+/// Handles encoding/decoding prost::Any messages back
+/// and forth to native Rust types
+///
+/// TODO use / contribute upstream arrow-flight implementation, when ready:
+/// <https://github.com/apache/arrow-rs/issues/3874>
+#[derive(Debug, Clone, PartialEq)]
+pub enum FlightSQLCommand {
+    /// Run a normal query
+    CommandStatementQuery(CommandStatementQuery),
+    /// Run a prepared statement.
+    CommandPreparedStatementQuery(PreparedStatementHandle),
+    /// Get information about the SQL supported
+    CommandGetSqlInfo(CommandGetSqlInfo),
+    /// Get a list of the available catalogs. See [`CommandGetCatalogs`] for details.
+    CommandGetCatalogs(CommandGetCatalogs),
+    /// Get a description of the foreign key columns in the given foreign key table
+    /// that reference the primary key or the columns representing a unique constraint
+    /// of the parent table (could be the same or a different table).
+    /// See [`CommandGetCrossReference`] for details.
+    CommandGetCrossReference(CommandGetCrossReference),
+    /// Get a list of the available schemas. See [`CommandGetDbSchemas`]
+    /// for details and how to interpret the parameters.
+    CommandGetDbSchemas(CommandGetDbSchemas),
+    /// Get a description of the foreign key columns that reference the given
+    /// table's primary key columns (the foreign keys exported by a table) of a table.
+    /// See [`CommandGetExportedKeys`] for details.
+    CommandGetExportedKeys(CommandGetExportedKeys),
+    /// Get the foreign keys of a table. See [`CommandGetImportedKeys`] for details.
+    CommandGetImportedKeys(CommandGetImportedKeys),
+    /// Get a list of primary keys. See [`CommandGetPrimaryKeys`] for details.
+    CommandGetPrimaryKeys(CommandGetPrimaryKeys),
+    /// Get a list of the available tables
+    CommandGetTables(CommandGetTables),
+    /// Get information about data types supported.
+    /// See [`CommandGetXdbcTypeInfo`] for details.
+    CommandGetXdbcTypeInfo(CommandGetXdbcTypeInfo),
+    /// Get a list of the available table types
+    CommandGetTableTypes(CommandGetTableTypes),
+    /// Create a prepared statement
+    ActionCreatePreparedStatementRequest(ActionCreatePreparedStatementRequest),
+    /// Close a prepared statement
+    ActionClosePreparedStatementRequest(PreparedStatementHandle),
+}
+
+impl Display for FlightSQLCommand {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::CommandStatementQuery(CommandStatementQuery { query, .. }) => {
+                write!(f, "CommandStatementQuery{query}")
+            }
+            Self::CommandPreparedStatementQuery(h) => write!(f, "CommandPreparedStatementQuery{h}"),
+            Self::CommandGetSqlInfo(CommandGetSqlInfo { info: _ }) => {
+                write!(f, "CommandGetSqlInfo(...)")
+            }
+            Self::CommandGetCatalogs(CommandGetCatalogs {}) => write!(f, "CommandGetCatalogs"),
+            Self::CommandGetCrossReference(CommandGetCrossReference {
+                pk_catalog,
+                pk_db_schema,
+                pk_table,
+                fk_catalog,
+                fk_db_schema,
+                fk_table,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetCrossReference(
+                        pk_catalog={},
+                        pk_db_schema={},
+                        pk_table={},
+                        fk_catalog={},
+                        fk_db_schema={},
+                        fk_table={}",
+                    pk_catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    pk_db_schema
+                        .as_ref()
+                        .map(|c| c.as_str())
+                        .unwrap_or("<NONE>"),
+                    pk_table,
+                    fk_catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    fk_db_schema
+                        .as_ref()
+                        .map(|c| c.as_str())
+                        .unwrap_or("<NONE>"),
+                    fk_table,
+                )
+            }
+            Self::CommandGetDbSchemas(CommandGetDbSchemas {
+                catalog,
+                db_schema_filter_pattern,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetCatalogs(catalog={}, db_schema_filter_pattern={}",
+                    catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    db_schema_filter_pattern
+                        .as_ref()
+                        .map(|c| c.as_str())
+                        .unwrap_or("<NONE>")
+                )
+            }
+            Self::CommandGetExportedKeys(CommandGetExportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetExportedKeys(catalog={}, db_schema={}, table={})",
+                    catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    db_schema.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    table
+                )
+            }
+            Self::CommandGetImportedKeys(CommandGetImportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetImportedKeys(catalog={}, db_schema={}, table={})",
+                    catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    db_schema.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    table
+                )
+            }
+            Self::CommandGetPrimaryKeys(CommandGetPrimaryKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetPrimaryKeys(catalog={}, db_schema={}, table={})",
+                    catalog.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    db_schema.as_ref().map(|c| c.as_str()).unwrap_or("<NONE>"),
+                    table
+                )
+            }
+            Self::CommandGetTables(CommandGetTables {
+                catalog,
+                db_schema_filter_pattern,
+                table_name_filter_pattern,
+                table_types,
+                include_schema,
+            }) => {
+                write!(
+                    f,
+                    "CommandGetTables(catalog={}, db_schema_filter_pattern={},\
+                        table_name_filter_pattern={},table_types={},include_schema={})",
+                    catalog.as_ref().map(|c| c.as_ref()).unwrap_or("<NONE>"),
+                    db_schema_filter_pattern
+                        .as_ref()
+                        .map(|db| db.as_ref())
+                        .unwrap_or("<NONE>"),
+                    table_name_filter_pattern
+                        .as_ref()
+                        .map(|t| t.as_ref())
+                        .unwrap_or("<NONE>"),
+                    table_types.join(","),
+                    include_schema,
+                )
+            }
+            Self::CommandGetTableTypes(CommandGetTableTypes {}) => {
+                write!(f, "CommandGetTableTypes")
+            }
+            Self::CommandGetXdbcTypeInfo(CommandGetXdbcTypeInfo { data_type }) => {
+                write!(
+                    f,
+                    "CommandGetXdbcTypeInfo(data_type={})",
+                    data_type.as_ref().copied().unwrap_or(0),
+                )
+            }
+            Self::ActionCreatePreparedStatementRequest(ActionCreatePreparedStatementRequest {
+                query,
+                ..
+            }) => {
+                write!(f, "ActionCreatePreparedStatementRequest{query}")
+            }
+            Self::ActionClosePreparedStatementRequest(h) => {
+                write!(f, "ActionClosePreparedStatementRequest{h}")
+            }
+        }
+    }
+}
+
+impl FlightSQLCommand {
+    /// Figure out and decode the specific FlightSQL command in `msg`
+    /// and decode it to a native IOx / Rust struct
+    pub fn try_decode(msg: Bytes) -> Result<Self> {
+        let msg: Any = Message::decode(msg)?;
+
+        if let Some(decoded_cmd) = Any::unpack::<CommandStatementQuery>(&msg)? {
+            Ok(Self::CommandStatementQuery(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandPreparedStatementQuery>(&msg)? {
+            let CommandPreparedStatementQuery {
+                prepared_statement_handle,
+            } = decoded_cmd;
+            // Decode to IOx specific structure
+            let handle = PreparedStatementHandle::try_decode(prepared_statement_handle)?;
+            Ok(Self::CommandPreparedStatementQuery(handle))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetSqlInfo>(&msg)? {
+            Ok(Self::CommandGetSqlInfo(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetCatalogs>(&msg)? {
+            Ok(Self::CommandGetCatalogs(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetCrossReference>(&msg)? {
+            Ok(Self::CommandGetCrossReference(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetDbSchemas>(&msg)? {
+            Ok(Self::CommandGetDbSchemas(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetExportedKeys>(&msg)? {
+            Ok(Self::CommandGetExportedKeys(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetImportedKeys>(&msg)? {
+            Ok(Self::CommandGetImportedKeys(decoded_cmd))
+        } else if let Some(decode_cmd) = Any::unpack::<CommandGetPrimaryKeys>(&msg)? {
+            Ok(Self::CommandGetPrimaryKeys(decode_cmd))
+        } else if let Some(decode_cmd) = Any::unpack::<CommandGetTables>(&msg)? {
+            Ok(Self::CommandGetTables(decode_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetTableTypes>(&msg)? {
+            Ok(Self::CommandGetTableTypes(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<CommandGetXdbcTypeInfo>(&msg)? {
+            Ok(Self::CommandGetXdbcTypeInfo(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<ActionCreatePreparedStatementRequest>(&msg)?
+        {
+            Ok(Self::ActionCreatePreparedStatementRequest(decoded_cmd))
+        } else if let Some(decoded_cmd) = Any::unpack::<ActionClosePreparedStatementRequest>(&msg)?
+        {
+            // Decode to IOx specific structure
+            let ActionClosePreparedStatementRequest {
+                prepared_statement_handle,
+            } = decoded_cmd;
+            let handle = PreparedStatementHandle::try_decode(prepared_statement_handle)?;
+            Ok(Self::ActionClosePreparedStatementRequest(handle))
+        } else {
+            UnsupportedMessageTypeSnafu {
+                description: &msg.type_url,
+            }
+            .fail()
+        }
+    }
+
+    // Encode the command as a flightsql message (bytes)
+    pub fn try_encode(self) -> Result<Bytes> {
+        let msg = match self {
+            Self::CommandStatementQuery(cmd) => Any::pack(&cmd),
+            Self::CommandPreparedStatementQuery(handle) => {
+                let prepared_statement_handle = handle.encode();
+                let cmd = CommandPreparedStatementQuery {
+                    prepared_statement_handle,
+                };
+                Any::pack(&cmd)
+            }
+            Self::CommandGetSqlInfo(cmd) => Any::pack(&cmd),
+            Self::CommandGetCatalogs(cmd) => Any::pack(&cmd),
+            Self::CommandGetCrossReference(cmd) => Any::pack(&cmd),
+            Self::CommandGetDbSchemas(cmd) => Any::pack(&cmd),
+            Self::CommandGetExportedKeys(cmd) => Any::pack(&cmd),
+            Self::CommandGetImportedKeys(cmd) => Any::pack(&cmd),
+            Self::CommandGetPrimaryKeys(cmd) => Any::pack(&cmd),
+            Self::CommandGetTables(cmd) => Any::pack(&cmd),
+            Self::CommandGetTableTypes(cmd) => Any::pack(&cmd),
+            Self::CommandGetXdbcTypeInfo(cmd) => Any::pack(&cmd),
+            Self::ActionCreatePreparedStatementRequest(cmd) => Any::pack(&cmd),
+            Self::ActionClosePreparedStatementRequest(handle) => {
+                let prepared_statement_handle = handle.encode();
+                Any::pack(&ActionClosePreparedStatementRequest {
+                    prepared_statement_handle,
+                })
+            }
+        }?;
+        Ok(msg.encode_to_vec().into())
+    }
+}
diff --git a/flightsql/src/error.rs b/flightsql/src/error.rs
new file mode 100644
index 0000000..9c40f6c
--- /dev/null
+++ b/flightsql/src/error.rs
@@ -0,0 +1,51 @@
+//! FlightSQL errors
+use std::string::FromUtf8Error;
+
+use arrow::error::ArrowError;
+use arrow_flight::error::FlightError;
+use datafusion::error::DataFusionError;
+use prost::DecodeError;
+use snafu::Snafu;
+
+#[allow(clippy::enum_variant_names)]
+#[derive(Debug, Snafu)]
+// allow Snafu 's to be used in the crate
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("Invalid protobuf: {}", source))]
+    #[snafu(context(false))]
+    Decode { source: DecodeError },
+
+    #[snafu(display("Invalid PreparedStatement handle (invalid UTF-8:) {}", source))]
+    InvalidHandle { source: FromUtf8Error },
+
+    #[snafu(display("{}", source))]
+    #[snafu(context(false))]
+    Flight { source: FlightError },
+
+    #[snafu(display("{}", source))]
+    #[snafu(context(false))]
+    DataFusion { source: DataFusionError },
+
+    #[snafu(display("{}", source))]
+    #[snafu(context(false))]
+    Arrow { source: ArrowError },
+
+    #[snafu(display("Unsupported FlightSQL message type: {}", description))]
+    UnsupportedMessageType { description: String },
+
+    #[snafu(display("Protocol error. Method {} does not expect '{:?}'", method, cmd))]
+    Protocol { cmd: String, method: &'static str },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl From<Error> for DataFusionError {
+    fn from(value: Error) -> Self {
+        match value {
+            Error::DataFusion { source } => source,
+            Error::Arrow { source } => Self::ArrowError(source, None),
+            value => Self::External(Box::new(value)),
+        }
+    }
+}
diff --git a/flightsql/src/lib.rs b/flightsql/src/lib.rs
new file mode 100644
index 0000000..e3db0df
--- /dev/null
+++ b/flightsql/src/lib.rs
@@ -0,0 +1,28 @@
+//! InfluxDB IOx implementation of FlightSQL
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    clippy::clone_on_ref_ptr,
+    clippy::dbg_macro,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::use_self,
+    missing_debug_implementations,
+    // Allow missing docs - there's lots missing!
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod cmd;
+mod error;
+mod planner;
+mod sql_info;
+mod xdbc_type_info;
+
+pub use cmd::{FlightSQLCommand, PreparedStatementHandle};
+pub use error::{Error, Result};
+pub use planner::FlightSQLPlanner;
diff --git a/flightsql/src/planner.rs b/flightsql/src/planner.rs
new file mode 100644
index 0000000..7801661
--- /dev/null
+++ b/flightsql/src/planner.rs
@@ -0,0 +1,596 @@
+//! FlightSQL handling
+use std::sync::Arc;
+
+use arrow::{
+    array::{ArrayRef, StringArray},
+    datatypes::{DataType, Field, Schema, SchemaRef},
+    error::ArrowError,
+    ipc::writer::IpcWriteOptions,
+    record_batch::RecordBatch,
+};
+use arrow_flight::{
+    sql::{
+        ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any,
+        CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
+        CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
+        CommandGetTables, CommandGetXdbcTypeInfo, CommandStatementQuery,
+    },
+    IpcMessage, SchemaAsIpc,
+};
+use arrow_util::flight::prepare_schema_for_flight;
+use bytes::Bytes;
+use datafusion::{
+    common::ParamValues,
+    logical_expr::{LogicalPlan, TableType},
+    physical_plan::ExecutionPlan,
+    sql::TableReference,
+};
+use iox_query::{exec::IOxSessionContext, QueryNamespace};
+use observability_deps::tracing::debug;
+use once_cell::sync::Lazy;
+use prost::Message;
+
+use crate::{error::*, sql_info::iox_sql_info_data, xdbc_type_info::xdbc_type_info_data};
+use crate::{FlightSQLCommand, PreparedStatementHandle};
+
+/// Logic for creating plans for various Flight messages against a query database
+#[derive(Debug, Default, Copy, Clone)]
+pub struct FlightSQLPlanner {}
+
+impl FlightSQLPlanner {
+    pub fn new() -> Self {
+        Self {}
+    }
+
+    /// Returns the schema for the request in msg.
+    pub async fn get_schema(
+        namespace_name: impl Into<String> + Send,
+        cmd: FlightSQLCommand,
+        ctx: &IOxSessionContext,
+    ) -> Result<SchemaRef> {
+        let namespace_name = namespace_name.into();
+        debug!(%namespace_name, %cmd, "Handling flightsql get_flight_info (get schema)");
+
+        match cmd {
+            FlightSQLCommand::CommandStatementQuery(CommandStatementQuery { query, .. }) => {
+                get_schema_for_query(&query, ctx).await
+            }
+            FlightSQLCommand::CommandPreparedStatementQuery(handle) => {
+                get_schema_for_query(handle.query(), ctx).await
+            }
+            FlightSQLCommand::CommandGetSqlInfo(CommandGetSqlInfo { .. }) => {
+                Ok(iox_sql_info_data().schema())
+            }
+            FlightSQLCommand::CommandGetCatalogs(req) => Ok(req.into_builder().schema()),
+            FlightSQLCommand::CommandGetCrossReference(CommandGetCrossReference { .. }) => {
+                Ok(Arc::clone(&GET_CROSS_REFERENCE_SCHEMA))
+            }
+            FlightSQLCommand::CommandGetDbSchemas(req) => Ok(req.into_builder().schema()),
+            FlightSQLCommand::CommandGetExportedKeys(CommandGetExportedKeys { .. }) => {
+                Ok(Arc::clone(&GET_EXPORTED_KEYS_SCHEMA))
+            }
+            FlightSQLCommand::CommandGetImportedKeys(CommandGetImportedKeys { .. }) => {
+                Ok(Arc::clone(&GET_IMPORTED_KEYS_SCHEMA))
+            }
+            FlightSQLCommand::CommandGetPrimaryKeys(CommandGetPrimaryKeys { .. }) => {
+                Ok(Arc::clone(&GET_PRIMARY_KEYS_SCHEMA))
+            }
+            FlightSQLCommand::CommandGetTables(req) => Ok(req.into_builder().schema()),
+            FlightSQLCommand::CommandGetTableTypes(CommandGetTableTypes { .. }) => {
+                Ok(Arc::clone(&GET_TABLE_TYPE_SCHEMA))
+            }
+            FlightSQLCommand::CommandGetXdbcTypeInfo(CommandGetXdbcTypeInfo { .. }) => {
+                Ok(Arc::clone(&GET_XDBC_TYPE_INFO_SCHEMA))
+            }
+            FlightSQLCommand::ActionCreatePreparedStatementRequest(_)
+            | FlightSQLCommand::ActionClosePreparedStatementRequest(_) => ProtocolSnafu {
+                cmd: format!("{cmd:?}"),
+                method: "GetFlightInfo",
+            }
+            .fail(),
+        }
+    }
+
+    /// Returns a plan that computes results requested in msg
+    pub async fn do_get(
+        namespace_name: impl AsRef<str> + Send,
+        _database: Arc<dyn QueryNamespace>,
+        cmd: FlightSQLCommand,
+        params: impl Into<ParamValues> + Send,
+        ctx: &IOxSessionContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let namespace_name = namespace_name.as_ref();
+        debug!(%namespace_name, %cmd, "Handling flightsql do_get");
+
+        match cmd {
+            FlightSQLCommand::CommandStatementQuery(CommandStatementQuery { query, .. }) => {
+                debug!(%query, "Planning FlightSQL query");
+                Ok(ctx.sql_to_physical_plan_with_params(&query, params).await?)
+            }
+            FlightSQLCommand::CommandPreparedStatementQuery(handle) => {
+                let query = handle.query();
+                debug!(%query, "Planning FlightSQL prepared query");
+                Ok(ctx.sql_to_physical_plan_with_params(query, params).await?)
+            }
+            FlightSQLCommand::CommandGetSqlInfo(cmd) => {
+                debug!(?cmd, "Planning GetSqlInfo query");
+                let plan = plan_get_sql_info(ctx, cmd).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetCatalogs(cmd) => {
+                debug!("Planning GetCatalogs query");
+                let plan = plan_get_catalogs(ctx, cmd).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetCrossReference(CommandGetCrossReference {
+                pk_catalog,
+                pk_db_schema,
+                pk_table,
+                fk_catalog,
+                fk_db_schema,
+                fk_table,
+            }) => {
+                debug!(
+                    ?pk_catalog,
+                    ?pk_db_schema,
+                    ?pk_table,
+                    ?fk_catalog,
+                    ?fk_db_schema,
+                    ?fk_table,
+                    "Planning CommandGetCrossReference query"
+                );
+                let plan = plan_get_cross_reference(
+                    ctx,
+                    pk_catalog,
+                    pk_db_schema,
+                    pk_table,
+                    fk_catalog,
+                    fk_db_schema,
+                    fk_table,
+                )
+                .await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetDbSchemas(cmd) => {
+                debug!(
+                    catalog=?cmd.catalog,
+                    db_schema_filter_pattern=?cmd.db_schema_filter_pattern,
+                    "Planning GetDbSchemas query"
+                );
+                let plan = plan_get_db_schemas(ctx, cmd).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetExportedKeys(CommandGetExportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                debug!(
+                    ?catalog,
+                    ?db_schema,
+                    ?table,
+                    "Planning GetExportedKeys query"
+                );
+                let plan = plan_get_exported_keys(ctx, catalog, db_schema, table).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetImportedKeys(CommandGetImportedKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                debug!(
+                    ?catalog,
+                    ?db_schema,
+                    ?table,
+                    "Planning CommandGetImportedKeys query"
+                );
+                let plan = plan_get_imported_keys(ctx, catalog, db_schema, table).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetPrimaryKeys(CommandGetPrimaryKeys {
+                catalog,
+                db_schema,
+                table,
+            }) => {
+                debug!(
+                    ?catalog,
+                    ?db_schema,
+                    ?table,
+                    "Planning GetPrimaryKeys query"
+                );
+                let plan = plan_get_primary_keys(ctx, catalog, db_schema, table).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetTables(cmd) => {
+                debug!(
+                    catalog=?cmd.catalog,
+                    db_schema_filter_pattern=?cmd.db_schema_filter_pattern,
+                    table_name_filter_pattern=?cmd.table_name_filter_pattern,
+                    table_types=?cmd.table_types,
+                    include_schema=?cmd.include_schema,
+                    "Planning GetTables query"
+                );
+                let plan = plan_get_tables(ctx, cmd).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetTableTypes(CommandGetTableTypes {}) => {
+                debug!("Planning GetTableTypes query");
+                let plan = plan_get_table_types(ctx).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::CommandGetXdbcTypeInfo(cmd) => {
+                debug!(?cmd, "Planning GetXdbcTypeInfo query");
+                let plan = plan_get_xdbc_type_info(ctx, cmd).await?;
+                Ok(ctx.create_physical_plan(&plan).await?)
+            }
+            FlightSQLCommand::ActionClosePreparedStatementRequest(_)
+            | FlightSQLCommand::ActionCreatePreparedStatementRequest(_) => ProtocolSnafu {
+                cmd: format!("{cmd:?}"),
+                method: "DoGet",
+            }
+            .fail(),
+        }
+    }
+
+    /// Handles the action specified in `msg` and returns bytes for
+    /// the [`arrow_flight::Result`] (not the same as a rust
+    /// [`Result`]!)
+    pub async fn do_action(
+        namespace_name: impl Into<String> + Send,
+        _database: Arc<dyn QueryNamespace>,
+        cmd: FlightSQLCommand,
+        ctx: &IOxSessionContext,
+    ) -> Result<Bytes> {
+        let namespace_name = namespace_name.into();
+        debug!(%namespace_name, %cmd, "Handling flightsql do_action");
+
+        match cmd {
+            FlightSQLCommand::ActionCreatePreparedStatementRequest(
+                ActionCreatePreparedStatementRequest { query, .. },
+            ) => {
+                debug!(%query, "Creating prepared statement");
+
+                // todo run the planner here and actually figure out parameter schemas
+                // see https://github.com/apache/arrow-datafusion/pull/4701
+                let parameter_schema = vec![];
+
+                let dataset_schema = get_schema_for_query(&query, ctx).await?;
+                let dataset_schema = encode_schema(dataset_schema.as_ref())?;
+                let handle = PreparedStatementHandle::new(query);
+
+                let result = ActionCreatePreparedStatementResult {
+                    prepared_statement_handle: Bytes::from(handle),
+                    dataset_schema,
+                    parameter_schema: Bytes::from(parameter_schema),
+                };
+
+                let msg = Any::pack(&result)?;
+                Ok(msg.encode_to_vec().into())
+            }
+            FlightSQLCommand::ActionClosePreparedStatementRequest(handle) => {
+                let query = handle.query();
+                debug!(%query, "Closing prepared statement");
+
+                // Nothing really to do
+                Ok(Bytes::new())
+            }
+            _ => ProtocolSnafu {
+                cmd: format!("{cmd:?}"),
+                method: "DoAction",
+            }
+            .fail(),
+        }
+    }
+}
+
+/// Return the schema for the specified query
+async fn get_schema_for_query(query: &str, ctx: &IOxSessionContext) -> Result<SchemaRef> {
+    Ok(get_schema_for_plan(ctx.sql_to_logical_plan(query).await?))
+}
+
+/// Return the schema for the specified logical plan
+fn get_schema_for_plan(logical_plan: LogicalPlan) -> SchemaRef {
+    // gather real schema, but only
+    let schema = Arc::new(Schema::from(logical_plan.schema().as_ref())) as _;
+    prepare_schema_for_flight(schema)
+}
+
+/// Encodes the schema IPC encoded (schema_bytes)
+fn encode_schema(schema: &Schema) -> Result<Bytes> {
+    let options = IpcWriteOptions::default();
+
+    // encode the schema into the correct form
+    let message: Result<IpcMessage, ArrowError> = SchemaAsIpc::new(schema, &options).try_into();
+
+    let IpcMessage(schema) = message?;
+
+    Ok(schema)
+}
+
+/// Return a `LogicalPlan` for GetSqlInfo
+async fn plan_get_sql_info(ctx: &IOxSessionContext, cmd: CommandGetSqlInfo) -> Result<LogicalPlan> {
+    let batch = cmd.into_builder(iox_sql_info_data()).build()?;
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+/// Return a list of "catalogs" from the DataFusion catalog
+async fn plan_get_catalogs(
+    ctx: &IOxSessionContext,
+    cmd: CommandGetCatalogs,
+) -> Result<LogicalPlan> {
+    let mut builder = cmd.into_builder();
+    for catalog_name in ctx.inner().catalog_names() {
+        builder.append(catalog_name);
+    }
+    let batch = builder.build()?;
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+async fn plan_get_cross_reference(
+    ctx: &IOxSessionContext,
+    _pk_catalog: Option<String>,
+    _pk_db_schema: Option<String>,
+    _pk_table: String,
+    _fk_catalog: Option<String>,
+    _fk_db_schema: Option<String>,
+    _fk_table: String,
+) -> Result<LogicalPlan> {
+    let batch = RecordBatch::new_empty(Arc::clone(&GET_CROSS_REFERENCE_SCHEMA));
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+/// Return a list of schema from the DataFusion catalog
+async fn plan_get_db_schemas(
+    ctx: &IOxSessionContext,
+    cmd: CommandGetDbSchemas,
+) -> Result<LogicalPlan> {
+    let mut builder = cmd.into_builder();
+    let catalog_list = ctx.inner().state().catalog_list();
+
+    for catalog_name in catalog_list.catalog_names() {
+        // we just got the catalog name from the catalog_list, so it
+        // should always be Some, but avoid unwrap to be safe
+        let Some(catalog) = catalog_list.catalog(&catalog_name) else {
+            continue;
+        };
+
+        builder.append(&catalog_name, "information_schema");
+        for schema_name in catalog.schema_names() {
+            builder.append(&catalog_name, &schema_name);
+        }
+    }
+
+    let batch = builder.build()?;
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+async fn plan_get_exported_keys(
+    ctx: &IOxSessionContext,
+    _catalog: Option<String>,
+    _db_schema: Option<String>,
+    _table: String,
+) -> Result<LogicalPlan> {
+    let batch = RecordBatch::new_empty(Arc::clone(&GET_EXPORTED_KEYS_SCHEMA));
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+async fn plan_get_imported_keys(
+    ctx: &IOxSessionContext,
+    _catalog: Option<String>,
+    _db_schema: Option<String>,
+    _table: String,
+) -> Result<LogicalPlan> {
+    let batch = RecordBatch::new_empty(Arc::clone(&GET_IMPORTED_KEYS_SCHEMA));
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+async fn plan_get_primary_keys(
+    ctx: &IOxSessionContext,
+    _catalog: Option<String>,
+    _db_schema: Option<String>,
+    _table: String,
+) -> Result<LogicalPlan> {
+    let batch = RecordBatch::new_empty(Arc::clone(&GET_PRIMARY_KEYS_SCHEMA));
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+/// Return a list of tables from the DataFusion catalog
+async fn plan_get_tables(ctx: &IOxSessionContext, cmd: CommandGetTables) -> Result<LogicalPlan> {
+    let mut builder = cmd.into_builder();
+    let catalog_list = ctx.inner().state().catalog_list();
+
+    for catalog_name in catalog_list.catalog_names() {
+        // we just got the catalog name from the catalog_list, so it
+        // should always be Some, but avoid unwrap to be safe
+        let Some(catalog) = catalog_list.catalog(&catalog_name) else {
+            continue;
+        };
+
+        // special case the "public"."information_schema" as it is a
+        // "virtual" catalog in DataFusion and thus is not reported
+        // directly via the table providers
+        // We ensure this list is kept in sync with tests
+        let table_names = vec!["columns", "df_settings", "tables", "views"];
+        for table_name in table_names {
+            let schema_name = "information_schema";
+            let table_ref = TableReference::full(&catalog_name, schema_name, table_name);
+
+            let Some(table) = ctx.inner().table(table_ref).await.ok() else {
+                continue;
+            };
+
+            let table_type = "VIEW";
+            let schema = Schema::from(table.schema());
+            builder.append(&catalog_name, schema_name, table_name, table_type, &schema)?;
+        }
+
+        for schema_name in catalog.schema_names() {
+            let Some(schema) = catalog.schema(&schema_name) else {
+                continue;
+            };
+
+            for table_name in schema.table_names() {
+                let Some(table) = schema.table(&table_name).await else {
+                    continue;
+                };
+
+                let table_type = table_type_name(table.table_type());
+
+                builder.append(
+                    &catalog_name,
+                    &schema_name,
+                    &table_name,
+                    table_type,
+                    table.schema().as_ref(),
+                )?;
+            }
+        }
+    }
+    let batch = builder.build()?;
+
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+/// Return the correct FlightSQL name for the DataFusion TableType
+fn table_type_name(table_type: TableType) -> &'static str {
+    match table_type {
+        // from https://github.com/apache/arrow-datafusion/blob/26b8377b0690916deacf401097d688699026b8fb/datafusion/core/src/catalog/information_schema.rs#L284-L288
+        TableType::Base => "BASE TABLE",
+        TableType::View => "VIEW",
+        TableType::Temporary => "LOCAL TEMPORARY",
+    }
+}
+
+/// Return a `LogicalPlan` for GetTableTypes
+async fn plan_get_table_types(ctx: &IOxSessionContext) -> Result<LogicalPlan> {
+    Ok(ctx.batch_to_logical_plan(TABLE_TYPES_RECORD_BATCH.clone())?)
+}
+
+/// Return a `LogicalPlan` for GetXdbcTypeInfo
+async fn plan_get_xdbc_type_info(
+    ctx: &IOxSessionContext,
+    cmd: CommandGetXdbcTypeInfo,
+) -> Result<LogicalPlan> {
+    let batch = cmd.into_builder(xdbc_type_info_data()).build()?;
+    Ok(ctx.batch_to_logical_plan(batch)?)
+}
+
+/// The schema for GetTableTypes
+static GET_TABLE_TYPE_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![Field::new(
+        "table_type",
+        DataType::Utf8,
+        false,
+    )]))
+});
+
+static TABLE_TYPES_RECORD_BATCH: Lazy<RecordBatch> = Lazy::new(|| {
+    // https://github.com/apache/arrow-datafusion/blob/26b8377b0690916deacf401097d688699026b8fb/datafusion/core/src/catalog/information_schema.rs#L285-L287
+    // IOx doesn't support LOCAL TEMPORARY yet
+    let table_type = Arc::new(StringArray::from_iter_values(["BASE TABLE", "VIEW"])) as ArrayRef;
+    RecordBatch::try_new(Arc::clone(&GET_TABLE_TYPE_SCHEMA), vec![table_type]).unwrap()
+});
+
+/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name,
+/// pk_table_name, pk_key_name, then key_sequence.
+/// update_rule and delete_rule returns a byte that is equivalent to actions:
+///    - 0 = CASCADE
+///    - 1 = RESTRICT
+///    - 2 = SET NULL
+///    - 3 = NO ACTION
+///    - 4 = SET DEFAULT
+static GET_CROSS_REFERENCE_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("pk_catalog_name", DataType::Utf8, false),
+        Field::new("pk_db_schema_name", DataType::Utf8, false),
+        Field::new("pk_table_name", DataType::Utf8, false),
+        Field::new("pk_column_name", DataType::Utf8, false),
+        Field::new("fk_catalog_name", DataType::Utf8, false),
+        Field::new("fk_db_schema_name", DataType::Utf8, false),
+        Field::new("fk_table_name", DataType::Utf8, false),
+        Field::new("fk_column_name", DataType::Utf8, false),
+        Field::new("key_sequence", DataType::Int32, false),
+        Field::new("fk_key_name", DataType::Utf8, false),
+        Field::new("pk_key_name", DataType::Utf8, false),
+        Field::new("update_rule", DataType::UInt8, false),
+        Field::new("delete_rule", DataType::UInt8, false),
+    ]))
+});
+
+static GET_EXPORTED_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("pk_catalog_name", DataType::Utf8, false),
+        Field::new("pk_db_schema_name", DataType::Utf8, false),
+        Field::new("pk_table_name", DataType::Utf8, false),
+        Field::new("pk_column_name", DataType::Utf8, false),
+        Field::new("fk_catalog_name", DataType::Utf8, false),
+        Field::new("fk_db_schema_name", DataType::Utf8, false),
+        Field::new("fk_table_name", DataType::Utf8, false),
+        Field::new("fk_column_name", DataType::Utf8, false),
+        Field::new("key_sequence", DataType::Int32, false),
+        Field::new("fk_key_name", DataType::Utf8, false),
+        Field::new("pk_key_name", DataType::Utf8, false),
+        Field::new("update_rule", DataType::UInt8, false),
+        Field::new("delete_rule", DataType::UInt8, false),
+    ]))
+});
+
+static GET_IMPORTED_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("pk_catalog_name", DataType::Utf8, false),
+        Field::new("pk_db_schema_name", DataType::Utf8, false),
+        Field::new("pk_table_name", DataType::Utf8, false),
+        Field::new("pk_column_name", DataType::Utf8, false),
+        Field::new("fk_catalog_name", DataType::Utf8, false),
+        Field::new("fk_db_schema_name", DataType::Utf8, false),
+        Field::new("fk_table_name", DataType::Utf8, false),
+        Field::new("fk_column_name", DataType::Utf8, false),
+        Field::new("key_sequence", DataType::Int32, false),
+        Field::new("fk_key_name", DataType::Utf8, false),
+        Field::new("pk_key_name", DataType::Utf8, false),
+        Field::new("update_rule", DataType::UInt8, false),
+        Field::new("delete_rule", DataType::UInt8, false),
+    ]))
+});
+
+static GET_PRIMARY_KEYS_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("catalog_name", DataType::Utf8, false),
+        Field::new("db_schema_name", DataType::Utf8, false),
+        Field::new("table_name", DataType::Utf8, false),
+        Field::new("column_name", DataType::Utf8, false),
+        Field::new("key_name", DataType::Utf8, false),
+        Field::new("key_sequence", DataType::Int32, false),
+    ]))
+});
+
+/// The schema for GetXdbcTypeInfo
+// From https://github.com/apache/arrow/blob/9588da967c756b2923e213ccc067378ba6c90a86/format/FlightSql.proto#L1064-L1113
+static GET_XDBC_TYPE_INFO_SCHEMA: Lazy<SchemaRef> = Lazy::new(|| {
+    Arc::new(Schema::new(vec![
+        Field::new("type_name", DataType::Utf8, false),
+        Field::new("data_type", DataType::Int32, false),
+        Field::new("column_size", DataType::Int32, true),
+        Field::new("literal_prefix", DataType::Utf8, true),
+        Field::new("literal_suffix", DataType::Utf8, true),
+        Field::new(
+            "create_params",
+            DataType::List(Arc::new(Field::new("item", DataType::Utf8, false))),
+            true,
+        ),
+        Field::new("nullable", DataType::Int32, false), // Nullable enum: https://github.com/apache/arrow/blob/9588da967c756b2923e213ccc067378ba6c90a86/format/FlightSql.proto#L1014-L1029
+        Field::new("case_sensitive", DataType::Boolean, false),
+        Field::new("searchable", DataType::Int32, false), // Searchable enum: https://github.com/apache/arrow/blob/9588da967c756b2923e213ccc067378ba6c90a86/format/FlightSql.proto#L1031-L1056
+        Field::new("unsigned_attribute", DataType::Boolean, true),
+        Field::new("fixed_prec_scale", DataType::Boolean, false),
+        Field::new("auto_increment", DataType::Boolean, true),
+        Field::new("local_type_name", DataType::Utf8, true),
+        Field::new("minimum_scale", DataType::Int32, true),
+        Field::new("maximum_scale", DataType::Int32, true),
+        Field::new("sql_data_type", DataType::Int32, false),
+        Field::new("datetime_subcode", DataType::Int32, true),
+        Field::new("num_prec_radix", DataType::Int32, true),
+        Field::new("interval_precision", DataType::Int32, true),
+    ]))
+});
diff --git a/flightsql/src/sql_info/meta.rs b/flightsql/src/sql_info/meta.rs
new file mode 100644
index 0000000..425d545
--- /dev/null
+++ b/flightsql/src/sql_info/meta.rs
@@ -0,0 +1,390 @@
+//! SQL metadata tables (originally from [queryrouterd])
+//!
+//! TODO: figure out how to generate these keywords automatically from DataFusion / sqlparser-rs
+//!
+//! [queryrouterd]: https://github.com/influxdata/idpe/blob/85aa7a52b40f173cc4d79ac02b3a4a13e82333c4/queryrouter/internal/server/flightsql_info.go#L4
+
+use arrow::{
+    compute::can_cast_types,
+    datatypes::{
+        DataType,
+        IntervalUnit::{DayTime, YearMonth},
+        TimeUnit::Nanosecond,
+    },
+};
+use arrow_flight::sql::SqlSupportsConvert;
+use once_cell::sync::Lazy;
+use std::collections::HashMap;
+
+pub(crate) const SQL_INFO_SQL_KEYWORDS: &[&str] = &[
+    // SQL-92 Reserved Words
+    "absolute",
+    "action",
+    "add",
+    "all",
+    "allocate",
+    "alter",
+    "and",
+    "any",
+    "are",
+    "as",
+    "asc",
+    "assertion",
+    "at",
+    "authorization",
+    "avg",
+    "begin",
+    "between",
+    "bit",
+    "bit_length",
+    "both",
+    "by",
+    "cascade",
+    "cascaded",
+    "case",
+    "cast",
+    "catalog",
+    "char",
+    "char_length",
+    "character",
+    "character_length",
+    "check",
+    "close",
+    "coalesce",
+    "collate",
+    "collation",
+    "column",
+    "commit",
+    "connect",
+    "connection",
+    "constraint",
+    "constraints",
+    "continue",
+    "convert",
+    "corresponding",
+    "count",
+    "create",
+    "cross",
+    "current",
+    "current_date",
+    "current_time",
+    "current_timestamp",
+    "current_user",
+    "cursor",
+    "date",
+    "day",
+    "deallocate",
+    "dec",
+    "decimal",
+    "declare",
+    "default",
+    "deferrable",
+    "deferred",
+    "delete",
+    "desc",
+    "describe",
+    "descriptor",
+    "diagnostics",
+    "disconnect",
+    "distinct",
+    "domain",
+    "double",
+    "drop",
+    "else",
+    "end",
+    "end-exec",
+    "escape",
+    "except",
+    "exception",
+    "exec",
+    "execute",
+    "exists",
+    "external",
+    "extract",
+    "false",
+    "fetch",
+    "first",
+    "float",
+    "for",
+    "foreign",
+    "found",
+    "from",
+    "full",
+    "get",
+    "global",
+    "go",
+    "goto",
+    "grant",
+    "group",
+    "having",
+    "hour",
+    "identity",
+    "immediate",
+    "in",
+    "indicator",
+    "initially",
+    "inner",
+    "input",
+    "insensitive",
+    "insert",
+    "int",
+    "integer",
+    "intersect",
+    "interval",
+    "into",
+    "is",
+    "isolation",
+    "join",
+    "key",
+    "language",
+    "last",
+    "leading",
+    "left",
+    "level",
+    "like",
+    "local",
+    "lower",
+    "match",
+    "max",
+    "min",
+    "minute",
+    "module",
+    "month",
+    "names",
+    "national",
+    "natural",
+    "nchar",
+    "next",
+    "no",
+    "not",
+    "null",
+    "nullif",
+    "numeric",
+    "octet_length",
+    "of",
+    "on",
+    "only",
+    "open",
+    "option",
+    "or",
+    "order",
+    "outer",
+    "output",
+    "overlaps",
+    "pad",
+    "partial",
+    "position",
+    "precision",
+    "prepare",
+    "preserve",
+    "primary",
+    "prior",
+    "privileges",
+    "procedure",
+    "public",
+    "read",
+    "real",
+    "references",
+    "relative",
+    "restrict",
+    "revoke",
+    "right",
+    "rollback",
+    "rows",
+    "schema",
+    "scroll",
+    "second",
+    "section",
+    "select",
+    "session",
+    "session_user",
+    "set",
+    "size",
+    "smallint",
+    "some",
+    "space",
+    "sql",
+    "sqlcode",
+    "sqlerror",
+    "sqlstate",
+    "substring",
+    "sum",
+    "system_user",
+    "table",
+    "temporary",
+    "then",
+    "time",
+    "timestamp",
+    "timezone_hour",
+    "timezone_minute",
+    "to",
+    "trailing",
+    "transaction",
+    "translate",
+    "translation",
+    "trim",
+    "true",
+    "union",
+    "unique",
+    "unknown",
+    "update",
+    "upper",
+    "usage",
+    "user",
+    "using",
+    "value",
+    "values",
+    "varchar",
+    "varying",
+    "view",
+    "when",
+    "whenever",
+    "where",
+    "with",
+    "work",
+    "write",
+    "year",
+    "zone",
+];
+
+pub(crate) const SQL_INFO_NUMERIC_FUNCTIONS: &[&str] = &[
+    "abs", "acos", "asin", "atan", "atan2", "ceil", "cos", "exp", "floor", "ln", "log", "log10",
+    "log2", "pow", "power", "round", "signum", "sin", "sqrt", "tan", "trunc",
+];
+
+pub(crate) const SQL_INFO_STRING_FUNCTIONS: &[&str] = &[
+    "arrow_typeof",
+    "ascii",
+    "bit_length",
+    "btrim",
+    "char_length",
+    "character_length",
+    "chr",
+    "concat",
+    "concat_ws",
+    "digest",
+    "from_unixtime",
+    "initcap",
+    "left",
+    "length",
+    "lower",
+    "lpad",
+    "ltrim",
+    "md5",
+    "octet_length",
+    "random",
+    "regexp_match",
+    "regexp_replace",
+    "repeat",
+    "replace",
+    "reverse",
+    "right",
+    "rpad",
+    "rtrim",
+    "sha224",
+    "sha256",
+    "sha384",
+    "sha512",
+    "split_part",
+    "starts_with",
+    "strpos",
+    "substr",
+    "to_hex",
+    "translate",
+    "trim",
+    "upper",
+    "uuid",
+];
+
+pub(crate) const SQL_INFO_DATE_TIME_FUNCTIONS: &[&str] = &[
+    "current_date",
+    "current_time",
+    "date_bin",
+    "date_part",
+    "date_trunc",
+    "datepart",
+    "datetrunc",
+    "from_unixtime",
+    "now",
+    "to_timestamp",
+    "to_timestamp_micros",
+    "to_timestamp_millis",
+    "to_timestamp_seconds",
+];
+
+pub(crate) const SQL_INFO_SYSTEM_FUNCTIONS: &[&str] = &["array", "arrow_typeof", "struct"];
+
+static SQL_DATA_TYPE_TO_ARROW_DATA_TYPE: Lazy<HashMap<SqlSupportsConvert, DataType>> =
+    Lazy::new(|| {
+        [
+            // Referenced from DataFusion data types
+            // https://arrow.apache.org/datafusion/user-guide/sql/data_types.html
+            // Some SQL types are not supported by DataFusion
+            // https://arrow.apache.org/datafusion/user-guide/sql/data_types.html#unsupported-sql-types
+            (SqlSupportsConvert::SqlConvertBigint, DataType::Int64),
+            // SqlSupportsConvert::SqlConvertBinary is not supported
+            (SqlSupportsConvert::SqlConvertBit, DataType::Boolean),
+            (SqlSupportsConvert::SqlConvertChar, DataType::Utf8),
+            (SqlSupportsConvert::SqlConvertDate, DataType::Date32),
+            (
+                SqlSupportsConvert::SqlConvertDecimal,
+                // Use the max precision 38
+                // https://docs.rs/arrow-schema/47.0.0/arrow_schema/constant.DECIMAL128_MAX_PRECISION.html
+                DataType::Decimal128(38, 2),
+            ),
+            (SqlSupportsConvert::SqlConvertFloat, DataType::Float32),
+            (SqlSupportsConvert::SqlConvertInteger, DataType::Int32),
+            (
+                SqlSupportsConvert::SqlConvertIntervalDayTime,
+                DataType::Interval(DayTime),
+            ),
+            (
+                SqlSupportsConvert::SqlConvertIntervalYearMonth,
+                DataType::Interval(YearMonth),
+            ),
+            // SqlSupportsConvert::SqlConvertLongvarbinary is not supported
+            // LONG VARCHAR is identical to VARCHAR
+            // https://docs.oracle.com/javadb/10.6.2.1/ref/rrefsqlj15147.html
+            (SqlSupportsConvert::SqlConvertLongvarchar, DataType::Utf8),
+            // NUMERIC is a synonym for DECIMAL and behaves the same way
+            // https://docs.oracle.com/javadb/10.6.2.1/ref/rrefsqlj12362.html
+            (
+                SqlSupportsConvert::SqlConvertNumeric,
+                // Use the max precision 38
+                // https://docs.rs/arrow-schema/47.0.0/arrow_schema/constant.DECIMAL128_MAX_PRECISION.html
+                DataType::Decimal128(38, 2),
+            ),
+            (SqlSupportsConvert::SqlConvertReal, DataType::Float32),
+            (SqlSupportsConvert::SqlConvertSmallint, DataType::Int16),
+            (
+                SqlSupportsConvert::SqlConvertTime,
+                DataType::Time64(Nanosecond),
+            ),
+            (
+                SqlSupportsConvert::SqlConvertTimestamp,
+                DataType::Timestamp(Nanosecond, None),
+            ),
+            (SqlSupportsConvert::SqlConvertTinyint, DataType::Int8),
+            // SqlSupportsConvert::SqlConvertVarbinary is not supported
+            (SqlSupportsConvert::SqlConvertVarchar, DataType::Utf8),
+        ]
+        .iter()
+        .cloned()
+        .collect()
+    });
+
+pub(crate) static SQL_INFO_SUPPORTS_CONVERT: Lazy<HashMap<i32, Vec<i32>>> = Lazy::new(|| {
+    let mut convert: HashMap<i32, Vec<i32>> = HashMap::new();
+    for (from_type_sql, from_type_arrow) in SQL_DATA_TYPE_TO_ARROW_DATA_TYPE.clone().into_iter() {
+        let mut can_convert_to: Vec<i32> = vec![];
+        for (to_type_sql, to_type_arrow) in SQL_DATA_TYPE_TO_ARROW_DATA_TYPE.clone().into_iter() {
+            if can_cast_types(&from_type_arrow, &to_type_arrow) {
+                can_convert_to.push(to_type_sql as i32)
+            }
+        }
+        if !can_convert_to.is_empty() {
+            convert.insert(from_type_sql as i32, can_convert_to);
+        }
+    }
+    convert
+});
diff --git a/flightsql/src/sql_info/mod.rs b/flightsql/src/sql_info/mod.rs
new file mode 100644
index 0000000..db709e4
--- /dev/null
+++ b/flightsql/src/sql_info/mod.rs
@@ -0,0 +1,184 @@
+//! Represents the response to FlightSQL `GetSqlInfo` requests and
+//! handles the conversion to/from the format specified in the
+//! [Arrow FlightSQL Specification].
+//!
+//! <
+//!   info_name: uint32 not null,
+//!   value: dense_union<
+//!               string_value: utf8,
+//!               bool_value: bool,
+//!               bigint_value: int64,
+//!               int32_bitmask: int32,
+//!               string_list: list<string_data: utf8>
+//!               int32_to_int32_list_map: map<key: int32, value: list<$data$: int32>>
+//!  >
+//!
+//! where there is one row per requested piece of metadata information.
+//!
+//!
+//! [Arrow FlightSQL Specification]: https://github.com/apache/arrow/blob/f1eece9f276184063c9c35011e8243eb3b071233/format/FlightSql.proto#L33-L42
+
+mod meta;
+
+use arrow_flight::sql::{
+    metadata::{SqlInfoData, SqlInfoDataBuilder},
+    SqlInfo, SqlNullOrdering, SqlSupportedCaseSensitivity, SqlSupportedTransactions,
+    SupportedSqlGrammar,
+};
+use once_cell::sync::Lazy;
+
+use meta::{
+    SQL_INFO_DATE_TIME_FUNCTIONS, SQL_INFO_NUMERIC_FUNCTIONS, SQL_INFO_SQL_KEYWORDS,
+    SQL_INFO_STRING_FUNCTIONS, SQL_INFO_SUPPORTS_CONVERT, SQL_INFO_SYSTEM_FUNCTIONS,
+};
+
+#[allow(non_snake_case)]
+static INSTANCE: Lazy<SqlInfoData> = Lazy::new(|| {
+    // The following are not defined in the [`SqlInfo`], but are
+    // documented at
+    // https://arrow.apache.org/docs/format/FlightSql.html#protocol-buffer-definitions.
+
+    let SqlInfoFlightSqlServerSql = 4;
+    let SqlInfoFlightSqlServerSubstrait = 5;
+    //let SqlInfoFlightSqlServerSubstraitMinVersion = 6;
+    //let SqlInfoFlightSqlServerSubstraitMaxVersion = 7;
+    let SqlInfoFlightSqlServerTransaction = 8;
+    let SqlInfoFlightSqlServerCancel = 9;
+    let SqlInfoFlightSqlServerStatementTimeout = 100;
+    let SqlInfoFlightSqlServerTransactionTimeout = 101;
+
+    // Copied from https://github.com/influxdata/idpe/blob/85aa7a52b40f173cc4d79ac02b3a4a13e82333c4/queryrouter/internal/server/flightsql_handler.go#L208-L275
+
+    let mut builder = SqlInfoDataBuilder::new();
+
+    // Server information
+    builder.append(SqlInfo::FlightSqlServerName, "InfluxDB IOx");
+    builder.append(SqlInfo::FlightSqlServerVersion, "2");
+    // 1.3 comes from https://github.com/apache/arrow/blob/f9324b79bf4fc1ec7e97b32e3cce16e75ef0f5e3/format/Schema.fbs#L24
+    builder.append(SqlInfo::FlightSqlServerArrowVersion, "1.3");
+    builder.append(SqlInfo::FlightSqlServerReadOnly, true);
+    builder.append(SqlInfoFlightSqlServerSql, true);
+    builder.append(SqlInfoFlightSqlServerSubstrait, false);
+    builder.append(
+        SqlInfoFlightSqlServerTransaction,
+        SqlSupportedTransactions::SqlTransactionUnspecified as i32,
+    );
+    // don't yetsupport `CancelQuery` action
+    builder.append(SqlInfoFlightSqlServerCancel, false);
+    builder.append(SqlInfoFlightSqlServerStatementTimeout, 0i32);
+    builder.append(SqlInfoFlightSqlServerTransactionTimeout, 0i32);
+    // SQL syntax information
+    builder.append(SqlInfo::SqlDdlCatalog, false);
+    builder.append(SqlInfo::SqlDdlSchema, false);
+    builder.append(SqlInfo::SqlDdlTable, false);
+    builder.append(
+        SqlInfo::SqlIdentifierCase,
+        SqlSupportedCaseSensitivity::SqlCaseSensitivityLowercase as i32,
+    );
+    builder.append(SqlInfo::SqlIdentifierQuoteChar, r#"""#);
+    builder.append(
+        SqlInfo::SqlQuotedIdentifierCase,
+        SqlSupportedCaseSensitivity::SqlCaseSensitivityCaseInsensitive as i32,
+    );
+    builder.append(SqlInfo::SqlAllTablesAreSelectable, true);
+    builder.append(
+        SqlInfo::SqlNullOrdering,
+        SqlNullOrdering::SqlNullsSortedHigh as i32,
+    );
+    builder.append(SqlInfo::SqlKeywords, SQL_INFO_SQL_KEYWORDS);
+    builder.append(SqlInfo::SqlNumericFunctions, SQL_INFO_NUMERIC_FUNCTIONS);
+    builder.append(SqlInfo::SqlStringFunctions, SQL_INFO_STRING_FUNCTIONS);
+    builder.append(SqlInfo::SqlSystemFunctions, SQL_INFO_SYSTEM_FUNCTIONS);
+    builder.append(SqlInfo::SqlDatetimeFunctions, SQL_INFO_DATE_TIME_FUNCTIONS);
+    builder.append(SqlInfo::SqlSearchStringEscape, "\\");
+    builder.append(SqlInfo::SqlExtraNameCharacters, "");
+    builder.append(SqlInfo::SqlSupportsColumnAliasing, true);
+    builder.append(SqlInfo::SqlNullPlusNullIsNull, true);
+    builder.append(
+        SqlInfo::SqlSupportsConvert,
+        &SQL_INFO_SUPPORTS_CONVERT.clone(),
+    );
+    builder.append(SqlInfo::SqlSupportsTableCorrelationNames, false);
+    builder.append(SqlInfo::SqlSupportsDifferentTableCorrelationNames, false);
+    builder.append(SqlInfo::SqlSupportsExpressionsInOrderBy, true);
+    builder.append(SqlInfo::SqlSupportsOrderByUnrelated, true);
+    builder.append(SqlInfo::SqlSupportedGroupBy, 3i32);
+    builder.append(SqlInfo::SqlSupportsLikeEscapeClause, true);
+    builder.append(SqlInfo::SqlSupportsNonNullableColumns, true);
+    builder.append(
+        SqlInfo::SqlSupportedGrammar,
+        SupportedSqlGrammar::SqlCoreGrammar as i32,
+    );
+    // report IOx supports all ansi 92
+    builder.append(SqlInfo::SqlAnsi92SupportedLevel, 0b111_i32);
+    builder.append(SqlInfo::SqlSupportsIntegrityEnhancementFacility, false);
+    builder.append(SqlInfo::SqlOuterJoinsSupportLevel, 2i32);
+    builder.append(SqlInfo::SqlSchemaTerm, "schema");
+    builder.append(SqlInfo::SqlProcedureTerm, "procedure");
+    builder.append(SqlInfo::SqlCatalogAtStart, false);
+    builder.append(SqlInfo::SqlSchemasSupportedActions, 0i32);
+    builder.append(SqlInfo::SqlCatalogsSupportedActions, 0i32);
+    builder.append(SqlInfo::SqlSupportedPositionedCommands, 0i32);
+    builder.append(SqlInfo::SqlSelectForUpdateSupported, false);
+    builder.append(SqlInfo::SqlStoredProceduresSupported, false);
+    builder.append(SqlInfo::SqlSupportedSubqueries, 15i32);
+    builder.append(SqlInfo::SqlCorrelatedSubqueriesSupported, true);
+    builder.append(SqlInfo::SqlSupportedUnions, 3i32);
+    // For max lengths, report max arrow string length (IOx
+    // doesn't enfore many of these limits yet
+    builder.append(SqlInfo::SqlMaxBinaryLiteralLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxCharLiteralLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxColumnNameLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxColumnsInGroupBy, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxColumnsInIndex, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxColumnsInOrderBy, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxColumnsInSelect, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxColumnsInTable, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxConnections, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxCursorNameLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxIndexLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlDbSchemaNameLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxProcedureNameLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxCatalogNameLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxRowSize, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxRowSizeIncludesBlobs, true);
+    builder.append(SqlInfo::SqlMaxStatementLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxStatements, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxTableNameLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxTablesInSelect, i32::MAX as i64);
+    builder.append(SqlInfo::SqlMaxUsernameLength, i32::MAX as i64);
+    builder.append(SqlInfo::SqlDefaultTransactionIsolation, 0i64);
+    builder.append(SqlInfo::SqlTransactionsSupported, false);
+    builder.append(SqlInfo::SqlSupportedTransactionsIsolationLevels, 0i32);
+    builder.append(SqlInfo::SqlDataDefinitionCausesTransactionCommit, false);
+    builder.append(SqlInfo::SqlDataDefinitionsInTransactionsIgnored, true);
+    builder.append(SqlInfo::SqlSupportedResultSetTypes, 0i32);
+    builder.append(
+        SqlInfo::SqlSupportedConcurrenciesForResultSetUnspecified,
+        0i32,
+    );
+    builder.append(
+        SqlInfo::SqlSupportedConcurrenciesForResultSetForwardOnly,
+        0i32,
+    );
+    builder.append(
+        SqlInfo::SqlSupportedConcurrenciesForResultSetScrollSensitive,
+        0i32,
+    );
+    builder.append(
+        SqlInfo::SqlSupportedConcurrenciesForResultSetScrollInsensitive,
+        0i32,
+    );
+    builder.append(SqlInfo::SqlBatchUpdatesSupported, false);
+    builder.append(SqlInfo::SqlSavepointsSupported, false);
+    builder.append(SqlInfo::SqlNamedParametersSupported, false);
+    builder.append(SqlInfo::SqlLocatorsUpdateCopy, false);
+    builder.append(SqlInfo::SqlStoredFunctionsUsingCallSyntaxSupported, false);
+
+    builder.build().expect("Successfully built metadata")
+});
+
+/// Return a [`SqlInfoData`] that describes IOx's capablities
+pub(crate) fn iox_sql_info_data() -> &'static SqlInfoData {
+    &INSTANCE
+}
diff --git a/flightsql/src/xdbc_type_info/mod.rs b/flightsql/src/xdbc_type_info/mod.rs
new file mode 100644
index 0000000..ed53647
--- /dev/null
+++ b/flightsql/src/xdbc_type_info/mod.rs
@@ -0,0 +1,178 @@
+//! Represents the response to FlightSQL `GetXdbcTypeInfo` requests and
+//! handles the conversion to/from the format specified in the
+//! [Arrow FlightSQL Specification].
+//!
+//! <
+//!    type_name: utf8 not null (The name of the data type, for example: VARCHAR, INTEGER, etc),
+//!    data_type: int32 not null (The SQL data type),
+//!    column_size: int32 (The maximum size supported by that column.
+//!                        In case of exact numeric types, this represents the maximum precision.
+//!                        In case of string types, this represents the character length.
+//!                        In case of datetime data types, this represents the length in characters of the string representation.
+//!                        NULL is returned for data types where column size is not applicable.),
+//!    literal_prefix: utf8 (Character or characters used to prefix a literal, NULL is returned for
+//!                          data types where a literal prefix is not applicable.),
+//!    literal_suffix: utf8 (Character or characters used to terminate a literal,
+//!                          NULL is returned for data types where a literal suffix is not applicable.),
+//!    create_params: list< utf8 not null >
+//!                         (A list of keywords corresponding to which parameters can be used when creating
+//!                          a column for that specific type.
+//!                          NULL is returned if there are no parameters for the data type definition.),
+//!    nullable: int32 not null (Shows if the data type accepts a NULL value. The possible values can be seen in the
+//!                              Nullable enum.),
+//!    case_sensitive: bool not null (Shows if a character data type is case-sensitive in collations and comparisons),
+//!    searchable: int32 not null (Shows how the data type is used in a WHERE clause. The possible values can be seen in the
+//!                                Searchable enum.),
+//!    unsigned_attribute: bool (Shows if the data type is unsigned. NULL is returned if the attribute is
+//!                              not applicable to the data type or the data type is not numeric.),
+//!    fixed_prec_scale: bool not null (Shows if the data type has predefined fixed precision and scale.),
+//!    auto_increment: bool (Shows if the data type is auto incremental. NULL is returned if the attribute
+//!                          is not applicable to the data type or the data type is not numeric.),
+//!    local_type_name: utf8 (Localized version of the data source-dependent name of the data type. NULL
+//!                           is returned if a localized name is not supported by the data source),
+//!    minimum_scale: int32 (The minimum scale of the data type on the data source.
+//!                          If a data type has a fixed scale, the MINIMUM_SCALE and MAXIMUM_SCALE
+//!                          columns both contain this value. NULL is returned if scale is not applicable.),
+//!    maximum_scale: int32 (The maximum scale of the data type on the data source.
+//!                          NULL is returned if scale is not applicable.),
+//!    sql_data_type: int32 not null (The value of the SQL DATA TYPE which has the same values
+//!                                   as data_type value. Except for interval and datetime, which
+//!                                   uses generic values. More info about those types can be
+//!                                   obtained through datetime_subcode. The possible values can be seen
+//!                                   in the XdbcDataType enum.),
+//!    datetime_subcode: int32 (Only used when the SQL DATA TYPE is interval or datetime. It contains
+//!                             its sub types. For type different from interval and datetime, this value
+//!                             is NULL. The possible values can be seen in the XdbcDatetimeSubcode enum.),
+//!    num_prec_radix: int32 (If the data type is an approximate numeric type, this column contains
+//!                           the value 2 to indicate that COLUMN_SIZE specifies a number of bits. For
+//!                           exact numeric types, this column contains the value 10 to indicate that
+//!                           column size specifies a number of decimal digits. Otherwise, this column is NULL.),
+//!    interval_precision: int32 (If the data type is an interval data type, then this column contains the value
+//!                               of the interval leading precision. Otherwise, this column is NULL. This fields
+//!                               is only relevant to be used by ODBC).
+//! >
+//! The returned data should be ordered by data_type and then by type_name.
+//!
+//!
+//! [Arrow FlightSQL Specification]: https://github.com/apache/arrow/blob/9588da967c756b2923e213ccc067378ba6c90a86/format/FlightSql.proto#L1064-L1113
+
+use arrow_flight::sql::metadata::{XdbcTypeInfo, XdbcTypeInfoData, XdbcTypeInfoDataBuilder};
+use arrow_flight::sql::{Nullable, Searchable, XdbcDataType, XdbcDatetimeSubcode};
+use once_cell::sync::Lazy;
+
+pub(crate) fn xdbc_type_info_data() -> &'static XdbcTypeInfoData {
+    &XDBC_TYPE_INFO_DATA
+}
+
+/// Data Types supported by DataFusion
+/// <https://arrow.apache.org/datafusion/user-guide/sql/data_types.html>
+static XDBC_TYPE_INFO_DATA: Lazy<XdbcTypeInfoData> = Lazy::new(|| {
+    let mut builder = XdbcTypeInfoDataBuilder::new();
+    builder.append(XdbcTypeInfo {
+        type_name: "VARCHAR".to_string(),
+        data_type: XdbcDataType::XdbcVarchar,
+        column_size: Some(i32::MAX), // https://github.com/apache/arrow-datafusion/blob/3801d45fe5ea3d9b207488527b758a0264665263/datafusion/core/src/catalog/information_schema.rs#L532
+        literal_prefix: Some("'".to_string()),
+        literal_suffix: Some("'".to_string()),
+        create_params: Some(vec!["length".to_string()]),
+        nullable: Nullable::NullabilityNullable,
+        case_sensitive: true,
+        searchable: Searchable::Full,
+        unsigned_attribute: None,
+        fixed_prec_scale: false,
+        auto_increment: None,
+        local_type_name: Some("VARCHAR".to_string()),
+        minimum_scale: None,
+        maximum_scale: None,
+        sql_data_type: XdbcDataType::XdbcVarchar,
+        datetime_subcode: None,
+        num_prec_radix: None,
+        interval_precision: None,
+    });
+    builder.append(XdbcTypeInfo {
+        type_name: "INTEGER".to_string(),
+        data_type: XdbcDataType::XdbcInteger,
+        column_size: Some(32), // https://github.com/apache/arrow-datafusion/blob/3801d45fe5ea3d9b207488527b758a0264665263/datafusion/core/src/catalog/information_schema.rs#L563
+        literal_prefix: None,
+        literal_suffix: None,
+        create_params: None,
+        nullable: Nullable::NullabilityNullable,
+        case_sensitive: false,
+        searchable: Searchable::Full,
+        unsigned_attribute: Some(false),
+        fixed_prec_scale: false,
+        auto_increment: Some(false),
+        local_type_name: Some("INTEGER".to_string()),
+        minimum_scale: None,
+        maximum_scale: None,
+        sql_data_type: XdbcDataType::XdbcInteger,
+        datetime_subcode: None,
+        num_prec_radix: Some(2), // https://github.com/apache/arrow-datafusion/blob/3801d45fe5ea3d9b207488527b758a0264665263/datafusion/core/src/catalog/information_schema.rs#L563
+        interval_precision: None,
+    });
+    builder.append(XdbcTypeInfo {
+        type_name: "FLOAT".to_string(),
+        data_type: XdbcDataType::XdbcFloat,
+        column_size: Some(24), // https://github.com/apache/arrow-datafusion/blob/3801d45fe5ea3d9b207488527b758a0264665263/datafusion/core/src/catalog/information_schema.rs#L568
+        literal_prefix: None,
+        literal_suffix: None,
+        create_params: None,
+        nullable: Nullable::NullabilityNullable,
+        case_sensitive: false,
+        searchable: Searchable::Full,
+        unsigned_attribute: Some(false),
+        fixed_prec_scale: false,
+        auto_increment: Some(false),
+        local_type_name: Some("FLOAT".to_string()),
+        minimum_scale: None,
+        maximum_scale: None,
+        sql_data_type: XdbcDataType::XdbcFloat,
+        datetime_subcode: None,
+        num_prec_radix: Some(2), // https://github.com/apache/arrow-datafusion/blob/3801d45fe5ea3d9b207488527b758a0264665263/datafusion/core/src/catalog/information_schema.rs#L568
+        interval_precision: None,
+    });
+    builder.append(XdbcTypeInfo {
+        type_name: "TIMESTAMP".to_string(),
+        data_type: XdbcDataType::XdbcTimestamp,
+        column_size: Some(i32::MAX), // https://github.com/apache/arrow-datafusion/blob/4297547df6dc297d692ca82566cfdf135d4730b5/datafusion/proto/src/generated/prost.rs#L894
+        literal_prefix: Some("'".to_string()),
+        literal_suffix: Some("'".to_string()),
+        create_params: None,
+        nullable: Nullable::NullabilityNullable,
+        case_sensitive: false,
+        searchable: Searchable::Full,
+        unsigned_attribute: None,
+        fixed_prec_scale: false,
+        auto_increment: None,
+        local_type_name: Some("TIMESTAMP".to_string()),
+        minimum_scale: None,
+        maximum_scale: None,
+        sql_data_type: XdbcDataType::XdbcTimestamp,
+        datetime_subcode: None,
+        num_prec_radix: None,
+        interval_precision: None,
+    });
+    builder.append(XdbcTypeInfo {
+        type_name: "INTERVAL".to_string(),
+        data_type: XdbcDataType::XdbcInterval,
+        column_size: Some(i32::MAX), // https://github.com/apache/arrow-datafusion/blob/4297547df6dc297d692ca82566cfdf135d4730b5/datafusion/proto/src/generated/prost.rs#L1031-L1038
+        literal_prefix: Some("'".to_string()),
+        literal_suffix: Some("'".to_string()),
+        create_params: None,
+        nullable: Nullable::NullabilityNullable,
+        case_sensitive: false,
+        searchable: Searchable::Full,
+        unsigned_attribute: None,
+        fixed_prec_scale: false,
+        auto_increment: None,
+        local_type_name: Some("INTERVAL".to_string()),
+        minimum_scale: None,
+        maximum_scale: None,
+        sql_data_type: XdbcDataType::XdbcInterval,
+        datetime_subcode: Some(XdbcDatetimeSubcode::XdbcSubcodeUnknown),
+        num_prec_radix: None,
+        interval_precision: None, // https://github.com/apache/arrow-datafusion/blob/6be75ff2dcc47128b78a695477512ba86c46373f/datafusion/core/src/catalog/information_schema.rs#L581-L582
+    });
+
+    builder.build().expect("created XdbcTypeInfo")
+});
diff --git a/garbage_collector/Cargo.toml b/garbage_collector/Cargo.toml
new file mode 100644
index 0000000..af040db
--- /dev/null
+++ b/garbage_collector/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "garbage_collector"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+chrono = { version = "0.4", default-features = false }
+clap = { version = "4", features = ["derive", "env"] }
+clap_blocks = { path = "../clap_blocks" }
+data_types = { path = "../data_types" }
+futures = "0.3"
+humantime = "2.1.0"
+iox_catalog = { path = "../iox_catalog" }
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+snafu = "0.8"
+tokio = { version = "1", features = ["macros", "rt", "sync"] }
+tokio-stream = "0.1"
+tokio-util = { version = "0.7.10" }
+uuid = { version = "1", features = ["v4"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+async-trait = "0.1"
+bytes = "1.5"
+data_types = { path = "../data_types" }
+filetime = "0.2"
+iox_time = { path = "../iox_time" }
+metric = { path = "../metric" }
+once_cell = { version = "1.19", features = ["parking_lot"] }
+parquet_file = { path = "../parquet_file" }
+tempfile = "3"
diff --git a/garbage_collector/src/lib.rs b/garbage_collector/src/lib.rs
new file mode 100644
index 0000000..89fba32
--- /dev/null
+++ b/garbage_collector/src/lib.rs
@@ -0,0 +1,404 @@
+//! Tool to clean up old object store files that don't appear in the catalog.
+
+#![deny(
+    rustdoc::broken_intra_doc_links,
+    rust_2018_idioms,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+#![warn(
+    missing_docs,
+    clippy::todo,
+    clippy::dbg_macro,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(clippy::missing_docs_in_private_items)]
+
+// Workaround for "unused crate" lint false positives.
+use clap as _;
+use workspace_hack as _;
+
+use crate::{
+    objectstore::{checker as os_checker, deleter as os_deleter, lister as os_lister},
+    parquetfile::deleter as pf_deleter,
+    retention::flagger as retention_flagger,
+};
+
+use clap_blocks::garbage_collector::GarbageCollectorConfig;
+use humantime::format_duration;
+use iox_catalog::interface::Catalog;
+use object_store::DynObjectStore;
+use observability_deps::tracing::*;
+use snafu::prelude::*;
+use std::{fmt::Debug, sync::Arc};
+use tokio::{select, sync::mpsc};
+use tokio_util::sync::CancellationToken;
+
+/// Logic for listing, checking and deleting files in object storage
+mod objectstore;
+/// Logic for deleting parquet files from the catalog
+mod parquetfile;
+/// Logic for flagging parquet files for deletion based on retention settings
+mod retention;
+
+const BUFFER_SIZE: usize = 1000;
+
+/// Run the tasks that clean up old object store files that don't appear in the catalog.
+pub async fn main(config: Config) -> Result<()> {
+    GarbageCollector::start(config)?.join().await
+}
+
+/// The tasks that clean up old object store files that don't appear in the catalog.
+pub struct GarbageCollector {
+    shutdown: CancellationToken,
+    os_lister: tokio::task::JoinHandle<Result<(), os_lister::Error>>,
+    os_checker: tokio::task::JoinHandle<Result<(), os_checker::Error>>,
+    os_deleter: tokio::task::JoinHandle<Result<(), os_deleter::Error>>,
+    pf_deleter: tokio::task::JoinHandle<Result<(), pf_deleter::Error>>,
+    retention_flagger: tokio::task::JoinHandle<Result<(), retention_flagger::Error>>,
+}
+
+impl Debug for GarbageCollector {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("GarbageCollector").finish_non_exhaustive()
+    }
+}
+
+impl GarbageCollector {
+    /// Construct the garbage collector and start it
+    pub fn start(config: Config) -> Result<Self> {
+        let Config {
+            object_store,
+            sub_config,
+            catalog,
+        } = config;
+
+        let dry_run = sub_config.dry_run;
+        info!(
+            objectstore_cutoff = %format_duration(sub_config.objectstore_cutoff),
+            parquetfile_cutoff = %format_duration(sub_config.parquetfile_cutoff),
+            parquetfile_sleep_interval = %format_duration(sub_config.parquetfile_sleep_interval()),
+            objectstore_sleep_interval_minutes = %sub_config.objectstore_sleep_interval_minutes,
+            retention_sleep_interval_minutes = %sub_config.retention_sleep_interval_minutes,
+            "GarbageCollector starting"
+        );
+
+        // Shutdown handler channel to notify children
+        let shutdown = CancellationToken::new();
+
+        // Initialise the object store garbage collector, which works as three communicating threads:
+        // - lister lists objects in the object store and sends them on a channel. the lister will
+        //   run until it has enumerated all matching files, then sleep for the configured
+        //   interval.
+        // - checker receives from that channel and checks the catalog to see if they exist, if not
+        //   it sends them on another channel
+        // - deleter receives object store entries that have been checked and therefore should be
+        //   deleted.
+        let (tx1, rx1) = mpsc::channel(BUFFER_SIZE);
+        let (tx2, rx2) = mpsc::channel(BUFFER_SIZE);
+
+        let sdt = shutdown.clone();
+        let osa = Arc::clone(&object_store);
+
+        let os_lister = tokio::spawn(async move {
+            select! {
+                ret = os_lister::perform(
+                    osa,
+                    tx1,
+                    sub_config.objectstore_sleep_interval_minutes,
+                    sub_config.objectstore_sleep_interval_batch_milliseconds,
+                ) => {
+                    ret
+                },
+                _ = sdt.cancelled() => {
+                    Ok(())
+                },
+            }
+        });
+
+        let cat = Arc::clone(&catalog);
+        let sdt = shutdown.clone();
+        let cutoff = chrono::Duration::from_std(sub_config.objectstore_cutoff).map_err(|e| {
+            Error::CutoffError {
+                message: e.to_string(),
+            }
+        })?;
+
+        let os_checker = tokio::spawn(async move {
+            select! {
+                ret = os_checker::perform(
+                    cat,
+                    cutoff,
+                    rx1,
+                    tx2,
+                ) => {
+                    ret
+                },
+                _ = sdt.cancelled() => {
+                    Ok(())
+                },
+            }
+        });
+
+        let os_deleter = tokio::spawn(os_deleter::perform(
+            shutdown.clone(),
+            object_store,
+            dry_run,
+            rx2,
+        ));
+
+        // Initialise the parquet file deleter, which is just one thread that calls delete_old()
+        // on the catalog then sleeps.
+        let pf_deleter = tokio::spawn(pf_deleter::perform(
+            shutdown.clone(),
+            Arc::clone(&catalog),
+            sub_config.parquetfile_cutoff,
+            sub_config.parquetfile_sleep_interval(),
+        ));
+
+        // Initialise the retention code, which is just one thread that calls
+        // flag_for_delete_by_retention() on the catalog then sleeps.
+        let retention_flagger = tokio::spawn(retention_flagger::perform(
+            shutdown.clone(),
+            catalog,
+            sub_config.retention_sleep_interval_minutes,
+            sub_config.dry_run,
+        ));
+
+        Ok(Self {
+            shutdown,
+            os_lister,
+            os_checker,
+            os_deleter,
+            pf_deleter,
+            retention_flagger,
+        })
+    }
+
+    /// A handle to gracefully shutdown the garbage collector when invoked
+    pub fn shutdown_handle(&self) -> impl Fn() {
+        let shutdown = self.shutdown.clone();
+        move || {
+            shutdown.cancel();
+        }
+    }
+
+    /// Wait for the garbage collector to finish work
+    pub async fn join(self) -> Result<()> {
+        let Self {
+            os_lister,
+            os_checker,
+            os_deleter,
+            pf_deleter,
+            retention_flagger,
+            shutdown: _,
+        } = self;
+
+        let (os_lister, os_checker, os_deleter, pf_deleter, retention_flagger) = futures::join!(
+            os_lister,
+            os_checker,
+            os_deleter,
+            pf_deleter,
+            retention_flagger
+        );
+
+        retention_flagger.context(ParquetFileDeleterPanicSnafu)??;
+        pf_deleter.context(ParquetFileDeleterPanicSnafu)??;
+        os_deleter.context(ObjectStoreDeleterPanicSnafu)??;
+        os_checker.context(ObjectStoreCheckerPanicSnafu)??;
+        os_lister.context(ObjectStoreListerPanicSnafu)??;
+
+        Ok(())
+    }
+}
+
+/// Configuration to run the object store garbage collector
+#[derive(Clone)]
+pub struct Config {
+    /// The object store to garbage collect
+    pub object_store: Arc<DynObjectStore>,
+
+    /// The catalog to check if an object is garbage
+    pub catalog: Arc<dyn Catalog>,
+
+    /// The garbage collector specific configuration
+    pub sub_config: GarbageCollectorConfig,
+}
+
+impl Debug for Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Config (GarbageCollector")
+            .field("sub_config", &self.sub_config)
+            .finish_non_exhaustive()
+    }
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Error converting parsed duration: {message}"))]
+    CutoffError { message: String },
+
+    #[snafu(display("The object store lister task failed"))]
+    #[snafu(context(false))]
+    ObjectStoreLister { source: os_lister::Error },
+    #[snafu(display("The object store lister task panicked"))]
+    ObjectStoreListerPanic { source: tokio::task::JoinError },
+
+    #[snafu(display("The object store checker task failed"))]
+    #[snafu(context(false))]
+    ObjectStoreChecker { source: os_checker::Error },
+    #[snafu(display("The object store checker task panicked"))]
+    ObjectStoreCheckerPanic { source: tokio::task::JoinError },
+
+    #[snafu(display("The object store deleter task failed"))]
+    #[snafu(context(false))]
+    ObjectStoreDeleter { source: os_deleter::Error },
+    #[snafu(display("The object store deleter task panicked"))]
+    ObjectStoreDeleterPanic { source: tokio::task::JoinError },
+
+    #[snafu(display("The parquet file deleter task failed"))]
+    #[snafu(context(false))]
+    ParquetFileDeleter { source: pf_deleter::Error },
+    #[snafu(display("The parquet file deleter task panicked"))]
+    ParquetFileDeleterPanic { source: tokio::task::JoinError },
+
+    #[snafu(display("The parquet file retention flagger task failed"))]
+    #[snafu(context(false))]
+    ParquetFileRetentionFlagger { source: retention_flagger::Error },
+    #[snafu(display("The parquet file retention flagger task panicked"))]
+    ParquetFileRetentionFlaggerPanic { source: tokio::task::JoinError },
+}
+
+#[allow(missing_docs)]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+#[cfg(test)]
+mod tests {
+    use clap::Parser;
+    use clap_blocks::{
+        catalog_dsn::CatalogDsnConfig,
+        object_store::{make_object_store, ObjectStoreConfig},
+    };
+    use filetime::FileTime;
+    use iox_time::SystemProvider;
+    use std::{fs, iter, path::PathBuf, time::Duration};
+    use tempfile::TempDir;
+    use tokio::time::sleep;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn deletes_untracked_files_older_than_the_cutoff() {
+        let setup = OldFileSetup::new();
+
+        let config = build_config(
+            setup.data_dir_arg(),
+            ["--objectstore-sleep-interval-minutes=0"],
+        )
+        .await;
+        tokio::spawn(async {
+            main(config).await.unwrap();
+        });
+
+        // file-based objectstore only has one file, it can't take long
+        sleep(Duration::from_millis(500)).await;
+
+        assert!(
+            !setup.file_path.exists(),
+            "The path {} should have been deleted",
+            setup.file_path.as_path().display(),
+        );
+    }
+
+    #[tokio::test]
+    async fn preserves_untracked_files_newer_than_the_cutoff() {
+        let setup = OldFileSetup::new();
+
+        #[rustfmt::skip]
+        let config = build_config(setup.data_dir_arg(), [
+            "--objectstore-cutoff", "10y",
+        ]).await;
+        tokio::spawn(async {
+            main(config).await.unwrap();
+        });
+
+        // file-based objectstore only has one file, it can't take long
+        sleep(Duration::from_millis(500)).await;
+
+        assert!(
+            setup.file_path.exists(),
+            "The path {} should not have been deleted",
+            setup.file_path.as_path().display(),
+        );
+    }
+
+    async fn build_config(data_dir: &str, args: impl IntoIterator<Item = &str> + Send) -> Config {
+        let sub_config =
+            GarbageCollectorConfig::parse_from(iter::once("dummy-program-name").chain(args));
+        let object_store = object_store(data_dir);
+        let catalog = catalog().await;
+
+        Config {
+            object_store,
+            catalog,
+            sub_config,
+        }
+    }
+
+    fn object_store(data_dir: &str) -> Arc<DynObjectStore> {
+        #[rustfmt::skip]
+        let cfg = ObjectStoreConfig::parse_from([
+            "dummy-program-name",
+            "--object-store", "file",
+            "--data-dir", data_dir,
+        ]);
+        make_object_store(&cfg).unwrap()
+    }
+
+    async fn catalog() -> Arc<dyn Catalog> {
+        #[rustfmt::skip]
+        let cfg = CatalogDsnConfig::parse_from([
+            "dummy-program-name",
+            "--catalog-dsn", "memory",
+        ]);
+
+        let metrics = metric::Registry::default().into();
+        let time_provider = Arc::new(SystemProvider::new());
+
+        cfg.get_catalog("garbage_collector", metrics, time_provider)
+            .await
+            .unwrap()
+    }
+
+    struct OldFileSetup {
+        data_dir: TempDir,
+        file_path: PathBuf,
+    }
+
+    impl OldFileSetup {
+        const APRIL_9_2018: FileTime = FileTime::from_unix_time(1523308536, 0);
+
+        fn new() -> Self {
+            let data_dir = TempDir::new().unwrap();
+
+            let file_path = data_dir.path().join("some-old-file");
+            fs::write(&file_path, "dummy content").unwrap();
+            filetime::set_file_mtime(&file_path, Self::APRIL_9_2018).unwrap();
+
+            Self {
+                data_dir,
+                file_path,
+            }
+        }
+
+        fn data_dir_arg(&self) -> &str {
+            self.data_dir.path().to_str().unwrap()
+        }
+    }
+}
diff --git a/garbage_collector/src/objectstore/checker.rs b/garbage_collector/src/objectstore/checker.rs
new file mode 100644
index 0000000..b84d78c
--- /dev/null
+++ b/garbage_collector/src/objectstore/checker.rs
@@ -0,0 +1,579 @@
+use chrono::{DateTime, Duration, Utc};
+use data_types::ObjectStoreId;
+use iox_catalog::interface::{Catalog, ParquetFileRepo};
+use object_store::ObjectMeta;
+use observability_deps::tracing::*;
+use snafu::prelude::*;
+use std::collections::HashSet;
+use std::sync::Arc;
+use tokio::sync::mpsc;
+use tokio::time::timeout;
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Expected a file name"))]
+    FileNameMissing,
+
+    #[snafu(display("Channel closed unexpectedly"))]
+    ChannelClosed,
+
+    #[snafu(display("The catalog could not be queried for {object_store_id}"))]
+    GetFile {
+        source: iox_catalog::interface::Error,
+        object_store_id: uuid::Uuid,
+    },
+
+    #[snafu(display("The catalog could not be queried for the batch"))]
+    FileExists {
+        source: iox_catalog::interface::Error,
+    },
+
+    #[snafu(display("The deleter task exited unexpectedly"))]
+    DeleterExited {
+        source: tokio::sync::mpsc::error::SendError<ObjectMeta>,
+    },
+}
+
+/// The number of parquet files we will ask the catalog to look for at once.
+// todo(pjb): I have no idea what's a good value here to amortize the request. More than 1 is a start.
+// Here's the idea: group everything you can for 100ms `RECEIVE_TIMEOUT`, because that's not so much
+// of a delay that it would cause issues, but if you manage to get a huge number of file ids, stop
+// accumulating at 100 `CATALOG_BATCH_SIZE`.
+const CATALOG_BATCH_SIZE: usize = 100;
+const RECEIVE_TIMEOUT: core::time::Duration = core::time::Duration::from_millis(100); // This may not be long enough to collect many objects.
+
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
+
+pub(crate) async fn perform(
+    catalog: Arc<dyn Catalog>,
+    cutoff: Duration,
+    items: mpsc::Receiver<ObjectMeta>,
+    deleter: mpsc::Sender<ObjectMeta>,
+) -> Result<()> {
+    let mut repositories = catalog.repositories();
+    let parquet_files = repositories.parquet_files();
+
+    perform_inner(parquet_files, cutoff, items, deleter).await
+}
+
+/// Allows easier mocking of just `ParquetFileRepo` in tests.
+async fn perform_inner(
+    parquet_files: &mut dyn ParquetFileRepo,
+    cutoff: Duration,
+    mut items: mpsc::Receiver<ObjectMeta>,
+    deleter: mpsc::Sender<ObjectMeta>,
+) -> Result<()> {
+    let mut batch = Vec::with_capacity(CATALOG_BATCH_SIZE);
+    loop {
+        let maybe_item = timeout(RECEIVE_TIMEOUT, items.recv()).await;
+
+        // if we have an error, we timed out.
+        let timedout = maybe_item.is_err();
+        if let Ok(res) = maybe_item {
+            match res {
+                Some(item) => {
+                    batch.push(item);
+                }
+                None => {
+                    // The channel has been closed unexpectedly
+                    return Err(Error::ChannelClosed);
+                }
+            }
+        };
+
+        if batch.len() >= CATALOG_BATCH_SIZE || timedout {
+            let older_than = chrono::offset::Utc::now() - cutoff;
+            for item in should_delete(batch, older_than, parquet_files).await {
+                deleter.send(item).await.context(DeleterExitedSnafu)?;
+            }
+            batch = Vec::with_capacity(100);
+        }
+    }
+}
+
+/// [should_delete] processes a list of object store file information to see if the object for this
+/// [ObjectMeta] can be deleted.
+/// It can be deleted if it is old enough AND there isn't a reference in the catalog for it anymore (or ever)
+/// It will also say the file can be deleted if it isn't a parquet file or the uuid isn't valid.
+/// [should_delete] returns a subset of the input, which are the items that "should" be deleted.
+// It first processes the easy checks, age, uuid, file suffix, and other parse/data input errors. This
+// checking is cheap. For the files that need to be checked against the catalog, it batches them to
+// reduce the number of requests on the wire and amortize the catalog overhead. Setting the batch size
+// to 1 will return this method to its previous behavior (1 request per file) and resource usage.
+async fn should_delete(
+    items: Vec<ObjectMeta>,
+    cutoff: DateTime<Utc>,
+    parquet_files: &mut dyn ParquetFileRepo,
+) -> Vec<ObjectMeta> {
+    // to_delete is the vector we will return to the caller containing ObjectMeta we think should be deleted.
+    // it is never longer than `items`
+    let mut to_delete = Vec::with_capacity(items.len());
+    // After filtering out potential errors and non-parquet files, this vector accumulates the objects
+    // that need to be checked against the catalog to see if we can delete them.
+    let mut to_check_in_catalog = Vec::with_capacity(items.len());
+
+    for candidate in items {
+        if cutoff < candidate.last_modified {
+            // expected to be a common reason to skip a file
+            debug!(
+                location = %candidate.location,
+                deleting = false,
+                reason = "too new",
+                cutoff = %cutoff,
+                last_modified = %candidate.last_modified,
+                "Ignoring object",
+            );
+            // Not old enough; do not delete
+            continue;
+        }
+
+        let file_name = candidate.location.parts().last();
+        if file_name.is_none() {
+            warn!(
+                location = %candidate.location,
+                deleting = true,
+                reason = "bad location",
+                "Ignoring object",
+            );
+            // missing file name entirely! likely not a valid object store file entry
+            // skip it
+            continue;
+        }
+
+        // extract the file suffix, delete it if it isn't a parquet file
+        if let Some(uuid) = file_name.unwrap().as_ref().strip_suffix(".parquet") {
+            if let Ok(object_store_id) = uuid.parse::<ObjectStoreId>() {
+                // add it to the list to check against the catalog
+                // push a tuple that maps the uuid to the object meta struct so we don't have generate the uuid again
+                to_check_in_catalog.push((object_store_id, candidate))
+            } else {
+                // expected to be a rare situation so warn.
+                warn!(
+                    location = %candidate.location,
+                    deleting = true,
+                    uuid,
+                    reason = "not a valid UUID",
+                    "Scheduling file for deletion",
+                );
+                to_delete.push(candidate)
+            }
+        } else {
+            // expected to be a rare situation so warn.
+            warn!(
+                location = %candidate.location,
+                deleting = true,
+                reason = "not a .parquet file",
+                "Scheduling file for deletion",
+            );
+            to_delete.push(candidate)
+        }
+    }
+
+    // do_not_delete contains the items that are present in the catalog
+    let mut do_not_delete: HashSet<ObjectStoreId> =
+        HashSet::with_capacity(to_check_in_catalog.len());
+    for batch in to_check_in_catalog.chunks(CATALOG_BATCH_SIZE) {
+        let just_uuids: Vec<_> = batch.iter().map(|id| id.0).collect();
+        match check_ids_exists_in_catalog(just_uuids.clone(), parquet_files).await {
+            Ok(present_uuids) => {
+                do_not_delete.extend(present_uuids.iter());
+            }
+            Err(e) => {
+                // on error assume all the uuids in this batch are present in the catalog
+                do_not_delete.extend(just_uuids.iter());
+                warn!(
+                    error = %e,
+                    reason = "error querying catalog",
+                    "Ignoring batch and continuing",
+                );
+            }
+        }
+    }
+
+    if enabled!(Level::DEBUG) {
+        do_not_delete.iter().for_each(|uuid| {
+            debug!(
+                deleting = false,
+                uuid = %uuid,
+                reason = "Object is present in catalog, not deleting",
+                "Ignoring object",
+            )
+        });
+    }
+
+    // we have a Vec of uuids for the files we _do not_ want to delete (present in the catalog)
+    // remove these uuids from the Vec of all uuids we checked, adding the remainder to the delete list
+    to_check_in_catalog
+        .iter()
+        .filter(|c| !do_not_delete.contains(&c.0))
+        .for_each(|c| to_delete.push(c.1.clone()));
+
+    to_delete
+}
+
+/// helper to check a batch of ids for presence in the catalog.
+/// returns a list of the ids (from the original batch) that exist (or catalog error).
+async fn check_ids_exists_in_catalog(
+    candidates: Vec<ObjectStoreId>,
+    parquet_files: &mut dyn ParquetFileRepo,
+) -> Result<Vec<ObjectStoreId>> {
+    parquet_files
+        .exists_by_object_store_id_batch(candidates)
+        .await
+        .context(FileExistsSnafu)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use async_trait::async_trait;
+    use data_types::{
+        ColumnId, ColumnSet, CompactionLevel, NamespaceId, ObjectStoreId, ParquetFile,
+        ParquetFileId, ParquetFileParams, PartitionId, TableId, Timestamp, TransitionPartitionId,
+    };
+    use iox_catalog::{
+        interface::{Catalog, ParquetFileRepoExt},
+        mem::MemCatalog,
+        test_helpers::{arbitrary_namespace, arbitrary_table},
+    };
+    use iox_time::SystemProvider;
+    use object_store::path::Path;
+    use once_cell::sync::Lazy;
+    use parquet_file::ParquetFilePath;
+    use std::{assert_eq, vec};
+
+    static OLDER_TIME: Lazy<DateTime<Utc>> = Lazy::new(|| {
+        DateTime::parse_from_str("2022-01-01T00:00:00z", "%+")
+            .unwrap()
+            .naive_utc()
+            .and_utc()
+    });
+    static NEWER_TIME: Lazy<DateTime<Utc>> = Lazy::new(|| {
+        DateTime::parse_from_str("2022-02-02T00:00:00z", "%+")
+            .unwrap()
+            .naive_utc()
+            .and_utc()
+    });
+
+    async fn create_catalog_and_file() -> (Arc<dyn Catalog>, ParquetFile) {
+        let metric_registry = Arc::new(metric::Registry::new());
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog = Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        create_schema_and_file(catalog).await
+    }
+
+    async fn create_schema_and_file(catalog: Arc<dyn Catalog>) -> (Arc<dyn Catalog>, ParquetFile) {
+        let mut repos = catalog.repositories();
+        let namespace = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test").await;
+        let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+        let partition = repos
+            .partitions()
+            .create_or_get("one".into(), table.id)
+            .await
+            .unwrap();
+
+        let parquet_file_params = ParquetFileParams {
+            namespace_id: namespace.id,
+            table_id: partition.table_id,
+            partition_id: partition.id,
+            partition_hash_id: partition.hash_id().cloned(),
+            object_store_id: ObjectStoreId::new(),
+            min_time: Timestamp::new(1),
+            max_time: Timestamp::new(10),
+            file_size_bytes: 1337,
+            row_count: 0,
+            compaction_level: CompactionLevel::Initial,
+            created_at: Timestamp::new(1),
+            column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]),
+            max_l0_created_at: Timestamp::new(1),
+        };
+
+        let parquet_file = repos
+            .parquet_files()
+            .create(parquet_file_params)
+            .await
+            .unwrap();
+
+        (catalog, parquet_file)
+    }
+
+    #[tokio::test]
+    async fn dont_delete_new_file_in_catalog() {
+        let (catalog, file_in_catalog) = create_catalog_and_file().await;
+        let mut repositories = catalog.repositories();
+        let parquet_files = repositories.parquet_files();
+
+        let location = ParquetFilePath::new(
+            file_in_catalog.namespace_id,
+            file_in_catalog.table_id,
+            &file_in_catalog.transition_partition_id(),
+            file_in_catalog.object_store_id,
+        )
+        .object_store_path();
+
+        let cutoff = *OLDER_TIME;
+        let last_modified = *NEWER_TIME;
+
+        let item = ObjectMeta {
+            location,
+            last_modified,
+            size: 0,
+            e_tag: None,
+            version: None,
+        };
+
+        let results = should_delete(vec![item], cutoff, parquet_files).await;
+        assert_eq!(results.len(), 0);
+    }
+
+    #[tokio::test]
+    async fn dont_delete_new_file_not_in_catalog() {
+        let metric_registry = Arc::new(metric::Registry::new());
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
+        let parquet_files = repositories.parquet_files();
+
+        let location = ParquetFilePath::new(
+            NamespaceId::new(1),
+            TableId::new(2),
+            &TransitionPartitionId::Deprecated(PartitionId::new(4)),
+            ObjectStoreId::new(),
+        )
+        .object_store_path();
+
+        let cutoff = *OLDER_TIME;
+        let last_modified = *NEWER_TIME;
+
+        let item = ObjectMeta {
+            location,
+            last_modified,
+            size: 0,
+            e_tag: None,
+            version: None,
+        };
+
+        let results = should_delete(vec![item], cutoff, parquet_files).await;
+        assert_eq!(results.len(), 0);
+    }
+
+    #[tokio::test]
+    async fn dont_delete_new_file_with_unparseable_path() {
+        let metric_registry = Arc::new(metric::Registry::new());
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
+        let parquet_files = repositories.parquet_files();
+
+        let cutoff = *OLDER_TIME;
+        let last_modified = *NEWER_TIME;
+
+        let item = ObjectMeta {
+            location: Path::from("not-a-uuid.parquet"),
+            last_modified,
+            size: 0,
+            e_tag: None,
+            version: None,
+        };
+
+        let results = should_delete(vec![item], cutoff, parquet_files).await;
+        assert_eq!(results.len(), 0);
+    }
+
+    #[tokio::test]
+    async fn dont_delete_old_file_in_catalog() {
+        let (catalog, file_in_catalog) = create_catalog_and_file().await;
+        let mut repositories = catalog.repositories();
+        let parquet_files = repositories.parquet_files();
+
+        let location = ParquetFilePath::new(
+            file_in_catalog.namespace_id,
+            file_in_catalog.table_id,
+            &file_in_catalog.transition_partition_id(),
+            file_in_catalog.object_store_id,
+        )
+        .object_store_path();
+
+        let cutoff = *NEWER_TIME;
+        let last_modified = *OLDER_TIME;
+
+        let item = ObjectMeta {
+            location,
+            last_modified,
+            size: 0,
+            e_tag: None,
+            version: None,
+        };
+
+        let results = should_delete(vec![item], cutoff, parquet_files).await;
+        assert_eq!(results.len(), 0);
+    }
+
+    #[tokio::test]
+    async fn delete_old_file_not_in_catalog() {
+        let metric_registry = Arc::new(metric::Registry::new());
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
+        let parquet_files = repositories.parquet_files();
+
+        let location = ParquetFilePath::new(
+            NamespaceId::new(1),
+            TableId::new(2),
+            &TransitionPartitionId::Deprecated(PartitionId::new(4)),
+            ObjectStoreId::new(),
+        )
+        .object_store_path();
+
+        let cutoff = *NEWER_TIME;
+        let last_modified = *OLDER_TIME;
+
+        let item = ObjectMeta {
+            location,
+            last_modified,
+            size: 0,
+            e_tag: None,
+            version: None,
+        };
+        let results = should_delete(vec![item.clone()], cutoff, parquet_files).await;
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0], item);
+    }
+
+    #[tokio::test]
+    async fn delete_old_file_with_unparseable_path() {
+        let metric_registry = Arc::new(metric::Registry::new());
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
+        let parquet_files = repositories.parquet_files();
+
+        let cutoff = *NEWER_TIME;
+        let last_modified = *OLDER_TIME;
+
+        let item = ObjectMeta {
+            location: Path::from("not-a-uuid.parquet"),
+            last_modified,
+            size: 0,
+            e_tag: None,
+            version: None,
+        };
+
+        let results = should_delete(vec![item.clone()], cutoff, parquet_files).await;
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0], item);
+    }
+
+    /// The garbage collector checks the catalog for files it _should not delete_. If we can't reach
+    /// the catalog (some error), assume we are keeping all the files we are checking.
+    /// [do_not_delete_on_catalog_error] tests that.
+    #[tokio::test]
+    async fn do_not_delete_on_catalog_error() {
+        let metric_registry = Arc::new(metric::Registry::new());
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let (catalog, file_in_catalog) = create_schema_and_file(catalog).await;
+
+        let mut repositories = catalog.repositories();
+        let parquet_files = repositories.parquet_files();
+
+        // A ParquetFileRepo that returns an error in the one method [should_delete] uses.
+        let mut mocked_parquet_files = MockParquetFileRepo {
+            inner: parquet_files,
+        };
+
+        let cutoff = *NEWER_TIME;
+        let last_modified = *OLDER_TIME;
+
+        let loc = ParquetFilePath::new(
+            file_in_catalog.namespace_id,
+            file_in_catalog.table_id,
+            &file_in_catalog.transition_partition_id(),
+            file_in_catalog.object_store_id,
+        )
+        .object_store_path();
+
+        let item = ObjectMeta {
+            location: loc,
+            last_modified,
+            size: 0,
+            e_tag: None,
+            version: None,
+        };
+
+        // check precondition, file exists in catalog
+        let pf = mocked_parquet_files
+            .get_by_object_store_id(file_in_catalog.object_store_id)
+            .await
+            .unwrap()
+            .unwrap();
+        assert_eq!(pf, file_in_catalog);
+
+        // because of the db error, there should be no results
+        let results = should_delete(vec![item.clone()], cutoff, &mut mocked_parquet_files).await;
+        assert_eq!(results.len(), 0);
+    }
+
+    struct MockParquetFileRepo<'a> {
+        inner: &'a mut dyn ParquetFileRepo,
+    }
+
+    #[async_trait]
+    impl ParquetFileRepo for MockParquetFileRepo<'_> {
+        async fn flag_for_delete_by_retention(
+            &mut self,
+        ) -> iox_catalog::interface::Result<Vec<(PartitionId, ObjectStoreId)>> {
+            self.inner.flag_for_delete_by_retention().await
+        }
+
+        async fn delete_old_ids_only(
+            &mut self,
+            older_than: Timestamp,
+        ) -> iox_catalog::interface::Result<Vec<ObjectStoreId>> {
+            self.inner.delete_old_ids_only(older_than).await
+        }
+
+        async fn list_by_partition_not_to_delete_batch(
+            &mut self,
+            partition_ids: Vec<PartitionId>,
+        ) -> iox_catalog::interface::Result<Vec<ParquetFile>> {
+            self.inner
+                .list_by_partition_not_to_delete_batch(partition_ids)
+                .await
+        }
+
+        async fn get_by_object_store_id(
+            &mut self,
+            object_store_id: ObjectStoreId,
+        ) -> iox_catalog::interface::Result<Option<ParquetFile>> {
+            self.inner.get_by_object_store_id(object_store_id).await
+        }
+
+        async fn exists_by_object_store_id_batch(
+            &mut self,
+            _object_store_ids: Vec<ObjectStoreId>,
+        ) -> iox_catalog::interface::Result<Vec<ObjectStoreId>> {
+            Err(iox_catalog::interface::Error::External {
+                source: String::from("test").into(),
+            })
+        }
+
+        async fn create_upgrade_delete(
+            &mut self,
+            partition_id: PartitionId,
+            delete: &[ObjectStoreId],
+            upgrade: &[ObjectStoreId],
+            create: &[ParquetFileParams],
+            target_level: CompactionLevel,
+        ) -> iox_catalog::interface::Result<Vec<ParquetFileId>> {
+            self.create_upgrade_delete(partition_id, delete, upgrade, create, target_level)
+                .await
+        }
+    }
+}
diff --git a/garbage_collector/src/objectstore/deleter.rs b/garbage_collector/src/objectstore/deleter.rs
new file mode 100644
index 0000000..97af8ba
--- /dev/null
+++ b/garbage_collector/src/objectstore/deleter.rs
@@ -0,0 +1,155 @@
+use futures::{FutureExt, StreamExt, TryStreamExt};
+use object_store::{DynObjectStore, ObjectMeta};
+use observability_deps::tracing::info;
+use snafu::prelude::*;
+use std::sync::Arc;
+use tokio::sync::mpsc;
+use tokio_util::sync::CancellationToken;
+
+pub(crate) async fn perform(
+    shutdown: CancellationToken,
+    object_store: Arc<DynObjectStore>,
+    dry_run: bool,
+    items: mpsc::Receiver<ObjectMeta>,
+) -> Result<()> {
+    let locations = tokio_stream::wrappers::ReceiverStream::new(items).map(|item| item.location);
+
+    let stream_fu = if dry_run {
+        async move {
+            locations
+                .map(|path| {
+                    info!(?path, "Not deleting due to dry run");
+                })
+                .collect::<()>()
+                .await;
+            Ok(())
+        }
+        .boxed()
+    } else {
+        async move {
+            object_store
+                .delete_stream(
+                    locations
+                        .map(|path| {
+                            info!(%path, "Deleting");
+                            Ok(path)
+                        })
+                        .boxed(),
+                )
+                .map_ok(|_| ())
+                .map_err(|e: object_store::Error| Error::Deleting { source: e })
+                .try_collect()
+                .await
+        }
+        .boxed()
+    };
+
+    tokio::select! {
+        _ = shutdown.cancelled() => {
+            // Exit gracefully
+        }
+        res = stream_fu => {
+            // Propagate error
+            res?;
+        }
+    }
+
+    Ok(())
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("could not be delete: {source}"))]
+    Deleting { source: object_store::Error },
+}
+
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bytes::Bytes;
+    use chrono::Utc;
+    use data_types::{NamespaceId, ObjectStoreId, PartitionId, TableId, TransitionPartitionId};
+    use object_store::path::Path;
+    use parquet_file::ParquetFilePath;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn perform_shutdown_gracefully() {
+        let shutdown = CancellationToken::new();
+        let nitems = 3;
+        let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::new());
+        let items = populate_os_with_items(&object_store, nitems).await;
+
+        assert_eq!(count_os_element(&object_store).await, nitems);
+
+        let dry_run = false;
+        let (tx, rx) = mpsc::channel(1000);
+
+        tokio::spawn({
+            let shutdown = shutdown.clone();
+
+            async move {
+                for item in items {
+                    tx.send(item.clone()).await.unwrap();
+                }
+
+                // Send a shutdown signal
+                shutdown.cancel();
+
+                // Prevent this thread from exiting. Exiting this thread will
+                // close the channel, which in turns close the processing stream.
+                loop {
+                    tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                }
+            }
+        });
+
+        // This call should terminate because we send shutdown signal, but
+        // nothing can be said about the number of elements in object store.
+        // The processing stream may or may not have chance to process the
+        // items for deletion.
+        let perform_fu = perform(shutdown, Arc::clone(&object_store), dry_run, rx);
+        // Unusual test because there is no assertion but the call below should
+        // not panic which verifies that the deleter task shutdown gracefully.
+        tokio::time::timeout(Duration::from_secs(3), perform_fu)
+            .await
+            .unwrap()
+            .unwrap();
+    }
+
+    async fn count_os_element(os: &Arc<DynObjectStore>) -> usize {
+        let objects = os.list(None);
+        objects.fold(0, |acc, _| async move { acc + 1 }).await
+    }
+
+    async fn populate_os_with_items(os: &Arc<DynObjectStore>, nitems: usize) -> Vec<ObjectMeta> {
+        let mut items = vec![];
+        for i in 0..nitems {
+            let object_meta = ObjectMeta {
+                location: new_object_meta_location(),
+                last_modified: Utc::now(),
+                size: 0,
+                e_tag: None,
+                version: None,
+            };
+            os.put(&object_meta.location, Bytes::from(i.to_string()))
+                .await
+                .unwrap();
+            items.push(object_meta);
+        }
+        items
+    }
+
+    fn new_object_meta_location() -> Path {
+        ParquetFilePath::new(
+            NamespaceId::new(1),
+            TableId::new(2),
+            &TransitionPartitionId::Deprecated(PartitionId::new(4)),
+            ObjectStoreId::new(),
+        )
+        .object_store_path()
+    }
+}
diff --git a/garbage_collector/src/objectstore/lister.rs b/garbage_collector/src/objectstore/lister.rs
new file mode 100644
index 0000000..092e306
--- /dev/null
+++ b/garbage_collector/src/objectstore/lister.rs
@@ -0,0 +1,84 @@
+use futures::prelude::*;
+use object_store::{DynObjectStore, ObjectMeta};
+use observability_deps::tracing::*;
+use snafu::prelude::*;
+use std::{sync::Arc, time::Duration};
+use tokio::{sync::mpsc, time::sleep};
+
+/// Object store implementations will generally list all objects in the bucket/prefix. This limits
+/// the total items pulled (assuming lazy streams) at a time to limit impact on the catalog.
+/// Consider increasing this if throughput is an issue or shortening the loop/list sleep intervals.
+/// Listing will list all files, including those not to be deleted, which may be a very large number.
+const MAX_ITEMS_PROCESSED_PER_LOOP: usize = 10_000;
+
+/// perform a object store list, limiting to ['MAX_ITEMS_PROCESSED_PER_LOOP'] files at a time,
+/// waiting sleep interval before listing afresh.
+pub(crate) async fn perform(
+    object_store: Arc<DynObjectStore>,
+    checker: mpsc::Sender<ObjectMeta>,
+    sleep_interval_iteration_minutes: u64,
+    sleep_interval_list_page_milliseconds: u64,
+) -> Result<()> {
+    info!("beginning object store listing");
+
+    loop {
+        // there are issues with the service immediately hitting the os api (credentials, etc) on
+        // startup. Retry as needed.
+        let items = object_store.list(None);
+
+        let mut chunked_items = items.chunks(MAX_ITEMS_PROCESSED_PER_LOOP);
+
+        let mut count = 0;
+        while let Some(v) = chunked_items.next().await {
+            // relist and sleep on an error to allow time for transient errors to dissipate
+            // todo(pjb): react differently to different errors
+            match process_item_list(v, &checker).await {
+                Err(e) => {
+                    warn!("error processing items from object store, continuing: {e}");
+                    // go back to start of loop to list again, hopefully to get past error.
+                    break;
+                }
+                Ok(i) => {
+                    count += i;
+                }
+            }
+            sleep(Duration::from_millis(sleep_interval_list_page_milliseconds)).await;
+            debug!("starting next chunk of listed files");
+        }
+        info!("end of object store item list; listed {count} files: will relist in {sleep_interval_iteration_minutes} minutes");
+        sleep(Duration::from_secs(60 * sleep_interval_iteration_minutes)).await;
+    }
+}
+
+async fn process_item_list(
+    items: Vec<object_store::Result<ObjectMeta>>,
+    checker: &mpsc::Sender<ObjectMeta>,
+) -> Result<i32> {
+    let mut i = 0;
+    for item in items {
+        let item = item.context(MalformedSnafu)?;
+        debug!(location = %item.location, "Object store item");
+        checker.send(item).await?;
+        i += 1;
+    }
+    debug!("processed {i} files of listed chunk");
+    Ok(i)
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("The prefix could not be listed: {source}"))]
+    Listing { source: object_store::Error },
+
+    #[snafu(display("The object could not be listed: {source}"))]
+    Malformed { source: object_store::Error },
+
+    #[snafu(display("The checker task exited unexpectedly: {source}"))]
+    #[snafu(context(false))]
+    CheckerExited {
+        source: tokio::sync::mpsc::error::SendError<ObjectMeta>,
+    },
+}
+
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
diff --git a/garbage_collector/src/objectstore/mod.rs b/garbage_collector/src/objectstore/mod.rs
new file mode 100644
index 0000000..3d258d8
--- /dev/null
+++ b/garbage_collector/src/objectstore/mod.rs
@@ -0,0 +1,6 @@
+/// Logic for checking if a file in object storage should be deleted or not.
+pub(crate) mod checker;
+/// Logic for deleting a file from object storage.
+pub(crate) mod deleter;
+/// Logic for listing all files in object storage.
+pub(crate) mod lister;
diff --git a/garbage_collector/src/parquetfile/deleter.rs b/garbage_collector/src/parquetfile/deleter.rs
new file mode 100644
index 0000000..9313fbd
--- /dev/null
+++ b/garbage_collector/src/parquetfile/deleter.rs
@@ -0,0 +1,49 @@
+use data_types::Timestamp;
+use iox_catalog::interface::Catalog;
+use observability_deps::tracing::*;
+use snafu::prelude::*;
+use std::time::Instant;
+use std::{sync::Arc, time::Duration};
+use tokio::{select, time::sleep};
+use tokio_util::sync::CancellationToken;
+
+pub(crate) async fn perform(
+    shutdown: CancellationToken,
+    catalog: Arc<dyn Catalog>,
+    cutoff: Duration,
+    sleep_interval: Duration,
+) -> Result<()> {
+    loop {
+        let start = Instant::now();
+        let older_than = Timestamp::from(catalog.time_provider().now() - cutoff);
+        // do the delete, returning the deleted files
+        let deleted = catalog
+            .repositories()
+            .parquet_files()
+            .delete_old_ids_only(older_than) // read/write
+            .await
+            .context(DeletingSnafu)?;
+
+        let elapsed = start.elapsed();
+        info!(delete_count = %deleted.len(), ?elapsed, "iox_catalog::delete_old()");
+
+        select! {
+            _ = shutdown.cancelled() => {
+                break
+            },
+            _ = sleep(sleep_interval) => (),
+        }
+    }
+    Ok(())
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Failed to delete old parquet files in catalog"))]
+    Deleting {
+        source: iox_catalog::interface::Error,
+    },
+}
+
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
diff --git a/garbage_collector/src/parquetfile/mod.rs b/garbage_collector/src/parquetfile/mod.rs
new file mode 100644
index 0000000..00c8dc2
--- /dev/null
+++ b/garbage_collector/src/parquetfile/mod.rs
@@ -0,0 +1,2 @@
+/// Logic for deleting parquet_file entries from the catalog.
+pub(crate) mod deleter;
diff --git a/garbage_collector/src/retention/flagger.rs b/garbage_collector/src/retention/flagger.rs
new file mode 100644
index 0000000..59905ac
--- /dev/null
+++ b/garbage_collector/src/retention/flagger.rs
@@ -0,0 +1,46 @@
+use iox_catalog::interface::Catalog;
+use observability_deps::tracing::*;
+use snafu::prelude::*;
+use std::{sync::Arc, time::Duration};
+use tokio::{select, time::sleep};
+use tokio_util::sync::CancellationToken;
+
+pub(crate) async fn perform(
+    shutdown: CancellationToken,
+    catalog: Arc<dyn Catalog>,
+    sleep_interval_minutes: u64,
+    dry_run: bool,
+) -> Result<()> {
+    loop {
+        if !dry_run {
+            let flagged = catalog
+                .repositories()
+                .parquet_files()
+                .flag_for_delete_by_retention() //read/write
+                .await
+                .context(FlaggingSnafu)?;
+            info!(flagged_count = %flagged.len(), "iox_catalog::flag_for_delete_by_retention()");
+        } else {
+            debug!("dry run enabled for parquet retention flagger");
+        };
+
+        select! {
+            _ = shutdown.cancelled() => {
+                break
+            },
+            _ = sleep(Duration::from_secs(60 * sleep_interval_minutes)) => (),
+        }
+    }
+    Ok(())
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Failed to flag parquet files for deletion by retention policy"))]
+    Flagging {
+        source: iox_catalog::interface::Error,
+    },
+}
+
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
diff --git a/garbage_collector/src/retention/mod.rs b/garbage_collector/src/retention/mod.rs
new file mode 100644
index 0000000..357c275
--- /dev/null
+++ b/garbage_collector/src/retention/mod.rs
@@ -0,0 +1,2 @@
+/// Logic for flagging parquet files for deletion based on retention settings
+pub(crate) mod flagger;
diff --git a/generated_types/.gitignore b/generated_types/.gitignore
new file mode 100644
index 0000000..3954622
--- /dev/null
+++ b/generated_types/.gitignore
@@ -0,0 +1,2 @@
+.flatbuffers
+
diff --git a/generated_types/Cargo.toml b/generated_types/Cargo.toml
new file mode 100644
index 0000000..345f147
--- /dev/null
+++ b/generated_types/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "generated_types"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+observability_deps = { path = "../observability_deps" }
+pbjson = { workspace = true }
+pbjson-types = { workspace = true }
+prost = { workspace = true }
+serde = { version = "1.0", features = ["derive"] }
+tonic = { workspace = true }
+uuid = { version = "1" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+bytes = "1.5"
+
+[build-dependencies] # In alphabetical order
+tonic-build = { workspace = true }
+prost-build = { workspace = true }
+pbjson-build = { workspace = true }
diff --git a/generated_types/build.rs b/generated_types/build.rs
new file mode 100644
index 0000000..4718737
--- /dev/null
+++ b/generated_types/build.rs
@@ -0,0 +1,140 @@
+//! Compiles Protocol Buffers into native Rust types.
+
+use std::env;
+use std::path::{Path, PathBuf};
+
+type Error = Box<dyn std::error::Error>;
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+fn main() -> Result<()> {
+    let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
+
+    generate_grpc_types(&root)?;
+
+    Ok(())
+}
+
+/// Schema used with IOx specific gRPC requests
+///
+/// Creates:
+///
+/// - `influxdata.iox.authz.v1.rs`
+/// - `influxdata.iox.bulk_ingest.v1.rs`
+/// - `influxdata.iox.catalog.v1.rs`
+/// - `influxdata.iox.compactor.v1.rs`
+/// - `influxdata.iox.delete.v1.rs`
+/// - `influxdata.iox.ingester.v1.rs`
+/// - `influxdata.iox.namespace.v1.rs`
+/// - `influxdata.iox.object_store.v1.rs`
+/// - `influxdata.iox.predicate.v1.rs`
+/// - `influxdata.iox.querier.v1.rs`
+/// - `influxdata.iox.schema.v1.rs`
+/// - `influxdata.iox.table.v1.rs`
+/// - `influxdata.iox.wal.v1.rs`
+/// - `influxdata.iox.write.v1.rs`
+/// - `influxdata.platform.storage.rs`
+fn generate_grpc_types(root: &Path) -> Result<()> {
+    let authz_path = root.join("influxdata/iox/authz/v1");
+    let bulk_ingest_path = root.join("influxdata/iox/bulk_ingest/v1");
+    let catalog_cache_path = root.join("influxdata/iox/catalog_cache/v1");
+    let catalog_v1_path = root.join("influxdata/iox/catalog/v1");
+    let catalog_v2_path = root.join("influxdata/iox/catalog/v2");
+    let column_type = root.join("influxdata/iox/column_type/v1");
+    let compactor_path = root.join("influxdata/iox/compactor/v1");
+    let delete_path = root.join("influxdata/iox/delete/v1");
+    let gossip_path = root.join("influxdata/iox/gossip/v1");
+    let ingester_path = root.join("influxdata/iox/ingester/v1");
+    let namespace_path = root.join("influxdata/iox/namespace/v1");
+    let object_store_path = root.join("influxdata/iox/object_store/v1");
+    let partition_template_path = root.join("influxdata/iox/partition_template/v1");
+    let predicate_path = root.join("influxdata/iox/predicate/v1");
+    let querier_path = root.join("influxdata/iox/querier/v1");
+    let schema_path = root.join("influxdata/iox/schema/v1");
+    let skipped_compaction_path = root.join("influxdata/iox/skipped_compaction/v1");
+    let storage_errors_path = root.join("influxdata/platform/errors");
+    let storage_path = root.join("influxdata/platform/storage");
+    let table_path = root.join("influxdata/iox/table/v1");
+    let wal_path = root.join("influxdata/iox/wal/v1");
+
+    let proto_files = vec![
+        authz_path.join("authz.proto"),
+        bulk_ingest_path.join("service.proto"),
+        catalog_cache_path.join("value.proto"),
+        catalog_v1_path.join("parquet_file.proto"),
+        catalog_v1_path.join("partition_identifier.proto"),
+        catalog_v1_path.join("service.proto"),
+        catalog_v2_path.join("service.proto"),
+        column_type.join("type.proto"),
+        compactor_path.join("service.proto"),
+        delete_path.join("service.proto"),
+        gossip_path.join("compaction.proto"),
+        gossip_path.join("parquet_file.proto"),
+        gossip_path.join("schema.proto"),
+        gossip_path.join("schema_sync.proto"),
+        gossip_path.join("sort_keys.proto"),
+        ingester_path.join("parquet_metadata.proto"),
+        ingester_path.join("persist.proto"),
+        ingester_path.join("write.proto"),
+        namespace_path.join("service.proto"),
+        object_store_path.join("service.proto"),
+        partition_template_path.join("template.proto"),
+        predicate_path.join("predicate.proto"),
+        querier_path.join("flight.proto"),
+        querier_path.join("query_log.proto"),
+        root.join("google/longrunning/operations.proto"),
+        root.join("google/rpc/error_details.proto"),
+        root.join("google/rpc/status.proto"),
+        root.join("grpc/health/v1/service.proto"),
+        root.join("influxdata/pbdata/v1/influxdb_pb_data_protocol.proto"),
+        schema_path.join("service.proto"),
+        skipped_compaction_path.join("skipped_compaction.proto"),
+        storage_errors_path.join("errors.proto"),
+        storage_path.join("predicate.proto"),
+        storage_path.join("service.proto"),
+        storage_path.join("source.proto"),
+        storage_path.join("storage_common.proto"),
+        storage_path.join("test.proto"),
+        table_path.join("service.proto"),
+        wal_path.join("wal.proto"),
+    ];
+
+    // Tell cargo to recompile if any of these proto files are changed
+    for proto_file in &proto_files {
+        println!("cargo:rerun-if-changed={}", proto_file.display());
+    }
+
+    let mut config = prost_build::Config::new();
+
+    config
+        .compile_well_known_types()
+        .disable_comments([".google"])
+        .extern_path(".google.protobuf", "::pbjson_types")
+        .btree_map([
+            ".influxdata.iox.ingester.v1.IngesterQueryResponseMetadata.unpersisted_partitions",
+            ".influxdata.iox.schema.v1.UpsertSchemaRequest.columns",
+        ])
+        .type_attribute(".influxdata.iox.partition_template", "#[derive(Hash)]")
+        .bytes([".influxdata.iox.catalog_cache.v1"]);
+
+    let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");
+    tonic_build::configure()
+        .file_descriptor_set_path(&descriptor_path)
+        // protoc in ubuntu builder needs this option
+        .protoc_arg("--experimental_allow_proto3_optional")
+        .compile_with_config(config, &proto_files, &[root])?;
+
+    let descriptor_set = std::fs::read(descriptor_path)?;
+
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[
+            ".influxdata.iox",
+            ".influxdata.pbdata",
+            ".influxdata.platform.storage",
+            ".influxdata.platform.errors",
+            ".google.longrunning",
+            ".google.rpc",
+        ])?;
+
+    Ok(())
+}
diff --git a/generated_types/protos/google/api/annotations.proto b/generated_types/protos/google/api/annotations.proto
new file mode 100644
index 0000000..18dcf20
--- /dev/null
+++ b/generated_types/protos/google/api/annotations.proto
@@ -0,0 +1,31 @@
+// Copyright (c) 2015, Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package google.api;
+
+import "google/api/http.proto";
+import "google/protobuf/descriptor.proto";
+
+option go_package = "google.golang.org/genproto/googleapis/api/annotations;annotations";
+option java_multiple_files = true;
+option java_outer_classname = "AnnotationsProto";
+option java_package = "com.google.api";
+option objc_class_prefix = "GAPI";
+
+extend google.protobuf.MethodOptions {
+  // See `HttpRule`.
+  HttpRule http = 72295728;
+}
\ No newline at end of file
diff --git a/generated_types/protos/google/api/client.proto b/generated_types/protos/google/api/client.proto
new file mode 100644
index 0000000..2102623
--- /dev/null
+++ b/generated_types/protos/google/api/client.proto
@@ -0,0 +1,99 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package google.api;
+
+import "google/protobuf/descriptor.proto";
+
+option go_package = "google.golang.org/genproto/googleapis/api/annotations;annotations";
+option java_multiple_files = true;
+option java_outer_classname = "ClientProto";
+option java_package = "com.google.api";
+option objc_class_prefix = "GAPI";
+
+extend google.protobuf.MethodOptions {
+  // A definition of a client library method signature.
+  //
+  // In client libraries, each proto RPC corresponds to one or more methods
+  // which the end user is able to call, and calls the underlying RPC.
+  // Normally, this method receives a single argument (a struct or instance
+  // corresponding to the RPC request object). Defining this field will
+  // add one or more overloads providing flattened or simpler method signatures
+  // in some languages.
+  //
+  // The fields on the method signature are provided as a comma-separated
+  // string.
+  //
+  // For example, the proto RPC and annotation:
+  //
+  //   rpc CreateSubscription(CreateSubscriptionRequest)
+  //       returns (Subscription) {
+  //     option (google.api.method_signature) = "name,topic";
+  //   }
+  //
+  // Would add the following Java overload (in addition to the method accepting
+  // the request object):
+  //
+  //   public final Subscription createSubscription(String name, String topic)
+  //
+  // The following backwards-compatibility guidelines apply:
+  //
+  //   * Adding this annotation to an unannotated method is backwards
+  //     compatible.
+  //   * Adding this annotation to a method which already has existing
+  //     method signature annotations is backwards compatible if and only if
+  //     the new method signature annotation is last in the sequence.
+  //   * Modifying or removing an existing method signature annotation is
+  //     a breaking change.
+  //   * Re-ordering existing method signature annotations is a breaking
+  //     change.
+  repeated string method_signature = 1051;
+}
+
+extend google.protobuf.ServiceOptions {
+  // The hostname for this service.
+  // This should be specified with no prefix or protocol.
+  //
+  // Example:
+  //
+  //   service Foo {
+  //     option (google.api.default_host) = "foo.googleapi.com";
+  //     ...
+  //   }
+  string default_host = 1049;
+
+  // OAuth scopes needed for the client.
+  //
+  // Example:
+  //
+  //   service Foo {
+  //     option (google.api.oauth_scopes) = \
+  //       "https://www.googleapis.com/auth/cloud-platform";
+  //     ...
+  //   }
+  //
+  // If there is more than one scope, use a comma-separated string:
+  //
+  // Example:
+  //
+  //   service Foo {
+  //     option (google.api.oauth_scopes) = \
+  //       "https://www.googleapis.com/auth/cloud-platform,"
+  //       "https://www.googleapis.com/auth/monitoring";
+  //     ...
+  //   }
+  string oauth_scopes = 1050;
+}
diff --git a/generated_types/protos/google/api/http.proto b/generated_types/protos/google/api/http.proto
new file mode 100644
index 0000000..69460cf
--- /dev/null
+++ b/generated_types/protos/google/api/http.proto
@@ -0,0 +1,375 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package google.api;
+
+option cc_enable_arenas = true;
+option go_package = "google.golang.org/genproto/googleapis/api/annotations;annotations";
+option java_multiple_files = true;
+option java_outer_classname = "HttpProto";
+option java_package = "com.google.api";
+option objc_class_prefix = "GAPI";
+
+// Defines the HTTP configuration for an API service. It contains a list of
+// [HttpRule][google.api.HttpRule], each specifying the mapping of an RPC method
+// to one or more HTTP REST API methods.
+message Http {
+  // A list of HTTP configuration rules that apply to individual API methods.
+  //
+  // **NOTE:** All service configuration rules follow "last one wins" order.
+  repeated HttpRule rules = 1;
+
+  // When set to true, URL path parameters will be fully URI-decoded except in
+  // cases of single segment matches in reserved expansion, where "%2F" will be
+  // left encoded.
+  //
+  // The default behavior is to not decode RFC 6570 reserved characters in multi
+  // segment matches.
+  bool fully_decode_reserved_expansion = 2;
+}
+
+// # gRPC Transcoding
+//
+// gRPC Transcoding is a feature for mapping between a gRPC method and one or
+// more HTTP REST endpoints. It allows developers to build a single API service
+// that supports both gRPC APIs and REST APIs. Many systems, including [Google
+// APIs](https://github.com/googleapis/googleapis),
+// [Cloud Endpoints](https://cloud.google.com/endpoints), [gRPC
+// Gateway](https://github.com/grpc-ecosystem/grpc-gateway),
+// and [Envoy](https://github.com/envoyproxy/envoy) proxy support this feature
+// and use it for large scale production services.
+//
+// `HttpRule` defines the schema of the gRPC/REST mapping. The mapping specifies
+// how different portions of the gRPC request message are mapped to the URL
+// path, URL query parameters, and HTTP request body. It also controls how the
+// gRPC response message is mapped to the HTTP response body. `HttpRule` is
+// typically specified as an `google.api.http` annotation on the gRPC method.
+//
+// Each mapping specifies a URL path template and an HTTP method. The path
+// template may refer to one or more fields in the gRPC request message, as long
+// as each field is a non-repeated field with a primitive (non-message) type.
+// The path template controls how fields of the request message are mapped to
+// the URL path.
+//
+// Example:
+//
+//     service Messaging {
+//       rpc GetMessage(GetMessageRequest) returns (Message) {
+//         option (google.api.http) = {
+//             get: "/v1/{name=messages/*}"
+//         };
+//       }
+//     }
+//     message GetMessageRequest {
+//       string name = 1; // Mapped to URL path.
+//     }
+//     message Message {
+//       string text = 1; // The resource content.
+//     }
+//
+// This enables an HTTP REST to gRPC mapping as below:
+//
+// HTTP | gRPC
+// -----|-----
+// `GET /v1/messages/123456`  | `GetMessage(name: "messages/123456")`
+//
+// Any fields in the request message which are not bound by the path template
+// automatically become HTTP query parameters if there is no HTTP request body.
+// For example:
+//
+//     service Messaging {
+//       rpc GetMessage(GetMessageRequest) returns (Message) {
+//         option (google.api.http) = {
+//             get:"/v1/messages/{message_id}"
+//         };
+//       }
+//     }
+//     message GetMessageRequest {
+//       message SubMessage {
+//         string subfield = 1;
+//       }
+//       string message_id = 1; // Mapped to URL path.
+//       int64 revision = 2;    // Mapped to URL query parameter `revision`.
+//       SubMessage sub = 3;    // Mapped to URL query parameter `sub.subfield`.
+//     }
+//
+// This enables a HTTP JSON to RPC mapping as below:
+//
+// HTTP | gRPC
+// -----|-----
+// `GET /v1/messages/123456?revision=2&sub.subfield=foo` |
+// `GetMessage(message_id: "123456" revision: 2 sub: SubMessage(subfield:
+// "foo"))`
+//
+// Note that fields which are mapped to URL query parameters must have a
+// primitive type or a repeated primitive type or a non-repeated message type.
+// In the case of a repeated type, the parameter can be repeated in the URL
+// as `...?param=A&param=B`. In the case of a message type, each field of the
+// message is mapped to a separate parameter, such as
+// `...?foo.a=A&foo.b=B&foo.c=C`.
+//
+// For HTTP methods that allow a request body, the `body` field
+// specifies the mapping. Consider a REST update method on the
+// message resource collection:
+//
+//     service Messaging {
+//       rpc UpdateMessage(UpdateMessageRequest) returns (Message) {
+//         option (google.api.http) = {
+//           patch: "/v1/messages/{message_id}"
+//           body: "message"
+//         };
+//       }
+//     }
+//     message UpdateMessageRequest {
+//       string message_id = 1; // mapped to the URL
+//       Message message = 2;   // mapped to the body
+//     }
+//
+// The following HTTP JSON to RPC mapping is enabled, where the
+// representation of the JSON in the request body is determined by
+// protos JSON encoding:
+//
+// HTTP | gRPC
+// -----|-----
+// `PATCH /v1/messages/123456 { "text": "Hi!" }` | `UpdateMessage(message_id:
+// "123456" message { text: "Hi!" })`
+//
+// The special name `*` can be used in the body mapping to define that
+// every field not bound by the path template should be mapped to the
+// request body.  This enables the following alternative definition of
+// the update method:
+//
+//     service Messaging {
+//       rpc UpdateMessage(Message) returns (Message) {
+//         option (google.api.http) = {
+//           patch: "/v1/messages/{message_id}"
+//           body: "*"
+//         };
+//       }
+//     }
+//     message Message {
+//       string message_id = 1;
+//       string text = 2;
+//     }
+//
+//
+// The following HTTP JSON to RPC mapping is enabled:
+//
+// HTTP | gRPC
+// -----|-----
+// `PATCH /v1/messages/123456 { "text": "Hi!" }` | `UpdateMessage(message_id:
+// "123456" text: "Hi!")`
+//
+// Note that when using `*` in the body mapping, it is not possible to
+// have HTTP parameters, as all fields not bound by the path end in
+// the body. This makes this option more rarely used in practice when
+// defining REST APIs. The common usage of `*` is in custom methods
+// which don't use the URL at all for transferring data.
+//
+// It is possible to define multiple HTTP methods for one RPC by using
+// the `additional_bindings` option. Example:
+//
+//     service Messaging {
+//       rpc GetMessage(GetMessageRequest) returns (Message) {
+//         option (google.api.http) = {
+//           get: "/v1/messages/{message_id}"
+//           additional_bindings {
+//             get: "/v1/users/{user_id}/messages/{message_id}"
+//           }
+//         };
+//       }
+//     }
+//     message GetMessageRequest {
+//       string message_id = 1;
+//       string user_id = 2;
+//     }
+//
+// This enables the following two alternative HTTP JSON to RPC mappings:
+//
+// HTTP | gRPC
+// -----|-----
+// `GET /v1/messages/123456` | `GetMessage(message_id: "123456")`
+// `GET /v1/users/me/messages/123456` | `GetMessage(user_id: "me" message_id:
+// "123456")`
+//
+// ## Rules for HTTP mapping
+//
+// 1. Leaf request fields (recursive expansion nested messages in the request
+//    message) are classified into three categories:
+//    - Fields referred by the path template. They are passed via the URL path.
+//    - Fields referred by the [HttpRule.body][google.api.HttpRule.body]. They are passed via the HTTP
+//      request body.
+//    - All other fields are passed via the URL query parameters, and the
+//      parameter name is the field path in the request message. A repeated
+//      field can be represented as multiple query parameters under the same
+//      name.
+//  2. If [HttpRule.body][google.api.HttpRule.body] is "*", there is no URL query parameter, all fields
+//     are passed via URL path and HTTP request body.
+//  3. If [HttpRule.body][google.api.HttpRule.body] is omitted, there is no HTTP request body, all
+//     fields are passed via URL path and URL query parameters.
+//
+// ### Path template syntax
+//
+//     Template = "/" Segments [ Verb ] ;
+//     Segments = Segment { "/" Segment } ;
+//     Segment  = "*" | "**" | LITERAL | Variable ;
+//     Variable = "{" FieldPath [ "=" Segments ] "}" ;
+//     FieldPath = IDENT { "." IDENT } ;
+//     Verb     = ":" LITERAL ;
+//
+// The syntax `*` matches a single URL path segment. The syntax `**` matches
+// zero or more URL path segments, which must be the last part of the URL path
+// except the `Verb`.
+//
+// The syntax `Variable` matches part of the URL path as specified by its
+// template. A variable template must not contain other variables. If a variable
+// matches a single path segment, its template may be omitted, e.g. `{var}`
+// is equivalent to `{var=*}`.
+//
+// The syntax `LITERAL` matches literal text in the URL path. If the `LITERAL`
+// contains any reserved character, such characters should be percent-encoded
+// before the matching.
+//
+// If a variable contains exactly one path segment, such as `"{var}"` or
+// `"{var=*}"`, when such a variable is expanded into a URL path on the client
+// side, all characters except `[-_.~0-9a-zA-Z]` are percent-encoded. The
+// server side does the reverse decoding. Such variables show up in the
+// [Discovery
+// Document](https://developers.google.com/discovery/v1/reference/apis) as
+// `{var}`.
+//
+// If a variable contains multiple path segments, such as `"{var=foo/*}"`
+// or `"{var=**}"`, when such a variable is expanded into a URL path on the
+// client side, all characters except `[-_.~/0-9a-zA-Z]` are percent-encoded.
+// The server side does the reverse decoding, except "%2F" and "%2f" are left
+// unchanged. Such variables show up in the
+// [Discovery
+// Document](https://developers.google.com/discovery/v1/reference/apis) as
+// `{+var}`.
+//
+// ## Using gRPC API Service Configuration
+//
+// gRPC API Service Configuration (service config) is a configuration language
+// for configuring a gRPC service to become a user-facing product. The
+// service config is simply the YAML representation of the `google.api.Service`
+// proto message.
+//
+// As an alternative to annotating your proto file, you can configure gRPC
+// transcoding in your service config YAML files. You do this by specifying a
+// `HttpRule` that maps the gRPC method to a REST endpoint, achieving the same
+// effect as the proto annotation. This can be particularly useful if you
+// have a proto that is reused in multiple services. Note that any transcoding
+// specified in the service config will override any matching transcoding
+// configuration in the proto.
+//
+// Example:
+//
+//     http:
+//       rules:
+//         # Selects a gRPC method and applies HttpRule to it.
+//         - selector: example.v1.Messaging.GetMessage
+//           get: /v1/messages/{message_id}/{sub.subfield}
+//
+// ## Special notes
+//
+// When gRPC Transcoding is used to map a gRPC to JSON REST endpoints, the
+// proto to JSON conversion must follow the [proto3
+// specification](https://developers.google.com/protocol-buffers/docs/proto3#json).
+//
+// While the single segment variable follows the semantics of
+// [RFC 6570](https://tools.ietf.org/html/rfc6570) Section 3.2.2 Simple String
+// Expansion, the multi segment variable **does not** follow RFC 6570 Section
+// 3.2.3 Reserved Expansion. The reason is that the Reserved Expansion
+// does not expand special characters like `?` and `#`, which would lead
+// to invalid URLs. As the result, gRPC Transcoding uses a custom encoding
+// for multi segment variables.
+//
+// The path variables **must not** refer to any repeated or mapped field,
+// because client libraries are not capable of handling such variable expansion.
+//
+// The path variables **must not** capture the leading "/" character. The reason
+// is that the most common use case "{var}" does not capture the leading "/"
+// character. For consistency, all path variables must share the same behavior.
+//
+// Repeated message fields must not be mapped to URL query parameters, because
+// no client library can support such complicated mapping.
+//
+// If an API needs to use a JSON array for request or response body, it can map
+// the request or response body to a repeated field. However, some gRPC
+// Transcoding implementations may not support this feature.
+message HttpRule {
+  // Selects a method to which this rule applies.
+  //
+  // Refer to [selector][google.api.DocumentationRule.selector] for syntax details.
+  string selector = 1;
+
+  // Determines the URL pattern is matched by this rules. This pattern can be
+  // used with any of the {get|put|post|delete|patch} methods. A custom method
+  // can be defined using the 'custom' field.
+  oneof pattern {
+    // Maps to HTTP GET. Used for listing and getting information about
+    // resources.
+    string get = 2;
+
+    // Maps to HTTP PUT. Used for replacing a resource.
+    string put = 3;
+
+    // Maps to HTTP POST. Used for creating a resource or performing an action.
+    string post = 4;
+
+    // Maps to HTTP DELETE. Used for deleting a resource.
+    string delete = 5;
+
+    // Maps to HTTP PATCH. Used for updating a resource.
+    string patch = 6;
+
+    // The custom pattern is used for specifying an HTTP method that is not
+    // included in the `pattern` field, such as HEAD, or "*" to leave the
+    // HTTP method unspecified for this rule. The wild-card rule is useful
+    // for services that provide content to Web (HTML) clients.
+    CustomHttpPattern custom = 8;
+  }
+
+  // The name of the request field whose value is mapped to the HTTP request
+  // body, or `*` for mapping all request fields not captured by the path
+  // pattern to the HTTP body, or omitted for not having any HTTP request body.
+  //
+  // NOTE: the referred field must be present at the top-level of the request
+  // message type.
+  string body = 7;
+
+  // Optional. The name of the response field whose value is mapped to the HTTP
+  // response body. When omitted, the entire response message will be used
+  // as the HTTP response body.
+  //
+  // NOTE: The referred field must be present at the top-level of the response
+  // message type.
+  string response_body = 12;
+
+  // Additional HTTP bindings for the selector. Nested bindings must
+  // not contain an `additional_bindings` field themselves (that is,
+  // the nesting may only be one level deep).
+  repeated HttpRule additional_bindings = 11;
+}
+
+// A custom pattern is used for defining custom HTTP verb.
+message CustomHttpPattern {
+  // The name of this custom HTTP verb.
+  string kind = 1;
+
+  // The path matched by this custom verb.
+  string path = 2;
+}
diff --git a/generated_types/protos/google/longrunning/operations.proto b/generated_types/protos/google/longrunning/operations.proto
new file mode 100644
index 0000000..c1fdc6f
--- /dev/null
+++ b/generated_types/protos/google/longrunning/operations.proto
@@ -0,0 +1,247 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package google.longrunning;
+
+import "google/api/annotations.proto";
+import "google/api/client.proto";
+import "google/protobuf/any.proto";
+import "google/protobuf/duration.proto";
+import "google/protobuf/empty.proto";
+import "google/rpc/status.proto";
+import "google/protobuf/descriptor.proto";
+
+option cc_enable_arenas = true;
+option csharp_namespace = "Google.LongRunning";
+option go_package = "google.golang.org/genproto/googleapis/longrunning;longrunning";
+option java_multiple_files = true;
+option java_outer_classname = "OperationsProto";
+option java_package = "com.google.longrunning";
+option php_namespace = "Google\\LongRunning";
+
+extend google.protobuf.MethodOptions {
+  // Additional information regarding long-running operations.
+  // In particular, this specifies the types that are returned from
+  // long-running operations.
+  //
+  // Required for methods that return `google.longrunning.Operation`; invalid
+  // otherwise.
+  google.longrunning.OperationInfo operation_info = 1049;
+}
+
+// Manages long-running operations with an API service.
+//
+// When an API method normally takes long time to complete, it can be designed
+// to return [Operation][google.longrunning.Operation] to the client, and the client can use this
+// interface to receive the real response asynchronously by polling the
+// operation resource, or pass the operation resource to another API (such as
+// Google Cloud Pub/Sub API) to receive the response.  Any API service that
+// returns long-running operations should implement the `Operations` interface
+// so developers can have a consistent client experience.
+service Operations {
+  option (google.api.default_host) = "longrunning.googleapis.com";
+
+  // Lists operations that match the specified filter in the request. If the
+  // server doesn't support this method, it returns `UNIMPLEMENTED`.
+  //
+  // NOTE: the `name` binding allows API services to override the binding
+  // to use different resource name schemes, such as `users/*/operations`. To
+  // override the binding, API services can add a binding such as
+  // `"/v1/{name=users/*}/operations"` to their service configuration.
+  // For backwards compatibility, the default name includes the operations
+  // collection id, however overriding users must ensure the name binding
+  // is the parent resource, without the operations collection id.
+  rpc ListOperations(ListOperationsRequest) returns (ListOperationsResponse) {
+    option (google.api.http) = {
+      get: "/v1/{name=operations}"
+    };
+    option (google.api.method_signature) = "name,filter";
+  }
+
+  // Gets the latest state of a long-running operation.  Clients can use this
+  // method to poll the operation result at intervals as recommended by the API
+  // service.
+  rpc GetOperation(GetOperationRequest) returns (Operation) {
+    option (google.api.http) = {
+      get: "/v1/{name=operations/**}"
+    };
+    option (google.api.method_signature) = "name";
+  }
+
+  // Deletes a long-running operation. This method indicates that the client is
+  // no longer interested in the operation result. It does not cancel the
+  // operation. If the server doesn't support this method, it returns
+  // `google.rpc.Code.UNIMPLEMENTED`.
+  rpc DeleteOperation(DeleteOperationRequest) returns (google.protobuf.Empty) {
+    option (google.api.http) = {
+      delete: "/v1/{name=operations/**}"
+    };
+    option (google.api.method_signature) = "name";
+  }
+
+  // Starts asynchronous cancellation on a long-running operation.  The server
+  // makes a best effort to cancel the operation, but success is not
+  // guaranteed.  If the server doesn't support this method, it returns
+  // `google.rpc.Code.UNIMPLEMENTED`.  Clients can use
+  // [Operations.GetOperation][google.longrunning.Operations.GetOperation] or
+  // other methods to check whether the cancellation succeeded or whether the
+  // operation completed despite cancellation. On successful cancellation,
+  // the operation is not deleted; instead, it becomes an operation with
+  // an [Operation.error][google.longrunning.Operation.error] value with a [google.rpc.Status.code][google.rpc.Status.code] of 1,
+  // corresponding to `Code.CANCELLED`.
+  rpc CancelOperation(CancelOperationRequest) returns (google.protobuf.Empty) {
+    option (google.api.http) = {
+      post: "/v1/{name=operations/**}:cancel"
+      body: "*"
+    };
+    option (google.api.method_signature) = "name";
+  }
+
+  // Waits until the specified long-running operation is done or reaches at most
+  // a specified timeout, returning the latest state.  If the operation is
+  // already done, the latest state is immediately returned.  If the timeout
+  // specified is greater than the default HTTP/RPC timeout, the HTTP/RPC
+  // timeout is used.  If the server does not support this method, it returns
+  // `google.rpc.Code.UNIMPLEMENTED`.
+  // Note that this method is on a best-effort basis.  It may return the latest
+  // state before the specified timeout (including immediately), meaning even an
+  // immediate response is no guarantee that the operation is done.
+  rpc WaitOperation(WaitOperationRequest) returns (Operation) {
+  }
+}
+
+// This resource represents a long-running operation that is the result of a
+// network API call.
+message Operation {
+  // The server-assigned name, which is only unique within the same service that
+  // originally returns it. If you use the default HTTP mapping, the
+  // `name` should be a resource name ending with `operations/{unique_id}`.
+  string name = 1;
+
+  // Service-specific metadata associated with the operation.  It typically
+  // contains progress information and common metadata such as create time.
+  // Some services might not provide such metadata.  Any method that returns a
+  // long-running operation should document the metadata type, if any.
+  google.protobuf.Any metadata = 2;
+
+  // If the value is `false`, it means the operation is still in progress.
+  // If `true`, the operation is completed, and either `error` or `response` is
+  // available.
+  bool done = 3;
+
+  // The operation result, which can be either an `error` or a valid `response`.
+  // If `done` == `false`, neither `error` nor `response` is set.
+  // If `done` == `true`, exactly one of `error` or `response` is set.
+  oneof result {
+    // The error result of the operation in case of failure or cancellation.
+    google.rpc.Status error = 4;
+
+    // The normal response of the operation in case of success.  If the original
+    // method returns no data on success, such as `Delete`, the response is
+    // `google.protobuf.Empty`.  If the original method is standard
+    // `Get`/`Create`/`Update`, the response should be the resource.  For other
+    // methods, the response should have the type `XxxResponse`, where `Xxx`
+    // is the original method name.  For example, if the original method name
+    // is `TakeSnapshot()`, the inferred response type is
+    // `TakeSnapshotResponse`.
+    google.protobuf.Any response = 5;
+  }
+}
+
+// The request message for [Operations.GetOperation][google.longrunning.Operations.GetOperation].
+message GetOperationRequest {
+  // The name of the operation resource.
+  string name = 1;
+}
+
+// The request message for [Operations.ListOperations][google.longrunning.Operations.ListOperations].
+message ListOperationsRequest {
+  // The name of the operation's parent resource.
+  string name = 4;
+
+  // The standard list filter.
+  string filter = 1;
+
+  // The standard list page size.
+  int32 page_size = 2;
+
+  // The standard list page token.
+  string page_token = 3;
+}
+
+// The response message for [Operations.ListOperations][google.longrunning.Operations.ListOperations].
+message ListOperationsResponse {
+  // A list of operations that matches the specified filter in the request.
+  repeated Operation operations = 1;
+
+  // The standard List next-page token.
+  string next_page_token = 2;
+}
+
+// The request message for [Operations.CancelOperation][google.longrunning.Operations.CancelOperation].
+message CancelOperationRequest {
+  // The name of the operation resource to be cancelled.
+  string name = 1;
+}
+
+// The request message for [Operations.DeleteOperation][google.longrunning.Operations.DeleteOperation].
+message DeleteOperationRequest {
+  // The name of the operation resource to be deleted.
+  string name = 1;
+}
+
+// The request message for [Operations.WaitOperation][google.longrunning.Operations.WaitOperation].
+message WaitOperationRequest {
+  // The name of the operation resource to wait on.
+  string name = 1;
+
+  // The maximum duration to wait before timing out. If left blank, the wait
+  // will be at most the time permitted by the underlying HTTP/RPC protocol.
+  // If RPC context deadline is also specified, the shorter one will be used.
+  google.protobuf.Duration timeout = 2;
+}
+
+// A message representing the message types used by a long-running operation.
+//
+// Example:
+//
+//   rpc LongRunningRecognize(LongRunningRecognizeRequest)
+//       returns (google.longrunning.Operation) {
+//     option (google.longrunning.operation_info) = {
+//       response_type: "LongRunningRecognizeResponse"
+//       metadata_type: "LongRunningRecognizeMetadata"
+//     };
+//   }
+message OperationInfo {
+  // Required. The message name of the primary return type for this
+  // long-running operation.
+  // This type will be used to deserialize the LRO's response.
+  //
+  // If the response is in a different package from the rpc, a fully-qualified
+  // message name must be used (e.g. `google.protobuf.Struct`).
+  //
+  // Note: Altering this value constitutes a breaking change.
+  string response_type = 1;
+
+  // Required. The message name of the metadata type for this long-running
+  // operation.
+  //
+  // If the response is in a different package from the rpc, a fully-qualified
+  // message name must be used (e.g. `google.protobuf.Struct`).
+  //
+  // Note: Altering this value constitutes a breaking change.
+  string metadata_type = 2;
+}
diff --git a/generated_types/protos/google/rpc/error_details.proto b/generated_types/protos/google/rpc/error_details.proto
new file mode 100644
index 0000000..1dd1dff
--- /dev/null
+++ b/generated_types/protos/google/rpc/error_details.proto
@@ -0,0 +1,251 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// From https://github.com/googleapis/googleapis/blob/master/google/rpc/error_details.proto
+
+syntax = "proto3";
+
+package google.rpc;
+
+import "google/protobuf/duration.proto";
+
+option go_package = "google.golang.org/genproto/googleapis/rpc/errdetails;errdetails";
+option java_multiple_files = true;
+option java_outer_classname = "ErrorDetailsProto";
+option java_package = "com.google.rpc";
+option objc_class_prefix = "RPC";
+
+// Describes when the clients can retry a failed request. Clients could ignore
+// the recommendation here or retry when this information is missing from error
+// responses.
+//
+// It's always recommended that clients should use exponential backoff when
+// retrying.
+//
+// Clients should wait until `retry_delay` amount of time has passed since
+// receiving the error response before retrying.  If retrying requests also
+// fail, clients should use an exponential backoff scheme to gradually increase
+// the delay between retries based on `retry_delay`, until either a maximum
+// number of retries have been reached or a maximum retry delay cap has been
+// reached.
+message RetryInfo {
+  // Clients should wait at least this long between retrying the same request.
+  google.protobuf.Duration retry_delay = 1;
+}
+
+// Describes additional debugging info.
+message DebugInfo {
+  // The stack trace entries indicating where the error occurred.
+  repeated string stack_entries = 1;
+
+  // Additional debugging information provided by the server.
+  string detail = 2;
+}
+
+// Describes how a quota check failed.
+//
+// For example if a daily limit was exceeded for the calling project,
+// a service could respond with a QuotaFailure detail containing the project
+// id and the description of the quota limit that was exceeded.  If the
+// calling project hasn't enabled the service in the developer console, then
+// a service could respond with the project id and set `service_disabled`
+// to true.
+//
+// Also see RetryInfo and Help types for other details about handling a
+// quota failure.
+message QuotaFailure {
+  // A message type used to describe a single quota violation.  For example, a
+  // daily quota or a custom quota that was exceeded.
+  message Violation {
+    // The subject on which the quota check failed.
+    // For example, "clientip:<ip address of client>" or "project:<Google
+    // developer project id>".
+    string subject = 1;
+
+    // A description of how the quota check failed. Clients can use this
+    // description to find more about the quota configuration in the service's
+    // public documentation, or find the relevant quota limit to adjust through
+    // developer console.
+    //
+    // For example: "Service disabled" or "Daily Limit for read operations
+    // exceeded".
+    string description = 2;
+  }
+
+  // Describes all quota violations.
+  repeated Violation violations = 1;
+}
+
+// Describes the cause of the error with structured details.
+//
+// Example of an error when contacting the "pubsub.googleapis.com" API when it
+// is not enabled:
+//
+//     { "reason": "API_DISABLED"
+//       "domain": "googleapis.com"
+//       "metadata": {
+//         "resource": "projects/123",
+//         "service": "pubsub.googleapis.com"
+//       }
+//     }
+//
+// This response indicates that the pubsub.googleapis.com API is not enabled.
+//
+// Example of an error that is returned when attempting to create a Spanner
+// instance in a region that is out of stock:
+//
+//     { "reason": "STOCKOUT"
+//       "domain": "spanner.googleapis.com",
+//       "metadata": {
+//         "availableRegions": "us-central1,us-east2"
+//       }
+//     }
+message ErrorInfo {
+  // The reason of the error. This is a constant value that identifies the
+  // proximate cause of the error. Error reasons are unique within a particular
+  // domain of errors. This should be at most 63 characters and match
+  // /[A-Z0-9_]+/.
+  string reason = 1;
+
+  // The logical grouping to which the "reason" belongs. The error domain
+  // is typically the registered service name of the tool or product that
+  // generates the error. Example: "pubsub.googleapis.com". If the error is
+  // generated by some common infrastructure, the error domain must be a
+  // globally unique value that identifies the infrastructure. For Google API
+  // infrastructure, the error domain is "googleapis.com".
+  string domain = 2;
+
+  // Additional structured details about this error.
+  //
+  // Keys should match /[a-zA-Z0-9-_]/ and be limited to 64 characters in
+  // length. When identifying the current value of an exceeded limit, the units
+  // should be contained in the key, not the value.  For example, rather than
+  // {"instanceLimit": "100/request"}, should be returned as,
+  // {"instanceLimitPerRequest": "100"}, if the client exceeds the number of
+  // instances that can be created in a single (batch) request.
+  map<string, string> metadata = 3;
+}
+
+// Describes what preconditions have failed.
+//
+// For example, if an RPC failed because it required the Terms of Service to be
+// acknowledged, it could list the terms of service violation in the
+// PreconditionFailure message.
+message PreconditionFailure {
+  // A message type used to describe a single precondition failure.
+  message Violation {
+    // The type of PreconditionFailure. We recommend using a service-specific
+    // enum type to define the supported precondition violation subjects. For
+    // example, "TOS" for "Terms of Service violation".
+    string type = 1;
+
+    // The subject, relative to the type, that failed.
+    // For example, "google.com/cloud" relative to the "TOS" type would indicate
+    // which terms of service is being referenced.
+    string subject = 2;
+
+    // A description of how the precondition failed. Developers can use this
+    // description to understand how to fix the failure.
+    //
+    // For example: "Terms of service not accepted".
+    string description = 3;
+  }
+
+  // Describes all precondition violations.
+  repeated Violation violations = 1;
+}
+
+// Describes violations in a client request. This error type focuses on the
+// syntactic aspects of the request.
+message BadRequest {
+  // A message type used to describe a single bad request field.
+  message FieldViolation {
+    // A path leading to a field in the request body. The value will be a
+    // sequence of dot-separated identifiers that identify a protocol buffer
+    // field. E.g., "field_violations.field" would identify this field.
+    string field = 1;
+
+    // A description of why the request element is bad.
+    string description = 2;
+  }
+
+  // Describes all violations in a client request.
+  repeated FieldViolation field_violations = 1;
+}
+
+// Contains metadata about the request that clients can attach when filing a bug
+// or providing other forms of feedback.
+message RequestInfo {
+  // An opaque string that should only be interpreted by the service generating
+  // it. For example, it can be used to identify requests in the service's logs.
+  string request_id = 1;
+
+  // Any data that was used to serve this request. For example, an encrypted
+  // stack trace that can be sent back to the service provider for debugging.
+  string serving_data = 2;
+}
+
+// Describes the resource that is being accessed.
+message ResourceInfo {
+  // A name for the type of resource being accessed, e.g. "sql table",
+  // "cloud storage bucket", "file", "Google calendar"; or the type URL
+  // of the resource: e.g. "type.googleapis.com/google.pubsub.v1.Topic".
+  string resource_type = 1;
+
+  // The name of the resource being accessed.  For example, a shared calendar
+  // name: "example.com_4fghdhgsrgh@group.calendar.google.com", if the current
+  // error is [google.rpc.Code.PERMISSION_DENIED][google.rpc.Code.PERMISSION_DENIED].
+  string resource_name = 2;
+
+  // The owner of the resource (optional).
+  // For example, "user:<owner email>" or "project:<Google developer project
+  // id>".
+  string owner = 3;
+
+  // Describes what error is encountered when accessing this resource.
+  // For example, updating a cloud project may require the `writer` permission
+  // on the developer console project.
+  string description = 4;
+}
+
+// Provides links to documentation or for performing an out of band action.
+//
+// For example, if a quota check failed with an error indicating the calling
+// project hasn't enabled the accessed service, this can contain a URL pointing
+// directly to the right place in the developer console to flip the bit.
+message Help {
+  // Describes a URL link.
+  message Link {
+    // Describes what the link offers.
+    string description = 1;
+
+    // The URL of the link.
+    string url = 2;
+  }
+
+  // URL(s) pointing to additional information on handling the current error.
+  repeated Link links = 1;
+}
+
+// Provides a localized error message that is safe to return to the user
+// which can be attached to an RPC error.
+message LocalizedMessage {
+  // The locale used following the specification defined at
+  // http://www.rfc-editor.org/rfc/bcp/bcp47.txt.
+  // Examples are: "en-US", "fr-CH", "es-MX"
+  string locale = 1;
+
+  // The localized error message in the above locale.
+  string message = 2;
+}
\ No newline at end of file
diff --git a/generated_types/protos/google/rpc/status.proto b/generated_types/protos/google/rpc/status.proto
new file mode 100644
index 0000000..ed7691e
--- /dev/null
+++ b/generated_types/protos/google/rpc/status.proto
@@ -0,0 +1,49 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// From https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto
+
+syntax = "proto3";
+
+package google.rpc;
+
+import "google/protobuf/any.proto";
+
+option cc_enable_arenas = true;
+option go_package = "google.golang.org/genproto/googleapis/rpc/status;status";
+option java_multiple_files = true;
+option java_outer_classname = "StatusProto";
+option java_package = "com.google.rpc";
+option objc_class_prefix = "RPC";
+
+// The `Status` type defines a logical error model that is suitable for
+// different programming environments, including REST APIs and RPC APIs. It is
+// used by [gRPC](https://github.com/grpc). Each `Status` message contains
+// three pieces of data: error code, error message, and error details.
+//
+// You can find out more about this error model and how to work with it in the
+// [API Design Guide](https://cloud.google.com/apis/design/errors).
+message Status {
+  // The status code, which should be an enum value of [google.rpc.Code][google.rpc.Code].
+  int32 code = 1;
+
+  // A developer-facing error message, which should be in English. Any
+  // user-facing error message should be localized and sent in the
+  // [google.rpc.Status.details][google.rpc.Status.details] field, or localized by the client.
+  string message = 2;
+
+  // A list of messages that carry the error details.  There is a common set of
+  // message types for APIs to use.
+  repeated google.protobuf.Any details = 3;
+}
\ No newline at end of file
diff --git a/generated_types/protos/grpc/health/v1/service.proto b/generated_types/protos/grpc/health/v1/service.proto
new file mode 100644
index 0000000..7be24c7
--- /dev/null
+++ b/generated_types/protos/grpc/health/v1/service.proto
@@ -0,0 +1,23 @@
+syntax = "proto3";
+
+package grpc.health.v1;
+
+message HealthCheckRequest {
+  string service = 1;
+}
+
+message HealthCheckResponse {
+  enum ServingStatus {
+    UNKNOWN = 0;
+    SERVING = 1;
+    NOT_SERVING = 2;
+    SERVICE_UNKNOWN = 3;  // Used only by the Watch method.
+  }
+  ServingStatus status = 1;
+}
+
+service Health {
+  rpc Check(HealthCheckRequest) returns (HealthCheckResponse);
+
+  rpc Watch(HealthCheckRequest) returns (stream HealthCheckResponse);
+}
diff --git a/generated_types/protos/influxdata/iox/authz/v1/authz.proto b/generated_types/protos/influxdata/iox/authz/v1/authz.proto
new file mode 100644
index 0000000..e7e5406
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/authz/v1/authz.proto
@@ -0,0 +1,106 @@
+/* IOx authorizer protocol */
+
+syntax = "proto3";
+
+package influxdata.iox.authz.v1;
+option go_package = "github.com/influxdata/iox/authz/v1";
+
+/*
+ * An authorizer validates a provided authorization token and returns
+ * the permissions associated with the token.
+ */
+service IoxAuthorizerService {
+
+  /*
+    * Authorize is used to validate a token and return the set of 
+    * permissions associated with the token.
+    */
+  rpc Authorize(AuthorizeRequest) returns (AuthorizeResponse);
+}
+
+/*
+ * The request message 
+ */
+message AuthorizeRequest {
+  /*
+   * Token provided to IOx in the request.
+   */
+  bytes token = 1;
+
+  /*
+   * The set of permissions that will be validated with the token.
+   */
+  repeated Permission permissions = 15;
+}
+
+message AuthorizeResponse {
+  /*
+   * Indication whether the token provided in the request is a valid
+   * token within the service. A token may be valid and not have any
+   * of the requested permissions associated with it.
+   */
+  bool valid = 1;
+
+  /*
+   * The identity of the subject associated with the request token.
+   * This will never be present when a token is not valid, and might
+   * not be present if the token does not have an associated subject.
+   */
+  optional Subject subject = 2;
+
+  /*
+   * The set of permissions associated with the token.
+   * If the token is not valid this will be empty.
+   */
+  repeated Permission permissions = 15;
+}
+
+message Permission {
+  oneof permission_one_of {
+    ResourceActionPermission resource_action = 1;
+  }
+}
+
+message ResourceActionPermission {
+  enum ResourceType {
+    /*
+     * Resource type that is either unspecified or unrecognized. Permissions
+     * with unknown resource type can never match a token's permissions,
+     * so this type will never be returned from an Authorize call.
+     */
+    RESOURCE_TYPE_UNSPECIFIED = 0;
+
+    /*
+     * Permission to access a database.
+     */
+    RESOURCE_TYPE_DATABASE = 1;
+  }
+
+  enum Action {
+    /*
+     * Action that is either unspecified, or unrecognized. Permissions
+     * with unknown actions can never match a token's permissions so this
+     * type will never be returned from an Authorize call.
+     */
+    ACTION_UNSPECIFIED = 0;
+
+    ACTION_READ_SCHEMA = 1;
+    ACTION_READ = 2;
+    ACTION_WRITE = 3;
+    ACTION_CREATE = 4;
+    ACTION_DELETE = 5;
+  }
+
+  ResourceType resource_type = 1;
+  optional string resource_id = 2;
+  Action action = 3;
+}
+
+message Subject {
+  /*
+   * Globally unique ID for the subject. This is not security sensitive
+   * and is safe to use in logs and other metrics.
+   */
+  string id = 1;
+}
+
diff --git a/generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto b/generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto
new file mode 100644
index 0000000..ad62d57
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto
@@ -0,0 +1,73 @@
+syntax = "proto3";
+package influxdata.iox.bulk_ingest.v1;
+option go_package = "github.com/influxdata/iox/bulk_ingest/v1";
+
+import "google/protobuf/timestamp.proto";
+
+service BulkIngestService {
+  // Generate the Parquet metadata that a bulk ingest process should use for the specified data
+  rpc NewParquetMetadata(NewParquetMetadataRequest) returns (NewParquetMetadataResponse);
+
+  // Given a partition and the suggested sort order expressed as columns in order by cardinality
+  // (low-to-high, as observed in all data to be imported for this partition), update the sort key
+  // in the catalog by merging these new columns with the existing catalog sort key, then return
+  // the full sort key.
+  rpc UpsertSortKey(UpsertSortKeyRequest) returns (UpsertSortKeyResponse);
+}
+
+message NewParquetMetadataRequest {
+  // Name of the namespace the data will be imported into
+  string namespace_name = 1;
+
+  // Name of the table the data will be imported into
+  string table_name = 2;
+
+  // Partition key of this data
+  string partition_key = 3;
+
+  // The set of column names that will appear in this Parquet file, ordered by cardinality of the
+  // full data set (not just the cardinality of the data appearing in this file).
+  //
+  // This is not necessarily the column order that will be uploaded later.
+  //
+  // Will be merged with the set of columns in the catalog for this partition's sort key and then
+  // returned in the metadata blob.
+  //
+  // This request will return an error if any columns are unknown or have no schema information.
+  repeated string columns = 4;
+
+  // Timestamp when the dataset to be imported was generated - used to order older/newer data
+  google.protobuf.Timestamp data_created_at = 5;
+}
+
+message NewParquetMetadataResponse {
+   // The Parquet metadata blob.
+   //
+   // Opaque payload consisting of a `key=value` map to be inserted into the Parquet file metadata.
+   //
+   // Metadata is IOxMetadata + ECDSA signature to ensure it is not tampered with.
+   map<string, bytes> metadata = 1;
+
+   // A pre-authorised, signed URL which should be used to PUT the file to object storage
+   string upload_url = 2;
+}
+
+message UpsertSortKeyRequest {
+  // Name of the namespace in which to upsert the sort key
+  string namespace_name = 1;
+
+  // Name of the table for which to upsert the sort key
+  string table_name = 2;
+
+  // The partition key for which to upsert the sort key
+  string partition_key = 3;
+
+  // The sort key columns to potentially add to the catalog if needed
+  repeated string columns = 4;
+}
+
+message UpsertSortKeyResponse {
+   // The full catalog sort key that files to be imported must be sorted by, to be filtered by
+   // which columns actually occur in the file
+   repeated string sort_key = 3;
+}
diff --git a/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto b/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto
new file mode 100644
index 0000000..c1382cb
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto
@@ -0,0 +1,49 @@
+syntax = "proto3";
+package influxdata.iox.catalog.v1;
+option go_package = "github.com/influxdata/iox/catalog/v1";
+
+import "influxdata/iox/catalog/v1/partition_identifier.proto";
+
+message ParquetFile {
+    reserved 7;
+    reserved "min_sequence_number";
+    reserved 2;
+    reserved "sequencer_id";
+    reserved 17;
+    reserved "shard_id";
+    reserved 8;
+    reserved "max_sequence_number";
+    reserved 19;
+    reserved "partition_identifier";
+
+    // the id of the file in the catalog
+    int64 id = 1;
+    // the namespace id
+    int64 namespace_id = 3;
+    // the table id
+    int64 table_id = 4;
+    // the partition id
+    int64 partition_id = 5;
+    // optional partition hash id
+    bytes partition_hash_id = 20;
+    // the object store uuid
+    string object_store_id = 6;
+    // the min timestamp of data in this file
+    int64 min_time = 9;
+    // the max timestamp of data in this file
+    int64 max_time = 10;
+    // the optional timestamp of when this file was marked for deletion
+    optional int64 to_delete = 11;
+    // the file size in bytes
+    int64 file_size_bytes = 12;
+    // the number of rows in this file
+    int64 row_count = 13;
+    // the compaction level of the file
+    int32 compaction_level = 14;
+    // the creation timestamp of the parquet file
+    int64 created_at = 15;
+    // Set of columns within this parquet file.
+    repeated int64 column_set = 16;
+    // max creation timestamp of all L0s this parquet file is compacted to
+    int64 max_l0_created_at = 18;
+}
diff --git a/generated_types/protos/influxdata/iox/catalog/v1/partition_identifier.proto b/generated_types/protos/influxdata/iox/catalog/v1/partition_identifier.proto
new file mode 100644
index 0000000..eb11c38
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/catalog/v1/partition_identifier.proto
@@ -0,0 +1,12 @@
+syntax = "proto3";
+package influxdata.iox.catalog.v1;
+option go_package = "github.com/influxdata/iox/catalog/v1";
+
+message PartitionIdentifier {
+  // Either the catalog-assigned partition ID or the deterministic identifier created from the
+  // table ID and partition key.
+  oneof id {
+    int64 catalog_id = 1;
+    bytes hash_id = 2;
+  }
+}
diff --git a/generated_types/protos/influxdata/iox/catalog/v1/service.proto b/generated_types/protos/influxdata/iox/catalog/v1/service.proto
new file mode 100644
index 0000000..0834e8b
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/catalog/v1/service.proto
@@ -0,0 +1,94 @@
+syntax = "proto3";
+package influxdata.iox.catalog.v1;
+option go_package = "github.com/influxdata/iox/catalog/v1";
+
+import "influxdata/iox/catalog/v1/parquet_file.proto";
+import "influxdata/iox/catalog/v1/partition_identifier.proto";
+
+service CatalogService {
+    // Get the parquet_file catalog records in the given partition
+    rpc GetParquetFilesByPartitionId(GetParquetFilesByPartitionIdRequest) returns (GetParquetFilesByPartitionIdResponse);
+
+    // Get the partition catalog records by the table id
+    rpc GetPartitionsByTableId(GetPartitionsByTableIdRequest) returns (GetPartitionsByTableIdResponse);
+
+    // Get the parquet_file catalog records in the given namespace and table name
+    rpc GetParquetFilesByNamespaceTable(GetParquetFilesByNamespaceTableRequest) returns (GetParquetFilesByNamespaceTableResponse);
+
+    // Get the parquet_file catalog records in the given namespace
+    rpc GetParquetFilesByNamespace(GetParquetFilesByNamespaceRequest) returns (GetParquetFilesByNamespaceResponse);
+}
+
+message GetParquetFilesByPartitionIdRequest {
+    // Was the catalog-assigned partition ID.
+    int64 partition_id = 1;
+
+    reserved 2;
+    reserved "partition_identifier";
+}
+
+message GetParquetFilesByPartitionIdResponse {
+    // the parquet_file records in the partition
+    repeated ParquetFile parquet_files = 1;
+}
+
+message Partition {
+    reserved 5;
+    reserved "sort_key";
+    reserved 2;
+    reserved "sequencer_id";
+    reserved 7;
+    reserved "shard_id";
+    reserved 1;
+    reserved "id";
+    reserved 6;
+    reserved "array_sort_key";
+    reserved 10;
+    reserved "optional_sort_key";
+
+    // the table id the partition is in
+    int64 table_id = 3;
+    // the partition key
+    string key = 4;
+
+    PartitionIdentifier identifier = 8;
+
+    // the sort key ids sort_key_ids for data in parquet files of this partition which
+    // is an array of column ids of the sort keys
+    optional SortKeyIds sort_key_ids = 9;
+}
+
+message SortKeyIds {
+    repeated int64 array_sort_key_ids = 1;
+}
+
+message GetPartitionsByTableIdRequest {
+    int64 table_id = 1;
+}
+
+message GetPartitionsByTableIdResponse {
+    repeated Partition partitions = 1;
+}
+
+message GetParquetFilesByNamespaceTableRequest {
+    // the namespace name
+    string namespace_name = 1;
+
+    // the table name in the namespace
+    string table_name = 2;
+}
+
+message GetParquetFilesByNamespaceTableResponse {
+    // the parquet_file records in the table in the namespace
+    repeated ParquetFile parquet_files = 1;
+}
+
+message GetParquetFilesByNamespaceRequest {
+    // the namespace name
+    string namespace_name = 1;
+}
+
+message GetParquetFilesByNamespaceResponse {
+    // the parquet_file records in the namespace
+    repeated ParquetFile parquet_files = 1;
+}
diff --git a/generated_types/protos/influxdata/iox/catalog/v2/service.proto b/generated_types/protos/influxdata/iox/catalog/v2/service.proto
new file mode 100644
index 0000000..d06362b
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/catalog/v2/service.proto
@@ -0,0 +1,489 @@
+// The API for the catalog service
+
+syntax = "proto3";
+package influxdata.iox.catalog.v2;
+option go_package = "github.com/influxdata/iox/catalog/v2";
+
+import "influxdata/iox/partition_template/v1/template.proto";
+import "influxdata/iox/column_type/v1/type.proto";
+import "influxdata/iox/catalog_cache/v1/value.proto";
+
+service CatalogService {
+  rpc NamespaceCreate(NamespaceCreateRequest) returns (NamespaceCreateResponse);
+  rpc NamespaceUpdateRetentionPeriod(NamespaceUpdateRetentionPeriodRequest) returns (NamespaceUpdateRetentionPeriodResponse);
+  rpc NamespaceList(NamespaceListRequest) returns (stream NamespaceListResponse);
+  rpc NamespaceGetById(NamespaceGetByIdRequest) returns (NamespaceGetByIdResponse);
+  rpc NamespaceGetByName(NamespaceGetByNameRequest) returns (NamespaceGetByNameResponse);
+  rpc NamespaceSoftDelete(NamespaceSoftDeleteRequest) returns (NamespaceSoftDeleteResponse);
+  rpc NamespaceUpdateTableLimit(NamespaceUpdateTableLimitRequest) returns (NamespaceUpdateTableLimitResponse);
+  rpc NamespaceUpdateColumnLimit(NamespaceUpdateColumnLimitRequest) returns (NamespaceUpdateColumnLimitResponse);
+
+  rpc TableCreate(TableCreateRequest) returns (TableCreateResponse);
+  rpc TableGetById(TableGetByIdRequest) returns (TableGetByIdResponse);
+  rpc TableGetByNamespaceAndName(TableGetByNamespaceAndNameRequest) returns (TableGetByNamespaceAndNameResponse);
+  rpc TableListByNamespaceId(TableListByNamespaceIdRequest) returns (stream TableListByNamespaceIdResponse);
+  rpc TableList(TableListRequest) returns (stream TableListResponse);
+  rpc TableSnapshot(TableSnapshotRequest) returns (TableSnapshotResponse);
+
+  rpc ColumnCreateOrGet(ColumnCreateOrGetRequest) returns (ColumnCreateOrGetResponse);
+  rpc ColumnCreateOrGetManyUnchecked(ColumnCreateOrGetManyUncheckedRequest) returns (stream ColumnCreateOrGetManyUncheckedResponse);
+  rpc ColumnListByNamespaceId(ColumnListByNamespaceIdRequest) returns (stream ColumnListByNamespaceIdResponse);
+  rpc ColumnListByTableId(ColumnListByTableIdRequest) returns (stream ColumnListByTableIdResponse);
+  rpc ColumnList(ColumnListRequest) returns (stream ColumnListResponse);
+
+  rpc PartitionCreateOrGet(PartitionCreateOrGetRequest) returns (PartitionCreateOrGetResponse);
+  rpc PartitionGetByIdBatch(PartitionGetByIdBatchRequest) returns (stream PartitionGetByIdBatchResponse);
+  rpc PartitionListByTableId(PartitionListByTableIdRequest) returns (stream PartitionListByTableIdResponse);
+  rpc PartitionListIds(PartitionListIdsRequest) returns (stream PartitionListIdsResponse);
+  rpc PartitionCasSortKey(PartitionCasSortKeyRequest) returns (PartitionCasSortKeyResponse);
+  rpc PartitionRecordSkippedCompaction(PartitionRecordSkippedCompactionRequest) returns (PartitionRecordSkippedCompactionResponse);
+  rpc PartitionGetInSkippedCompactions(PartitionGetInSkippedCompactionsRequest) returns (stream PartitionGetInSkippedCompactionsResponse);
+  rpc PartitionListSkippedCompactions(PartitionListSkippedCompactionsRequest) returns (stream PartitionListSkippedCompactionsResponse);
+  rpc PartitionDeleteSkippedCompactions(PartitionDeleteSkippedCompactionsRequest) returns (PartitionDeleteSkippedCompactionsResponse);
+  rpc PartitionMostRecentN(PartitionMostRecentNRequest) returns (stream PartitionMostRecentNResponse);
+  rpc PartitionNewFileBetween(PartitionNewFileBetweenRequest) returns (stream PartitionNewFileBetweenResponse);
+  rpc PartitionListOldStyle(PartitionListOldStyleRequest) returns (stream PartitionListOldStyleResponse);
+  rpc PartitionSnapshot(PartitionSnapshotRequest) returns (PartitionSnapshotResponse);
+
+  rpc ParquetFileFlagForDeleteByRetention(ParquetFileFlagForDeleteByRetentionRequest) returns (stream ParquetFileFlagForDeleteByRetentionResponse);
+  rpc ParquetFileDeleteOldIdsOnly(ParquetFileDeleteOldIdsOnlyRequest) returns (stream ParquetFileDeleteOldIdsOnlyResponse);
+  rpc ParquetFileListByPartitionNotToDeleteBatch(ParquetFileListByPartitionNotToDeleteBatchRequest) returns (stream ParquetFileListByPartitionNotToDeleteBatchResponse);
+  rpc ParquetFileGetByObjectStoreId(ParquetFileGetByObjectStoreIdRequest) returns (ParquetFileGetByObjectStoreIdResponse);
+  rpc ParquetFileExistsByObjectStoreIdBatch(stream ParquetFileExistsByObjectStoreIdBatchRequest) returns (stream ParquetFileExistsByObjectStoreIdBatchResponse);
+  rpc ParquetFileCreateUpgradeDelete(ParquetFileCreateUpgradeDeleteRequest) returns (ParquetFileCreateUpgradeDeleteResponse);
+}
+
+message NamespaceCreateRequest {
+  string name = 1;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 2;
+  optional int64 retention_period_ns = 3;
+  ServiceProtectionLimits service_protection_limits = 4;
+}
+
+message NamespaceCreateResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceUpdateRetentionPeriodRequest {
+  string name = 1;
+  optional int64 retention_period_ns = 2;
+}
+
+message NamespaceUpdateRetentionPeriodResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceListRequest {
+  SoftDeletedRows deleted = 1;
+}
+
+message NamespaceListResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceGetByIdRequest {
+  int64 id = 1;
+  SoftDeletedRows deleted = 2;
+}
+
+message NamespaceGetByIdResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceGetByNameRequest {
+  string name = 1;
+  SoftDeletedRows deleted = 2;
+}
+
+message NamespaceGetByNameResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceSoftDeleteRequest {
+  string name = 1;
+}
+
+message NamespaceSoftDeleteResponse {}
+
+message NamespaceUpdateTableLimitRequest {
+  string name = 1;
+  int32 new_max = 2;
+}
+
+message NamespaceUpdateTableLimitResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceUpdateColumnLimitRequest {
+  string name = 1;
+  int32 new_max = 2;
+}
+
+message NamespaceUpdateColumnLimitResponse {
+  Namespace namespace = 1;
+}
+
+message TableCreateRequest {
+  string name = 1;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 2;
+  int64 namespace_id = 3;
+}
+
+message TableCreateResponse {
+  Table table = 1;
+}
+
+message TableGetByIdRequest {
+  int64 id = 1;
+}
+
+message TableGetByIdResponse {
+  Table table = 1;
+}
+
+message TableGetByNamespaceAndNameRequest {
+  int64 namespace_id = 1;
+  string name = 2;
+}
+
+message TableGetByNamespaceAndNameResponse {
+  Table table = 1;
+}
+
+message TableListByNamespaceIdRequest {
+  int64 namespace_id = 1;
+}
+
+message TableListByNamespaceIdResponse {
+  Table table = 1;
+}
+
+message TableListRequest {}
+
+message TableListResponse {
+  Table table = 1;
+}
+
+message TableSnapshotRequest {
+  int64 table_id = 1;
+}
+
+message TableSnapshotResponse {
+  catalog_cache.v1.Table table = 1;
+  uint64 generation = 2;
+}
+
+message ColumnCreateOrGetRequest {
+  string name = 1;
+  int64 table_id = 2;
+  influxdata.iox.column_type.v1.ColumnType column_type = 3;
+}
+
+message ColumnCreateOrGetResponse {
+  Column column = 1;
+}
+
+message ColumnCreateOrGetManyUncheckedRequest {
+  int64 table_id = 1;
+  map<string, influxdata.iox.column_type.v1.ColumnType> columns = 2;
+}
+
+message ColumnCreateOrGetManyUncheckedResponse {
+  Column column = 1;
+}
+
+message ColumnListByNamespaceIdRequest {
+  int64 namespace_id = 1;
+}
+
+message ColumnListByNamespaceIdResponse {
+  Column column = 1;
+}
+
+message ColumnListByTableIdRequest {
+  int64 table_id = 1;
+}
+
+message ColumnListByTableIdResponse {
+  Column column = 1;
+}
+
+message ColumnListRequest {}
+
+message ColumnListResponse {
+  Column column = 1;
+}
+
+message PartitionCreateOrGetRequest {
+  string key = 1;
+  int64 table_id = 2;
+}
+
+message PartitionCreateOrGetResponse {
+  Partition partition = 1;
+}
+
+message PartitionGetByIdBatchRequest {
+  repeated int64 partition_ids = 1;
+}
+
+message PartitionGetByIdBatchResponse {
+  Partition partition = 1;
+}
+
+message PartitionGetByHashIdBatchRequest {
+  repeated bytes partition_hash_ids = 1;
+}
+
+message PartitionGetByHashIdBatchResponse {
+  Partition partition = 1;
+}
+
+message PartitionListByTableIdRequest {
+  int64 table_id = 1;
+}
+
+message PartitionListByTableIdResponse {
+  Partition partition = 1;
+}
+
+message PartitionListIdsRequest {}
+
+message PartitionListIdsResponse {
+  int64 partition_id = 1;
+}
+
+message PartitionCasSortKeyRequest {
+  int64 partition_id = 1;
+  SortKeyIds old_sort_key_ids = 2;
+  SortKeyIds new_sort_key_ids = 3;
+}
+
+message PartitionCasSortKeyResponse {
+  oneof res {
+    Partition partition = 1;
+    SortKeyIds current_sort_key = 2;
+  }
+}
+
+message PartitionRecordSkippedCompactionRequest {
+  int64 partition_id = 1;
+  string reason = 2;
+  uint64 num_files = 3;
+  uint64 limit_num_files = 4;
+  uint64 limit_num_files_first_in_partition = 5;
+  uint64 estimated_bytes = 6;
+  uint64 limit_bytes = 7;
+}
+
+message PartitionRecordSkippedCompactionResponse {}
+
+message PartitionGetInSkippedCompactionsRequest {
+  repeated int64 partition_ids = 1;
+}
+
+message PartitionGetInSkippedCompactionsResponse {
+  SkippedCompaction skipped_compaction = 1;
+}
+
+message PartitionListSkippedCompactionsRequest {}
+
+message PartitionListSkippedCompactionsResponse {
+  SkippedCompaction skipped_compaction = 1;
+}
+
+message PartitionDeleteSkippedCompactionsRequest {
+  int64 partition_id = 1;
+}
+
+message PartitionDeleteSkippedCompactionsResponse {
+  SkippedCompaction skipped_compaction = 1;
+}
+
+message PartitionMostRecentNRequest {
+  uint64 n = 1;
+}
+
+message PartitionMostRecentNResponse {
+  Partition partition = 1;
+}
+
+message PartitionNewFileBetweenRequest {
+  int64 minimum_time = 1;
+  optional int64 maximum_time = 2;
+}
+
+message PartitionNewFileBetweenResponse {
+  int64 partition_id = 1;
+}
+
+message PartitionListOldStyleRequest {}
+
+message PartitionListOldStyleResponse {
+  Partition partition = 1;
+}
+
+message PartitionSnapshotRequest {
+  int64 partition_id = 1;
+}
+
+message PartitionSnapshotResponse {
+  catalog_cache.v1.Partition partition = 1;
+  uint64 generation = 2;
+}
+
+message ParquetFileFlagForDeleteByRetentionRequest {}
+
+message ParquetFileFlagForDeleteByRetentionResponse {
+  ObjectStoreId object_store_id = 1;
+  int64 partition_id = 2;
+}
+
+message ParquetFileDeleteOldIdsOnlyRequest {
+  int64 older_than = 1;
+}
+
+message ParquetFileDeleteOldIdsOnlyResponse {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileListByPartitionNotToDeleteBatchRequest  {
+  repeated int64 partition_ids = 1;
+}
+
+message ParquetFileListByPartitionNotToDeleteBatchResponse {
+  ParquetFile parquet_file = 1;
+}
+
+message ParquetFileGetByObjectStoreIdRequest {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileGetByObjectStoreIdResponse {
+  ParquetFile parquet_file = 1;
+}
+
+message ParquetFileExistsByObjectStoreIdBatchRequest {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileExistsByObjectStoreIdBatchResponse {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileCreateUpgradeDeleteRequest {
+  repeated ObjectStoreId delete = 1;
+  repeated ObjectStoreId upgrade = 2;
+  repeated ParquetFileParams create = 3;
+  int32 target_level = 4;
+  int64 partition_id = 5;
+}
+
+message ParquetFileCreateUpgradeDeleteResponse {
+  repeated int64 created_parquet_file_ids = 1;
+}
+
+message ServiceProtectionLimits {
+  optional int32 max_tables = 1;
+  optional int32 max_columns_per_table = 2;
+}
+
+message Namespace {
+  int64 id = 1;
+  string name = 2;
+  optional int64 retention_period_ns = 3;
+  int32 max_tables = 4;
+  int32 max_columns_per_table = 5;
+  optional int64 deleted_at = 6;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 7;
+}
+
+enum SoftDeletedRows {
+  SOFT_DELETED_ROWS_UNSPECIFIED = 0;
+  SOFT_DELETED_ROWS_ALL_ROWS = 1;
+  SOFT_DELETED_ROWS_EXCLUDE_DELETED = 2;
+  SOFT_DELETED_ROWS_ONLY_DELETED = 3;
+}
+
+message Table {
+  int64 id = 1;
+  int64 namespace_id = 2;
+  string name = 3;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 4;
+}
+
+message Column {
+  int64 id = 1;
+  int64 table_id = 2;
+  string name = 3;
+  influxdata.iox.column_type.v1.ColumnType column_type = 4;
+}
+
+message SortKeyIds {
+  repeated int64 column_ids = 2;
+}
+
+message Partition {
+  int64 id = 1;
+  bytes hash_id = 2;
+  int64 table_id = 3;
+  string partition_key = 4;
+  SortKeyIds sort_key_ids = 5;
+  optional int64 new_file_at = 6;
+}
+
+message SkippedCompaction {
+  int64 partition_id = 1;
+  string reason = 2;
+  int64 skipped_at = 3;
+  int64 estimated_bytes = 4;
+  int64 limit_bytes = 5;
+  int64 num_files = 6;
+  int64 limit_num_files = 7;
+  int64 limit_num_files_first_in_partition = 8;
+}
+
+message ObjectStoreId {
+  fixed64 high64 = 1;
+  fixed64 low64 = 2;
+}
+
+message ColumnSet {
+  repeated int64 column_ids = 1;
+}
+
+message ParquetFileParams {
+  int64 namespace_id = 1;
+  int64 table_id = 2;
+  int64 partition_id = 3;
+  optional bytes partition_hash_id = 4;
+  ObjectStoreId object_store_id = 5;
+  int64 min_time = 6;
+  int64 max_time = 7;
+  int64 file_size_bytes = 8;
+  int64 row_count = 9;
+  int32 compaction_level = 10;
+  int64 created_at = 11;
+  ColumnSet column_set = 12;
+  int64 max_l0_created_at = 13;
+}
+
+message ParquetFile {
+  reserved 4; // TransitionPartitionId
+
+  int64 id = 1;
+  int64 namespace_id = 2;
+  int64 table_id = 3;
+  int64 partition_id = 15;
+  bytes partition_hash_id = 16;
+  ObjectStoreId object_store_id = 5;
+  int64 min_time = 6;
+  int64 max_time = 7;
+  optional int64 to_delete = 8;
+  int64 file_size_bytes = 9;
+  int64 row_count = 10;
+  int32 compaction_level = 11;
+  int64 created_at = 12;
+  ColumnSet column_set = 13;
+  int64 max_l0_created_at = 14;
+}
diff --git a/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
new file mode 100644
index 0000000..4338e4f
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
@@ -0,0 +1,158 @@
+syntax = "proto3";
+package influxdata.iox.catalog_cache.v1;
+option go_package = "github.com/influxdata/iox/catalog_cache/v1";
+
+import "influxdata/iox/column_type/v1/type.proto";
+import "influxdata/iox/partition_template/v1/template.proto";
+import "influxdata/iox/skipped_compaction/v1/skipped_compaction.proto";
+
+// A list of Message supporting efficient random access
+// See data_types::snapshot::list::MessageList
+message MessageList {
+  // Int32 offsets denoting slices into values
+  // See https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
+  bytes offsets = 1;
+  // Raw value data
+  bytes values = 2;
+}
+
+message SipHash24 {
+  fixed64 key0 = 1;
+  fixed64 key1 = 2;
+}
+
+// A list of hash buckets supporting linear probing
+// See data_types::snapshot::hash::HashBuckets
+message HashBuckets {
+  bytes buckets = 1;
+  oneof hash_function {
+    SipHash24 sip_hash_24 = 2;
+  }
+}
+
+// A packed bitmask
+// See data_types::snapshot::mask::BitMask
+message BitMask {
+  bytes mask = 1;
+  uint64 len = 2;
+}
+
+/// A UUID encoded as two unsigned 64-bit integers
+message UUID {
+  fixed64 low = 1;
+  fixed64 high = 2;
+}
+
+message Partition {
+  // The namespace id
+  int64 namespace_id = 1;
+
+  // The table id
+  int64 table_id = 2;
+
+  // The partition id
+  int64 partition_id = 3;
+
+  // True if this partition has a partition_hash_id
+  bool partition_hash_id = 4;
+
+  // The partition key of this partition
+  bytes key = 6;
+
+  // A MessageList of PartitionFile
+  MessageList files = 7;
+
+  // The column ids
+  repeated int64 column_ids = 8;
+
+  // The sort keys ids
+  repeated int64 sort_key_ids = 9;
+
+  // The time of a new file
+  optional int64 new_file_at = 10;
+
+  // Skipped compaction registered for this partition.
+  influxdata.iox.skipped_compaction.v1.SkippedCompaction skipped_compaction = 11;
+}
+
+message PartitionFile {
+  // The identifier for a file
+  UUID object_store_uuid = 1;
+
+  // A min timestamp in nanoseconds from epoch
+  int64 min_time = 2;
+
+  // A max timestamp in nanoseconds from epoch
+  int64 max_time = 3;
+
+  // The size of this file in bytes
+  int64 file_size_bytes = 4;
+
+  // The number of rows in this file
+  int64 row_count = 5;
+
+  // The compaction level
+  int32 compaction_level = 6;
+
+  // The creation time of this file in nanoseconds
+  int64 created_at = 7;
+
+  // The maximum created_at of the l0 files used to produce this file
+  int64 max_l0_created_at = 8;
+
+  // A mask of Partition.column_ids
+  BitMask column_mask = 9;
+
+  // Legacy sequential id
+  int64 id = 10;
+}
+
+message Table {
+  // A MessageList of TablePartition
+  MessageList partitions = 1;
+
+  // A MessageList of TableColumn
+  MessageList columns = 2;
+
+  // The partition template of this table
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 3;
+
+  // The namespace this table belongs to
+  int64 namespace_id = 4;
+
+  // The id of this table
+  int64 table_id = 5;
+
+  // The name of the table
+  bytes table_name = 6;
+}
+
+message TablePartition {
+  // The id of this partition
+  int64 id = 1;
+  // The partition key
+  bytes key = 2;
+}
+
+message TableColumn {
+  // The id of this column
+  int64 id = 1;
+  // The name of this column
+  bytes name = 2;
+  // The type of this column
+  influxdata.iox.column_type.v1.ColumnType column_type = 3;
+}
+
+message Namespace {
+  // A MessageList of NamespaceTable
+  MessageList tables = 1;
+  // A HashBuckets of NamespaceTable.table_name
+  HashBuckets table_names = 2;
+}
+
+message NamespaceTable {
+  // The id of this table
+  int64 id = 1;
+  // The name of this table
+  bytes name = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/column_type/v1/type.proto b/generated_types/protos/influxdata/iox/column_type/v1/type.proto
new file mode 100644
index 0000000..fc65faf
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/column_type/v1/type.proto
@@ -0,0 +1,14 @@
+syntax = "proto3";
+package influxdata.iox.column_type.v1;
+option go_package = "github.com/influxdata/iox/column_type/v1";
+
+enum ColumnType {
+  COLUMN_TYPE_UNSPECIFIED = 0;
+  COLUMN_TYPE_I64 = 1;
+  COLUMN_TYPE_U64 = 2;
+  COLUMN_TYPE_F64 = 3;
+  COLUMN_TYPE_BOOL = 4;
+  COLUMN_TYPE_STRING = 5;
+  COLUMN_TYPE_TIME = 6;
+  COLUMN_TYPE_TAG = 7;
+}
diff --git a/generated_types/protos/influxdata/iox/compactor/v1/service.proto b/generated_types/protos/influxdata/iox/compactor/v1/service.proto
new file mode 100644
index 0000000..2a25e4b
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/compactor/v1/service.proto
@@ -0,0 +1,29 @@
+syntax = "proto3";
+package influxdata.iox.compactor.v1;
+option go_package = "github.com/influxdata/iox/compactor/v1";
+
+import "influxdata/iox/skipped_compaction/v1/skipped_compaction.proto";
+
+service CompactionService {
+  // List all skipped compactions in the catalog
+  rpc ListSkippedCompactions(ListSkippedCompactionsRequest) returns (ListSkippedCompactionsResponse);
+
+  // Delete a skipped compaction by partition ID
+  rpc DeleteSkippedCompactions(DeleteSkippedCompactionsRequest) returns (DeleteSkippedCompactionsResponse);
+}
+
+message ListSkippedCompactionsRequest {}
+
+message ListSkippedCompactionsResponse {
+  // A list of skipped compactions
+  repeated influxdata.iox.skipped_compaction.v1.SkippedCompaction skipped_compactions = 1;
+}
+
+message DeleteSkippedCompactionsRequest {
+  int64 partition_id = 1;
+}
+
+message DeleteSkippedCompactionsResponse {
+  // The deleted skipped compaction
+  optional influxdata.iox.skipped_compaction.v1.SkippedCompaction skipped_compaction = 1;
+}
diff --git a/generated_types/protos/influxdata/iox/delete/v1/service.proto b/generated_types/protos/influxdata/iox/delete/v1/service.proto
new file mode 100644
index 0000000..c30d343
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/delete/v1/service.proto
@@ -0,0 +1,50 @@
+syntax = "proto3";
+package influxdata.iox.delete.v1;
+option go_package = "github.com/influxdata/iox/delete/v1";
+
+import "influxdata/iox/predicate/v1/predicate.proto";
+
+service DeleteService {
+  // Delete data for a table on a specified predicate
+  rpc Delete(DeleteRequest) returns (DeleteResponse);
+}
+
+// Request to delete data from a table on a specified predicate
+message DeleteRequest {
+  reserved 1;
+  reserved "db_name";
+
+  reserved 2;
+  reserved "table_name";
+
+  reserved 3;
+  reserved "start_time";
+
+  reserved 4;
+  reserved "stop_time";
+
+  reserved 5;
+  reserved "predicate";
+
+  // Delete payload
+  DeletePayload payload = 6;
+}
+
+message DeleteResponse {
+}
+
+// A delete payload
+message DeletePayload {
+  // Was the name of the database / namespace.
+  reserved "db_name";
+  reserved 1;
+
+  // The catalog ID for this database / namespace.
+  int64 database_id = 4;
+
+  // An optional table name to restrict this delete to
+  string table_name = 2;
+
+  // The predicate identifying data to delete
+  influxdata.iox.predicate.v1.Predicate predicate = 3;
+}
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/compaction.proto b/generated_types/protos/influxdata/iox/gossip/v1/compaction.proto
new file mode 100644
index 0000000..6392b83
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/gossip/v1/compaction.proto
@@ -0,0 +1,31 @@
+syntax = "proto3";
+package influxdata.iox.gossip.v1;
+option go_package = "github.com/influxdata/iox/gossip/v1";
+
+import "influxdata/iox/catalog/v1/parquet_file.proto";
+
+// Notification of a compaction round completion.
+//
+// This message defines the output of the compaction round - the files deleted,
+// upgraded, and created. Deleted and upgraded files are addressed by their
+// catalog row IDs ("parquet file ID"), while newly created files are provided
+// with their entire metadata.
+//
+// # Atomicity
+//
+// This message is atomic - it describes the output of a single compaction job
+// in its entirety. It is never split into multiple messages.
+message CompactionEvent {
+  // Files that were deleted by this compaction event, addressed by their
+  // parquet row ID in the catalog.
+  repeated int64 deleted_file_ids = 1;
+
+  // The set of parquet catalog row IDs that were upgraded to
+  // `upgraded_target_level` as part of this compaction event.
+  int64 upgraded_target_level = 2;
+  repeated int64 updated_file_ids = 3;
+
+  // The set of newly created parquet files that were output by this compaction
+  // event.
+  repeated influxdata.iox.catalog.v1.ParquetFile new_files = 4;
+}
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/parquet_file.proto b/generated_types/protos/influxdata/iox/gossip/v1/parquet_file.proto
new file mode 100644
index 0000000..b45825a
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/gossip/v1/parquet_file.proto
@@ -0,0 +1,8 @@
+syntax = "proto3";
+package influxdata.iox.gossip.v1;
+option go_package = "github.com/influxdata/iox/gossip/v1";
+
+import "influxdata/iox/catalog/v1/parquet_file.proto";
+
+/// A gossip-specific wrapper over a `ParquetFile` record.
+message NewParquetFile { influxdata.iox.catalog.v1.ParquetFile file = 1; }
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/schema.proto b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
new file mode 100644
index 0000000..7036cad
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
@@ -0,0 +1,120 @@
+syntax = "proto3";
+package influxdata.iox.gossip.v1;
+option go_package = "github.com/influxdata/iox/gossip/v1";
+
+import "google/protobuf/timestamp.proto";
+import "influxdata/iox/partition_template/v1/template.proto";
+
+// A message exchanged via the IOx gossip mechanism describing schema changes.
+message SchemaMessage {
+  oneof event {
+    // A new namespace was created.
+    NamespaceCreated namespace_created = 1;
+
+    // A new table was created.
+    TableCreated table_created = 2;
+
+    // One or more new columns were added to an existing table.
+    TableUpdated table_updated = 3;
+  }
+
+  // Wall-clock time when this consistency probe was enqueued for broadcast
+  // by the sending node.
+  //
+  // Clock-skew yada yada applies to any calculations done by nodes in receipt
+  // of the message.
+  google.protobuf.Timestamp sent_at = 15;
+}
+
+// Initialisation of a new namespace occured.
+//
+// If the local peer already knows of this namespace, this is a no-op.
+message NamespaceCreated {
+  string namespace_name = 1;
+  int64 namespace_id = 2;
+
+  // Immutable fields.
+  //
+  // Fields below this line MUST be immutable for the lifetime of a table -
+  // there is no merge stategy for them.
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 3;
+
+  // Mutable fields.
+  //
+  // Fields below this line may change over the lifetime of the namespace, and
+  // may differ on the local node when it receives this message.
+  //
+  // If the local peer has values that differ from these, the local value takes
+  // prescidence.
+  uint64 max_columns_per_table = 4;
+  uint64 max_tables = 5;
+  optional int64 retention_period_ns = 6;
+}
+
+// An incremental/differential addition to an existing table.
+//
+// If the receiving peer does not know of the table being updated, this is a
+// no-op.
+//
+// This type is designed to function as a commutative, operation-based CRDT.
+//
+// A table can contain many columns, each with a string name, and therefore the
+// serialised representation of an entire table can grow to be fairly large -
+// it's infeasible to send all columns for every update due to space constraints
+// of the gossip message transport. Instead only differentials/new additions are
+// gossiped between peers and an external system is relied on to converge state
+// in the case of lost updates.
+message TableUpdated {
+  string table_name = 1;
+  string namespace_name = 2;
+  int64 table_id = 3;
+
+  // The set of columns in this update.
+  repeated Column columns = 4;
+}
+
+// Initialisation of a new table occured.
+//
+// This is a superset of the merge-able TableUpdated message, containing
+// immutable and large fields that should not be propagated for each column
+// addition for frame size/performance reasons.
+message TableCreated {
+  // The initialised state of the new table, including all, or a subset of,
+  // columns.
+  //
+  // If the serialised message exceeds the underlying max transport frame size,
+  // a subset of columns is transmitted instead of the full set, and one or more
+  // TableUpdated frames MAY be sent containing the remaining columns.
+  TableUpdated table = 1;
+
+  // Fields below this line MUST be immutable for the lifetime of a table -
+  // there is no merge stategy for them.
+
+  // The table partition template used for partitioning writes for this table.
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 2;
+}
+
+// Representation of a column schema within a table.
+//
+// Values within this structure MUST be immutable for the lifetime of the
+// column.
+message Column {
+  string name = 1;
+  int64 column_id = 2;
+  ColumnType column_type = 5;
+
+  enum ColumnType {
+    // An unknown column data type.
+    //
+    // This is an invalid value and SHOULD never be specified.
+    COLUMN_TYPE_UNSPECIFIED = 0;
+
+    COLUMN_TYPE_I64 = 1;
+    COLUMN_TYPE_U64 = 2;
+    COLUMN_TYPE_F64 = 3;
+    COLUMN_TYPE_BOOL = 4;
+    COLUMN_TYPE_STRING = 5;
+    COLUMN_TYPE_TIME = 6;
+    COLUMN_TYPE_TAG = 7;
+  }
+}
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/schema_sync.proto b/generated_types/protos/influxdata/iox/gossip/v1/schema_sync.proto
new file mode 100644
index 0000000..583e872
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/gossip/v1/schema_sync.proto
@@ -0,0 +1,102 @@
+syntax = "proto3";
+package influxdata.iox.gossip.v1;
+option go_package = "github.com/influxdata/iox/gossip/v1";
+
+import "influxdata/iox/gossip/v1/schema.proto";
+
+// An RPC service provided by peers wishing to take part in anti-entropy of
+// their schema caches.
+service AntiEntropyService {
+  // Return the computed Merkle Search Tree difference between the senders
+  // serialised compact MST representation included in the request, and the
+  // receivers local MST.
+  //
+  // The caller of this RPC sends their serialised MST page ranges, and the
+  // callee performs the tree diff, returning the key ranges identified as
+  // containing inconsistencies.
+  rpc GetTreeDiff(GetTreeDiffRequest) returns (GetTreeDiffResponse);
+
+  // Fetch all schemas in the peer cache within the specified inclusive key
+  // range bounds.
+  rpc GetRange(GetRangeRequest) returns (GetRangeResponse);
+}
+
+// Request the receiver perform a Merkle Search Tree diff against the provided
+// set of MST pages.
+message GetTreeDiffRequest {
+  // Serialised representation of the sender's MST.
+  repeated PageRange pages = 1;
+}
+
+// The set of namespace name ranges that contain inconsistencies.
+message GetTreeDiffResponse {
+  // Computed diff ranges containing MST inconsistencies between the two nodes.
+  repeated DiffRange ranges = 1;
+}
+
+// A compact representation of a single page in a Merkle Search Tree.
+message PageRange {
+  // Lexicographically minimum namespace name in this page (inclusive).
+  string min = 1;
+
+  // Lexicographically maximum namespace name in this page (inclusive).
+  string max = 2;
+
+  // A 16-byte MST page hash covering all entries in this page.
+  bytes page_hash = 3;
+}
+
+// An inclusive range of namespace names which contains at least one
+// inconsistent schema.
+message DiffRange {
+  // Lexicographically minimum inconsistent namespace name in this diff range 
+  // (inclusive).
+  string min = 1;
+
+  // Lexicographically maximum inconsistent namespace name in this diff range
+  // (inclusive).
+  string max = 2;
+}
+
+// Fetch the namespace schemas with namespace names falling within the specified
+// inclusive range.
+message GetRangeRequest {
+  // Lexicographically minimum namespace name in this range to be fetched 
+  // (inclusive).
+  string min = 1;
+
+  // Lexicographically maximum namespace name in this range to be fetched
+  // (inclusive).
+  string max = 2;
+}
+
+// A set of namespace schemas for a range request.
+message GetRangeResponse {
+  repeated NamespaceSchemaEntry namespaces = 1;
+}
+
+// A composition of a "namespace create" event and zero-or-more "table create"
+// events.
+//
+// Convergence is achieved by reapplying these gossip events and merging their
+// content into the local node's schema cache.
+message NamespaceSchemaEntry {
+  // The "namespace create" event containing namespace parameters.
+  influxdata.iox.gossip.v1.NamespaceCreated namespace = 1;
+
+  // The "table create" events containing the set of all tables and their
+  // parameters.
+  repeated influxdata.iox.gossip.v1.TableCreated tables = 2;
+}
+
+// A gossip frame sent to peers to begin a sync round / consistency check.
+message ConsistencyProbe {
+  // A 16-byte Merkle Search Tree root hash convering the schema cache content.
+  bytes root_hash = 1;
+
+  // The gRPC bind port on the sender.
+  //
+  // Peers will combine this port with the source address of gossip frames to
+  // derive the gRPC address for sync operations.
+  uint32 grpc_port = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto b/generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto
new file mode 100644
index 0000000..b72e38a
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto
@@ -0,0 +1,20 @@
+syntax = "proto3";
+package influxdata.iox.gossip.v1;
+option go_package = "github.com/influxdata/iox/gossip/v1";
+
+import "influxdata/iox/catalog/v1/partition_identifier.proto";
+import "influxdata/iox/catalog/v1/service.proto";
+
+// Notification of the sort key for a partition being updated.
+//
+// This message defines the complete sort key yielded by the sender as a result
+// of the update. Sort key updates are additive so this event MUST NOT result in
+// the removal of columns from the receivers sort key.
+message PartitionSortKeyUpdateEvent {
+  // The unique identifier for the partition this event 
+  influxdata.iox.catalog.v1.PartitionIdentifier identifier = 1;
+
+  // The complete, sorted set of column IDs which make up the new sort key for 
+  // the partition at the time of update.
+  influxdata.iox.catalog.v1.SortKeyIds sort_key_ids = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/ingester/v1/parquet_metadata.proto b/generated_types/protos/influxdata/iox/ingester/v1/parquet_metadata.proto
new file mode 100644
index 0000000..1bdd8b5
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/ingester/v1/parquet_metadata.proto
@@ -0,0 +1,77 @@
+syntax = "proto3";
+package influxdata.iox.ingester.v1;
+option go_package = "github.com/influxdata/iox/ingester/v1";
+
+import "google/protobuf/timestamp.proto";
+
+// IOx-specific metadata that will be serialized into the file-level key-value Parquet metadata
+// under a single key.
+message IoxMetadata {
+  // Removed as the Parquet metadata itself contains the row count & min/max
+  // timestamps, and specifying them here creates a dependency that prevents
+  // streaming serialisation (needing to know the number rows before you can
+  // serialize your parquet file with this metadata structure within it)
+  reserved 10, 11, 12, 14;
+  reserved "row_count", "min_sequence_number", "time_of_first_write", "time_of_last_write";
+  // Renamed to shard_id
+  reserved 5;
+  reserved "sequencer_id";
+  // shard_id was removed
+  reserved 17;
+  reserved "shard_id";
+  // max_sequence_number was removed
+  reserved 13;
+  reserved "max_sequence_number";
+  // Was the unique database partition ID. Moving to deterministic hash-based partition IDs
+  // that can be built from the table ID and the partition key, so storing the new partition hash
+  // ID here would be redundant.
+  reserved 8;
+  reserved "partition_id";
+
+  // Object store ID. Used in the parquet filename. 16 bytes in big-endian order.
+  bytes object_store_id = 1;
+
+  // Timestamp when this file was created.
+  google.protobuf.Timestamp creation_timestamp = 2;
+
+  // Unique namespace ID.
+  int64 namespace_id = 3;
+
+  // Unique name of the namespace.
+  string namespace_name = 4;
+
+  // Unique table ID.
+  int64 table_id = 6;
+
+  // Table that holds this parquet file.
+  string table_name = 7;
+
+  // Partition key of the partition that holds this parquet file.
+  string partition_key = 9;
+
+  // The sort key of this chunk
+  SortKey sort_key = 15;
+
+  // the compaction level of the file
+  int32 compaction_level = 16;
+
+  // max creation time of all L0 files this file is compacted to
+  google.protobuf.Timestamp max_l0_created_at = 18;
+}
+
+// Sort key of a chunk.
+message SortKey {
+  // A sort expression
+  message Expr {
+    // The name of the column
+    string column = 1;
+
+    /// Whether the data is sorted in descending order
+    bool descending = 2;
+
+    /// Whether the data is sorted with nulls first
+    bool nulls_first = 3;
+  }
+
+  repeated Expr expressions = 1;
+}
diff --git a/generated_types/protos/influxdata/iox/ingester/v1/persist.proto b/generated_types/protos/influxdata/iox/ingester/v1/persist.proto
new file mode 100644
index 0000000..45c0054
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/ingester/v1/persist.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+package influxdata.iox.ingester.v1;
+option go_package = "github.com/influxdata/iox/ingester/v1";
+
+service PersistService {
+  // The Persist RPC call requests an immediate persistence of all buffed data
+  // for a given namespace, blocking until the data is persisted.
+  //
+  // This endpoint persists the data not currently being persisted in the buffer
+  // for the namespace at the time of the call (if any) and then returns once
+  // the persist job is complete - it does not account for outstanding persist
+  // jobs for the same namespace (which may complete out of order).
+  //
+  // Currently this API endpoint is for internal / experimental use and is
+  // subject to change / removal. Multiple and/or concurrent calls to this
+  // method, invoking it whilst there are outstanding persist operations, or
+  // concurrently calling it with writes you expect to be persisted MAY result
+  // in strange (non-deterministic) behaviour.
+  rpc Persist(PersistRequest) returns (PersistResponse);
+}
+
+message PersistRequest {
+  // The namespace to persist
+  string namespace = 1;
+}
+
+message PersistResponse {}
diff --git a/generated_types/protos/influxdata/iox/ingester/v1/write.proto b/generated_types/protos/influxdata/iox/ingester/v1/write.proto
new file mode 100644
index 0000000..2b2c4fe
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/ingester/v1/write.proto
@@ -0,0 +1,16 @@
+syntax = "proto3";
+package influxdata.iox.ingester.v1;
+option go_package = "github.com/influxdata/iox/ingester/v1";
+
+import "influxdata/pbdata/v1/influxdb_pb_data_protocol.proto";
+
+service WriteService {
+  rpc Write(WriteRequest) returns (WriteResponse);
+}
+
+message WriteRequest {
+  influxdata.pbdata.v1.DatabaseBatch payload = 1;
+}
+
+message WriteResponse {}
+
diff --git a/generated_types/protos/influxdata/iox/namespace/v1/service.proto b/generated_types/protos/influxdata/iox/namespace/v1/service.proto
new file mode 100644
index 0000000..ff1ed74
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/namespace/v1/service.proto
@@ -0,0 +1,118 @@
+syntax = "proto3";
+package influxdata.iox.namespace.v1;
+option go_package = "github.com/influxdata/iox/namespace/v1";
+
+import "influxdata/iox/partition_template/v1/template.proto";
+
+service NamespaceService {
+  // Get all namespaces
+  rpc GetNamespaces(GetNamespacesRequest) returns (GetNamespacesResponse);
+
+  // Create a namespace
+  rpc CreateNamespace(CreateNamespaceRequest) returns (CreateNamespaceResponse);
+
+  // Delete a namespace
+  rpc DeleteNamespace(DeleteNamespaceRequest) returns (DeleteNamespaceResponse);
+
+  // Update retention period
+  rpc UpdateNamespaceRetention(UpdateNamespaceRetentionRequest)
+      returns (UpdateNamespaceRetentionResponse);
+
+  // Update a service protection limit of a namespace. For this change to take
+  // effect, all routers MUST be restarted
+  rpc UpdateNamespaceServiceProtectionLimit(
+      UpdateNamespaceServiceProtectionLimitRequest)
+      returns (UpdateNamespaceServiceProtectionLimitResponse);
+}
+
+message GetNamespacesRequest {}
+
+message GetNamespacesResponse { repeated Namespace namespaces = 1; }
+
+message CreateNamespaceRequest {
+  // Name of the namespace to be created
+  string name = 1;
+
+  // Retention period in nanoseconds.
+  //
+  // NULL means "infinite retention", and 0 is mapped to NULL. Negative values
+  // are rejected.
+  optional int64 retention_period_ns = 2;
+
+  // Partitioning scheme to use for tables created in this namespace
+  optional influxdata.iox.partition_template.v1.PartitionTemplate
+      partition_template = 3;
+
+  ServiceProtectionLimits service_protection_limits = 4;
+}
+
+message CreateNamespaceResponse { Namespace namespace = 1; }
+
+message DeleteNamespaceRequest {
+  // Name of the namespace to be deleted
+  string name = 1;
+}
+
+message DeleteNamespaceResponse {}
+
+message UpdateNamespaceRetentionRequest {
+  // Name of the namespace to be set
+  string name = 1;
+
+  // Retention period in nanoseconds.
+  //
+  // NULL means "infinite retention", and 0 is mapped to NULL. Negative values
+  // are rejected.
+  optional int64 retention_period_ns = 2;
+}
+
+message UpdateNamespaceRetentionResponse { Namespace namespace = 1; }
+
+message UpdateNamespaceServiceProtectionLimitRequest {
+  // Namespace to have its service protection limits updated.
+  string name = 1;
+
+  // The service protection limit to update.
+  oneof limit_update {
+    // Change the maximum number of tables the namespace may have.
+    int32 max_tables = 2;
+    // Change the maximum number of columns each table in the namespace may
+    // have.
+    int32 max_columns_per_table = 3;
+  }
+}
+
+message UpdateNamespaceServiceProtectionLimitResponse {
+  Namespace namespace = 1;
+}
+
+message ServiceProtectionLimits {
+  // Change the maximum number of tables the namespace may have.
+  optional int32 max_tables = 2;
+  // Change the maximum number of columns each table in the namespace may
+  // have.
+  optional int32 max_columns_per_table = 3;
+}
+
+message Namespace {
+  // Namespace ID
+  int64 id = 1;
+
+  // Name of the Namespace
+  string name = 2;
+
+  // Retention period in nanoseconds.
+  //
+  // NULL means "infinite retention".
+  optional int64 retention_period_ns = 3;
+
+  // The maximum number of tables which this namespace is allowed to contain.
+  int32 max_tables = 4;
+
+  // The maximum number of columns a table belonging to this namespace may have.
+  int32 max_columns_per_table = 5;
+
+  // The default partitioning scheme used for any new tables that are created
+  // in this namespace, if any.
+  optional influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 6;
+}
diff --git a/generated_types/protos/influxdata/iox/object_store/v1/service.proto b/generated_types/protos/influxdata/iox/object_store/v1/service.proto
new file mode 100644
index 0000000..729a579
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/object_store/v1/service.proto
@@ -0,0 +1,18 @@
+syntax = "proto3";
+package influxdata.iox.object_store.v1;
+option go_package = "github.com/influxdata/iox/object_store/v1";
+
+service ObjectStoreService {
+    // Get the parquet file from the object store by its uuid
+    rpc GetParquetFileByObjectStoreId(GetParquetFileByObjectStoreIdRequest) returns (stream GetParquetFileByObjectStoreIdResponse);
+}
+
+message GetParquetFileByObjectStoreIdRequest {
+    // the parquet file object store uuid
+    string uuid = 1;
+}
+
+message GetParquetFileByObjectStoreIdResponse {
+    // bytes from the parquet file in object store
+    bytes data = 1;
+}
\ No newline at end of file
diff --git a/generated_types/protos/influxdata/iox/partition_template/v1/template.proto b/generated_types/protos/influxdata/iox/partition_template/v1/template.proto
new file mode 100644
index 0000000..e78646c
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/partition_template/v1/template.proto
@@ -0,0 +1,55 @@
+syntax = "proto3";
+package influxdata.iox.partition_template.v1;
+option go_package = "github.com/influxdata/iox/partition_template/v1";
+
+// A partitioning template describes how data is split into IOx partitions in
+// the ingest pipeline.
+message PartitionTemplate {
+  // One or more partitioning template parts.
+  //
+  // Each template part is evaluated in sequence, concatinating the final
+  // partition key from the output of each part, delimited by hyphens.
+  //
+  // For example, given the following template:
+  //
+  // ```text
+  // [ TemplatePart::time_format("%Y.%j") TemplatePart::tag_value("region") ]
+  // ```
+  //
+  // The below example rows would have the specified partition key derived:
+  //
+  // ```text
+  // time=2023-03-10T13:00:00, region=EMEA, x=42   => "2023.69-EMEA"
+  // time=2023-03-10T13:00:00, region=EMEA-bananas => "2023.69-EMEA-bananas"
+  // time=2023-03-10T13:00:00, x=42                => "2023.69-region"
+  // ```
+  repeated TemplatePart parts = 1;
+}
+
+// A sub-part of a PartitionTemplate.
+message TemplatePart {
+  oneof part {
+    // A tag value matcher extracts a string value from the tag with the
+    // specified name.
+    //
+    // If a row does not contain the specified tag, the provided tag name is
+    // rendered instead of the (missing) value.
+    string tag_value = 1;
+
+    // A time format matcher accepts a "strftime"-like format string and
+    // evaluates it against the "time" column.
+    string time_format = 2;
+
+    // A bucketing matcher that sorts data through a hash on the value of
+    // the specified tag.
+    Bucket bucket = 3;
+  }
+}
+
+// A hash-bucketing sub-part of a PartitionTemplate. 
+message Bucket {
+  // The tag name used for derivation of the bucket the data belongs in.
+  string tag_name = 1;
+  // The number of number of buckets tag values are distributed across.
+  uint32 num_buckets = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/predicate/v1/predicate.proto b/generated_types/protos/influxdata/iox/predicate/v1/predicate.proto
new file mode 100644
index 0000000..0b94083
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/predicate/v1/predicate.proto
@@ -0,0 +1,68 @@
+syntax = "proto3";
+package influxdata.iox.predicate.v1;
+option go_package = "github.com/influxdata/iox/predicate/v1";
+
+// Represents a parsed predicate for evaluation by the InfluxDB IOx query engine.
+message Predicate {
+  // Was `table_names`.
+  reserved 1;
+
+  // Was `field_columns`.
+  reserved 2;
+
+  // Was `partition_key`.
+  reserved 3;
+
+  // Optional timestamp range: only rows within this range are included in results. Other rows are excluded.
+  TimestampRange range = 4;
+
+  // Optional arbitrary predicates, represented as list of expressions applied a logical conjunction (aka they are
+  // 'AND'ed together). Only rows that evaluate to TRUE for all these expressions should be returned. Other rows are
+  // excluded from the results.
+  repeated Expr exprs = 5;
+}
+
+// Specifies a continuous range of nanosecond timestamps.
+message TimestampRange {
+  // Start defines the inclusive lower bound.
+  int64 start = 1;
+
+  // End defines the exclusive upper bound.
+  int64 end = 2;
+}
+
+// Single expression to be used as parts of a predicate.
+//
+// Only very simple expression of the type `<column> <op> <scalar>` are supported.
+message Expr {
+  // Column (w/o table name).
+  string column = 1;
+
+  // Operator.
+  Op op = 2;
+
+  // Scalar value.
+  Scalar scalar = 3;
+}
+
+// Binary operator that can be evaluated on a column and a scalar value.
+enum Op {
+  // Unspecified operator, will result in an error.
+  OP_UNSPECIFIED = 0;
+
+  // Strict equality (`=`).
+  OP_EQ = 1;
+
+  // Inequality (`!=`).
+  OP_NE = 2;
+}
+
+// Scalar value of a certain type.
+message Scalar {
+  oneof value {
+    bool value_bool = 1;
+    int64 value_i64 = 2;
+    double value_f64 = 3;
+    string value_string = 4;
+  }
+}
diff --git a/generated_types/protos/influxdata/iox/querier/v1/flight.proto b/generated_types/protos/influxdata/iox/querier/v1/flight.proto
new file mode 100644
index 0000000..46ffe90
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/querier/v1/flight.proto
@@ -0,0 +1,115 @@
+syntax = "proto3";
+package influxdata.iox.querier.v1;
+option go_package = "github.com/influxdata/iox/querier/v1";
+
+/*
+ * Message definition for the native InfluxDB IOx Flight API
+ *
+ * ReadInfo is sent to an InfluxDB IOx Querier server's `DoGet` RPC
+ * method as the opaque "Ticket" in Arrow Flight messages.
+ *
+ * Tickets are created by encoding these messages using the protobuf
+ * binary format.
+ *
+ * IOx clients can construct these Tickets directly to avoid making
+ * two RPC requests as typically required by Arrow Flight (a
+ * `GetFlightInfo` followed by a `DoGet`).
+ */
+message ReadInfo {
+  // Database name
+  // This used to be namespace_name
+  string database = 1;
+
+  // Query text (either SQL or InfluxQL, depending on query_type)
+  string sql_query = 2;
+
+  // A FlightSQL command payload (serialized protobuf bytes). One of
+  // the messages defined in the [protobuf definition].
+  //
+  // [protobuf definition]: https://arrow.apache.org/docs/format/FlightSql.html#protocol-buffer-definitions
+  bytes flightsql_command = 4;
+
+  // The type of query
+  QueryType query_type = 3;
+
+  enum QueryType {
+    // An unspecified query type. IOx may choose how to interpret sql_query.
+    QUERY_TYPE_UNSPECIFIED = 0;
+
+    // SQL query. `sql_query` contains a SQL query as text
+    QUERY_TYPE_SQL = 1;
+
+    // InfluxQL query. `sql_query` contains an InfluxQL query as text
+    QUERY_TYPE_INFLUX_QL = 2;
+
+    // FlightSQL message: `sql_query` is empty, flightsql_command
+    // contains a serialized FlightSQL message.
+    QUERY_TYPE_FLIGHT_SQL_MESSAGE = 3;
+  }
+
+  // A sequence of query parameters to insert into the query in place
+  // of `$placeholder` variables
+  repeated QueryParam params = 6;
+
+  message QueryParam {
+    string name = 1;
+    // Tagged union of possible param values
+    oneof value {
+      NullValue null = 2;
+      bool boolean = 3;
+      uint64 u_int64 = 4;
+      int64 int64 = 5;
+      double float64 = 6;
+      string string = 7;
+    }
+    // a singleton enum to represent a null value
+    enum NullValue {
+      NULL_VALUE_UNSPECIFIED = 0;
+    }
+  }
+
+  // Do we present debug information to the user?
+  //
+  // Debug information are mostly features that are helpful to IOx developers but a unhelpful or even confusing to end
+  // users. One example are process-local query logs, which in most deployment scenarios will be inconsistent from a
+  // user PoV due to the routing layer that sits in front of the queriers.
+  //
+  // Note that debug information MUST NOT include security-relevant information. E.g. the process-local query log
+  // mentioned above MUST be namespace-scoped! So even a user hand-crafsts the `ReadInfo` message, they do NOT gain
+  // relevant information. The worst case is that their user experience will be suboptimal.
+  bool is_debug = 5;
+}
+
+// Message included in the DoGet response from the querier
+//
+// Currently this does not contain any information, but IOx may
+// provide data lineage information, statistics, watermarks or other
+// information in the future.
+message AppMetadata {}
+
+// A structure which describes the layout of the group key in a `RecordBatch`.
+// This information is used to map the data in a `RecordBatch` to the InfluxDB data model
+// where in addition to a data type, each columns is either a `tag`, `field` or `timestamp`
+//
+// Typically, this structure is encoded in the schema of a `RecordBatch`.
+message InfluxQlMetadata {
+  uint32 measurement_column_index = 1;
+
+  // Provides additional metadata about a column that is used
+  // to form part of the group key.
+  message TagKeyColumn {
+    // The tag key name.
+    string tag_key = 1;
+    // The column index of the tag values.
+    uint32 column_index = 2;
+    // `true` if the tag key column is also a projected column in original query.
+    bool is_projected = 3;
+  }
+
+  // A list of tag key names and and associated metadata.
+  //
+  // **Note**:
+  // The vector is sorted by the `tag_key` field in lexicographically
+  // ascending order.
+  repeated TagKeyColumn tag_key_columns = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/querier/v1/query_log.proto b/generated_types/protos/influxdata/iox/querier/v1/query_log.proto
new file mode 100644
index 0000000..a75f26f
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/querier/v1/query_log.proto
@@ -0,0 +1,73 @@
+syntax = "proto3";
+package influxdata.iox.querier.v1;
+option go_package = "github.com/influxdata/iox/querier/v1";
+
+import "google/protobuf/duration.proto";
+import "google/protobuf/timestamp.proto";
+
+service QueryLogService {
+    // Get the process-local query log.
+    rpc GetLog(GetLogRequest) returns (GetLogResponse);
+}
+
+message GetLogRequest {}
+
+// Describe a single query.
+message LogEntry {
+  // Unique entry ID.
+  string id = 1;
+
+  // Namespace ID.
+  int64 namespace_id = 2;
+
+  // Namespace name.
+  string namespace_name = 3;
+
+  // Query type, e.g. `sql`.
+  string query_type = 4;
+
+  // Query text.
+  string query_text = 5;
+
+  // Trace ID.
+  string trace_id = 6;
+
+  // Start timestamp.
+  google.protobuf.Timestamp issue_time = 7;
+
+  reserved 8;
+  reserved 'query_completed_duration';
+
+  // Duration it took to acquire a semaphore permit, relative to `issue_time`.
+  google.protobuf.Duration permit_duration = 10;
+
+  // Duration it took to plan the query, relative to `issue_time` + `permit_duration`.
+  google.protobuf.Duration plan_duration = 11;
+
+  // Duration it took to execute the query, relative to `issue_time` +
+  // `permit_duration` + `plan_duration`.
+  google.protobuf.Duration execute_duration = 12;
+
+  // Duration from `issue_time` til the query ended somehow.
+  google.protobuf.Duration end2end_duration = 13;
+
+  // CPU duration spend for computation.
+  google.protobuf.Duration compute_duration = 15;
+
+  // If the query completed successfully.
+  bool success = 9;
+
+  // If the query is currently running (in any state).
+  bool running = 14;
+}
+
+message GetLogResponse {
+  // Current entries.
+  repeated LogEntry entries = 1;
+
+  // Maximum size of the query log.
+  uint64 max_size = 2;
+
+  // Number of evicted entries due to the "max size" constraint.
+  uint64 evicted = 3;
+}
diff --git a/generated_types/protos/influxdata/iox/schema/v1/service.proto b/generated_types/protos/influxdata/iox/schema/v1/service.proto
new file mode 100644
index 0000000..ef026de
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/schema/v1/service.proto
@@ -0,0 +1,76 @@
+syntax = "proto3";
+package influxdata.iox.schema.v1;
+option go_package = "github.com/influxdata/iox/schema/v1";
+
+import "influxdata/iox/column_type/v1/type.proto";
+
+service SchemaService {
+  // Get the schema for a namespace and, optionally, a table within that namespace
+  rpc GetSchema(GetSchemaRequest) returns (GetSchemaResponse);
+
+  // Upsert the schema for the specified namespace and table. Returns a namespace schema
+  // containing only the upserted table's schema.
+  rpc UpsertSchema(UpsertSchemaRequest) returns (UpsertSchemaResponse);
+}
+
+message GetSchemaRequest {
+  // The namespace for which to fetch the schema
+  string namespace = 1;
+
+  // If specified, the table in the namespace for which to fetch the schema. If not specified, the
+  // schemas for all tables in this namespace will be returned.
+  optional string table = 2;
+}
+
+message GetSchemaResponse {
+  NamespaceSchema schema = 1;
+}
+
+message UpsertSchemaRequest {
+  // The namespace in which to upsert the schema
+  string namespace = 1;
+
+  // The table for which to upsert the schema
+  string table = 2;
+
+  // Map of Column Name -> Column Type to upsert into this table's schema
+  map<string, influxdata.iox.column_type.v1.ColumnType> columns = 3;
+}
+
+message UpsertSchemaResponse {
+  // Namespace schema containing only the upserted table's schema.
+  NamespaceSchema schema = 1;
+}
+
+message NamespaceSchema {
+  // Renamed to topic_id
+  reserved 2;
+  reserved "kafka_topic_id";
+  // Removed topic ID
+  reserved 5;
+  reserved "topic_id";
+  // Removed query pool ID
+  reserved 3;
+  reserved "query_pool_id";
+
+  // Namespace ID
+  int64 id = 1;
+  // Map of Table Name -> Table Schema
+  map<string, TableSchema> tables = 4;
+}
+
+message TableSchema {
+  // Table ID
+  int64 id = 1;
+  // Map of Column Name -> Column Schema
+  map<string, ColumnSchema> columns = 2;
+}
+
+message ColumnSchema {
+  reserved 2;
+
+  // Column ID
+  int64 id = 1;
+  // Column type
+  influxdata.iox.column_type.v1.ColumnType column_type = 3;
+}
diff --git a/generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto b/generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto
new file mode 100644
index 0000000..cd9863c
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto
@@ -0,0 +1,29 @@
+syntax = "proto3";
+package influxdata.iox.skipped_compaction.v1;
+option go_package = "github.com/influxdata/iox/skipped_compaction/v1";
+
+message SkippedCompaction {
+  // the partition
+  int64 partition_id = 1;
+
+  // the reason compaction was skipped
+  string reason = 2;
+
+  // when compaction was skipped
+  int64 skipped_at = 3;
+
+  // estimated memory budget
+  int64 estimated_bytes = 4;
+
+  // limit on memory budget
+  int64 limit_bytes = 5;
+
+  // num files selected to compact
+  int64 num_files = 6;
+
+  // limit on num files
+  int64 limit_num_files = 7;
+
+  // limit on num files for the first file in a partition
+  int64 limit_num_files_first_in_partition = 8;
+}
diff --git a/generated_types/protos/influxdata/iox/table/v1/service.proto b/generated_types/protos/influxdata/iox/table/v1/service.proto
new file mode 100644
index 0000000..34ced58
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/table/v1/service.proto
@@ -0,0 +1,72 @@
+syntax = "proto3";
+package influxdata.iox.table.v1;
+option go_package = "github.com/influxdata/iox/table/v1";
+
+import "influxdata/iox/partition_template/v1/template.proto";
+
+service TableService {
+  // Get tables within a namespace
+  rpc GetTables(GetTablesRequest) returns (GetTablesResponse);
+
+  // Get a table within a namespace
+  rpc GetTable(GetTableRequest) returns (GetTableResponse); 
+
+  // Create a table in a namespace
+  rpc CreateTable(CreateTableRequest) returns (CreateTableResponse);
+}
+
+message CreateTableRequest {
+  // Name of the table to be created
+  string name = 1;
+
+  // Name of the namespace to create the table in
+  string namespace = 2;
+
+  // Partitioning scheme to use for writes to this table. If not specified, the
+  // namespace's partition template will be used.
+  //
+  // Any use of "tag_value" template parts will cause the named column schema to
+  // be set as "tag" as part of this request.
+  optional influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 3;
+}
+
+message CreateTableResponse {
+  Table table = 1;
+}
+
+message Table {
+  // Table ID
+  int64 id = 1;
+
+  // Name of the Table
+  string name = 2;
+
+  // Namespace ID
+  int64 namespace_id = 3;
+  
+  // The partitioning scheme applied to writes for this table
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 4;
+}
+
+message GetTablesRequest {
+  // Name of the namespace to list tables for.
+  string namespace_name = 1;
+}
+
+message GetTableRequest {
+  // Name of the namespace to  table for.
+  string namespace_name = 1;
+
+  // Name of the table to get from namespace
+  string table_name = 2;
+}
+
+message GetTablesResponse {
+  // Tables contained within the namespace.
+  repeated Table tables = 1;
+}
+
+message GetTableResponse {
+  // Table contained within a namespace
+  Table table = 1;
+}
diff --git a/generated_types/protos/influxdata/iox/wal/v1/wal.proto b/generated_types/protos/influxdata/iox/wal/v1/wal.proto
new file mode 100644
index 0000000..0345073
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/wal/v1/wal.proto
@@ -0,0 +1,47 @@
+syntax = "proto3";
+package influxdata.iox.wal.v1;
+option go_package = "github.com/influxdata/iox/wal/v1";
+
+import "influxdata/iox/delete/v1/service.proto";
+import "influxdata/pbdata/v1/influxdb_pb_data_protocol.proto";
+
+/// Persist information that can be saved to the WAL. Used during replay to evict data from memory.
+message PersistOp {
+  // The catalog namespace ID
+  int64 namespace_id = 1;
+
+  // The catalog table ID
+  int64 table_id = 2;
+
+  // The catalog partition ID
+  int64 partition_id = 3;
+
+  // The Parquet file UUID in object storage
+  string parquet_file_uuid = 4;
+}
+
+// WAL operation with a sequence number, used to inform read buffers when to evict data
+message SequencedWalOp {
+  // Was the op-level sequence number.
+  reserved "sequence_number";
+  reserved 1;
+
+  // A mapping which assigns a sequence number per table ID affected by this WAL
+  // operation.
+  //
+  // Because this operation contains data for exactly one partition per table,
+  // this transitively tracks the sequence numbers per partition within this op.
+  map<int64, uint64> table_write_sequence_numbers = 5;
+
+  oneof op {
+    influxdata.pbdata.v1.DatabaseBatch write = 2;
+    influxdata.iox.delete.v1.DeletePayload delete = 3;
+    PersistOp persist = 4;
+  }
+}
+
+// Collection of WAL operations (they get batched into a single write into the WAL)
+message WalOpBatch {
+  // the ops
+  repeated SequencedWalOp ops = 1;
+}
\ No newline at end of file
diff --git a/generated_types/protos/influxdata/pbdata/v1/influxdb_pb_data_protocol.proto b/generated_types/protos/influxdata/pbdata/v1/influxdb_pb_data_protocol.proto
new file mode 100644
index 0000000..4cea6d6
--- /dev/null
+++ b/generated_types/protos/influxdata/pbdata/v1/influxdb_pb_data_protocol.proto
@@ -0,0 +1,127 @@
+syntax = "proto3";
+
+package influxdata.pbdata.v1;
+
+option go_package = "github.com/influxdata/influxdb-pb-data-protocol/golang;influxdbpbdataprotocol";
+
+message DatabaseBatch {
+    // Was the destination database name / namespace for this write.
+    reserved "database_name";
+    reserved 1;
+
+    // The catalog ID for this database / namespace.
+    int64 database_id = 4;
+
+    // An optional partition key for this batch.
+    //
+    // If specified, all batches in this write MUST map to this partition key.
+    // Violating this invariant MAY cause data to become unqueryable, degrade
+    // query performance, or other bad things.
+    string partition_key = 3;
+
+    // Table data. Data for a given table may appear in multiple table batches.
+    repeated TableBatch table_batches = 2;
+}
+
+message TableBatch {
+    // Was the table name for this data.
+    reserved "table_name";
+    reserved 1;
+
+    // The catalog ID for this table.
+    int64 table_id = 4;
+
+    // Data are represented here.
+    //
+    // Exactly one column named and typed "time" *must* exist,
+    // and *must not* contain null values.
+    //
+    // For line protocol tables (tables containing tags and fields),
+    // columns *should* be sorted by cardinality, from lowest to highest
+    // (the time column is always last in the sort order).
+    repeated Column columns = 2;
+
+    // Length of all columns in this table batch.
+    // This count includes null and non-null values.
+    //
+    // All columns in a TableBatch must have equal length.
+    uint32 row_count = 3;
+}
+
+// A list of strings encoded as a contiguous string payload and a list of
+// byte offsets. The offset at index i records the byte offset of the start
+// of the i'th string, with index (i+1) recording the exclusive end offset
+//
+// For example ["hello", "influx", "", "data"] would be encoded as
+// - values: "helloinfluxdata"
+// - offsets: [0, 5, 11, 11, 15]
+//
+message PackedStrings {
+    string values = 1;
+    repeated uint32 offsets = 2;
+}
+
+// A dictionary containing a list of string values combined with a values array of
+// indexes into this dictionary
+//
+// For example ["tag1", "tag1", "tag2", "tag2", "tag1"] might be encoded as
+// - dictionary: ["tag1, "tag2"]
+// - offsets: [0, 0, 1, 1, 0]
+//
+// There are no requirements that the dictionary is sorted, or its keys unique
+message InternedStrings {
+    PackedStrings dictionary = 1;
+    repeated uint32 values = 2;
+}
+
+message Column {
+    string column_name = 1;
+
+    enum SemanticType {
+        reserved 1; // SEMANTIC_TYPE_IOX (native IOx)
+        reserved "SEMANTIC_TYPE_IOX";
+
+        SEMANTIC_TYPE_UNSPECIFIED = 0;
+        // InfluxDB/TSM tag (value type string only)
+        SEMANTIC_TYPE_TAG = 2;
+        // InfluxDB/TSM field (value types: i64, f64, u64, string, bool)
+        SEMANTIC_TYPE_FIELD = 3;
+        // Timestamps, which must have value type i64
+        SEMANTIC_TYPE_TIME = 4;
+    }
+    // Semantic meaning behind the data.
+    SemanticType semantic_type = 2;
+
+    message Values {
+        repeated int64 i64_values = 1;
+        repeated double f64_values = 2;
+        repeated uint64 u64_values = 3;
+        repeated string string_values = 4;
+        repeated bool bool_values = 5;
+        repeated bytes bytes_values = 6;
+        PackedStrings packed_string_values = 7;
+        InternedStrings interned_string_values = 8;
+    }
+    // The sequence of non-null values contained in this column.
+    // Order matters; positions are adjusted by null positions in null_mask.
+    //
+    // For example, column "foo", containing I64 values (10,11,12,13,14,null,16,17,null,99,100):
+    //   Column:
+    //     column_name: foo
+    //     values:
+    //       i64_values: 10,11,12,13,14,16,17,99,100
+    //                7      0  15     8
+    //     null_mask: 00100000  00000001
+    //
+    // Exactly one of the fields within the Values message *should* be set.
+    Values values = 3;
+
+    // Mask that maps the positions of null values.
+    // Null positions hold space between non-null values in the values field.
+    //
+    // An on bit (1) indicates that the column value at that position is null.
+    // If zero null values exist in the column, then null_mask *may* be omitted.
+    // If zero non-null values in the column, then the column *should* be omitted.
+    // Trailing off bits (0) *may* be omitted.
+    bytes null_mask = 4;
+}
diff --git a/generated_types/protos/influxdata/platform/errors/errors.proto b/generated_types/protos/influxdata/platform/errors/errors.proto
new file mode 100644
index 0000000..22736d7
--- /dev/null
+++ b/generated_types/protos/influxdata/platform/errors/errors.proto
@@ -0,0 +1,28 @@
+// This file contains InfluxDB error definitions.
+//
+// Copy/pasted, as closely as verbatim as possible, from
+// https://github.com/influxdata/idpe/blob/master/kit/grpc/errors.proto
+
+syntax = "proto3";
+package influxdata.platform.errors;
+
+import "google/protobuf/any.proto";
+
+message InternalError {
+    string message = 1;
+}
+
+message InfluxDBError {
+    string code = 1;
+    string message = 2;
+    string op = 3;
+    google.protobuf.Any error = 4;
+}
+
+message FluxError {
+    uint32 code = 1;
+    string message = 2;
+    string doc_url = 3;
+    google.protobuf.Any error = 4;
+}
+
diff --git a/generated_types/protos/influxdata/platform/storage/predicate.proto b/generated_types/protos/influxdata/platform/storage/predicate.proto
new file mode 100644
index 0000000..60c848c
--- /dev/null
+++ b/generated_types/protos/influxdata/platform/storage/predicate.proto
@@ -0,0 +1,61 @@
+// This file defines the types of predicates that can
+// be passed down to the InfluxDB storage gRPC service
+//
+// Copy/pasted, as closely as verbatim as possible, from
+// https://github.com/influxdata/influxdb/blob/master/storage/reads/datatypes/predicate.proto
+
+syntax = "proto3";
+package influxdata.platform.storage;
+
+
+message Node {
+  enum Type {
+    LOGICAL_EXPRESSION = 0;
+    COMPARISON_EXPRESSION = 1;
+    PAREN_EXPRESSION = 2;
+    TAG_REF = 3;
+    LITERAL = 4;
+    FIELD_REF = 5;
+  }
+
+  enum Comparison {
+    EQUAL = 0;
+    NOT_EQUAL = 1;
+    STARTS_WITH = 2;
+    REGEX = 3;
+    NOT_REGEX = 4;
+    LT = 5;
+    LTE = 6;
+    GT = 7;
+    GTE = 8;
+  }
+
+  // Logical operators apply to boolean values and combine to produce a single boolean result.
+  enum Logical {
+    AND = 0;
+    OR = 1;
+  }
+
+
+  Type node_type = 1;
+  repeated Node children = 2;
+
+  oneof value {
+    string string_value = 3;
+    bool bool_value = 4;
+    int64 int_value = 5;
+    uint64 uint_value = 6;
+    double float_value = 7;
+    string regex_value = 8;
+    /*    string tag_ref_value = 9; */
+    // AAL changed from string --> bytes to handle \xff literals in Rust which are not valid UTF-8
+    bytes tag_ref_value = 9;
+    string field_ref_value = 10;
+    Logical logical = 11;
+    Comparison comparison = 12;
+  }
+}
+
+message Predicate {
+  Node root = 1;
+}
diff --git a/generated_types/protos/influxdata/platform/storage/service.proto b/generated_types/protos/influxdata/platform/storage/service.proto
new file mode 100644
index 0000000..2a81257
--- /dev/null
+++ b/generated_types/protos/influxdata/platform/storage/service.proto
@@ -0,0 +1,47 @@
+// This file defines the InfluxDB storage gRPC service definition (how the rest of influxdb /
+// flux / influxql talk to the storage system)
+//
+// Copy/pasted, as closely as verbatim as possible, from
+// https://github.com/influxdata/idpe/blob/master/storage/storageproto/service.proto
+
+syntax = "proto3";
+package influxdata.platform.storage;
+
+import "google/protobuf/empty.proto";
+import "influxdata/platform/storage/storage_common.proto";
+
+service Storage {
+    // ReadFilter performs a filter operation at storage
+    rpc ReadFilter (ReadFilterRequest) returns (stream ReadResponse);
+
+    // ReadGroup performs a group operation at storage
+    rpc ReadGroup (ReadGroupRequest) returns (stream ReadResponse);
+
+    // ReadWindowAggregate performs a window aggregate operation at storage
+    rpc ReadWindowAggregate (ReadWindowAggregateRequest) returns (stream ReadResponse);
+
+    // TagKeys performs a read operation for tag keys
+    rpc TagKeys (TagKeysRequest) returns (stream StringValuesResponse);
+
+    // TagValues performs a read operation for tag values
+    rpc TagValues (TagValuesRequest) returns (stream StringValuesResponse);
+
+    rpc TagValuesGroupedByMeasurementAndTagKey (TagValuesGroupedByMeasurementAndTagKeyRequest) returns (stream TagValuesResponse);
+
+    // ReadSeriesCardinality performs a read operation for series cardinality
+    rpc ReadSeriesCardinality (ReadSeriesCardinalityRequest) returns (stream Int64ValuesResponse);
+
+    // Capabilities returns a map of keys and values identifying the capabilities supported by the storage engine
+    rpc Capabilities (google.protobuf.Empty) returns (CapabilitiesResponse);
+
+    rpc MeasurementNames(MeasurementNamesRequest) returns (stream StringValuesResponse);
+
+    rpc MeasurementTagKeys(MeasurementTagKeysRequest) returns (stream StringValuesResponse);
+
+    rpc MeasurementTagValues(MeasurementTagValuesRequest) returns (stream StringValuesResponse);
+
+    rpc MeasurementFields(MeasurementFieldsRequest) returns (stream MeasurementFieldsResponse);
+
+    // Offsets gets the partition offsets of the node
+    rpc Offsets (google.protobuf.Empty) returns (OffsetsResponse);
+}
diff --git a/generated_types/protos/influxdata/platform/storage/source.proto b/generated_types/protos/influxdata/platform/storage/source.proto
new file mode 100644
index 0000000..9ede15d
--- /dev/null
+++ b/generated_types/protos/influxdata/platform/storage/source.proto
@@ -0,0 +1,13 @@
+syntax = "proto3";
+package influxdata.platform.storage.read;
+
+message ReadSource {
+  // OrgID specifies the organization identifier for this request.
+  uint64 org_id = 1;
+
+  // BucketID specifies the bucket in the organization.
+  uint64 bucket_id = 2;
+
+  // PartitionID specifies the partition to be queried.
+  uint64 partition_id = 3;
+}
diff --git a/generated_types/protos/influxdata/platform/storage/storage_common.proto b/generated_types/protos/influxdata/platform/storage/storage_common.proto
new file mode 100644
index 0000000..14cd2a1
--- /dev/null
+++ b/generated_types/protos/influxdata/platform/storage/storage_common.proto
@@ -0,0 +1,353 @@
+// This file defines the InfluxDB storage gRPC message types (how the rest of influxdb /
+// flux / influxql talk to the storage system)
+//
+// Copy/pasted, as closely as verbatim as possible, from
+// https://github.com/influxdata/influxdb/blob/master/storage/reads/datatypes/storage_common.proto
+
+syntax = "proto3";
+package influxdata.platform.storage;
+
+import "google/protobuf/any.proto";
+import "influxdata/platform/storage/predicate.proto";
+import "influxdata/platform/storage/source.proto";
+
+
+message OffsetsResponse {
+    message PartitionOffsetResponse {
+        int64 id = 1;
+        int64 offset = 2;
+    }
+    repeated PartitionOffsetResponse partitions = 1;
+}
+
+enum TagKeyMetaNames {
+  // option (gogoproto.goproto_enum_prefix) = false;
+
+  // TagKeyMetaNamesText means the tag keys for measurement and field will
+  // be returned as _measurement and _field respectively.
+  TagKeyMetaNamesText = 0;
+
+  // TagKeyMetaNames means the tag keys for measurement and field will
+  // be returned as \x00 and \xff respectively.
+  TagKeyMetaNamesBinary = 1;
+}
+
+message ReadFilterRequest {
+  google.protobuf.Any ReadSource = 1;
+  TimestampRange range = 2; // [(gogoproto.nullable) = false];
+  Predicate predicate = 3;
+  // KeySort determines the ordering of series keys from the server.
+  KeySort key_sort = 4;
+
+  // TagKeyMetaNames determines the key format used for the measurement and field
+  // tags.
+  TagKeyMetaNames tag_key_meta_names = 5;
+
+  enum KeySort {
+    // option (gogoproto.goproto_enum_prefix) = false;
+
+    // KeySortUnspecified means the key order is unspecified.
+    KeySortUnspecified = 0;
+
+    // KeySortAscending means the key order should be lexicographically ascending.
+    //
+    // NOTE: In order to preserve sort order, canonical tag keys are not
+    // transformed from 0x00 → _measurement and 0xff → _field.
+    KeySortAscending = 1;
+  }
+}
+
+message ReadGroupRequest {
+  google.protobuf.Any ReadSource = 1;
+  TimestampRange range = 2; // [(gogoproto.nullable) = false];
+  Predicate predicate = 3;
+
+  enum Group {
+    // option (gogoproto.goproto_enum_prefix) = false;
+
+    // GroupNone returns all series as a single group.
+    // The single GroupFrame.TagKeys will be the union of all tag keys.
+    GroupNone = 0;
+
+    // GroupBy returns a group for each unique value of the specified GroupKeys.
+    GroupBy = 2;
+  }
+
+  // GroupKeys specifies a list of tag keys used to order the data.
+  // It is dependent on the Group property to determine its behavior.
+  repeated string GroupKeys = 4;
+
+  Group group = 5;
+  Aggregate aggregate = 6;
+
+  // Deprecated field only used in TSM storage-related tests.
+  reserved "Hints";
+}
+
+message Aggregate {
+  enum AggregateType {
+    AggregateTypeNone = 0;
+    AggregateTypeSum = 1;
+    AggregateTypeCount = 2;
+    AggregateTypeMin = 3;
+    AggregateTypeMax = 4;
+    AggregateTypeFirst = 5;
+    AggregateTypeLast = 6;
+    AggregateTypeMean = 7;
+  }
+
+  AggregateType type = 1;
+
+  // additional arguments?
+}
+
+message Tag {
+  bytes key = 1;
+  bytes value = 2;
+}
+
+// Response message for ReadFilter and ReadGroup
+message ReadResponse {
+  enum FrameType {
+    FrameTypeSeries = 0;
+    FrameTypePoints = 1;
+  }
+
+  enum DataType {
+    DataTypeFloat = 0;
+    DataTypeInteger = 1;
+    DataTypeUnsigned = 2;
+    DataTypeBoolean = 3;
+    DataTypeString = 4;
+  }
+
+  message Frame {
+    oneof data {
+      GroupFrame group = 7;
+      SeriesFrame series = 1;
+      FloatPointsFrame FloatPoints = 2;
+      IntegerPointsFrame IntegerPoints = 3;
+      UnsignedPointsFrame UnsignedPoints = 4;
+      BooleanPointsFrame BooleanPoints = 5;
+      StringPointsFrame StringPoints = 6;
+    }
+  }
+
+  message GroupFrame {
+    // TagKeys
+    repeated bytes TagKeys = 1;
+    // PartitionKeyVals is the values of the partition key for this group, order matching ReadGroupRequest.GroupKeys
+    repeated bytes PartitionKeyVals = 2;
+  }
+
+  message SeriesFrame {
+    repeated Tag tags = 1; // [(gogoproto.nullable) = false];
+    DataType data_type = 2;
+  }
+
+  message FloatPointsFrame {
+    repeated sfixed64 timestamps = 1;
+    repeated double values = 2;
+  }
+
+  message IntegerPointsFrame {
+    repeated sfixed64 timestamps = 1;
+    repeated int64 values = 2;
+  }
+
+  message UnsignedPointsFrame {
+    repeated sfixed64 timestamps = 1;
+    repeated uint64 values = 2;
+  }
+
+  message BooleanPointsFrame {
+    repeated sfixed64 timestamps = 1;
+    repeated bool values = 2;
+  }
+
+  message StringPointsFrame {
+    repeated sfixed64 timestamps = 1;
+    repeated string values = 2;
+  }
+
+  repeated Frame frames = 1; // [(gogoproto.nullable) = false];
+}
+
+message Capability {
+  // Features contains the specific features supported
+  // by this capability.
+  repeated string features = 1;
+}
+
+message CapabilitiesResponse {
+  // Capabilities contains the set of capabilities supported
+  // by the storage engine. It is a map of method names to
+  // the detailed capability information for the method.
+  map<string, Capability> caps = 1;
+}
+
+// Specifies a continuous range of nanosecond timestamps.
+message TimestampRange {
+  // Start defines the inclusive lower bound.
+  int64 start = 1;
+
+  // End defines the exclusive upper bound.
+  int64 end = 2;
+}
+
+// TagKeysRequest is the request message for Storage.TagKeys.
+message TagKeysRequest {
+  google.protobuf.Any TagsSource = 1;
+  TimestampRange range = 2; // [(gogoproto.nullable) = false];
+  Predicate predicate = 3;
+}
+
+// TagValuesRequest is the request message for Storage.TagValues.
+message TagValuesRequest {
+  google.protobuf.Any TagsSource = 1;
+  TimestampRange range = 2; // [(gogoproto.nullable) = false];
+  Predicate predicate = 3;
+  bytes tag_key = 4;
+}
+
+message ReadSeriesCardinalityRequest {
+  google.protobuf.Any ReadSeriesCardinalitySource = 1;
+  TimestampRange range = 2; // [(gogoproto.nullable) = false];
+  Predicate predicate = 3;
+}
+
+// Response message for Storage.TagKeys, Storage.TagValues Storage.MeasurementNames,
+// Storage.MeasurementTagKeys and Storage.MeasurementTagValues.
+message StringValuesResponse {
+  repeated bytes values = 1;
+}
+
+// Response message for Storage.TagValuesGroupedByMeasurementAndTagKey.
+message TagValuesResponse {
+  string measurement = 1;
+  string key = 2;
+  repeated string values = 3;
+}
+
+// Response message for Storage.SeriesCardinality
+message Int64ValuesResponse {
+  repeated int64 values = 1;
+}
+
+// MeasurementNamesRequest is the request message for Storage.MeasurementNames.
+message MeasurementNamesRequest {
+  google.protobuf.Any source = 1;
+  TimestampRange range = 2; // [(gogoproto.nullable) = false];
+  Predicate predicate = 3;
+}
+
+// MeasurementTagKeysRequest is the request message for Storage.MeasurementTagKeys.
+message MeasurementTagKeysRequest {
+  google.protobuf.Any source = 1;
+  string measurement = 2;
+  TimestampRange range = 3; // [(gogoproto.nullable) = false];
+  Predicate predicate = 4;
+}
+
+// MeasurementTagValuesRequest is the request message for Storage.MeasurementTagValues.
+message MeasurementTagValuesRequest {
+  google.protobuf.Any source = 1;
+  string measurement = 2;
+  string tag_key = 3;
+  TimestampRange range = 4; // [(gogoproto.nullable) = false];
+  Predicate predicate = 5;
+}
+
+// MeasurementFieldsRequest is the request message for Storage.MeasurementFields.
+message MeasurementFieldsRequest {
+  google.protobuf.Any source = 1;
+  string measurement = 2;
+  TimestampRange range = 3; // [(gogoproto.nullable) = false];
+  Predicate predicate = 4;
+}
+
+// MeasurementFieldsResponse is the response message for Storage.MeasurementFields.
+message MeasurementFieldsResponse {
+  enum FieldType {
+    FieldTypeFloat = 0;
+    FieldTypeInteger = 1;
+    FieldTypeUnsigned = 2;
+    FieldTypeString = 3;
+    FieldTypeBoolean = 4;
+    FieldTypeUndefined = 5;
+  }
+
+  message MessageField {
+    string key = 1;
+    FieldType type = 2;
+    sfixed64 timestamp = 3;
+  }
+
+  repeated MessageField fields = 1; // [(gogoproto.nullable) = false];
+}
+
+message ReadWindowAggregateRequest {
+  google.protobuf.Any ReadSource = 1;
+  TimestampRange range = 2; // [(gogoproto.nullable) = false];
+  Predicate predicate = 3;
+  int64 WindowEvery = 4;
+  int64 Offset = 6;
+  repeated Aggregate aggregate = 5;
+  Window window = 7;
+  // TagKeyMetaNames determines the key format used for the measurement and field
+  // tags.
+  TagKeyMetaNames tag_key_meta_names = 8;
+}
+
+message TagValuesGroupedByMeasurementAndTagKeyRequest {
+  google.protobuf.Any source = 1;
+
+  // MeasurementPatterns holds the patterns to match the measurements
+  // against (the "FROM" part of the SHOW TAG VALUES statement).
+  repeated LiteralOrRegex MeasurementPatterns = 2;
+
+  // TagKeyPredicate holds a predicate for the tags to find values on.
+  // (the "WITH KEY" part of the SHOW TAG VALUES statement.
+  // It's in one of the forms:
+  //  OR(IDENT, OR(IDENT, ...))
+  //  EQ(IDENT)
+  //  NEQ(IDENT)
+  //  EQREGEX(REGEX)
+  //  NEQREGEX(REGEX)
+  TagKeyPredicate TagKeyPredicate = 3;
+
+  // Condition holds any additional condition to evaluate on the results.
+  Predicate Condition = 4;
+}
+
+message TagKeyPredicate {
+  oneof value {
+    string Eq = 1;
+    string Neq = 2;
+    string EqRegex = 3;
+    string NeqRegex = 4;
+    StringList In = 5;
+  }
+}
+
+message StringList {
+  repeated string Vals = 1;
+}
+
+message LiteralOrRegex {
+  oneof value {
+    string literal_value = 1;
+    string regex_value = 2;
+  }
+}
+
+message Window {
+  Duration every = 1;
+  Duration offset = 2;
+}
+
+message Duration {
+  int64 nsecs = 1;
+  int64 months = 2;
+  bool negative = 3;
+}
diff --git a/generated_types/protos/influxdata/platform/storage/test.proto b/generated_types/protos/influxdata/platform/storage/test.proto
new file mode 100644
index 0000000..f48dcfc
--- /dev/null
+++ b/generated_types/protos/influxdata/platform/storage/test.proto
@@ -0,0 +1,16 @@
+// This file defines a gRPC service used for testing
+
+syntax = "proto3";
+package influxdata.platform.storage;
+
+// Send a message that will generate an internal error (used for testing)
+message TestErrorRequest {
+}
+
+message TestErrorResponse {
+}
+
+
+service IOxTesting {
+    rpc TestError(TestErrorRequest) returns (TestErrorResponse) {}
+}
diff --git a/generated_types/src/google.rs b/generated_types/src/google.rs
new file mode 100644
index 0000000..2e44a75
--- /dev/null
+++ b/generated_types/src/google.rs
@@ -0,0 +1,792 @@
+//! Protobuf types for errors from the google standards and
+//! conversions to `tonic::Status`
+//!
+//! # Status Responses
+//!
+//! gRPC defines a standard set of [Status Codes] to use for various purposes. In general this
+//! combined with a textual error message are sufficient. The status code allows the client to
+//! handle classes of errors programmatically, whilst the error message provides additional context
+//! to the end user. This is the minimal error model supported by all implementations of gRPC.
+//!
+//! gRPC also has a concept of [Error Details]. These are just a list of `google.protobuf.Any`
+//! that can be bundled with the status message. A standard set of [Error Payloads] are used
+//! by Google APIs, and this is a convention IOx follows.
+//!
+//! As the encoding of these payloads is somewhat arcane, Rust types such as [`FieldViolation`],
+//! [`AlreadyExists`], [`NotFound`], [`PreconditionViolation`] etc... are provided that can be
+//! converted to a `tonic::Status` with `Into::into`.
+//!
+//! Unfortunately client support for details payloads is patchy. Therefore, whilst IOx does
+//! provide these payloads, they should be viewed as an optional extension and not mandatory
+//! functionality for a workable client implementation
+//!
+//! # Message Conversion
+//!
+//! Most of the logic within IOx is written in terms of types defined in the `data_types` crate,
+//! conversions are then defined to/from `generated_types` types generated by prost.
+//!
+//! In addition to avoiding a dependency on hyper, tonic, etc... within these crates, this serves
+//! to abstract away concerns such as API evolution, default values, etc...
+//!
+//! Unfortunately, writing this translation code is somewhat cumbersome and so this module
+//! contains a collection of extension traits to improve the ergonomics of writing the potentially
+//! fallible conversion from the protobuf representation to the corresponding `data_types` type
+//!
+//! Each type should implement the following:
+//!
+//! * `From<data_types::Type>` for `proto::Type`
+//! * `TryFrom<proto::Type, Error=FieldViolation>` for `data_types::Type`
+//!
+//! Where [`FieldViolation`] allows context propagation about the problematic field within a
+//! nested structure. A common error type is chosen because:
+//!
+//! * It integrates well with the expectations of gRPC and by extension tonic
+//! * It reduces boilerplate code
+//!
+//! [Status Codes]: https://grpc.github.io/grpc/core/md_doc_statuscodes.html
+//! [Error Details]: https://cloud.google.com/apis/design/errors#error_details
+//! [Error Payloads]: https://cloud.google.com/apis/design/errors#error_payloads
+//!
+
+pub mod protobuf {
+    pub use pbjson_types::*;
+}
+
+pub mod rpc {
+    include!(concat!(env!("OUT_DIR"), "/google.rpc.rs"));
+    include!(concat!(env!("OUT_DIR"), "/google.rpc.serde.rs"));
+}
+
+pub mod longrunning {
+    include!(concat!(env!("OUT_DIR"), "/google.longrunning.rs"));
+    include!(concat!(env!("OUT_DIR"), "/google.longrunning.serde.rs"));
+
+    impl Operation {
+        /// Return the IOx operation `id`. This `id` can
+        /// be passed to the various APIs in the
+        /// operations client such as `influxdb_iox_client::operations::Client::wait_operation`;
+        pub fn id(&self) -> usize {
+            self.name
+                .parse()
+                .expect("Internal error: id returned from server was not an integer")
+        }
+    }
+}
+
+use self::protobuf::Any;
+use observability_deps::tracing::error;
+use prost::{bytes::BytesMut, Message};
+use std::convert::TryInto;
+
+// A newtype struct to provide conversion into tonic::Status
+#[derive(Debug)]
+struct EncodeError(prost::EncodeError);
+
+impl From<EncodeError> for tonic::Status {
+    fn from(error: EncodeError) -> Self {
+        error!(error=%error.0, "failed to serialize error response details");
+        tonic::Status::unknown(format!("failed to serialize server error: {}", error.0))
+    }
+}
+
+impl From<prost::EncodeError> for EncodeError {
+    fn from(e: prost::EncodeError) -> Self {
+        Self(e)
+    }
+}
+
+pub fn encode_status(code: tonic::Code, message: String, details: Any) -> tonic::Status {
+    let mut buffer = BytesMut::new();
+
+    let status = rpc::Status {
+        code: code as i32,
+        message: message.clone(),
+        details: vec![details],
+    };
+
+    match status.encode(&mut buffer) {
+        Ok(_) => tonic::Status::with_details(code, message, buffer.freeze()),
+        Err(e) => EncodeError(e).into(),
+    }
+}
+
+/// Returns an iterator over the [`protobuf::Any`] payloads in the provided [`tonic::Status`]
+fn get_details(status: &tonic::Status) -> impl Iterator<Item = protobuf::Any> {
+    rpc::Status::decode(status.details())
+        .ok()
+        .into_iter()
+        .flat_map(|status| status.details)
+}
+
+/// Error returned if a request field has an invalid value. Includes
+/// machinery to add parent field names for context -- thus it will
+/// report `rules.write_timeout` than simply `write_timeout`.
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct FieldViolation {
+    pub field: String,
+    pub description: String,
+}
+
+impl FieldViolation {
+    pub fn required(field: impl Into<String>) -> Self {
+        Self {
+            field: field.into(),
+            description: "Field is required".to_string(),
+        }
+    }
+
+    /// Re-scopes this error as the child of another field
+    pub fn scope(self, field: impl Into<String>) -> Self {
+        let field = if self.field.is_empty() {
+            field.into()
+        } else {
+            [field.into(), self.field].join(".")
+        };
+
+        Self {
+            field,
+            description: self.description,
+        }
+    }
+}
+
+impl std::error::Error for FieldViolation {}
+
+impl std::fmt::Display for FieldViolation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Violation for field \"{}\": {}",
+            self.field, self.description
+        )
+    }
+}
+
+fn encode_bad_request(violation: Vec<FieldViolation>) -> Result<Any, EncodeError> {
+    let mut buffer = BytesMut::new();
+
+    rpc::BadRequest {
+        field_violations: violation
+            .into_iter()
+            .map(|f| rpc::bad_request::FieldViolation {
+                field: f.field,
+                description: f.description,
+            })
+            .collect(),
+    }
+    .encode(&mut buffer)?;
+
+    Ok(Any {
+        type_url: "type.googleapis.com/google.rpc.BadRequest".to_string(),
+        value: buffer.freeze(),
+    })
+}
+
+impl From<FieldViolation> for tonic::Status {
+    fn from(f: FieldViolation) -> Self {
+        let message = f.to_string();
+
+        match encode_bad_request(vec![f]) {
+            Ok(details) => encode_status(tonic::Code::InvalidArgument, message, details),
+            Err(e) => e.into(),
+        }
+    }
+}
+
+impl From<rpc::bad_request::FieldViolation> for FieldViolation {
+    fn from(v: rpc::bad_request::FieldViolation) -> Self {
+        Self {
+            field: v.field,
+            description: v.description,
+        }
+    }
+}
+
+/// Returns an iterator over the [`FieldViolation`] in the provided [`tonic::Status`]
+pub fn decode_field_violation(status: &tonic::Status) -> impl Iterator<Item = FieldViolation> {
+    get_details(status)
+        .filter(|details| details.type_url == "type.googleapis.com/google.rpc.BadRequest")
+        .flat_map(|details| rpc::BadRequest::decode(details.value).ok())
+        .flat_map(|bad_request| bad_request.field_violations)
+        .map(Into::into)
+}
+
+/// An internal error occurred, no context is provided to the client
+///
+/// Should be reserved for when a fundamental invariant of the system has been broken
+#[derive(Debug, Default, Clone)]
+pub struct InternalError {}
+
+impl From<InternalError> for tonic::Status {
+    fn from(_: InternalError) -> Self {
+        tonic::Status::new(tonic::Code::Internal, "Internal Error")
+    }
+}
+
+/// A resource type within [`AlreadyExists`] or [`NotFound`]
+#[derive(Debug, Clone, PartialEq)]
+pub enum ResourceType {
+    Database,
+    Table,
+    Partition,
+    Chunk,
+    DatabaseUuid,
+    Job,
+    Router,
+    Unknown(String),
+}
+
+impl ResourceType {
+    pub fn as_str(&self) -> &str {
+        match self {
+            Self::Database => "database",
+            Self::DatabaseUuid => "database_uuid",
+            Self::Table => "table",
+            Self::Partition => "partition",
+            Self::Chunk => "chunk",
+            Self::Job => "job",
+            Self::Router => "router",
+            Self::Unknown(unknown) => unknown,
+        }
+    }
+}
+
+impl From<String> for ResourceType {
+    fn from(s: String) -> Self {
+        match s.as_str() {
+            "database" => Self::Database,
+            "database_uuid" => Self::DatabaseUuid,
+            "table" => Self::Table,
+            "partition" => Self::Partition,
+            "chunk" => Self::Chunk,
+            "job" => Self::Job,
+            "router" => Self::Router,
+            _ => Self::Unknown(s),
+        }
+    }
+}
+
+impl std::fmt::Display for ResourceType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.as_str().fmt(f)
+    }
+}
+
+/// Returns an iterator over the [`rpc::ResourceInfo`] payloads in the provided [`tonic::Status`]
+fn decode_resource_info(status: &tonic::Status) -> impl Iterator<Item = rpc::ResourceInfo> {
+    get_details(status)
+        .filter(|details| details.type_url == "type.googleapis.com/google.rpc.ResourceInfo")
+        .flat_map(|details| rpc::ResourceInfo::decode(details.value).ok())
+}
+
+/// IOx returns [`AlreadyExists`] when it is unable to create the requested entity
+/// as it already exists on the server
+#[derive(Debug, Clone, PartialEq)]
+pub struct AlreadyExists {
+    pub resource_type: ResourceType,
+    pub resource_name: String,
+    pub owner: String,
+    pub description: String,
+}
+
+impl AlreadyExists {
+    pub fn new(resource_type: ResourceType, resource_name: String) -> Self {
+        let description = format!("Resource {resource_type}/{resource_name} already exists");
+
+        Self {
+            resource_type,
+            resource_name,
+            description,
+            owner: Default::default(),
+        }
+    }
+}
+
+fn encode_resource_info(
+    resource_type: String,
+    resource_name: String,
+    owner: String,
+    description: String,
+) -> Result<Any, EncodeError> {
+    let mut buffer = BytesMut::new();
+
+    rpc::ResourceInfo {
+        resource_type,
+        resource_name,
+        owner,
+        description,
+    }
+    .encode(&mut buffer)?;
+
+    Ok(Any {
+        type_url: "type.googleapis.com/google.rpc.ResourceInfo".to_string(),
+        value: buffer.freeze(),
+    })
+}
+
+impl From<AlreadyExists> for tonic::Status {
+    fn from(exists: AlreadyExists) -> Self {
+        match encode_resource_info(
+            exists.resource_type.to_string(),
+            exists.resource_name,
+            exists.owner,
+            exists.description.clone(),
+        ) {
+            Ok(details) => encode_status(tonic::Code::AlreadyExists, exists.description, details),
+            Err(e) => e.into(),
+        }
+    }
+}
+
+impl From<rpc::ResourceInfo> for AlreadyExists {
+    fn from(r: rpc::ResourceInfo) -> Self {
+        Self {
+            resource_type: r.resource_type.into(),
+            resource_name: r.resource_name,
+            owner: r.owner,
+            description: r.description,
+        }
+    }
+}
+
+/// Returns an iterator over the [`AlreadyExists`] in the provided [`tonic::Status`]
+pub fn decode_already_exists(status: &tonic::Status) -> impl Iterator<Item = AlreadyExists> {
+    decode_resource_info(status).map(Into::into)
+}
+
+/// IOx returns [`NotFound`] when it is unable to perform an operation on a resource
+/// because it doesn't exist on the server
+#[derive(Debug, Clone, PartialEq)]
+pub struct NotFound {
+    pub resource_type: ResourceType,
+    pub resource_name: String,
+    pub owner: String,
+    pub description: String,
+}
+
+impl NotFound {
+    pub fn new(resource_type: ResourceType, resource_name: String) -> Self {
+        let description = format!("Resource {resource_type}/{resource_name} not found");
+
+        Self {
+            resource_type,
+            resource_name,
+            description,
+            owner: Default::default(),
+        }
+    }
+}
+
+impl From<NotFound> for tonic::Status {
+    fn from(not_found: NotFound) -> Self {
+        match encode_resource_info(
+            not_found.resource_type.to_string(),
+            not_found.resource_name,
+            not_found.owner,
+            not_found.description.clone(),
+        ) {
+            Ok(details) => encode_status(tonic::Code::NotFound, not_found.description, details),
+            Err(e) => e.into(),
+        }
+    }
+}
+
+impl From<rpc::ResourceInfo> for NotFound {
+    fn from(r: rpc::ResourceInfo) -> Self {
+        Self {
+            resource_type: r.resource_type.into(),
+            resource_name: r.resource_name,
+            owner: r.owner,
+            description: r.description,
+        }
+    }
+}
+
+/// Returns an iterator over the [`NotFound`] in the provided [`tonic::Status`]
+pub fn decode_not_found(status: &tonic::Status) -> impl Iterator<Item = NotFound> {
+    decode_resource_info(status).map(Into::into)
+}
+
+/// A [`PreconditionViolation`] is returned by IOx when the system is in a state that
+/// prevents performing the requested operation
+#[derive(Debug, Clone, PartialEq)]
+pub enum PreconditionViolation {
+    /// Database is not mutable
+    DatabaseImmutable,
+    /// Server not in required state for operation
+    ServerInvalidState(String),
+    /// Database not in required state for operation
+    DatabaseInvalidState(String),
+    /// Partition not in required state for operation
+    PartitionInvalidState(String),
+    /// Chunk not in required state for operation
+    ChunkInvalidState(String),
+    /// Configuration is immutable
+    RouterConfigImmutable,
+    /// Configuration is immutable
+    DatabaseConfigImmutable,
+    /// An unknown precondition violation
+    Unknown {
+        category: String,
+        subject: String,
+        description: String,
+    },
+}
+
+impl PreconditionViolation {
+    fn description(&self) -> String {
+        match self {
+            Self::DatabaseImmutable => "database must be mutable".to_string(),
+            Self::ServerInvalidState(description) => description.clone(),
+            Self::DatabaseInvalidState(description) => description.clone(),
+            Self::PartitionInvalidState(description) => description.clone(),
+            Self::ChunkInvalidState(description) => description.clone(),
+            Self::RouterConfigImmutable => "router configuration is not mutable".to_string(),
+            Self::DatabaseConfigImmutable => "database configuration is not mutable".to_string(),
+            Self::Unknown { description, .. } => description.clone(),
+        }
+    }
+}
+
+impl From<PreconditionViolation> for rpc::precondition_failure::Violation {
+    fn from(v: PreconditionViolation) -> Self {
+        match v {
+            PreconditionViolation::ServerInvalidState(_) => Self {
+                r#type: "state".to_string(),
+                subject: "influxdata.com/iox".to_string(),
+                description: v.description(),
+            },
+            PreconditionViolation::DatabaseImmutable => Self {
+                r#type: "mutable".to_string(),
+                subject: "influxdata.com/iox/database".to_string(),
+                description: v.description(),
+            },
+            PreconditionViolation::DatabaseInvalidState(_) => Self {
+                r#type: "state".to_string(),
+                subject: "influxdata.com/iox/database".to_string(),
+                description: v.description(),
+            },
+            PreconditionViolation::PartitionInvalidState(_) => Self {
+                r#type: "state".to_string(),
+                subject: "influxdata.com/iox/partition".to_string(),
+                description: v.description(),
+            },
+            PreconditionViolation::ChunkInvalidState(_) => Self {
+                r#type: "state".to_string(),
+                subject: "influxdata.com/iox/chunk".to_string(),
+                description: v.description(),
+            },
+            PreconditionViolation::RouterConfigImmutable => Self {
+                r#type: "config".to_string(),
+                subject: "influxdata.com/iox/router".to_string(),
+                description: v.description(),
+            },
+            PreconditionViolation::DatabaseConfigImmutable => Self {
+                r#type: "config".to_string(),
+                subject: "influxdata.com/iox/database".to_string(),
+                description: v.description(),
+            },
+            PreconditionViolation::Unknown {
+                category,
+                subject,
+                description,
+            } => Self {
+                r#type: category,
+                subject,
+                description,
+            },
+        }
+    }
+}
+
+impl From<rpc::precondition_failure::Violation> for PreconditionViolation {
+    fn from(v: rpc::precondition_failure::Violation) -> Self {
+        match (v.r#type.as_str(), v.subject.as_str()) {
+            ("state", "influxdata.com/iox") => {
+                PreconditionViolation::ServerInvalidState(v.description)
+            }
+            ("mutable", "influxdata.com/iox/database") => PreconditionViolation::DatabaseImmutable,
+            ("state", "influxdata.com/iox/database") => {
+                PreconditionViolation::DatabaseInvalidState(v.description)
+            }
+            ("state", "influxdata.com/iox/partition") => {
+                PreconditionViolation::PartitionInvalidState(v.description)
+            }
+            ("state", "influxdata.com/iox/chunk") => {
+                PreconditionViolation::ChunkInvalidState(v.description)
+            }
+            ("config", "influxdata.com/iox/router") => PreconditionViolation::RouterConfigImmutable,
+            ("config", "influxdata.com/iox/database") => {
+                PreconditionViolation::DatabaseConfigImmutable
+            }
+            _ => Self::Unknown {
+                category: v.r#type,
+                subject: v.subject,
+                description: v.description,
+            },
+        }
+    }
+}
+
+/// Returns an iterator over the [`PreconditionViolation`] in the provided [`tonic::Status`]
+pub fn decode_precondition_violation(
+    status: &tonic::Status,
+) -> impl Iterator<Item = PreconditionViolation> {
+    get_details(status)
+        .filter(|details| details.type_url == "type.googleapis.com/google.rpc.PreconditionFailure")
+        .flat_map(|details| rpc::PreconditionFailure::decode(details.value).ok())
+        .flat_map(|failure| failure.violations)
+        .map(Into::into)
+}
+
+fn encode_precondition_failure(violations: Vec<PreconditionViolation>) -> Result<Any, EncodeError> {
+    let mut buffer = BytesMut::new();
+
+    rpc::PreconditionFailure {
+        violations: violations.into_iter().map(Into::into).collect(),
+    }
+    .encode(&mut buffer)?;
+
+    Ok(Any {
+        type_url: "type.googleapis.com/google.rpc.PreconditionFailure".to_string(),
+        value: buffer.freeze(),
+    })
+}
+
+impl From<PreconditionViolation> for tonic::Status {
+    fn from(violation: PreconditionViolation) -> Self {
+        let message = violation.description();
+        match encode_precondition_failure(vec![violation]) {
+            Ok(details) => encode_status(tonic::Code::FailedPrecondition, message, details),
+            Err(e) => e.into(),
+        }
+    }
+}
+
+/// An extension trait that adds the ability to convert an error
+/// that can be converted to a String to a FieldViolation
+///
+/// This is useful where a field has fallible `TryFrom` conversion logic, but which doesn't
+/// use [`FieldViolation`] as its error type. [`FieldViolationExt::scope`] will format the
+/// returned error and add the field name as context
+///
+pub trait FieldViolationExt {
+    type Output;
+
+    fn scope(self, field: &'static str) -> Result<Self::Output, FieldViolation>;
+}
+
+impl<T, E> FieldViolationExt for Result<T, E>
+where
+    E: ToString,
+{
+    type Output = T;
+
+    fn scope(self, field: &'static str) -> Result<T, FieldViolation> {
+        self.map_err(|e| FieldViolation {
+            field: field.to_string(),
+            description: e.to_string(),
+        })
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct QuotaFailure {
+    pub subject: String,
+    pub description: String,
+}
+
+impl From<QuotaFailure> for tonic::Status {
+    fn from(quota_failure: QuotaFailure) -> Self {
+        tonic::Status::new(
+            tonic::Code::ResourceExhausted,
+            format!("{}: {}", quota_failure.subject, quota_failure.description),
+        )
+    }
+}
+
+/// An extension trait that adds the method `field` to any type implementing
+/// `TryInto<U, Error = FieldViolation>`
+///
+/// This is primarily used to define other extension traits but may be useful for:
+///
+/// * Conversion code for a oneof enumeration
+/// * Converting from a scalar field to a custom Rust type
+///
+/// In a lot of cases, the type will be `Option<proto::Type>` or `Vec<proto::Type>`
+/// in which case `FromOptionalField` or `FromRepeatedField` should be used instead
+///
+pub trait FromField<T> {
+    fn field(self, field: impl Into<String>) -> Result<T, FieldViolation>;
+}
+
+impl<T, U> FromField<U> for T
+where
+    T: TryInto<U, Error = FieldViolation>,
+{
+    /// Try to convert type using TryInto calling [`FieldViolation::scope`]
+    /// on any returned error
+    fn field(self, field: impl Into<String>) -> Result<U, FieldViolation> {
+        self.try_into().map_err(|e| e.scope(field))
+    }
+}
+
+/// An extension trait that adds the methods `from_optional` and `from_required` to any
+/// Option containing a type implementing `TryInto<U, Error = FieldViolation>`
+///
+/// This is useful for converting message-typed fields such as `Option<prost::Type>` to
+/// `Option<data_types::Type>` and `data_types::Type` respectively
+pub trait FromOptionalField<T> {
+    /// Converts an optional protobuf field to an option of a different type
+    ///
+    /// Returns None if the option is None, otherwise calls [`FromField::field`]
+    /// on the contained data, returning any error encountered
+    fn optional(self, field: impl Into<String>) -> Result<Option<T>, FieldViolation>;
+
+    /// Converts an optional protobuf field to a different type, returning an error if None
+    ///
+    /// Returns `FieldViolation::required` if None, otherwise calls [`FromField::field`]
+    /// on the contained data, returning any error encountered
+    fn required(self, field: impl Into<String>) -> Result<T, FieldViolation>;
+}
+
+impl<T, U> FromOptionalField<U> for Option<T>
+where
+    T: TryInto<U, Error = FieldViolation>,
+{
+    fn optional(self, field: impl Into<String>) -> Result<Option<U>, FieldViolation> {
+        self.map(|t| t.field(field)).transpose()
+    }
+
+    fn required(self, field: impl Into<String>) -> Result<U, FieldViolation> {
+        match self {
+            None => Err(FieldViolation::required(field)),
+            Some(t) => t.field(field),
+        }
+    }
+}
+
+/// An extension trait that adds the method `from_repeated` to any `Vec` of a type
+/// implementing `TryInto<U, Error = FieldViolation>`
+///
+/// This is useful for converting message-typed repeated fields such as `Vec<prost::Type>`
+/// to `Vec<data_types::Type>`
+pub trait FromRepeatedField<T> {
+    /// Converts to a `Vec<U>`, short-circuiting on the first error and
+    /// returning a correctly scoped `FieldViolation` for where the error
+    /// was encountered
+    fn repeated(self, field: impl Into<String>) -> Result<T, FieldViolation>;
+}
+
+impl<T, U> FromRepeatedField<Vec<U>> for Vec<T>
+where
+    T: TryInto<U, Error = FieldViolation>,
+{
+    fn repeated(self, field: impl Into<String>) -> Result<Vec<U>, FieldViolation> {
+        let res: Result<_, _> = self
+            .into_iter()
+            .enumerate()
+            .map(|(i, t)| t.field(i.to_string()))
+            .collect();
+
+        res.map_err(|e| e.scope(field))
+    }
+}
+
+/// An extension trait that adds the method `non_empty` to any `String`
+///
+/// This is useful where code wishes to require a non-empty string is specified
+///
+/// TODO: Replace with NonEmptyString type implementing TryFrom?
+pub trait NonEmptyString {
+    /// Returns a Ok if the String is not empty
+    fn non_empty(self, field: impl Into<String>) -> Result<String, FieldViolation>;
+}
+
+impl NonEmptyString for String {
+    fn non_empty(self, field: impl Into<Self>) -> Result<String, FieldViolation> {
+        if self.is_empty() {
+            return Err(FieldViolation::required(field));
+        }
+        Ok(self)
+    }
+}
+
+/// An extension trait that adds the method `required` to any `Option<T>`
+///
+/// This is useful for field types:
+///
+/// * With infallible conversions (e.g. `field.required("field")?.into()`)
+/// * With conversion logic not implemented using `TryFrom`
+///
+/// `FromOptionalField` should be preferred where applicable
+pub trait OptionalField<T> {
+    fn unwrap_field(self, field: impl Into<String>) -> Result<T, FieldViolation>;
+}
+
+impl<T> OptionalField<T> for Option<T> {
+    fn unwrap_field(self, field: impl Into<String>) -> Result<T, FieldViolation> {
+        self.ok_or_else(|| FieldViolation::required(field))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bytes::Bytes;
+
+    #[test]
+    fn test_error_roundtrip() {
+        let violation = FieldViolation::required("foobar");
+        let status = tonic::Status::from(violation.clone());
+        let collected: Vec<_> = decode_field_violation(&status).collect();
+        assert_eq!(collected, vec![violation]);
+
+        let not_found = NotFound::new(ResourceType::Chunk, "chunky".to_string());
+        let status = tonic::Status::from(not_found.clone());
+        let collected: Vec<_> = decode_not_found(&status).collect();
+        assert_eq!(collected, vec![not_found]);
+
+        let already_exists = AlreadyExists::new(ResourceType::Database, "my database".to_string());
+        let status = tonic::Status::from(already_exists.clone());
+        let collected: Vec<_> = decode_already_exists(&status).collect();
+        assert_eq!(collected, vec![already_exists]);
+
+        let precondition = PreconditionViolation::PartitionInvalidState("mumbo".to_string());
+        let status = tonic::Status::from(precondition.clone());
+        let collected: Vec<_> = decode_precondition_violation(&status).collect();
+        assert_eq!(collected, vec![precondition]);
+    }
+
+    #[test]
+    fn test_multiple() {
+        // Should allow encoding multiple violations
+        let violations = vec![
+            FieldViolation::required("fizbuz"),
+            FieldViolation::required("bingo"),
+        ];
+
+        let encoded = encode_bad_request(violations.clone()).unwrap();
+        let mut buffer = BytesMut::new();
+
+        let code = tonic::Code::InvalidArgument;
+
+        let status = rpc::Status {
+            code: code as i32,
+            message: "message".to_string(),
+            details: vec![
+                // Should ignore unrecognised details payloads
+                protobuf::Any {
+                    type_url: "my_magic/type".to_string(),
+                    value: Bytes::from(&b"INVALID"[..]),
+                },
+                encoded,
+            ],
+        };
+
+        status.encode(&mut buffer).unwrap();
+        let status = tonic::Status::with_details(code, status.message, buffer.freeze());
+        let collected: Vec<_> = decode_field_violation(&status).collect();
+        assert_eq!(collected, violations);
+    }
+}
diff --git a/generated_types/src/lib.rs b/generated_types/src/lib.rs
new file mode 100644
index 0000000..0b468d1
--- /dev/null
+++ b/generated_types/src/lib.rs
@@ -0,0 +1,438 @@
+// This crate deliberately does not use the same linting rules as the other
+// crates because of all the generated code it contains that we don't have much
+// control over.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls)]
+#![allow(
+    clippy::derive_partial_eq_without_eq,
+    clippy::future_not_send,
+    clippy::needless_borrow,
+    clippy::needless_borrows_for_generic_args,
+    missing_copy_implementations,
+    unreachable_pub
+)]
+#![warn(unused_crate_dependencies)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+// Re-export prost for users of proto types.
+pub use prost;
+
+/// This module imports the generated protobuf code into a Rust module
+/// hierarchy that matches the namespace hierarchy of the protobuf
+/// definitions
+pub mod influxdata {
+    pub mod platform {
+        pub mod storage {
+            include!(concat!(
+                env!("OUT_DIR"),
+                "/influxdata.platform.storage.read.rs"
+            ));
+            include!(concat!(
+                env!("OUT_DIR"),
+                "/influxdata.platform.storage.read.serde.rs"
+            ));
+
+            include!(concat!(env!("OUT_DIR"), "/influxdata.platform.storage.rs"));
+            include!(concat!(
+                env!("OUT_DIR"),
+                "/influxdata.platform.storage.serde.rs"
+            ));
+
+            // Can't implement `Default` because `prost::Message` implements `Default`
+            impl TimestampRange {
+                pub fn max() -> Self {
+                    TimestampRange {
+                        start: std::i64::MIN,
+                        end: std::i64::MAX,
+                    }
+                }
+            }
+        }
+        pub mod errors {
+            include!(concat!(env!("OUT_DIR"), "/influxdata.platform.errors.rs"));
+            include!(concat!(
+                env!("OUT_DIR"),
+                "/influxdata.platform.errors.serde.rs"
+            ));
+        }
+    }
+
+    pub mod iox {
+        pub mod authz {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.authz.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.authz.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod bulk_ingest {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.bulk_ingest.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.bulk_ingest.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod catalog {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.catalog.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog.v1.serde.rs"
+                ));
+            }
+            pub mod v2 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.catalog.v2.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog.v2.serde.rs"
+                ));
+            }
+        }
+
+        pub mod catalog_cache {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog_cache.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog_cache.v1.serde.rs"
+                ));
+            }
+
+            impl From<uuid::Uuid> for v1::Uuid {
+                fn from(value: uuid::Uuid) -> Self {
+                    let (high, low) = value.as_u64_pair();
+                    Self { high, low }
+                }
+            }
+
+            impl From<v1::Uuid> for uuid::Uuid {
+                fn from(value: v1::Uuid) -> Self {
+                    uuid::Uuid::from_u64_pair(value.high, value.low)
+                }
+            }
+        }
+
+        pub mod column_type {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.column_type.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.column_type.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod compactor {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.compactor.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.compactor.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod delete {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.delete.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.delete.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod gossip {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.gossip.v1.rs"));
+            }
+
+            /// The set of topics used for IOx gossiping.
+            ///
+            /// NOTE: Don't renumber topics. Don't re-use numbers. Use the range
+            /// 0 to 63 for numbers.
+            #[derive(Debug, Clone, Copy, PartialEq, Eq)]
+            pub enum Topic {
+                /// New namespace, table, and column additions observed and
+                /// broadcast by the routers.
+                SchemaChanges = 1,
+
+                /// Parquet file creation notifications.
+                NewParquetFiles = 2,
+
+                /// Compaction round completion notifications.
+                CompactionEvents = 3,
+
+                /// Schema cache consistency check / sync / convergence
+                /// messages.
+                SchemaCacheConsistency = 4,
+
+                /// Partition sort key update notifications.
+                PartitionSortKeyUpdates = 5,
+            }
+
+            impl TryFrom<u64> for Topic {
+                type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
+
+                fn try_from(v: u64) -> Result<Self, Self::Error> {
+                    Ok(match v {
+                        v if v == Self::SchemaChanges as u64 => Self::SchemaChanges,
+                        v if v == Self::NewParquetFiles as u64 => Self::NewParquetFiles,
+                        v if v == Self::CompactionEvents as u64 => Self::CompactionEvents,
+                        v if v == Self::SchemaCacheConsistency as u64 => {
+                            Self::SchemaCacheConsistency
+                        }
+                        v if v == Self::PartitionSortKeyUpdates as u64 => {
+                            Self::PartitionSortKeyUpdates
+                        }
+                        _ => return Err(format!("unknown topic id {}", v).into()),
+                    })
+                }
+            }
+
+            impl From<Topic> for u64 {
+                fn from(v: Topic) -> u64 {
+                    v as u64
+                }
+            }
+        }
+
+        pub mod ingester {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.ingester.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.ingester.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod namespace {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.namespace.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.namespace.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod object_store {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.object_store.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.object_store.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod partition_template {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.partition_template.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.partition_template.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod predicate {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.predicate.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.predicate.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod querier {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.querier.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.querier.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod schema {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.schema.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.schema.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod skipped_compaction {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.skipped_compaction.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.skipped_compaction.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod table {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.table.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.table.v1.serde.rs"
+                ));
+            }
+        }
+
+        pub mod wal {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.wal.v1.rs"));
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.wal.v1.serde.rs"));
+            }
+        }
+    }
+
+    pub mod pbdata {
+        pub mod v1 {
+            include!(concat!(env!("OUT_DIR"), "/influxdata.pbdata.v1.rs"));
+            include!(concat!(env!("OUT_DIR"), "/influxdata.pbdata.v1.serde.rs"));
+        }
+    }
+}
+
+// Needed because of https://github.com/hyperium/tonic/issues/471
+pub mod grpc {
+    pub mod health {
+        pub mod v1 {
+            include!(concat!(env!("OUT_DIR"), "/grpc.health.v1.rs"));
+        }
+    }
+}
+
+/// gRPC Storage Service
+pub const STORAGE_SERVICE: &str = "influxdata.platform.storage.Storage";
+
+/// gRPC Testing Service
+pub const IOX_TESTING_SERVICE: &str = "influxdata.platform.storage.IOxTesting";
+
+/// gRPC Arrow Flight Service
+pub const ARROW_SERVICE: &str = "arrow.flight.protocol.FlightService";
+
+/// The type prefix for any types
+pub const ANY_TYPE_PREFIX: &str = "type.googleapis.com";
+
+/// Returns the protobuf URL usable with a google.protobuf.Any message
+/// This is the full Protobuf package and message name prefixed by
+/// "type.googleapis.com/"
+pub fn protobuf_type_url(protobuf_type: &str) -> String {
+    format!("{ANY_TYPE_PREFIX}/{protobuf_type}")
+}
+
+/// Protobuf file descriptor containing all generated types.
+/// Useful in gRPC reflection.
+pub const FILE_DESCRIPTOR_SET: &[u8] = tonic::include_file_descriptor_set!("proto_descriptor");
+
+/// Compares the protobuf type URL found within a google.protobuf.Any
+/// message to an expected Protobuf package and message name
+///
+/// i.e. strips off the "type.googleapis.com/" prefix from `url`
+/// and compares the result with `protobuf_type`
+///
+/// ```
+/// use generated_types::protobuf_type_url_eq;
+/// assert!(protobuf_type_url_eq("type.googleapis.com/google.protobuf.Empty", "google.protobuf.Empty"));
+/// assert!(!protobuf_type_url_eq("type.googleapis.com/google.protobuf.Empty", "something.else"));
+/// ```
+pub fn protobuf_type_url_eq(url: &str, protobuf_type: &str) -> bool {
+    let mut split = url.splitn(2, '/');
+    match (split.next(), split.next()) {
+        (Some(ANY_TYPE_PREFIX), Some(t)) => t == protobuf_type,
+        _ => false,
+    }
+}
+
+// TODO: Remove these (#2419)
+pub use influxdata::platform::storage::*;
+
+pub mod google;
+
+pub use prost::{DecodeError, EncodeError};
+
+#[cfg(test)]
+mod tests {
+    use crate::influxdata::iox::gossip::Topic;
+
+    use super::*;
+
+    #[test]
+    fn test_protobuf_type_url() {
+        let t = protobuf_type_url(STORAGE_SERVICE);
+
+        assert_eq!(
+            &t,
+            "type.googleapis.com/influxdata.platform.storage.Storage"
+        );
+
+        assert!(protobuf_type_url_eq(&t, STORAGE_SERVICE));
+        assert!(!protobuf_type_url_eq(&t, "foo"));
+
+        // The URL must start with the type.googleapis.com prefix
+        assert!(!protobuf_type_url_eq(STORAGE_SERVICE, STORAGE_SERVICE,));
+    }
+
+    #[test]
+    fn test_gossip_topics() {
+        let topics = [
+            Topic::SchemaChanges,
+            Topic::NewParquetFiles,
+            Topic::CompactionEvents,
+            Topic::SchemaCacheConsistency,
+            Topic::PartitionSortKeyUpdates,
+        ];
+
+        for topic in topics {
+            let v = u64::from(topic);
+            let got = Topic::try_from(v).expect("failed to round-trip topic");
+            assert_eq!(got, topic);
+        }
+
+        // Adding a new topic? Add it to the test cases too and then add it to
+        // this match (that forces a compile-time error and makes you read this
+        // message).
+        match topics[0] {
+            Topic::SchemaChanges => {}
+            Topic::NewParquetFiles => {}
+            Topic::CompactionEvents => {}
+            Topic::SchemaCacheConsistency => {}
+            Topic::PartitionSortKeyUpdates => {}
+        }
+    }
+}
diff --git a/grpc-binary-logger-proto/Cargo.toml b/grpc-binary-logger-proto/Cargo.toml
new file mode 100644
index 0000000..7b1780c
--- /dev/null
+++ b/grpc-binary-logger-proto/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "grpc-binary-logger-proto"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+prost = { workspace = true }
+prost-types = { workspace = true, features = ["std"] }
+tonic = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[build-dependencies]
+prost-build = { workspace = true }
+tonic-build = { workspace = true }
diff --git a/grpc-binary-logger-proto/build.rs b/grpc-binary-logger-proto/build.rs
new file mode 100644
index 0000000..0c4ab03
--- /dev/null
+++ b/grpc-binary-logger-proto/build.rs
@@ -0,0 +1,8 @@
+//! Compiles Protocol Buffers into native Rust types.
+//! <https://github.com/grpc/grpc/blob/master/doc/binary-logging.md>
+
+use std::io::Result;
+
+fn main() -> Result<()> {
+    tonic_build::compile_protos("proto/grpc/binlog/v1/binarylog.proto")
+}
diff --git a/grpc-binary-logger-proto/proto/grpc/binlog/v1/binarylog.proto b/grpc-binary-logger-proto/proto/grpc/binlog/v1/binarylog.proto
new file mode 100644
index 0000000..751a350
--- /dev/null
+++ b/grpc-binary-logger-proto/proto/grpc/binlog/v1/binarylog.proto
@@ -0,0 +1,209 @@
+// Copyright 2018 The gRPC Authors
+// All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The canonical version of this proto can be found at
+// https://github.com/grpc/grpc-proto/blob/master/grpc/binlog/v1/binarylog.proto
+
+syntax = "proto3";
+
+package grpc.binarylog.v1;
+
+import "google/protobuf/duration.proto";
+import "google/protobuf/timestamp.proto";
+
+option go_package = "google.golang.org/grpc/binarylog/grpc_binarylog_v1";
+option java_multiple_files = true;
+option java_package = "io.grpc.binarylog.v1";
+option java_outer_classname = "BinaryLogProto";
+
+// Log entry we store in binary logs
+message GrpcLogEntry {
+  // Enumerates the type of event
+  // Note the terminology is different from the RPC semantics
+  // definition, but the same meaning is expressed here.
+  enum EventType {
+    EVENT_TYPE_UNKNOWN = 0;
+    // Header sent from client to server
+    EVENT_TYPE_CLIENT_HEADER = 1;
+    // Header sent from server to client
+    EVENT_TYPE_SERVER_HEADER = 2;
+    // Message sent from client to server
+    EVENT_TYPE_CLIENT_MESSAGE = 3;
+    // Message sent from server to client
+    EVENT_TYPE_SERVER_MESSAGE = 4;
+    // A signal that client is done sending
+    EVENT_TYPE_CLIENT_HALF_CLOSE = 5;
+    // Trailer indicates the end of the RPC.
+    // On client side, this event means a trailer was either received
+    // from the network or the gRPC library locally generated a status
+    // to inform the application about a failure.
+    // On server side, this event means the server application requested
+    // to send a trailer. Note: EVENT_TYPE_CANCEL may still arrive after
+    // this due to races on server side.
+    EVENT_TYPE_SERVER_TRAILER = 6;
+    // A signal that the RPC is cancelled. On client side, this
+    // indicates the client application requests a cancellation.
+    // On server side, this indicates that cancellation was detected.
+    // Note: This marks the end of the RPC. Events may arrive after
+    // this due to races. For example, on client side a trailer
+    // may arrive even though the application requested to cancel the RPC.
+    EVENT_TYPE_CANCEL = 7;
+  }
+
+  // Enumerates the entity that generates the log entry
+  enum Logger {
+    LOGGER_UNKNOWN = 0;
+    LOGGER_CLIENT = 1;
+    LOGGER_SERVER = 2;
+  }
+
+  // The timestamp of the binary log message
+  google.protobuf.Timestamp timestamp = 1;
+
+  // Uniquely identifies a call. The value must not be 0 in order to disambiguate
+  // from an unset value.
+  // Each call may have several log entries, they will all have the same call_id.
+  // Nothing is guaranteed about their value other than they are unique across
+  // different RPCs in the same gRPC process.
+  uint64 call_id = 2;
+
+  // The entry sequence id for this call. The first GrpcLogEntry has a
+  // value of 1, to disambiguate from an unset value. The purpose of
+  // this field is to detect missing entries in environments where
+  // durability or ordering is not guaranteed.
+  uint64 sequence_id_within_call = 3;
+
+  EventType type = 4;
+  Logger logger = 5;  // One of the above Logger enum
+
+  // The logger uses one of the following fields to record the payload,
+  // according to the type of the log entry.
+  oneof payload {
+    ClientHeader client_header = 6;
+    ServerHeader server_header = 7;
+    // Used by EVENT_TYPE_CLIENT_MESSAGE, EVENT_TYPE_SERVER_MESSAGE
+    Message message = 8;
+    Trailer trailer = 9;
+  }
+
+  // true if payload does not represent the full message or metadata.
+  bool payload_truncated = 10;
+
+  // Peer address information, will only be recorded on the first
+  // incoming event. On client side, peer is logged on
+  // EVENT_TYPE_SERVER_HEADER normally or EVENT_TYPE_SERVER_TRAILER in
+  // the case of trailers-only. On server side, peer is always
+  // logged on EVENT_TYPE_CLIENT_HEADER.
+  Address peer = 11;
+};
+
+message ClientHeader {
+  // This contains only the metadata from the application.
+  Metadata metadata = 1;
+
+  // The name of the RPC method, which looks something like:
+  // `/<service>/<method>`
+  // Note the leading "/" character.
+  string method_name = 2;
+
+  // A single process may be used to run multiple virtual
+  // servers with different identities.
+  // The authority is the name of such a server identitiy.
+  // It is typically a portion of the URI in the form of
+  // `<host>` or `<host>:<port>`.
+  string authority = 3;
+
+  // the RPC timeout
+  google.protobuf.Duration timeout = 4;
+}
+
+message ServerHeader {
+  // This contains only the metadata from the application.
+  Metadata metadata = 1;
+}
+
+message Trailer {
+  // This contains only the metadata from the application.
+  Metadata metadata = 1;
+
+  // The gRPC status code.
+  uint32 status_code = 2;
+
+  // An original status message before any transport specific
+  // encoding.
+  string status_message = 3;
+
+  // The value of the 'grpc-status-details-bin' metadata key. If
+  // present, this is always an encoded 'google.rpc.Status' message.
+  bytes status_details = 4;
+}
+
+// Message payload, used by CLIENT_MESSAGE and SERVER_MESSAGE
+message Message {
+  // Length of the message. It may not be the same as the length of the
+  // data field, as the logging payload can be truncated or omitted.
+  uint32 length = 1;
+  // May be truncated or omitted.
+  bytes data = 2;
+}
+
+// A list of metadata pairs, used in the payload of client header,
+// server header, and server trailer.
+// Implementations may omit some entries to honor the header limits
+// of GRPC_BINARY_LOG_CONFIG.
+//
+// Header keys added by gRPC are omitted. To be more specific,
+// implementations will not log the following entries, and this is
+// not to be treated as a truncation:
+// - entries handled by grpc that are not user visible, such as those
+//   that begin with 'grpc-' (with exception of grpc-trace-bin)
+//   or keys like 'lb-token'
+// - transport specific entries, including but not limited to:
+//   ':path', ':authority', 'content-encoding', 'user-agent', 'te', etc
+// - entries added for call credentials
+//
+// Implementations must always log grpc-trace-bin if it is present.
+// Practically speaking it will only be visible on server side because
+// grpc-trace-bin is managed by low level client side mechanisms
+// inaccessible from the application level. On server side, the
+// header is just a normal metadata key.
+// The pair will not count towards the size limit.
+message Metadata {
+  repeated MetadataEntry entry = 1;
+}
+
+// A metadata key value pair
+message MetadataEntry {
+  string key = 1;
+  bytes value = 2;
+}
+
+// Address information
+message Address {
+  enum Type {
+    TYPE_UNKNOWN = 0;
+    // address is in 1.2.3.4 form
+    TYPE_IPV4 = 1;
+    // address is in IPv6 canonical form (RFC5952 section 4)
+    // The scope is NOT included in the address string.
+    TYPE_IPV6 = 2;
+    // address is UDS string
+    TYPE_UNIX = 3;
+  };
+  Type type = 1;
+  string address = 2;
+  // only for TYPE_IPV4 and TYPE_IPV6
+  uint32 ip_port = 3;
+}
diff --git a/grpc-binary-logger-proto/src/lib.rs b/grpc-binary-logger-proto/src/lib.rs
new file mode 100644
index 0000000..b4d8e81
--- /dev/null
+++ b/grpc-binary-logger-proto/src/lib.rs
@@ -0,0 +1,10 @@
+#![warn(unused_crate_dependencies)]
+#![allow(clippy::derive_partial_eq_without_eq)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod proto {
+    tonic::include_proto!("grpc.binarylog.v1");
+}
+pub use proto::*;
diff --git a/grpc-binary-logger-test-proto/Cargo.toml b/grpc-binary-logger-test-proto/Cargo.toml
new file mode 100644
index 0000000..5fc86dd
--- /dev/null
+++ b/grpc-binary-logger-test-proto/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "grpc-binary-logger-test-proto"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+prost = { workspace = true }
+tonic = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[build-dependencies]
+prost-build = { workspace = true }
+tonic-build = { workspace = true }
diff --git a/grpc-binary-logger-test-proto/build.rs b/grpc-binary-logger-test-proto/build.rs
new file mode 100644
index 0000000..6c56b20
--- /dev/null
+++ b/grpc-binary-logger-test-proto/build.rs
@@ -0,0 +1,7 @@
+//! Compiles Protocol Buffers into native Rust types.
+
+use std::io::Result;
+
+fn main() -> Result<()> {
+    tonic_build::compile_protos("proto/test.proto")
+}
diff --git a/grpc-binary-logger-test-proto/proto/test.proto b/grpc-binary-logger-test-proto/proto/test.proto
new file mode 100644
index 0000000..5af7359
--- /dev/null
+++ b/grpc-binary-logger-test-proto/proto/test.proto
@@ -0,0 +1,12 @@
+syntax = "proto3";
+
+package test;
+
+service Test {
+  rpc TestUnary(TestRequest) returns (TestResponse);
+  rpc TestStream(TestRequest) returns (stream TestResponse);
+}
+
+message TestRequest { uint64 question = 1; }
+
+message TestResponse { uint64 answer = 1; }
diff --git a/grpc-binary-logger-test-proto/src/lib.rs b/grpc-binary-logger-test-proto/src/lib.rs
new file mode 100644
index 0000000..12397eb
--- /dev/null
+++ b/grpc-binary-logger-test-proto/src/lib.rs
@@ -0,0 +1,15 @@
+#![warn(unused_crate_dependencies)]
+#![allow(
+    clippy::derive_partial_eq_without_eq,
+    clippy::future_not_send,
+    missing_copy_implementations,
+    unreachable_pub
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod proto {
+    tonic::include_proto!("test");
+}
+pub use proto::*;
diff --git a/grpc-binary-logger/Cargo.toml b/grpc-binary-logger/Cargo.toml
new file mode 100644
index 0000000..0908570
--- /dev/null
+++ b/grpc-binary-logger/Cargo.toml
@@ -0,0 +1,34 @@
+[package]
+name = "grpc-binary-logger"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+base64 = "0.21"
+byteorder = { version = "1", features = ["std"] }
+bytes = "1.5"
+futures = "0.3"
+http = "0.2"
+http-body = "0.4"
+hyper = "0.14"
+pin-project = "1.1"
+prost = { workspace = true }
+tokio = {version = "1", features = [ "rt" ]}
+tonic = { workspace = true }
+tower = "0.4"
+grpc-binary-logger-proto = { path = "../grpc-binary-logger-proto" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+grpc-binary-logger-test-proto = { path = "../grpc-binary-logger-test-proto" }
+tokio-stream = { version = "0.1", features = ["net"] }
+assert_matches = "1"
+
+[build-dependencies]
+prost-build = { workspace = true }
+tonic-build = { workspace = true }
diff --git a/grpc-binary-logger/examples/README.md b/grpc-binary-logger/examples/README.md
new file mode 100644
index 0000000..72454cc
--- /dev/null
+++ b/grpc-binary-logger/examples/README.md
@@ -0,0 +1,30 @@
+# example usage
+
+Run server:
+
+```console
+cargo run -p grpc-binary-logger --example server
+```
+
+Run client:
+
+```
+cargo run -p grpc-binary-logger --example client
+```
+
+View binary log:
+
+```console
+$ go install mkm.pub/binlog
+$ binlog stats /tmp/grpcgo_binarylog.bin
+Method			[≥0s]	[≥0.05s][≥0.1s]	[≥0.2s]	[≥0.5s]	[≥1s]	[≥10s]	[≥100s]	[errors]
+/test.Test/TestUnary	1	0	0	0	0	0	0	0
+$ binlog debug /tmp/grpcgo_binarylog.bin
+1	EVENT_TYPE_CLIENT_HEADER	/test.Test/TestUnary
+1	EVENT_TYPE_CLIENT_MESSAGE
+1	EVENT_TYPE_SERVER_HEADER
+1	EVENT_TYPE_SERVER_MESSAGE
+1	EVENT_TYPE_SERVER_TRAILER
+$ binlog view /tmp/grpcgo_binarylog.bin
+ID	When				Elapsed	Method			Status
+1	2022/08/24 14:33:08.736308	1.217ms	/test.Test/TestUnary	OK
diff --git a/grpc-binary-logger/examples/client.rs b/grpc-binary-logger/examples/client.rs
new file mode 100644
index 0000000..d754b55
--- /dev/null
+++ b/grpc-binary-logger/examples/client.rs
@@ -0,0 +1,12 @@
+use grpc_binary_logger_test_proto::{test_client::TestClient, TestRequest};
+
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut client = TestClient::connect("http://[::1]:50051").await?;
+
+    let request = tonic::Request::new(TestRequest { question: 41 });
+    let response = client.test_unary(request).await?;
+    println!("RESPONSE={response:?}");
+
+    Ok(())
+}
diff --git a/grpc-binary-logger/examples/server.rs b/grpc-binary-logger/examples/server.rs
new file mode 100644
index 0000000..5a641d5
--- /dev/null
+++ b/grpc-binary-logger/examples/server.rs
@@ -0,0 +1,75 @@
+use futures::Stream;
+use grpc_binary_logger::{BinaryLoggerLayer, FileSink, NoReflection};
+use std::pin::Pin;
+use tonic::{metadata::MetadataValue, transport::Server, Request, Response, Status};
+
+use grpc_binary_logger_test_proto::{
+    test_server::{self, TestServer},
+    TestRequest, TestResponse,
+};
+
+#[derive(Debug, Clone, Copy)]
+pub struct TestService;
+
+type PinnedStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send>>;
+
+#[tonic::async_trait]
+impl test_server::Test for TestService {
+    async fn test_unary(
+        &self,
+        request: Request<TestRequest>,
+    ) -> Result<Response<TestResponse>, Status> {
+        let request = request.into_inner();
+
+        if request.question == 42 {
+            return Err(Status::invalid_argument("The Answer is not a question"));
+        }
+
+        let mut res = tonic::Response::new(TestResponse {
+            answer: request.question + 1,
+        });
+        res.metadata_mut().insert(
+            "my-server-header",
+            MetadataValue::from_static("my-server-header-value"),
+        );
+        Ok(res)
+    }
+
+    type TestStreamStream = PinnedStream<TestResponse>;
+
+    async fn test_stream(
+        &self,
+        request: Request<TestRequest>,
+    ) -> Result<Response<Self::TestStreamStream>, Status> {
+        let request = request.into_inner();
+        let it = (0..=request.question).map(|answer| Ok(TestResponse { answer }));
+        Ok(tonic::Response::new(Box::pin(futures::stream::iter(it))))
+    }
+}
+
+#[tokio::main(flavor = "current_thread")]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let addr = "[::1]:50051".parse().unwrap();
+    let service = TestService;
+
+    println!("TestService listening on {addr}");
+
+    // Create a binary log sink that writes length delimited binary log entries.
+    let file = std::fs::File::create("/tmp/grpcgo_binarylog.bin")?;
+    let sink = FileSink::new(file);
+    // Create a binary logger with a given sink.
+    let binlog_layer = BinaryLoggerLayer::new(sink);
+    // You can provide a custom predicate that selects requests that you want to be logged.
+    // The default NoReflection predicate filters out all gRPC reflection chatter.
+    let binlog_layer = binlog_layer.with_predicate(NoReflection);
+    // You can provide a custom logger.
+    let binlog_layer = binlog_layer.with_error_logger(|e| eprintln!("grpc binlog error: {e:?}"));
+
+    Server::builder()
+        .layer(binlog_layer)
+        .add_service(TestServer::new(service))
+        .serve(addr)
+        .await?;
+
+    Ok(())
+}
diff --git a/grpc-binary-logger/src/lib.rs b/grpc-binary-logger/src/lib.rs
new file mode 100644
index 0000000..f302087
--- /dev/null
+++ b/grpc-binary-logger/src/lib.rs
@@ -0,0 +1,35 @@
+//! Implements a gRPC binary logger. See <https://github.com/grpc/grpc/blob/master/doc/binary-logging.md>
+//!
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#[allow(unreachable_pub)]
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use assert_matches as _;
+#[cfg(test)]
+use grpc_binary_logger_test_proto as _;
+#[cfg(test)]
+use tokio_stream as _;
+use workspace_hack as _;
+
+mod predicate;
+pub use self::predicate::{NoReflection, Predicate};
+pub mod sink;
+pub use self::sink::{DebugSink, FileSink, Sink};
+
+mod middleware;
+pub use middleware::BinaryLoggerLayer;
+
+pub use grpc_binary_logger_proto as proto;
diff --git a/grpc-binary-logger/src/middleware.rs b/grpc-binary-logger/src/middleware.rs
new file mode 100644
index 0000000..f3a1282
--- /dev/null
+++ b/grpc-binary-logger/src/middleware.rs
@@ -0,0 +1,544 @@
+use base64::prelude::BASE64_STANDARD;
+use base64::Engine;
+use bytes::Bytes;
+use http::header::AsHeaderName;
+use http::uri::Authority;
+use http::HeaderMap;
+use http_body::Body as HttpBody;
+use hyper::Body;
+use pin_project::pin_project;
+use std::marker::PhantomData;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use std::time::{Duration, SystemTime};
+use tonic::Code;
+use tonic::{body::BoxBody, Status};
+use tower::{Layer, Service};
+
+use crate::sink::ErrorLogger;
+use crate::{proto, sink::NopErrorLogger, NoReflection, Predicate, Sink};
+
+/// Intercepts all gRPC frames, builds gRPC log entries and sends them to a [`Sink`].
+#[derive(Debug, Default, Clone)]
+pub struct BinaryLoggerLayer<K, P = NoReflection, L = NopErrorLogger>
+where
+    K: Sink + Send + Sync,
+    L: ErrorLogger<K::Error>,
+{
+    sink: Arc<K>,
+    predicate: P,
+    error_logger: L,
+}
+
+impl<K> BinaryLoggerLayer<K, NoReflection, NopErrorLogger>
+where
+    K: Sink + Send + Sync,
+{
+    /// Creates a new binary logger layer with the default predicate that
+    /// logs everything except gRPC reflection requests
+    pub fn new(sink: K) -> Self {
+        Self {
+            sink: Arc::new(sink),
+            predicate: Default::default(),
+            error_logger: NopErrorLogger,
+        }
+    }
+}
+
+impl<K, P, L> BinaryLoggerLayer<K, P, L>
+where
+    K: Sink + Send + Sync,
+    P: Predicate,
+    L: ErrorLogger<K::Error>,
+{
+    /// Builds a new binary logger layer with the provided predicate.
+    pub fn with_predicate<P2: Predicate>(self, predicate: P2) -> BinaryLoggerLayer<K, P2, L> {
+        BinaryLoggerLayer {
+            sink: self.sink,
+            predicate,
+            error_logger: self.error_logger,
+        }
+    }
+
+    /// Builds a new binary logger layer with the provided error logger.
+    pub fn with_error_logger<L2: ErrorLogger<K::Error>>(
+        self,
+        error_logger: L2,
+    ) -> BinaryLoggerLayer<K, P, L2> {
+        BinaryLoggerLayer {
+            sink: self.sink,
+            predicate: self.predicate,
+            error_logger,
+        }
+    }
+}
+
+impl<S, K, P, L> Layer<S> for BinaryLoggerLayer<K, P, L>
+where
+    P: Predicate + Send,
+    K: Sink + Send + Sync + 'static,
+    L: ErrorLogger<K::Error> + 'static,
+{
+    type Service = BinaryLoggerMiddleware<S, K, P, L>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        BinaryLoggerMiddleware::new(
+            service,
+            Arc::clone(&self.sink),
+            self.predicate.clone(),
+            self.error_logger.clone(),
+        )
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct BinaryLoggerMiddleware<S, K, P, L>
+where
+    K: Sink + Send + Sync,
+    L: ErrorLogger<K::Error>,
+{
+    sink: Arc<K>,
+    inner: S,
+    predicate: P,
+    error_logger: L,
+    next_call_id: Arc<AtomicU64>,
+}
+
+impl<S, K, P, L> BinaryLoggerMiddleware<S, K, P, L>
+where
+    K: Sink + Send + Sync,
+    P: Predicate + Send,
+    L: ErrorLogger<K::Error>,
+{
+    fn new(inner: S, sink: Arc<K>, predicate: P, error_logger: L) -> Self {
+        Self {
+            sink,
+            inner,
+            predicate,
+            error_logger,
+            next_call_id: Arc::new(AtomicU64::new(1)),
+        }
+    }
+
+    fn next_call_id(&self) -> u64 {
+        self.next_call_id.fetch_add(1, Ordering::SeqCst)
+    }
+}
+
+impl<S, K, P, L> Service<hyper::Request<Body>> for BinaryLoggerMiddleware<S, K, P, L>
+where
+    S: Service<hyper::Request<Body>, Response = hyper::Response<BoxBody>> + Clone + Send + 'static,
+    S::Future: Send + 'static,
+    K: Sink + Send + Sync + 'static,
+    P: Predicate + Send,
+    L: ErrorLogger<K::Error> + 'static,
+{
+    type Response = S::Response;
+    type Error = S::Error;
+    type Future = futures::future::BoxFuture<'static, Result<Self::Response, Self::Error>>;
+
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, request: hyper::Request<Body>) -> Self::Future {
+        // This is necessary because tonic internally uses `tower::buffer::Buffer`.
+        // See https://github.com/tower-rs/tower/issues/547#issuecomment-767629149
+        // for details on why this is necessary
+        let clone = self.inner.clone();
+        let mut inner = std::mem::replace(&mut self.inner, clone);
+
+        if !self.predicate.should_log(&request) {
+            Box::pin(async move { inner.call(request).await })
+        } else {
+            let call = CallLogger::new(
+                self.next_call_id(),
+                Arc::clone(&self.sink),
+                self.error_logger.clone(),
+            );
+            Box::pin(async move {
+                let uri = request.uri();
+                call.log(LogEntry::ClientHeaders {
+                    method: uri.path(),
+                    authority: uri.authority(),
+                    headers: request.headers(),
+                });
+
+                // wrap our logger around the request stream.
+                let request = Self::logged_request(request, call.clone());
+
+                // Perform the actual request.
+                // When a handler returns an error, we get an `Ok(Response(...))` here.
+                // TODO(mkm): figure out what's the right way to log an error when we get `Err` here
+                let response = inner.call(request).await?;
+
+                // wrap our logger around the response stream.
+                Ok(Self::logged_response(response, call))
+            })
+        }
+    }
+}
+
+impl<S, K, P, L> BinaryLoggerMiddleware<S, K, P, L>
+where
+    K: Sink + Send + Sync + 'static,
+    L: ErrorLogger<K::Error> + 'static,
+{
+    fn logged_request(req: hyper::Request<Body>, call: CallLogger<K, L>) -> hyper::Request<Body> {
+        let (req_parts, mut req_body) = req.into_parts();
+
+        // We *must* return a Request<Body> because that's what `inner` requires.
+        // `Body` is not a trait though, so we cannot wrap it and passively log as
+        // tonic consumes bytes from the client connection. Instead we have to construct
+        // a `Body` with one of its public constructors. We can create a body that
+        // produces its data as obtained asynchronously from a channel.
+        // We spawn a task that reads from the request body and forwards bytes to the
+        // inner request. While we're streaming data we determine gRPC message boundaries
+        // and log messages.
+        let (mut sender, client_body) = hyper::Body::channel();
+        tokio::spawn(async move {
+            while let Some(buf) = req_body.data().await {
+                match buf {
+                    Ok(buf) => {
+                        // `data` returns an actual gRPC frames, even if it has been sent
+                        // over multiple tcp segments and over multiple HTTP/2 DATA frames.
+
+                        // TODO(mkm): figure out why the client produces a zero length chunk here.
+                        // Ignoring it seems to be the right thing to do.
+                        if !buf.is_empty() {
+                            call.log(LogEntry::ClientMessage(&buf));
+                        }
+                        if sender.send_data(buf).await.is_err() {
+                            // TODO(mkm): figure out how to log this kind of error, if any.
+                            // The Go gRPC framework seems to either log a frame if successful
+                            // or just not log it.
+                            sender.abort();
+                            return;
+                        }
+                    }
+                    Err(_err) => {
+                        // TODO(mkm): figure out how to log this kind of error, if any.
+                        // The Go gRPC framework seems to either log a frame if successful
+                        // or just not log it.
+                        sender.abort();
+                        return;
+                    }
+                }
+            }
+            // gRPC doesn't use client trailers, but let's forward them nevertheless for completeness.
+            match req_body.trailers().await {
+                Ok(Some(trailers)) => {
+                    if sender.send_trailers(trailers).await.is_err() {
+                        // TODO(mkm): figure out how to log this kind of error, if any.
+                        // The Go gRPC framework seems to either log a frame if successful
+                        // or just not log it.
+                        sender.abort();
+                    }
+                }
+                Err(_err) => {
+                    // TODO(mkm): figure out how to log this kind of error, if any.
+                    // The Go gRPC framework seems to either log a frame if successful
+                    // or just not log it.
+                    sender.abort();
+                }
+                _ => {}
+            };
+        });
+        hyper::Request::from_parts(req_parts, client_body)
+    }
+
+    fn logged_response(
+        response: hyper::Response<BoxBody>,
+        call: CallLogger<K, L>,
+    ) -> hyper::Response<BoxBody> {
+        let (parts, inner) = response.into_parts();
+        let body = BoxBody::new(BinaryLoggingBody {
+            inner,
+            headers: parts.headers.clone(),
+            call: call.clone(),
+            _phantom_error_logger: PhantomData,
+        });
+        call.log(LogEntry::ServerHeaders(&parts.headers));
+        if body.is_end_stream() {
+            // When an grpc call doesn't produce any results (either because the result type
+            // is empty or because it returns an error immediately), the BinaryLoggingBody
+            // won't be able to log anything. We have to log the event here.
+            call.log(LogEntry::ServerTrailers(&parts.headers));
+        }
+        hyper::Response::from_parts(parts, body)
+    }
+}
+
+#[pin_project]
+struct BinaryLoggingBody<K, L>
+where
+    K: Sink + Send + Sync,
+    L: ErrorLogger<K::Error>,
+{
+    #[pin]
+    inner: BoxBody,
+    headers: HeaderMap,
+    call: CallLogger<K, L>,
+    _phantom_error_logger: PhantomData<L>,
+}
+
+impl<K, L> HttpBody for BinaryLoggingBody<K, L>
+where
+    K: Sink + Send + Sync,
+    L: ErrorLogger<K::Error>,
+{
+    type Data = bytes::Bytes;
+
+    type Error = Status;
+
+    fn poll_data(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Data, Self::Error>>> {
+        let call = self.call.clone();
+        let data = self.project().inner.poll_data(cx);
+        if let Poll::Ready(Some(Ok(ref body))) = data {
+            call.log(LogEntry::ServerMessage(body));
+        }
+        data
+    }
+
+    fn poll_trailers(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Result<Option<http::HeaderMap>, Self::Error>> {
+        let call = self.call.clone();
+        let trailers = self.project().inner.poll_trailers(cx);
+        if let Poll::Ready(Ok(Some(ref headers))) = trailers {
+            call.log(LogEntry::ServerTrailers(headers));
+        }
+        trailers
+    }
+
+    fn size_hint(&self) -> http_body::SizeHint {
+        self.inner.size_hint()
+    }
+
+    fn is_end_stream(&self) -> bool {
+        self.inner.is_end_stream()
+    }
+}
+
+enum LogEntry<'a> {
+    ClientHeaders {
+        method: &'a str,
+        authority: Option<&'a Authority>,
+        headers: &'a HeaderMap,
+    },
+    ClientMessage(&'a Bytes),
+    ServerHeaders(&'a HeaderMap),
+    ServerMessage(&'a Bytes),
+    ServerTrailers(&'a HeaderMap),
+}
+
+#[derive(Clone)]
+struct CallLogger<K, L>
+where
+    K: Sink + Send + Sync,
+    L: ErrorLogger<K::Error>,
+{
+    call_id: u64,
+    sequence: Arc<AtomicU64>,
+    sink: Arc<K>,
+    error_logger: L,
+}
+
+impl<K, L> CallLogger<K, L>
+where
+    K: Sink + Send + Sync,
+    L: ErrorLogger<K::Error>,
+{
+    fn new(call_id: u64, sink: Arc<K>, error_logger: L) -> Self {
+        Self {
+            call_id,
+            sequence: Arc::new(AtomicU64::new(1)),
+            sink,
+            error_logger,
+        }
+    }
+    fn log(&self, entry: LogEntry<'_>) {
+        let sequence_id_within_call = self.sequence.fetch_add(1, Ordering::SeqCst);
+
+        let common_entry = proto::GrpcLogEntry {
+            timestamp: Some(SystemTime::now().into()),
+            call_id: self.call_id,
+            sequence_id_within_call,
+            logger: proto::grpc_log_entry::Logger::Server as i32,
+            ..Default::default()
+        };
+
+        let log_entry = match entry {
+            LogEntry::ClientHeaders {
+                method,
+                authority,
+                headers,
+            } => {
+                let timeout = headers.grpc_timeout().map(|t| t.try_into().unwrap());
+                proto::GrpcLogEntry {
+                    r#type: proto::grpc_log_entry::EventType::ClientHeader as i32,
+                    payload: Some(proto::grpc_log_entry::Payload::ClientHeader(
+                        proto::ClientHeader {
+                            method_name: method.to_string(),
+                            metadata: Some(Self::metadata(headers)),
+                            authority: authority
+                                .map(|a| a.as_str())
+                                .unwrap_or_default()
+                                .to_string(),
+                            timeout,
+                        },
+                    )),
+                    ..common_entry
+                }
+            }
+            LogEntry::ClientMessage(body) => proto::GrpcLogEntry {
+                r#type: proto::grpc_log_entry::EventType::ClientMessage as i32,
+                payload: Some(Self::message(body)),
+                ..common_entry
+            },
+            LogEntry::ServerHeaders(headers) => proto::GrpcLogEntry {
+                r#type: proto::grpc_log_entry::EventType::ServerHeader as i32,
+                payload: Some(proto::grpc_log_entry::Payload::ServerHeader(
+                    proto::ServerHeader {
+                        metadata: Some(Self::metadata(headers)),
+                    },
+                )),
+                ..common_entry
+            },
+            LogEntry::ServerMessage(body) => proto::GrpcLogEntry {
+                r#type: proto::grpc_log_entry::EventType::ServerMessage as i32,
+                payload: Some(Self::message(body)),
+                ..common_entry
+            },
+            LogEntry::ServerTrailers(headers) => proto::GrpcLogEntry {
+                r#type: proto::grpc_log_entry::EventType::ServerTrailer as i32,
+                payload: Some(proto::grpc_log_entry::Payload::Trailer(proto::Trailer {
+                    status_code: headers.grpc_status() as u32,
+                    status_message: headers.grpc_message().to_string(),
+                    metadata: Some(Self::metadata(headers)),
+                    status_details: headers.grpc_status_details(),
+                })),
+                ..common_entry
+            },
+        };
+        self.sink.write(log_entry, self.error_logger.clone());
+    }
+
+    fn message(bytes: &Bytes) -> proto::grpc_log_entry::Payload {
+        let compressed = bytes[0] == 1;
+        if compressed {
+            unimplemented!("grpc compressed messages");
+        }
+
+        const COMPRESSED_FLAG_FIELD_LEN: usize = 1;
+        const MESSAGE_LENGTH_FIELD_LEN: usize = 4;
+        let data = bytes
+            .clone() // cheap
+            .into_iter()
+            .skip(COMPRESSED_FLAG_FIELD_LEN + MESSAGE_LENGTH_FIELD_LEN)
+            .collect::<Vec<_>>();
+
+        proto::grpc_log_entry::Payload::Message(proto::Message {
+            length: data.len() as u32,
+            data,
+        })
+    }
+
+    fn metadata(headers: &HeaderMap) -> proto::Metadata {
+        proto::Metadata {
+            entry: headers
+                .iter()
+                .filter(|&(key, _)| !is_reserved_header(key))
+                .map(|(key, value)| proto::MetadataEntry {
+                    key: key.to_string(),
+                    value: value.as_bytes().to_vec(),
+                })
+                .collect(),
+        }
+    }
+}
+
+/// As defined in [binarylog.proto](https://github.com/grpc/grpc-proto/blob/master/grpc/binlog/v1/binarylog.proto)
+fn is_reserved_header<K>(key: K) -> bool
+where
+    K: AsHeaderName,
+{
+    let key = key.as_str();
+    match key {
+        "grpc-trace-bin" => false, // this is the only "grpc-" prefixed header that is not ignored.
+        "te" | "content-type" | "content-length" | "content-encoding" | "user-agent" => true,
+        _ if key.starts_with("grpc-") => true,
+        _ => false,
+    }
+}
+
+trait GrpcHeaderExt {
+    fn grpc_message(&self) -> &str;
+    fn grpc_status(&self) -> Code;
+    fn grpc_status_details(&self) -> Vec<u8>;
+    fn grpc_timeout(&self) -> Option<Duration>;
+
+    fn get_grpc_header<K>(&self, key: K) -> Option<&str>
+    where
+        K: AsHeaderName;
+}
+
+impl GrpcHeaderExt for HeaderMap {
+    fn grpc_message(&self) -> &str {
+        self.get_grpc_header("grpc-message").unwrap_or_default()
+    }
+
+    fn grpc_status(&self) -> Code {
+        self.get_grpc_header("grpc-status")
+            .map(|s| s.as_bytes())
+            .map(Code::from_bytes)
+            .unwrap_or(Code::Unknown)
+    }
+
+    fn grpc_status_details(&self) -> Vec<u8> {
+        self.get_grpc_header("grpc-status-details-bin")
+            .and_then(|v| BASE64_STANDARD.decode(v).ok())
+            .unwrap_or_default()
+    }
+
+    /// Extract a gRPC timeout from the `grpc-timeout` header.
+    /// Returns None if the header is not present or not valid.
+    fn grpc_timeout(&self) -> Option<Duration> {
+        self.get_grpc_header("grpc-timeout")
+            .and_then(parse_grpc_timeout)
+    }
+
+    fn get_grpc_header<K>(&self, key: K) -> Option<&str>
+    where
+        K: AsHeaderName,
+    {
+        self.get(key).and_then(|s| s.to_str().ok())
+    }
+}
+
+/// Parse a gRPC "Timeout" format (see [gRPC over HTTP2]).
+/// Returns None if it cannot parse the format.
+///
+/// [gRPC over HTTP2]: https://github.com/grpc/grpc/blob/master/doc/PROTOCOL-HTTP2.md
+fn parse_grpc_timeout(header_value: &str) -> Option<Duration> {
+    if header_value.is_empty() {
+        return None;
+    }
+    let (digits, unit) = header_value.split_at(header_value.len() - 1);
+    let timeout: u64 = digits.parse().ok()?;
+    match unit {
+        "H" => Some(Duration::from_secs(timeout * 60 * 60)),
+        "M" => Some(Duration::from_secs(timeout * 60)),
+        "S" => Some(Duration::from_secs(timeout)),
+        "m" => Some(Duration::from_millis(timeout)),
+        "u" => Some(Duration::from_micros(timeout)),
+        "n" => Some(Duration::from_nanos(timeout)),
+        _ => None,
+    }
+}
diff --git a/grpc-binary-logger/src/predicate.rs b/grpc-binary-logger/src/predicate.rs
new file mode 100644
index 0000000..3cd437f
--- /dev/null
+++ b/grpc-binary-logger/src/predicate.rs
@@ -0,0 +1,36 @@
+use http_body::Body;
+
+/// A [`Predicate`] allows filtering requests before they get processed by a [`crate::Sink`].
+pub trait Predicate: Clone {
+    /// If this method returns true, the logger layer will capture gRPC frames for this request
+    /// and send them to a [`crate::Sink`].
+    fn should_log<B>(&self, req: &hyper::Request<B>) -> bool
+    where
+        B: Body;
+}
+
+#[derive(Default, Clone, Debug)]
+pub(crate) struct LogAll;
+
+impl Predicate for LogAll {
+    fn should_log<B>(&self, _req: &hyper::Request<B>) -> bool
+    where
+        B: Body,
+    {
+        true
+    }
+}
+
+/// A [`Predicate`] that filters out all [gRPC server reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md)
+#[derive(Default, Clone, Debug, Copy)]
+pub struct NoReflection;
+
+impl Predicate for NoReflection {
+    fn should_log<B>(&self, req: &hyper::Request<B>) -> bool
+    where
+        B: Body,
+    {
+        let method = req.uri().path();
+        !method.starts_with("/grpc.reflection.v1alpha.ServerReflection")
+    }
+}
diff --git a/grpc-binary-logger/src/sink.rs b/grpc-binary-logger/src/sink.rs
new file mode 100644
index 0000000..b10b445
--- /dev/null
+++ b/grpc-binary-logger/src/sink.rs
@@ -0,0 +1,161 @@
+//! gRPC binary log middleware layer writes binary logs into a [`crate::Sink`].
+use super::proto::GrpcLogEntry;
+use byteorder::{BigEndian, WriteBytesExt};
+use prost::Message;
+use std::io;
+use std::sync::{Arc, Mutex};
+
+/// Receives [`GrpcLogEntry`] entries capturing all gRPC frames from a [`crate::BinaryLoggerLayer`].
+pub trait Sink: Clone + Send + Sync {
+    /// The type returned in the event of an error.
+    type Error;
+
+    /// The sink receives a [`GrpcLogEntry`] message for every gRPC frame captured by a [`crate::BinaryLoggerLayer`].
+    /// The sink owns the log entry and is encouraged to process the log in the background without blocking the logger layer.
+    /// Errors should be handled (e.g. logged) by the sink.
+    fn write(&self, data: GrpcLogEntry, error_logger: impl ErrorLogger<Self::Error>);
+}
+
+/// Passed to a Sink to log errors.
+pub trait ErrorLogger<E>: Clone + Send + Sync {
+    /// Log error
+    fn log_error(&self, error: E);
+}
+
+/// An error logger that doesn't log anywhere.
+#[derive(Clone, Copy, Debug)]
+pub struct NopErrorLogger;
+
+impl<E> ErrorLogger<E> for NopErrorLogger {
+    fn log_error(&self, _error: E) {}
+}
+
+impl<F, E> ErrorLogger<E> for F
+where
+    F: Fn(E) + Send + Sync + Clone,
+{
+    fn log_error(&self, error: E) {
+        self(error)
+    }
+}
+
+/// A simple [`Sink`] implementation that prints to stderr.
+#[derive(Default, Clone, Copy, Debug)]
+pub struct DebugSink;
+
+impl Sink for DebugSink {
+    type Error = ();
+
+    fn write(&self, data: GrpcLogEntry, _error_logger: impl ErrorLogger<Self::Error>) {
+        eprintln!("{data:?}");
+    }
+}
+
+/// Write binary log entries to a writer using the gRPC binary logging "framing format" (sadly undocumented),
+/// compatible with the official gRPC implementation (C/C++/Java/Go) and with the [binlog](https://github.com/mkmik/binlog) CLI tool.
+/// The frame format consists of a `u32` length followed by that many bytes of data.
+/// ```text
+/// 4 bytes: length as a u32
+/// length bytes: <data>
+/// ```
+#[derive(Default, Debug)]
+pub struct FileSink<W>
+where
+    W: io::Write + Send,
+{
+    writer: Arc<Mutex<W>>,
+}
+
+impl<W> Clone for FileSink<W>
+where
+    W: io::Write + Send,
+{
+    fn clone(&self) -> Self {
+        Self {
+            writer: Arc::clone(&self.writer),
+        }
+    }
+}
+
+impl<W> FileSink<W>
+where
+    W: io::Write + Send,
+{
+    /// Create a new FileSink that writes to a [`std::io::Write`].
+    pub fn new(writer: W) -> Self {
+        let writer = Arc::new(Mutex::new(writer));
+        Self { writer }
+    }
+
+    fn write_log_entry(&self, data: &GrpcLogEntry) -> std::io::Result<()> {
+        let mut buf = vec![];
+        buf.write_u32::<BigEndian>(data.encoded_len() as u32)?;
+        data.encode(&mut buf)?;
+
+        let mut writer = self.writer.lock().expect("not poisoned");
+        writer.write_all(&buf)?;
+        Ok(())
+    }
+}
+
+impl<W> Sink for FileSink<W>
+where
+    W: io::Write + Send,
+{
+    type Error = std::io::Error;
+
+    fn write(&self, data: GrpcLogEntry, error_logger: impl ErrorLogger<Self::Error>) {
+        if let Err(error) = self.write_log_entry(&data) {
+            error_logger.log_error(error);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fmt;
+    use tokio::sync::mpsc::{self, Sender};
+
+    #[derive(Debug, PartialEq, Eq)]
+    struct DummyError;
+
+    impl fmt::Display for DummyError {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "dummy")
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct FailingSink;
+
+    impl Sink for FailingSink {
+        type Error = DummyError;
+
+        fn write(&self, _data: GrpcLogEntry, error_logger: impl ErrorLogger<Self::Error>) {
+            error_logger.log_error(DummyError);
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct TestErrorLogger(Arc<Sender<DummyError>>);
+
+    /// Tests that error loggers can do weird stuff like spawning tokio tasks.
+    impl ErrorLogger<DummyError> for TestErrorLogger {
+        fn log_error(&self, error: DummyError) {
+            let tx = Arc::clone(&self.0);
+            tokio::spawn(async move { tx.send(error).await });
+        }
+    }
+
+    #[tokio::test]
+    async fn test_sink_error() {
+        let (tx, mut rx) = mpsc::channel(1);
+
+        let error_logger = TestErrorLogger(Arc::new(tx));
+        let sink = FailingSink;
+        sink.write(GrpcLogEntry::default(), error_logger.clone());
+
+        assert_eq!(rx.recv().await, Some(DummyError));
+    }
+}
diff --git a/grpc-binary-logger/tests/end_to_end.rs b/grpc-binary-logger/tests/end_to_end.rs
new file mode 100644
index 0000000..01e8c1c
--- /dev/null
+++ b/grpc-binary-logger/tests/end_to_end.rs
@@ -0,0 +1 @@
+mod end_to_end_cases;
diff --git a/grpc-binary-logger/tests/end_to_end_cases/mod.rs b/grpc-binary-logger/tests/end_to_end_cases/mod.rs
new file mode 100644
index 0000000..7c87c6d
--- /dev/null
+++ b/grpc-binary-logger/tests/end_to_end_cases/mod.rs
@@ -0,0 +1,3 @@
+mod server;
+mod test_utils;
+mod tests;
diff --git a/grpc-binary-logger/tests/end_to_end_cases/server.rs b/grpc-binary-logger/tests/end_to_end_cases/server.rs
new file mode 100644
index 0000000..c148d74
--- /dev/null
+++ b/grpc-binary-logger/tests/end_to_end_cases/server.rs
@@ -0,0 +1,44 @@
+use futures::Stream;
+use std::pin::Pin;
+use tonic::{metadata::MetadataValue, Request, Response, Status};
+
+use grpc_binary_logger_test_proto::{test_server, TestRequest, TestResponse};
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct TestService;
+
+type PinnedStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send>>;
+
+#[tonic::async_trait]
+impl test_server::Test for TestService {
+    async fn test_unary(
+        &self,
+        request: Request<TestRequest>,
+    ) -> Result<Response<TestResponse>, Status> {
+        let request = request.into_inner();
+
+        if request.question == 42 {
+            return Err(Status::invalid_argument("The Answer is not a question"));
+        }
+
+        let mut res = tonic::Response::new(TestResponse {
+            answer: request.question + 1,
+        });
+        res.metadata_mut().insert(
+            "my-server-header",
+            MetadataValue::from_static("my-server-header-value"),
+        );
+        Ok(res)
+    }
+
+    type TestStreamStream = PinnedStream<TestResponse>;
+
+    async fn test_stream(
+        &self,
+        request: Request<TestRequest>,
+    ) -> Result<Response<Self::TestStreamStream>, Status> {
+        let request = request.into_inner();
+        let it = (0..=request.question).map(|answer| Ok(TestResponse { answer }));
+        Ok(tonic::Response::new(Box::pin(futures::stream::iter(it))))
+    }
+}
diff --git a/grpc-binary-logger/tests/end_to_end_cases/test_utils.rs b/grpc-binary-logger/tests/end_to_end_cases/test_utils.rs
new file mode 100644
index 0000000..10cb26b
--- /dev/null
+++ b/grpc-binary-logger/tests/end_to_end_cases/test_utils.rs
@@ -0,0 +1,97 @@
+use futures::FutureExt;
+use grpc_binary_logger::{sink, BinaryLoggerLayer, Sink};
+use grpc_binary_logger_proto::GrpcLogEntry;
+use grpc_binary_logger_test_proto::{
+    test_client::TestClient,
+    test_server::{self, TestServer},
+};
+use std::{
+    net::SocketAddr,
+    sync::{Arc, Mutex},
+};
+use tokio_stream::wrappers::TcpListenerStream;
+use tonic::transport::{Channel, Server};
+
+#[derive(Debug)]
+pub(crate) struct Fixture {
+    pub(crate) client: TestClient<Channel>,
+    shutdown_tx: tokio::sync::oneshot::Sender<()>,
+}
+
+impl Fixture {
+    /// Start up a grpc server listening on `port`, returning
+    /// a fixture with the server and client.
+    pub(crate) async fn new<T, K>(svc: T, sink: K) -> Result<Self, Box<dyn std::error::Error>>
+    where
+        T: test_server::Test,
+        K: Sink + 'static,
+    {
+        let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
+        let addr: SocketAddr = "127.0.0.1:0".parse()?;
+        let listener = tokio::net::TcpListener::bind(addr).await?;
+        let local_addr = listener.local_addr()?;
+        let local_addr = format!("http://{local_addr}");
+
+        tokio::spawn(async move {
+            Server::builder()
+                .layer(BinaryLoggerLayer::new(sink))
+                .add_service(TestServer::new(svc))
+                .serve_with_incoming_shutdown(
+                    TcpListenerStream::new(listener),
+                    shutdown_rx.map(drop),
+                )
+                .await
+                .unwrap();
+        });
+
+        // Give the test server a few ms to become available
+        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
+
+        // Construct client and send request, extract response
+        let client = TestClient::connect(local_addr.clone())
+            .await
+            .expect("connect");
+
+        Ok(Self {
+            client,
+            shutdown_tx,
+        })
+    }
+}
+
+impl Drop for Fixture {
+    fn drop(&mut self) {
+        let (tmp_tx, _) = tokio::sync::oneshot::channel();
+        let shutdown_tx = std::mem::replace(&mut self.shutdown_tx, tmp_tx);
+        if let Err(e) = shutdown_tx.send(()) {
+            eprintln!("error shutting down text fixture: {e:?}");
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct RecordingSink {
+    log: Arc<Mutex<Vec<GrpcLogEntry>>>,
+}
+
+impl RecordingSink {
+    pub(crate) fn new() -> Self {
+        Self {
+            log: Default::default(),
+        }
+    }
+
+    /// Return a copy of the recorded log entries.
+    pub(crate) fn entries(&self) -> Vec<GrpcLogEntry> {
+        self.log.lock().unwrap().clone()
+    }
+}
+
+impl Sink for RecordingSink {
+    type Error = ();
+
+    fn write(&self, data: GrpcLogEntry, _error_logger: impl sink::ErrorLogger<Self::Error>) {
+        let mut log = self.log.lock().expect("poisoned");
+        log.push(data);
+    }
+}
diff --git a/grpc-binary-logger/tests/end_to_end_cases/tests.rs b/grpc-binary-logger/tests/end_to_end_cases/tests.rs
new file mode 100644
index 0000000..fbfad15
--- /dev/null
+++ b/grpc-binary-logger/tests/end_to_end_cases/tests.rs
@@ -0,0 +1,121 @@
+use std::io::Cursor;
+
+use crate::end_to_end_cases::{
+    server::TestService,
+    test_utils::{Fixture, RecordingSink},
+};
+use assert_matches::assert_matches;
+use grpc_binary_logger_proto::{
+    grpc_log_entry::{EventType, Payload},
+    ClientHeader, Message, Metadata, MetadataEntry, ServerHeader, Trailer,
+};
+use grpc_binary_logger_test_proto::{TestRequest, TestResponse};
+use prost::Message as _;
+use tonic::{metadata::MetadataValue, Code};
+
+#[tokio::test]
+async fn test_unary() {
+    let sink = RecordingSink::new();
+    let fixture = Fixture::new(TestService, sink.clone())
+        .await
+        .expect("fixture");
+
+    const BASE: u64 = 1;
+    {
+        let mut client = fixture.client.clone();
+        let mut req = tonic::Request::new(TestRequest { question: BASE });
+        req.metadata_mut().insert(
+            "my-client-header",
+            MetadataValue::from_static("my-client-header-value"),
+        );
+        let res = client.test_unary(req).await.expect("no errors");
+        assert_eq!(res.into_inner().answer, BASE + 1);
+    }
+
+    let entries = sink.entries();
+    assert_eq!(entries.len(), 5);
+
+    let entry = &entries[0];
+    assert_eq!(entry.r#type(), EventType::ClientHeader);
+    assert_matches!(
+        entry.payload,
+        Some(Payload::ClientHeader(ClientHeader {
+            ref method_name,
+            metadata: Some(Metadata{ref entry}), ..
+        })) => {
+            assert_eq!(method_name, "/test.Test/TestUnary");
+            assert_matches!(entry[..], [
+                MetadataEntry{ref key, ref value, ..},
+            ] if key == "my-client-header" && value == b"my-client-header-value");
+        }
+    );
+
+    let entry = &entries[1];
+    assert_eq!(entry.r#type(), EventType::ClientMessage);
+    assert_matches!(
+        entry.payload,
+        Some(Payload::Message(Message{length, ref data})) => {
+            assert_eq!(data.len(), length as usize);
+            let message = TestRequest::decode(Cursor::new(data)).expect("valid proto");
+            assert_eq!(message.question, BASE);
+        }
+    );
+
+    let entry = &entries[2];
+    assert_eq!(entry.r#type(), EventType::ServerHeader);
+    assert_matches!(
+        entry.payload,
+        Some(Payload::ServerHeader(ServerHeader {
+            metadata: Some(Metadata{ref entry}), ..
+        })) => {
+            assert_matches!(entry[..], [
+                MetadataEntry{ref key, ref value, ..},
+            ] if key == "my-server-header" && value == b"my-server-header-value");
+        }
+    );
+
+    let entry = &entries[3];
+    assert_eq!(entry.r#type(), EventType::ServerMessage);
+    assert_matches!(
+        entry.payload,
+        Some(Payload::Message(Message{length, ref data})) => {
+            assert_eq!(data.len(), length as usize);
+            let message = TestResponse::decode(Cursor::new(data)).unwrap();
+            assert_eq!(message.answer, BASE+1);
+        }
+    );
+
+    let entry = &entries[4];
+    assert_eq!(entry.r#type(), EventType::ServerTrailer);
+    assert_matches!(entry.payload, Some(Payload::Trailer(Trailer { .. })));
+}
+
+#[tokio::test]
+async fn test_unary_error() {
+    let sink = RecordingSink::new();
+    let fixture = Fixture::new(TestService, sink.clone())
+        .await
+        .expect("fixture");
+
+    {
+        let mut client = fixture.client.clone();
+        let err = client
+            .test_unary(TestRequest { question: 42 })
+            .await
+            .expect_err("should fail");
+        assert_eq!(err.code(), Code::InvalidArgument);
+        assert_eq!(err.message(), "The Answer is not a question");
+    }
+
+    let entries = sink.entries();
+    assert_eq!(entries.len(), 4);
+
+    let entry = &entries[3];
+    assert_eq!(entry.r#type(), EventType::ServerTrailer);
+    assert_matches!(
+        entry.payload,
+        Some(Payload::Trailer(Trailer { status_code, .. })) => {
+            assert_eq!(status_code, Code::InvalidArgument as u32);
+        }
+    );
+}
diff --git a/import_export/Cargo.toml b/import_export/Cargo.toml
new file mode 100644
index 0000000..49a2dc8
--- /dev/null
+++ b/import_export/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "import_export"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+bytes = "1.5"
+data_types = { path = "../data_types" }
+futures-util = { version = "0.3" }
+generated_types = { path = "../generated_types" }
+influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format"] }
+iox_catalog = { path = "../iox_catalog"  }
+parquet_file = { path = "../parquet_file"  }
+object_store = { workspace=true }
+observability_deps = { path = "../observability_deps" }
+schema = { path = "../schema" }
+serde_json = "1.0.111"
+thiserror = "1.0.56"
+tokio = { version = "1.35" }
+tokio-util = { version = "0.7.10", features = ["compat"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/import_export/src/file/export.rs b/import_export/src/file/export.rs
new file mode 100644
index 0000000..86e91f4
--- /dev/null
+++ b/import_export/src/file/export.rs
@@ -0,0 +1,226 @@
+use data_types::{PartitionHashId, PartitionId, TransitionPartitionId};
+use futures_util::TryStreamExt;
+use influxdb_iox_client::{
+    catalog::{
+        self,
+        generated_types::{partition_identifier, ParquetFile, PartitionIdentifier},
+    },
+    connection::Connection,
+    store, table,
+};
+use std::path::{Path, PathBuf};
+use thiserror::Error;
+use tokio::{
+    fs::{self, File, OpenOptions},
+    io::{self, AsyncWriteExt},
+};
+use tokio_util::compat::FuturesAsyncReadCompatExt;
+
+#[derive(Debug, Error)]
+pub enum ExportError {
+    #[error("JSON Serialization error: {0}")]
+    Serde(#[from] serde_json::Error),
+
+    #[error("IOx request failed: {0}")]
+    Client(#[from] influxdb_iox_client::error::Error),
+
+    #[error("Writing file: {0}")]
+    File(#[from] std::io::Error),
+}
+
+type Result<T, E = ExportError> = std::result::Result<T, E>;
+
+/// Exports data from a remote IOx instance to local files.
+///
+/// Data is read using the clients in [`influxdb_iox_client`] (rather
+/// than the catalog) so that this can be used to debug remote systems.
+#[derive(Debug)]
+pub struct RemoteExporter {
+    catalog_client: catalog::Client,
+    store_client: store::Client,
+    table_client: table::Client,
+}
+
+impl RemoteExporter {
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            catalog_client: catalog::Client::new(connection.clone()),
+            store_client: store::Client::new(connection.clone()),
+            table_client: table::Client::new(connection),
+        }
+    }
+
+    /// Exports all data and metadata for `table_name` in
+    /// `namespace` to local files.
+    ///
+    /// If `output_directory` is specified, all files are written
+    /// there otherwise files are exported to a directory named
+    /// `table_name`.
+    pub async fn export_table(
+        &mut self,
+        output_directory: Option<PathBuf>,
+        namespace_name: String,
+        table_name: String,
+    ) -> Result<()> {
+        let output_directory = output_directory.unwrap_or_else(|| PathBuf::from(&table_name));
+        fs::create_dir_all(&output_directory).await?;
+
+        let parquet_files = self
+            .catalog_client
+            .get_parquet_files_by_namespace_table(&namespace_name, &table_name)
+            .await?;
+
+        // Export the metadata for the table. Since all
+        // parquet_files are part of the same table, use the table_id
+        // from the first parquet_file
+        let table_id = parquet_files
+            .first()
+            .map(|parquet_file| parquet_file.table_id);
+        if let Some(table_id) = table_id {
+            self.export_table_metadata(&output_directory, table_id, &table_name, &namespace_name)
+                .await?;
+        }
+
+        let num_parquet_files = parquet_files.len();
+        println!("found {num_parquet_files} Parquet files, exporting...");
+        let indexed_parquet_file_metadata = parquet_files.into_iter().enumerate();
+
+        for (index, parquet_file) in indexed_parquet_file_metadata {
+            self.export_parquet_file(&output_directory, index, num_parquet_files, &parquet_file)
+                .await?;
+        }
+        println!("Done.");
+
+        Ok(())
+    }
+
+    /// Exports table and partition information for the specified
+    /// table. Overwrites existing files, if any, to ensure it has the
+    /// latest catalog information.
+    ///
+    /// 1. `<output_directory>/table.<partition_id>.json`: pbjson
+    /// encoded data about the table (minimal now)
+    ///
+    /// 2. `<output_directory>/partition.<partition_id>.json`: pbjson
+    /// encoded data for each partition
+    async fn export_table_metadata(
+        &mut self,
+        output_directory: &Path,
+        table_id: i64,
+        table_name: &str,
+        namespace_name: &str,
+    ) -> Result<()> {
+        // write table metadata
+        let table = self
+            .table_client
+            .get_table(namespace_name, table_name)
+            .await?;
+        let table_json = serde_json::to_string_pretty(&table)?;
+        let filename = format!("table.{table_id}.json");
+        let file_path = output_directory.join(&filename);
+        write_string_to_file(&table_json, &file_path).await?;
+
+        // write partition metadata for the table
+        let partitions = self
+            .catalog_client
+            .get_partitions_by_table_id(table_id)
+            .await?;
+
+        for partition in partitions {
+            let partition_id = to_partition_id(partition.identifier.as_ref());
+            let partition_json = serde_json::to_string_pretty(&partition)?;
+            let filename = format!("partition.{partition_id}.json");
+            let file_path = output_directory.join(&filename);
+            write_string_to_file(&partition_json, &file_path).await?;
+        }
+
+        Ok(())
+    }
+
+    /// Exports a remote ParquetFile to:
+    ///
+    /// 1. `<output_directory>/<uuid>.parquet`: The parquet bytes
+    ///
+    /// 2. `<output_directory>/<uuid>.parquet.json`: pbjson encoded `ParquetFile` metadata
+    async fn export_parquet_file(
+        &mut self,
+        output_directory: &Path,
+        index: usize,
+        num_parquet_files: usize,
+        parquet_file: &ParquetFile,
+    ) -> Result<()> {
+        let uuid = &parquet_file.object_store_id;
+        let file_size_bytes = parquet_file.file_size_bytes as u64;
+
+        // copy out the metadata as pbjson encoded data always (to
+        // ensure we have the most up to date version)
+        {
+            let filename = format!("{uuid}.parquet.json");
+            let file_path = output_directory.join(&filename);
+            let json = serde_json::to_string_pretty(&parquet_file)?;
+            write_string_to_file(&json, &file_path).await?;
+        }
+
+        let filename = format!("{uuid}.parquet");
+        let file_path = output_directory.join(&filename);
+
+        if fs::metadata(&file_path)
+            .await
+            .map_or(false, |metadata| metadata.len() == file_size_bytes)
+        {
+            println!(
+                "skipping file {} of {num_parquet_files} ({filename} already exists with expected file size)",
+                index + 1
+            );
+        } else {
+            // scope to close files
+            {
+                println!(
+                    "downloading file {} of {num_parquet_files} ({filename})...",
+                    index + 1
+                );
+                let mut response = self
+                    .store_client
+                    .get_parquet_file_by_object_store_id(uuid.clone())
+                    .await?
+                    .map_ok(|res| res.data)
+                    .map_err(|err| io::Error::new(io::ErrorKind::Other, err))
+                    .into_async_read()
+                    .compat();
+                let mut file = File::create(&file_path).await?;
+                io::copy(&mut response, &mut file).await?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn to_partition_id(partition_identifier: Option<&PartitionIdentifier>) -> TransitionPartitionId {
+    match partition_identifier
+        .and_then(|pi| pi.id.as_ref())
+        .expect("Catalog service should send the partition identifier")
+    {
+        partition_identifier::Id::HashId(bytes) => TransitionPartitionId::Deterministic(
+            PartitionHashId::try_from(&bytes[..])
+                .expect("Catalog service should send valid hash_id bytes"),
+        ),
+        partition_identifier::Id::CatalogId(id) => {
+            TransitionPartitionId::Deprecated(PartitionId::new(*id))
+        }
+    }
+}
+
+/// writes the contents of a string to a file, overwriting the previous contents, if any
+async fn write_string_to_file(contents: &str, path: &Path) -> Result<()> {
+    let mut file = OpenOptions::new()
+        .write(true)
+        .truncate(true)
+        .create(true)
+        .open(path)
+        .await?;
+
+    file.write_all(contents.as_bytes()).await?;
+
+    Ok(())
+}
diff --git a/import_export/src/file/import.rs b/import_export/src/file/import.rs
new file mode 100644
index 0000000..d8401c1
--- /dev/null
+++ b/import_export/src/file/import.rs
@@ -0,0 +1,757 @@
+//! Utilities for importing catalog and data from files
+//! MORE COMING SOON: <https://github.com/influxdata/influxdb_iox/issues/7744>
+
+use bytes::Bytes;
+use data_types::{
+    partition_template::{
+        NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, PARTITION_BY_DAY_PROTO,
+    },
+    ColumnSet, ColumnType, CompactionLevel, CompactionLevelProtoError, Namespace, NamespaceId,
+    NamespaceName, NamespaceNameError, ParquetFileParams, Partition, PartitionKey, SortKeyIds,
+    Statistics, Table, TableId, Timestamp,
+};
+use generated_types::influxdata::iox::catalog::v1 as proto;
+use generated_types::influxdata::iox::table::v1 as table;
+//    ParquetFile as ProtoParquetFile, Partition as ProtoPartition,
+use iox_catalog::{
+    interface::{CasFailure, Catalog, ParquetFileRepoExt, RepoCollection, SoftDeletedRows},
+    util::get_table_columns_by_id,
+};
+use object_store::ObjectStore;
+use observability_deps::tracing::{debug, info, warn};
+use parquet_file::{
+    metadata::{DecodedIoxParquetMetaData, IoxMetadata, IoxParquetMetaData},
+    ParquetFilePath,
+};
+use std::{
+    borrow::Cow,
+    io::Read,
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum Error {
+    #[error("Reading {path:?}: {e}")]
+    Reading { path: PathBuf, e: std::io::Error },
+
+    #[error("Not a directory: {0:?}")]
+    NotDirectory(PathBuf),
+
+    #[error("Error setting sort key: {0}")]
+    SetSortKey(iox_catalog::interface::Error),
+
+    #[error("Error decoding json in {path:?}: {e}")]
+    Json { path: PathBuf, e: serde_json::Error },
+
+    #[error("Parquet Metadata Not Found in {path:?}")]
+    ParquetMetadataNotFound { path: PathBuf },
+
+    #[error("Invalid Parquet Metadata: {0}")]
+    ParquetMetadata(#[from] parquet_file::metadata::Error),
+
+    #[error("Error creating default partition template override: {0}")]
+    PartitionOveride(#[from] data_types::partition_template::ValidationError),
+
+    #[error("Expected timestamp stats to be i64, but got: {stats:?}")]
+    BadStats { stats: Option<Statistics> },
+
+    #[error("Expected timestamp to have both min and max stats, had min={min:?}, max={max:?}")]
+    NoMinMax { min: Option<i64>, max: Option<i64> },
+
+    #[error("Mismatched sort key. Exported sort key is {exported}, existing is {existing}")]
+    MismatchedSortKey { exported: String, existing: String },
+
+    #[error("Unexpected parquet filename. Expected a name like <id>.parquet, got {path:?}")]
+    UnexpectedFileName { path: PathBuf },
+
+    #[error("Invalid Namespace: {0}")]
+    NamespaceName(#[from] NamespaceNameError),
+
+    #[error(
+        "Unexpected error: cound not find sort key in catalog export or embedded parquet metadata"
+    )]
+    NoSortKey,
+
+    #[error("Unknown compaction level in encoded metadata: {0}")]
+    UnknownCompactionLevel(#[from] CompactionLevelProtoError),
+
+    #[error("Catalog error: {0}")]
+    Catalog(#[from] iox_catalog::interface::Error),
+
+    #[error("Object store error: {0}")]
+    ObjectStore(#[from] object_store::Error),
+}
+
+impl Error {
+    fn reading(path: impl Into<PathBuf>, e: std::io::Error) -> Self {
+        let path = path.into();
+        Self::Reading { path, e }
+    }
+}
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Represents the contents of a directory exported using
+/// [`RemoteExporter`]. This is a partial catalog snapshot.
+///
+/// [`RemoteExporter`]: crate::file::RemoteExporter
+#[derive(Debug, Default)]
+pub struct ExportedContents {
+    /// .parquet files
+    parquet_files: Vec<PathBuf>,
+
+    /// .parquet.json files (json that correspond to the parquet files)
+    parquet_json_files: Vec<PathBuf>,
+
+    /// table .json files
+    table_json_files: Vec<PathBuf>,
+
+    /// partition .json files
+    partition_json_files: Vec<PathBuf>,
+
+    /// Decoded partition metadata,  found in the export
+    partition_metadata: Vec<proto::Partition>,
+
+    /// Decoded tables, found in the export
+    tables: Vec<table::Table>,
+
+    /// Decoded parquet metata found in the export
+    /// Key is object_store_id, value is decoded metadata
+    parquet_metadata: Vec<proto::ParquetFile>,
+}
+
+impl ExportedContents {
+    /// Read the contents of the directory in `dir_path`, categorizing
+    /// files in that directory.
+    pub fn try_new(dir_path: &Path) -> Result<Self> {
+        info!(?dir_path, "Reading exported catalog contents");
+
+        if !dir_path.is_dir() {
+            return Err(Error::NotDirectory(dir_path.into()));
+        };
+
+        let entries: Vec<_> = dir_path
+            .read_dir()
+            .map_err(|e| Error::reading(dir_path, e))?
+            .flatten()
+            .collect();
+
+        debug!(?entries, "Directory contents");
+
+        let mut new_self = Self::default();
+
+        for entry in entries {
+            let path = entry.path();
+            let extension = if let Some(extension) = path.extension() {
+                extension
+            } else {
+                warn!(?path, "IGNORING file with no extension");
+                continue;
+            };
+
+            if extension == "parquet" {
+                // names like "<UUID>.parquet"
+                new_self.parquet_files.push(path)
+            } else if extension == "json" {
+                let name = file_name(&path);
+                if name.starts_with("table.") {
+                    new_self.table_json_files.push(path);
+                } else if name.starts_with("partition") {
+                    // names like "partitition.<id>.json"
+                    new_self.partition_json_files.push(path);
+                } else if name.ends_with(".parquet.json") {
+                    // names like  "<UUID>.parquet.json"
+                    new_self.parquet_json_files.push(path);
+                } else {
+                    warn!(?path, "IGNORING unknown JSON file");
+                }
+            } else {
+                warn!(?path, "IGNORING unknown file");
+            }
+        }
+
+        new_self.try_decode_files()?;
+
+        Ok(new_self)
+    }
+
+    /// tries to decode all the metadata files found in the export
+    fn try_decode_files(&mut self) -> Result<()> {
+        debug!("Decoding partition files");
+
+        for path in &self.partition_json_files {
+            debug!(?path, "Reading partition json file");
+            let json = std::fs::read_to_string(path).map_err(|e| Error::Reading {
+                path: path.clone(),
+                e,
+            })?;
+
+            let partition: proto::Partition =
+                serde_json::from_str(&json).map_err(|e| Error::Json {
+                    path: path.clone(),
+                    e,
+                })?;
+
+            self.partition_metadata.push(partition);
+        }
+
+        for path in &self.parquet_json_files {
+            debug!(?path, "Reading parquet json file");
+            let json = std::fs::read_to_string(path).map_err(|e| Error::Reading {
+                path: path.clone(),
+                e,
+            })?;
+
+            let parquet_file: proto::ParquetFile =
+                serde_json::from_str(&json).map_err(|e| Error::Json {
+                    path: path.clone(),
+                    e,
+                })?;
+
+            self.parquet_metadata.push(parquet_file);
+        }
+
+        for path in &self.table_json_files {
+            debug!(?path, "Reading table metadata json file");
+            let json = std::fs::read_to_string(path).map_err(|e| Error::Reading {
+                path: path.clone(),
+                e,
+            })?;
+
+            let table: table::Table = serde_json::from_str(&json).map_err(|e| Error::Json {
+                path: path.clone(),
+                e,
+            })?;
+
+            self.tables.push(table);
+        }
+
+        Ok(())
+    }
+
+    /// Returns the name of the i'th entry in `self.parquet_files`, if
+    /// any
+    pub fn parquet_file_name(&self, i: usize) -> Option<Cow<'_, str>> {
+        self.parquet_files.get(i).map(|p| file_name(p))
+    }
+
+    pub fn parquet_files(&self) -> &[PathBuf] {
+        self.parquet_files.as_ref()
+    }
+
+    pub fn parquet_json_files(&self) -> &[PathBuf] {
+        self.parquet_json_files.as_ref()
+    }
+
+    pub fn table_json_files(&self) -> &[PathBuf] {
+        self.table_json_files.as_ref()
+    }
+
+    pub fn partition_json_files(&self) -> &[PathBuf] {
+        self.partition_json_files.as_ref()
+    }
+
+    /// Returns partition information retrieved from the exported
+    /// catalog, if any, with the given table id and partition key
+    pub fn partition_metadata(
+        &self,
+        table_id: i64,
+        partition_key: &str,
+    ) -> Option<proto::Partition> {
+        self.partition_metadata
+            .iter()
+            .find(|p| p.table_id == table_id && p.key == partition_key)
+            .cloned()
+    }
+
+    /// Returns parquet file metadata, for the given object_store id, if any
+    pub fn parquet_metadata(&self, object_store_id: &str) -> Option<proto::ParquetFile> {
+        self.parquet_metadata
+            .iter()
+            .find(|p| p.object_store_id == object_store_id)
+            .cloned()
+    }
+
+    /// Returns table information retrieved exported
+    /// from the table client, if any, with the given table namespace id and table id
+    pub fn table(&self, namespace_id: i64, table_name: &str) -> Option<table::Table> {
+        self.tables
+            .iter()
+            .find(|t| t.namespace_id == namespace_id && t.name == table_name)
+            .cloned()
+    }
+}
+
+/// Returns the name of the file
+fn file_name(p: &Path) -> Cow<'_, str> {
+    p.file_name()
+        .map(|p| p.to_string_lossy())
+        .unwrap_or_else(|| Cow::Borrowed(""))
+}
+
+/// Imports the contents of a [`ExportedContents`] into a catalog and
+/// object_store instance
+#[derive(Debug)]
+pub struct RemoteImporter {
+    exported_contents: ExportedContents,
+    catalog: Arc<dyn Catalog>,
+    object_store: Arc<dyn ObjectStore>,
+}
+
+impl RemoteImporter {
+    pub fn new(
+        exported_contents: ExportedContents,
+        catalog: Arc<dyn Catalog>,
+        object_store: Arc<dyn ObjectStore>,
+    ) -> Self {
+        Self {
+            exported_contents,
+            catalog,
+            object_store,
+        }
+    }
+
+    /// Performs the import, reporting status to observer and erroring
+    /// if a failure occurs
+    pub async fn import(&self) -> Result<()> {
+        let parquet_files = self.exported_contents.parquet_files();
+
+        let total_files = parquet_files.len();
+        info!(%total_files, "Begin importing files");
+        for (files_done, file) in parquet_files.iter().enumerate() {
+            self.import_parquet(file).await?;
+
+            // print a log message every 50 files
+            if files_done % 50 == 0 {
+                let pct = (files_done as f64 / total_files as f64).floor() * 100.0;
+                info!(%files_done, %total_files, %pct, "Import running");
+            }
+        }
+
+        info!(%total_files, "Completed importing files");
+        Ok(())
+    }
+
+    // tries to import the specified parquet file into the catalog
+    async fn import_parquet(&self, file_path: &Path) -> Result<()> {
+        info!(?file_path, "Beginning Import");
+
+        // step 1: figure out the location to write the parquet file in object store and do so
+        let mut in_file =
+            std::fs::File::open(file_path).map_err(|e| Error::reading(file_path, e))?;
+
+        let mut file_bytes = vec![];
+        in_file
+            .read_to_end(&mut file_bytes)
+            .map_err(|e| Error::reading(file_path, e))?;
+        let bytes = Bytes::from(file_bytes);
+        let file_size_bytes = bytes.len();
+
+        let Some(iox_parquet_metadata) = IoxParquetMetaData::from_file_bytes(bytes.clone())? else {
+            return Err(Error::ParquetMetadataNotFound {
+                path: PathBuf::from(file_path),
+            });
+        };
+
+        let decoded_iox_parquet_metadata = iox_parquet_metadata.decode()?;
+
+        let iox_metadata = decoded_iox_parquet_metadata.read_iox_metadata_new()?;
+
+        debug!(?iox_metadata, "read metadata");
+
+        // step 2: Add the appropriate entry to the catalog
+        let namespace_name = iox_metadata.namespace_name.as_ref();
+        let mut repos = self.catalog.repositories();
+
+        let namespace = repos
+            .namespaces()
+            .get_by_name(namespace_name, SoftDeletedRows::ExcludeDeleted)
+            .await?;
+
+        // create it if it doesn't exist
+        let namespace = match namespace {
+            Some(namespace) => {
+                debug!(%namespace_name, "Found existing namespace");
+                namespace
+            }
+            None => {
+                let namespace_name = NamespaceName::try_from(namespace_name)?;
+                let partition_template = None;
+                let retention_period_ns = None;
+                let service_protection_limits = None;
+
+                info!(%namespace_name, "Namespace not found, creating new namespace");
+                repos
+                    .namespaces()
+                    .create(
+                        &namespace_name,
+                        partition_template,
+                        retention_period_ns,
+                        service_protection_limits,
+                    )
+                    .await?
+            }
+        };
+
+        let table = self
+            .table_for_parquet_file(repos.as_mut(), &namespace, &iox_metadata)
+            .await?;
+        let table_id = table.id;
+        debug!(%table_id, "Inserting catalog records into table");
+
+        // Create a new partition
+        let partition_key = iox_metadata.partition_key.clone();
+        let mut partition = self
+            .create_partition(repos.as_mut(), &table, partition_key)
+            .await?;
+
+        // Note that for some reason, the object_store_id that is
+        // actually used in object_storage from the source system is
+        // different than what is stored in the metadata embedded in
+        // the parquet file itself. Thus use the object_store_id
+        // encoded into the parquet file name
+        let object_store_id =
+            object_store_id_from_parquet_filename(file_path).ok_or_else(|| {
+                Error::UnexpectedFileName {
+                    path: file_path.into(),
+                }
+            })?;
+        debug!(partition_id=%partition.id, %object_store_id, "Inserting into partition");
+
+        let parquet_metadata = self.exported_contents.parquet_metadata(&object_store_id);
+
+        let parquet_params = self
+            .parquet_file_params(
+                repos.as_mut(),
+                &namespace,
+                &table,
+                &partition,
+                parquet_metadata,
+                &iox_metadata,
+                &decoded_iox_parquet_metadata,
+                file_size_bytes,
+            )
+            .await?;
+
+        let object_store_id = parquet_params.object_store_id;
+        let parquet_file = repos.parquet_files().create(parquet_params).await;
+
+        match parquet_file {
+            Ok(parquet_file) => {
+                debug!(parquet_file_id=?parquet_file.id, "  Created parquet file entry {}", parquet_file.id);
+            }
+            Err(iox_catalog::interface::Error::AlreadyExists { .. }) => {
+                warn!(%object_store_id, "parquet file already exists, skipping");
+            }
+            Err(e) => {
+                return Err(Error::Catalog(e));
+            }
+        };
+
+        // Update partition sort key
+        let partition = self
+            .update_partition(&mut partition, repos.as_mut(), &table, &iox_metadata)
+            .await?;
+
+        // Now copy the parquet files into the object store
+        let transition_partition_id = partition.transition_partition_id();
+
+        let parquet_path = ParquetFilePath::new(
+            namespace.id,
+            table_id,
+            &transition_partition_id,
+            object_store_id,
+        );
+        let object_store_path = parquet_path.object_store_path();
+        debug!(?object_store_path, "copying data to object store");
+        self.object_store.put(&object_store_path, bytes).await?;
+
+        info!(
+            ?file_path,
+            %namespace_name,
+            %object_store_path,
+            %transition_partition_id,
+            %table_id,
+            "Successfully imported file"
+        );
+
+        Ok(())
+    }
+
+    /// Return the relevant Catalog [`Table`] for the specified parquet
+    /// file.
+    ///
+    /// If the table has been exported, add it to the repo and return it.
+    /// If the table does not yet exist, it is created, using any
+    /// available catalog metadata and falling back to what is in the
+    /// iox metadata if needed
+    async fn table_for_parquet_file(
+        &self,
+        repos: &mut dyn RepoCollection,
+        namespace: &Namespace,
+        iox_metadata: &IoxMetadata,
+    ) -> Result<Table> {
+        let tables = repos.tables();
+
+        // Note the export format doesn't currently have any table level information
+        let table_name = iox_metadata.table_name.as_ref();
+
+        if let Some(table) = tables
+            .get_by_namespace_and_name(namespace.id, table_name)
+            .await?
+        {
+            return Ok(table);
+        }
+
+        // use exported table
+        if let Some(table) = self.exported_contents.table(namespace.id.get(), table_name) {
+            return Ok(tables
+                .create(
+                    &table.name,
+                    table.partition_template.try_into()?,
+                    NamespaceId::new(table.namespace_id),
+                )
+                .await?);
+        }
+
+        // need to make a new table, create the default partitioning scheme...
+        let partition_template = PARTITION_BY_DAY_PROTO.as_ref().clone();
+        let namespace_template = NamespacePartitionTemplateOverride::try_from(partition_template)?;
+        let custom_table_template = None;
+        let partition_template =
+            TablePartitionTemplateOverride::try_new(custom_table_template, &namespace_template)?;
+        let table = tables
+            .create(table_name, partition_template, namespace.id)
+            .await?;
+        Ok(table)
+    }
+
+    /// Create the catalog [`Partition`] into which the specified parquet
+    /// file shoudl be inserted.
+    ///
+    /// The sort_key and sort_key_ids of the partition should be empty when it is first created
+    /// because there are no columns in any parquet files to use for sorting yet.
+    /// The sort_key and sort_key_ids will be updated after the parquet files are created.
+    async fn create_partition(
+        &self,
+        repos: &mut dyn RepoCollection,
+        table: &Table,
+        partition_key: PartitionKey,
+    ) -> Result<Partition> {
+        let partition = repos
+            .partitions()
+            .create_or_get(partition_key, table.id)
+            .await?;
+
+        Ok(partition)
+    }
+
+    /// Update sort keys of the partition
+    ///
+    /// file should be inserted.
+    ///
+    /// First attempts to use any available metadata from the
+    /// catalog export, and falls back to what is in the iox
+    /// metadata stored in the parquet file, if needed
+    async fn update_partition(
+        &self,
+        partition: &mut Partition,
+        repos: &mut dyn RepoCollection,
+        table: &Table,
+        iox_metadata: &IoxMetadata,
+    ) -> Result<Partition> {
+        let partition_key = iox_metadata.partition_key.clone();
+
+        // Note we use the table_id embedded in the file's metadata
+        // from the source catalog to match the exported catlog (which
+        // is dfferent than the new table we just created in the
+        // target catalog);
+        let proto_partition = self
+            .exported_contents
+            .partition_metadata(iox_metadata.table_id.get(), partition_key.inner());
+
+        let new_sort_key_ids = if let Some(proto_partition) = proto_partition.as_ref() {
+            // Use the sort key from the source catalog
+            debug!(sort_key_ids=?proto_partition.sort_key_ids, "Using sort key from catalog export");
+            let new_sort_key_ids = match &proto_partition.sort_key_ids {
+                Some(sort_key_ids) => sort_key_ids.array_sort_key_ids.clone(),
+                None => vec![],
+            };
+
+            SortKeyIds::from(new_sort_key_ids)
+        } else {
+            warn!("Could not find sort key in catalog metadata export, falling back to embedded metadata");
+            let sort_key = iox_metadata
+                .sort_key
+                .as_ref()
+                .ok_or_else(|| Error::NoSortKey)?;
+
+            let new_sort_key = sort_key.to_columns().collect::<Vec<_>>();
+
+            // fetch table columns
+            let columns = get_table_columns_by_id(table.id, repos).await?;
+            columns.ids_for_names(&new_sort_key)
+        };
+
+        loop {
+            let res = repos
+                .partitions()
+                .cas_sort_key(partition.id, partition.sort_key_ids(), &new_sort_key_ids)
+                .await;
+
+            match res {
+                Ok(partition) => return Ok(partition),
+                Err(CasFailure::ValueMismatch(_)) => {
+                    debug!("Value mismatch when setting sort key, retrying...");
+                    continue;
+                }
+                Err(CasFailure::QueryError(e)) => return Err(Error::SetSortKey(e)),
+            }
+        }
+    }
+
+    /// Return a [`ParquetFileParams`] (information needed to insert
+    /// the data into the target catalog).
+    ///
+    /// First attempts to use any available metadata from the
+    /// catalog export, and falls back to what is in the iox
+    /// metadata stored in the parquet file, if needed
+    #[allow(clippy::too_many_arguments)]
+    async fn parquet_file_params(
+        &self,
+        repos: &mut dyn RepoCollection,
+        namespace: &Namespace,
+        table: &Table,
+        partition: &Partition,
+        // parquet metadata, if known
+        parquet_metadata: Option<proto::ParquetFile>,
+        iox_metadata: &IoxMetadata,
+        decoded_iox_parquet_metadata: &DecodedIoxParquetMetaData,
+        file_size_bytes: usize,
+    ) -> Result<ParquetFileParams> {
+        let object_store_id = iox_metadata.object_store_id;
+
+        // need to make columns in the target catalog
+        let column_set = insert_columns(table.id, decoded_iox_parquet_metadata, repos).await?;
+
+        let params = if let Some(proto_parquet_file) = &parquet_metadata {
+            let compaction_level = proto_parquet_file.compaction_level.try_into()?;
+
+            ParquetFileParams {
+                namespace_id: namespace.id,
+                table_id: table.id,
+                partition_id: partition.id,
+                partition_hash_id: partition.hash_id().cloned(),
+                object_store_id,
+                min_time: Timestamp::new(proto_parquet_file.min_time),
+                max_time: Timestamp::new(proto_parquet_file.max_time),
+                file_size_bytes: proto_parquet_file.file_size_bytes,
+                row_count: proto_parquet_file.row_count,
+                compaction_level,
+                created_at: Timestamp::new(proto_parquet_file.created_at),
+                column_set,
+                max_l0_created_at: Timestamp::new(proto_parquet_file.max_l0_created_at),
+            }
+        } else {
+            warn!("Could not read parquet file metadata, reconstructing based on encoded metadata");
+
+            let (min_time, max_time) = get_min_max_times(decoded_iox_parquet_metadata)?;
+            let created_at = Timestamp::new(iox_metadata.creation_timestamp.timestamp_nanos());
+            ParquetFileParams {
+                namespace_id: namespace.id,
+                table_id: table.id,
+                partition_id: partition.id,
+                partition_hash_id: partition.hash_id().cloned(),
+                object_store_id,
+                min_time,
+                max_time,
+                // use unwrap: if we can't fit the file size or row
+                // counts into usize, something is very wrong and we
+                // should stop immediately (and get an exact stack trace)
+                file_size_bytes: file_size_bytes.try_into().unwrap(),
+                row_count: decoded_iox_parquet_metadata.row_count().try_into().unwrap(),
+                //compaction_level: CompactionLevel::Final,
+                compaction_level: CompactionLevel::Initial,
+                created_at,
+                column_set,
+                max_l0_created_at: created_at,
+            }
+        };
+        debug!(?params, "Created ParquetFileParams");
+        Ok(params)
+    }
+}
+/// Returns a `ColumnSet` that represents all the columns specified in
+/// `decoded_iox_parquet_metadata`.
+///
+/// Insert the appropriate column entries in the catalog they are not
+/// already present.
+async fn insert_columns(
+    table_id: TableId,
+    decoded_iox_parquet_metadata: &DecodedIoxParquetMetaData,
+    repos: &mut dyn RepoCollection,
+) -> Result<ColumnSet> {
+    let schema = decoded_iox_parquet_metadata.read_schema()?;
+
+    let mut column_ids = vec![];
+
+    for (iox_column_type, field) in schema.iter() {
+        let column_name = field.name();
+        let column_type = ColumnType::from(iox_column_type);
+
+        let column = repos
+            .columns()
+            .create_or_get(column_name, table_id, column_type)
+            .await?;
+        column_ids.push(column.id);
+    }
+
+    Ok(ColumnSet::new(column_ids))
+}
+
+/// Reads out the min and max value for the decoded_iox_parquet_metadata column
+fn get_min_max_times(
+    decoded_iox_parquet_metadata: &DecodedIoxParquetMetaData,
+) -> Result<(Timestamp, Timestamp)> {
+    let schema = decoded_iox_parquet_metadata.read_schema()?;
+    let stats = decoded_iox_parquet_metadata.read_statistics(&schema)?;
+
+    let Some(summary) = stats.iter().find(|s| s.name == schema::TIME_COLUMN_NAME) else {
+        return Err(Error::BadStats { stats: None });
+    };
+
+    let Statistics::I64(stats) = &summary.stats else {
+        return Err(Error::BadStats {
+            stats: Some(summary.stats.clone()),
+        });
+    };
+
+    let (Some(min), Some(max)) = (stats.min, stats.max) else {
+        return Err(Error::NoMinMax {
+            min: stats.min,
+            max: stats.max,
+        });
+    };
+
+    Ok((Timestamp::new(min), Timestamp::new(max)))
+}
+
+/// Given a filename of the store parquet metadata, returns the object_store_id
+///
+/// For example, `e65790df-3e42-0094-048f-0b69a7ee402c.parquet`,
+/// returns `e65790df-3e42-0094-048f-0b69a7ee402c`
+///
+/// For some reason the object store id embedded in the parquet file's
+/// [`IoxMetadata`] and the of the actual file in object storage are
+/// different, so we need to use the object_store_id actually used in
+/// the source system, which is embedded in the filename
+fn object_store_id_from_parquet_filename(path: &Path) -> Option<String> {
+    let stem = path
+        // <uuid>.partition_id.parquet --> <uuid>.partition_id
+        .file_stem()?
+        .to_string_lossy();
+
+    Some(stem.to_string())
+}
diff --git a/import_export/src/file/mod.rs b/import_export/src/file/mod.rs
new file mode 100644
index 0000000..d0c9b1d
--- /dev/null
+++ b/import_export/src/file/mod.rs
@@ -0,0 +1,6 @@
+/// Code to import/export files
+mod export;
+mod import;
+
+pub use export::{ExportError, RemoteExporter};
+pub use import::{Error, ExportedContents, RemoteImporter};
diff --git a/import_export/src/lib.rs b/import_export/src/lib.rs
new file mode 100644
index 0000000..df7c5ef
--- /dev/null
+++ b/import_export/src/lib.rs
@@ -0,0 +1,17 @@
+//! Import/export utilities for IOx
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::future_not_send,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+/// Import/Export data to files
+pub mod file;
diff --git a/influxdb2_client/Cargo.toml b/influxdb2_client/Cargo.toml
new file mode 100644
index 0000000..3b05518
--- /dev/null
+++ b/influxdb2_client/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "influxdb2_client"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+bytes = "1.5"
+futures = { version = "0.3", default-features = false }
+reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls-native-roots"] }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.111"
+snafu = "0.8"
+url = "2.5.0"
+uuid = { version = "1", features = ["v4"] }
+
+[dev-dependencies] # In alphabetical order
+mockito = { version ="1.2", default-features = false }
+once_cell = { version = "1.19", features = ["parking_lot"] }
+parking_lot = "0.12"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+test_helpers = { path = "../test_helpers" }
diff --git a/influxdb2_client/README.md b/influxdb2_client/README.md
new file mode 100644
index 0000000..7e94127
--- /dev/null
+++ b/influxdb2_client/README.md
@@ -0,0 +1,23 @@
+# InfluxDB V2 Client API
+
+This crate contains a work-in-progress implementation of a Rust client for the [InfluxDB 2.0 API](https://docs.influxdata.com/influxdb/v2.0/reference/api/).
+
+This client is not the Rust client for IOx. You can find that [here](../influxdb_iox_client).
+
+The InfluxDB IOx project plans to focus its efforts on the subset of the API which are most relevant to IOx, but we accept (welcome!) PRs for adding the other pieces of functionality.
+
+
+## Design Notes
+
+When it makes sense, this client aims to mirror the [InfluxDB 2.x Go client API](https://github.com/influxdata/influxdb-client-go)
+
+## Contributing
+
+If you would like to contribute code you can do through GitHub by forking the repository and sending a pull request into the master branch.
+
+
+## Future work
+
+- [ ] Publish as a crate on [crates.io](http://crates.io)
+
+If you would like to contribute code you can do through GitHub by forking the repository and sending a pull request into the main branch.
diff --git a/influxdb2_client/examples/health.rs b/influxdb2_client/examples/health.rs
new file mode 100644
index 0000000..72d0843
--- /dev/null
+++ b/influxdb2_client/examples/health.rs
@@ -0,0 +1,11 @@
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let influx_url = "some-url";
+    let token = "some-token";
+
+    let client = influxdb2_client::Client::new(influx_url, token);
+
+    println!("{:?}", client.health().await?);
+
+    Ok(())
+}
diff --git a/influxdb2_client/examples/label.rs b/influxdb2_client/examples/label.rs
new file mode 100644
index 0000000..f7a7134
--- /dev/null
+++ b/influxdb2_client/examples/label.rs
@@ -0,0 +1,29 @@
+use std::collections::HashMap;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let influx_url = "http://localhost:8888";
+    let token = "some-token";
+
+    let client = influxdb2_client::Client::new(influx_url, token);
+
+    println!("{:?}", client.labels().await?);
+    println!("{:?}", client.labels_by_org("some-org_id").await?);
+    println!("{:?}", client.find_label("some-label_id").await?);
+    let mut properties = HashMap::new();
+    properties.insert("some-key".to_string(), "some-value".to_string());
+    println!(
+        "{:?}",
+        client
+            .create_label("some-org_id", "some-name", Some(properties))
+            .await?
+    );
+    println!(
+        "{:?}",
+        client
+            .update_label(Some("some-name".to_string()), None, "some-label_id")
+            .await?
+    );
+    println!("{:?}", client.delete_label("some-label_id").await?);
+    Ok(())
+}
diff --git a/influxdb2_client/examples/query.rs b/influxdb2_client/examples/query.rs
new file mode 100644
index 0000000..00bb761
--- /dev/null
+++ b/influxdb2_client/examples/query.rs
@@ -0,0 +1,26 @@
+use influxdb2_client::models::{LanguageRequest, Query};
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let influx_url = "http://localhost:8086";
+    let token = "some-token";
+
+    let client = influxdb2_client::Client::new(influx_url, token);
+
+    client.query_suggestions().await?;
+    client.query_suggestions_name("some-name").await?;
+
+    client
+        .query_raw("some-org", Some(Query::new("some-query".to_string())))
+        .await?;
+
+    client
+        .query_analyze(Some(Query::new("some-query".to_string())))
+        .await?;
+
+    client
+        .query_ast(Some(LanguageRequest::new("some-query".to_string())))
+        .await?;
+
+    Ok(())
+}
diff --git a/influxdb2_client/examples/ready.rs b/influxdb2_client/examples/ready.rs
new file mode 100644
index 0000000..07d69c4
--- /dev/null
+++ b/influxdb2_client/examples/ready.rs
@@ -0,0 +1,11 @@
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let influx_url = "some-url";
+    let token = "some-token";
+
+    let client = influxdb2_client::Client::new(influx_url, token);
+
+    println!("{:?}", client.ready().await?);
+
+    Ok(())
+}
diff --git a/influxdb2_client/examples/setup.rs b/influxdb2_client/examples/setup.rs
new file mode 100644
index 0000000..c54b12f
--- /dev/null
+++ b/influxdb2_client/examples/setup.rs
@@ -0,0 +1,32 @@
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let influx_url = "http://localhost:8888";
+    let token = "some-token";
+
+    let client = influxdb2_client::Client::new(influx_url, token);
+
+    if client.is_onboarding_allowed().await? {
+        println!(
+            "{:?}",
+            client
+                .onboarding("some-user", "some-org", "some-bucket", None, None, None,)
+                .await?
+        );
+    }
+
+    println!(
+        "{:?}",
+        client
+            .post_setup_user(
+                "some-new-user",
+                "some-new-org",
+                "some-new-bucket",
+                None,
+                None,
+                None,
+            )
+            .await?
+    );
+
+    Ok(())
+}
diff --git a/influxdb2_client/examples/write.rs b/influxdb2_client/examples/write.rs
new file mode 100644
index 0000000..2bbc23d
--- /dev/null
+++ b/influxdb2_client/examples/write.rs
@@ -0,0 +1,27 @@
+use futures::prelude::*;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let org = "myorg";
+    let bucket = "mybucket";
+    let influx_url = "http://localhost:9999";
+    let token = "my-token";
+
+    let client = influxdb2_client::Client::new(influx_url, token);
+
+    let points = vec![
+        influxdb2_client::models::DataPoint::builder("cpu_load_short")
+            .tag("host", "server01")
+            .tag("region", "us-west")
+            .field("value", 0.64)
+            .build()?,
+        influxdb2_client::models::DataPoint::builder("cpu_load_short")
+            .tag("host", "server01")
+            .field("value", 27.99)
+            .build()?,
+    ];
+
+    client.write(org, bucket, stream::iter(points)).await?;
+
+    Ok(())
+}
diff --git a/influxdb2_client/src/api/buckets.rs b/influxdb2_client/src/api/buckets.rs
new file mode 100644
index 0000000..92065d0
--- /dev/null
+++ b/influxdb2_client/src/api/buckets.rs
@@ -0,0 +1,68 @@
+//! Buckets API
+
+use crate::models::PostBucketRequest;
+use crate::{Client, HttpSnafu, RequestError, ReqwestProcessingSnafu, SerializingSnafu};
+use reqwest::Method;
+use snafu::ResultExt;
+
+impl Client {
+    /// Create a new bucket in the organization specified by the 16-digit
+    /// hexadecimal `org_id` and with the bucket name `bucket`.
+    pub async fn create_bucket(
+        &self,
+        post_bucket_request: Option<PostBucketRequest>,
+    ) -> Result<(), RequestError> {
+        let create_bucket_url = format!("{}/api/v2/buckets", self.url);
+
+        let response = self
+            .request(Method::POST, &create_bucket_url)
+            .header("Content-Type", "application/json")
+            .body(
+                serde_json::to_string(&post_bucket_request.unwrap_or_default())
+                    .context(SerializingSnafu)?,
+            )
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let text = response.text().await.context(ReqwestProcessingSnafu)?;
+            HttpSnafu { status, text }.fail()?;
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mockito::Server;
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn create_bucket() {
+        let org_id = "0000111100001111".to_string();
+        let bucket = "some-bucket".to_string();
+        let token = "some-token";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/buckets")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                format!(r#"{{"orgID":"{org_id}","name":"{bucket}","retentionRules":[]}}"#).as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client
+            .create_bucket(Some(PostBucketRequest::new(org_id, bucket)))
+            .await;
+
+        mock.assert_async().await;
+    }
+}
diff --git a/influxdb2_client/src/api/health.rs b/influxdb2_client/src/api/health.rs
new file mode 100644
index 0000000..4364ec6
--- /dev/null
+++ b/influxdb2_client/src/api/health.rs
@@ -0,0 +1,53 @@
+//! Health
+//!
+//! Get health of an InfluxDB instance
+
+use crate::models::HealthCheck;
+use crate::{Client, HttpSnafu, RequestError, ReqwestProcessingSnafu};
+use reqwest::{Method, StatusCode};
+use snafu::ResultExt;
+
+impl Client {
+    /// Get health of an instance
+    pub async fn health(&self) -> Result<HealthCheck, RequestError> {
+        let health_url = format!("{}/health", self.url);
+        let response = self
+            .request(Method::GET, &health_url)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<HealthCheck>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            StatusCode::SERVICE_UNAVAILABLE => Ok(response
+                .json::<HealthCheck>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mockito::Server;
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn health() {
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server.mock("GET", "/health").create_async().await;
+
+        let client = Client::new(mock_server.url(), "");
+
+        let _result = client.health().await;
+
+        mock.assert_async().await;
+    }
+}
diff --git a/influxdb2_client/src/api/label.rs b/influxdb2_client/src/api/label.rs
new file mode 100644
index 0000000..f71dc56
--- /dev/null
+++ b/influxdb2_client/src/api/label.rs
@@ -0,0 +1,318 @@
+//! Labels
+
+use crate::models::{LabelCreateRequest, LabelResponse, LabelUpdate, LabelsResponse};
+use crate::{Client, HttpSnafu, RequestError, ReqwestProcessingSnafu, SerializingSnafu};
+use reqwest::{Method, StatusCode};
+use snafu::ResultExt;
+use std::collections::HashMap;
+
+impl Client {
+    /// List all Labels
+    pub async fn labels(&self) -> Result<LabelsResponse, RequestError> {
+        self.get_labels(None).await
+    }
+
+    /// List all Labels by organization ID
+    pub async fn labels_by_org(&self, org_id: &str) -> Result<LabelsResponse, RequestError> {
+        self.get_labels(Some(org_id)).await
+    }
+
+    async fn get_labels(&self, org_id: Option<&str>) -> Result<LabelsResponse, RequestError> {
+        let labels_url = format!("{}/api/v2/labels", self.url);
+        let mut request = self.request(Method::GET, &labels_url);
+
+        if let Some(id) = org_id {
+            request = request.query(&[("orgID", id)]);
+        }
+
+        let response = request.send().await.context(ReqwestProcessingSnafu)?;
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<LabelsResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Retrieve a label by ID
+    pub async fn find_label(&self, label_id: &str) -> Result<LabelResponse, RequestError> {
+        let labels_by_id_url = format!("{}/api/v2/labels/{}", self.url, label_id);
+        let response = self
+            .request(Method::GET, &labels_by_id_url)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<LabelResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Create a Label
+    pub async fn create_label(
+        &self,
+        org_id: &str,
+        name: &str,
+        properties: Option<HashMap<String, String>>,
+    ) -> Result<LabelResponse, RequestError> {
+        let create_label_url = format!("{}/api/v2/labels", self.url);
+        let body = LabelCreateRequest {
+            org_id: org_id.into(),
+            name: name.into(),
+            properties,
+        };
+        let response = self
+            .request(Method::POST, &create_label_url)
+            .header("Content-Type", "application/json")
+            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+        match response.status() {
+            StatusCode::CREATED => Ok(response
+                .json::<LabelResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Update a Label
+    pub async fn update_label(
+        &self,
+        name: Option<String>,
+        properties: Option<HashMap<String, String>>,
+        label_id: &str,
+    ) -> Result<LabelResponse, RequestError> {
+        let update_label_url = format!("{}/api/v2/labels/{}", &self.url, label_id);
+        let body = LabelUpdate { name, properties };
+        let response = self
+            .request(Method::PATCH, &update_label_url)
+            .header("Content-Type", "application/json")
+            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<LabelResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Delete a Label
+    pub async fn delete_label(&self, label_id: &str) -> Result<(), RequestError> {
+        let delete_label_url = format!("{}/api/v2/labels/{}", &self.url, label_id);
+        let response = self
+            .request(Method::DELETE, &delete_label_url)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+        match response.status() {
+            StatusCode::NO_CONTENT => Ok(()),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mockito::Server;
+
+    const BASE_PATH: &str = "/api/v2/labels";
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn labels() {
+        let token = "some-token";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("GET", BASE_PATH)
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.labels().await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn labels_by_org() {
+        let token = "some-token";
+        let org_id = "some-org_id";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("GET", format!("{BASE_PATH}?orgID={org_id}").as_str())
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.labels_by_org(org_id).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn find_label() {
+        let token = "some-token";
+        let label_id = "some-id";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("GET", format!("{BASE_PATH}/{label_id}").as_str())
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.find_label(label_id).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn create_label() {
+        let token = "some-token";
+        let org_id = "some-org";
+        let name = "some-user";
+        let mut properties = HashMap::new();
+        properties.insert("some-key".to_string(), "some-value".to_string());
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server.mock("POST", BASE_PATH)
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                format!(
+                    r#"{{"orgID":"{org_id}","name":"{name}","properties":{{"some-key":"some-value"}}}}"#
+                )
+                .as_str(),
+            )
+            .create_async().await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.create_label(org_id, name, Some(properties)).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn create_label_opt() {
+        let token = "some-token";
+        let org_id = "some-org_id";
+        let name = "some-user";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", BASE_PATH)
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(format!(r#"{{"orgID":"{org_id}","name":"{name}"}}"#).as_str())
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.create_label(org_id, name, None).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn update_label() {
+        let token = "some-token";
+        let name = "some-user";
+        let label_id = "some-label_id";
+        let mut properties = HashMap::new();
+        properties.insert("some-key".to_string(), "some-value".to_string());
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("PATCH", format!("{BASE_PATH}/{label_id}").as_str())
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                format!(r#"{{"name":"{name}","properties":{{"some-key":"some-value"}}}}"#).as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client
+            .update_label(Some(name.to_string()), Some(properties), label_id)
+            .await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn update_label_opt() {
+        let token = "some-token";
+        let label_id = "some-label_id";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("PATCH", format!("{BASE_PATH}/{label_id}").as_str())
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body("{}")
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.update_label(None, None, label_id).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn delete_label() {
+        let token = "some-token";
+        let label_id = "some-label_id";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("DELETE", format!("{BASE_PATH}/{label_id}").as_str())
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.delete_label(label_id).await;
+
+        mock.assert_async().await;
+    }
+}
diff --git a/influxdb2_client/src/api/mod.rs b/influxdb2_client/src/api/mod.rs
new file mode 100644
index 0000000..ece3d9e
--- /dev/null
+++ b/influxdb2_client/src/api/mod.rs
@@ -0,0 +1,8 @@
+//! InfluxDB v2.0 Client API
+pub mod buckets;
+pub mod health;
+pub mod label;
+pub mod query;
+pub mod ready;
+pub mod setup;
+pub mod write;
diff --git a/influxdb2_client/src/api/query.rs b/influxdb2_client/src/api/query.rs
new file mode 100644
index 0000000..85e8ee6
--- /dev/null
+++ b/influxdb2_client/src/api/query.rs
@@ -0,0 +1,384 @@
+//! Query
+//!
+//! Query InfluxDB using InfluxQL or Flux Query
+
+use crate::{
+    Client, HttpSnafu, RequestError, ReqwestProcessingSnafu, ResponseBytesSnafu,
+    ResponseStringSnafu, SerializingSnafu,
+};
+use reqwest::{Method, StatusCode};
+use snafu::ResultExt;
+
+use crate::models::{
+    AnalyzeQueryResponse, AstResponse, FluxSuggestion, FluxSuggestions, LanguageRequest, Query,
+};
+
+impl Client {
+    /// Get Query Suggestions
+    pub async fn query_suggestions(&self) -> Result<FluxSuggestions, RequestError> {
+        let req_url = format!("{}/api/v2/query/suggestions", self.url);
+        let response = self
+            .request(Method::GET, &req_url)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<FluxSuggestions>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Query Suggestions with name
+    pub async fn query_suggestions_name(&self, name: &str) -> Result<FluxSuggestion, RequestError> {
+        let req_url = format!(
+            "{}/api/v2/query/suggestions/{name}",
+            self.url,
+            name = crate::common::urlencode(name),
+        );
+
+        let response = self
+            .request(Method::GET, &req_url)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<FluxSuggestion>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Query and return the raw string data from the server
+    pub async fn query_raw(&self, org: &str, query: Option<Query>) -> Result<String, RequestError> {
+        let req_url = format!("{}/api/v2/query", self.url);
+
+        let response = self
+            .request(Method::POST, &req_url)
+            .header("Accepting-Encoding", "identity")
+            .header("Content-Type", "application/json")
+            .query(&[("org", &org)])
+            .body(serde_json::to_string(&query.unwrap_or_default()).context(SerializingSnafu)?)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => {
+                let bytes = response.bytes().await.context(ResponseBytesSnafu)?;
+                String::from_utf8(bytes.to_vec()).context(ResponseStringSnafu)
+            }
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Analyze Query
+    pub async fn query_analyze(
+        &self,
+        query: Option<Query>,
+    ) -> Result<AnalyzeQueryResponse, RequestError> {
+        let req_url = format!("{}/api/v2/query/analyze", self.url);
+
+        let response = self
+            .request(Method::POST, &req_url)
+            .header("Content-Type", "application/json")
+            .body(serde_json::to_string(&query.unwrap_or_default()).context(SerializingSnafu)?)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<AnalyzeQueryResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Get Query AST Repsonse
+    pub async fn query_ast(
+        &self,
+        language_request: Option<LanguageRequest>,
+    ) -> Result<AstResponse, RequestError> {
+        let req_url = format!("{}/api/v2/query/ast", self.url);
+
+        let response = self
+            .request(Method::POST, &req_url)
+            .header("Content-Type", "application/json")
+            .body(
+                serde_json::to_string(&language_request.unwrap_or_default())
+                    .context(SerializingSnafu)?,
+            )
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<AstResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mockito::{Matcher, Server};
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_suggestions() {
+        let token = "some-token";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("GET", "/api/v2/query/suggestions")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_suggestions().await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_suggestions_name() {
+        let token = "some-token";
+        let suggestion_name = "some-name";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock(
+                "GET",
+                format!(
+                    "/api/v2/query/suggestions/{name}",
+                    name = crate::common::urlencode(suggestion_name)
+                )
+                .as_str(),
+            )
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_suggestions_name(suggestion_name).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_raw() {
+        let token = "some-token";
+        let org = "some-org";
+        let query: Option<Query> = Some(Query::new("some-influx-query-string".to_string()));
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/query")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Accepting-Encoding", "identity")
+            .match_header("Content-Type", "application/json")
+            .match_query(Matcher::UrlEncoded("org".into(), org.into()))
+            .match_body(
+                serde_json::to_string(&query.clone().unwrap_or_default())
+                    .unwrap()
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_raw(org, query).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_raw_opt() {
+        let token = "some-token";
+        let org = "some-org";
+        let query: Option<Query> = None;
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/query")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Accepting-Encoding", "identity")
+            .match_header("Content-Type", "application/json")
+            .match_query(Matcher::UrlEncoded("org".into(), org.into()))
+            .match_body(
+                #[allow(clippy::unnecessary_literal_unwrap)]
+                serde_json::to_string(&query.unwrap_or_default())
+                    .unwrap()
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_raw(org, None).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_analyze() {
+        let token = "some-token";
+        let query: Option<Query> = Some(Query::new("some-influx-query-string".to_string()));
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/query/analyze")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                serde_json::to_string(&query.clone().unwrap_or_default())
+                    .unwrap()
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_analyze(query).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_analyze_opt() {
+        let token = "some-token";
+        let query: Option<Query> = None;
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/query/analyze")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                serde_json::to_string(&query.clone().unwrap_or_default())
+                    .unwrap()
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_analyze(query).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_ast() {
+        let token = "some-token";
+        let language_request: Option<LanguageRequest> =
+            Some(LanguageRequest::new("some-influx-query-string".to_string()));
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/query/ast")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                serde_json::to_string(&language_request.clone().unwrap_or_default())
+                    .unwrap()
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_ast(language_request).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_ast_opt() {
+        let token = "some-token";
+        let language_request: Option<LanguageRequest> = None;
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/query/ast")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                serde_json::to_string(&language_request.clone().unwrap_or_default())
+                    .unwrap()
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client.query_ast(language_request).await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn query_raw_no_results() {
+        let token = "some-token";
+        let org = "some-org";
+        let query: Option<Query> = Some(Query::new("some-influx-query-string".to_string()));
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/query")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Accepting-Encoding", "identity")
+            .match_header("Content-Type", "application/json")
+            .match_query(Matcher::UrlEncoded("org".into(), org.into()))
+            .match_body(
+                serde_json::to_string(&query.clone().unwrap_or_default())
+                    .unwrap()
+                    .as_str(),
+            )
+            .with_body("")
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let result = client.query_raw(org, query).await.expect("request success");
+        assert_eq!(result, "");
+
+        mock.assert_async().await;
+    }
+}
diff --git a/influxdb2_client/src/api/ready.rs b/influxdb2_client/src/api/ready.rs
new file mode 100644
index 0000000..6765316
--- /dev/null
+++ b/influxdb2_client/src/api/ready.rs
@@ -0,0 +1,47 @@
+//! Ready
+//!
+//! Check readiness of an InfluxDB instance at startup
+
+use reqwest::{Method, StatusCode};
+use snafu::ResultExt;
+
+use crate::{Client, HttpSnafu, RequestError, ReqwestProcessingSnafu};
+
+impl Client {
+    /// Get the readiness of an instance at startup
+    pub async fn ready(&self) -> Result<bool, RequestError> {
+        let ready_url = format!("{}/ready", self.url);
+        let response = self
+            .request(Method::GET, &ready_url)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => Ok(true),
+            _ => {
+                let status = response.status();
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mockito::Server;
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn ready() {
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server.mock("GET", "/ready").create_async().await;
+
+        let client = Client::new(mock_server.url(), "");
+
+        let _result = client.ready().await;
+
+        mock.assert_async().await;
+    }
+}
diff --git a/influxdb2_client/src/api/setup.rs b/influxdb2_client/src/api/setup.rs
new file mode 100644
index 0000000..590283b
--- /dev/null
+++ b/influxdb2_client/src/api/setup.rs
@@ -0,0 +1,261 @@
+//! Onboarding/Setup
+//!
+//! Initate and start onboarding process of InfluxDB server.
+
+use crate::{Client, HttpSnafu, RequestError, ReqwestProcessingSnafu, SerializingSnafu};
+use reqwest::{Method, StatusCode};
+use snafu::ResultExt;
+
+use crate::models::{IsOnboarding, OnboardingRequest, OnboardingResponse};
+
+impl Client {
+    /// Check if database has default user, org, bucket
+    pub async fn is_onboarding_allowed(&self) -> Result<bool, RequestError> {
+        let setup_url = format!("{}/api/v2/setup", self.url);
+        let response = self
+            .request(Method::GET, &setup_url)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::OK => Ok(response
+                .json::<IsOnboarding>()
+                .await
+                .context(ReqwestProcessingSnafu)?
+                .allowed),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Set up initial user, org and bucket
+    pub async fn onboarding(
+        &self,
+        username: &str,
+        org: &str,
+        bucket: &str,
+        password: Option<String>,
+        retention_period_hrs: Option<i32>,
+        retention_period_seconds: Option<i32>,
+    ) -> Result<OnboardingResponse, RequestError> {
+        let setup_init_url = format!("{}/api/v2/setup", self.url);
+
+        let body = OnboardingRequest {
+            username: username.into(),
+            org: org.into(),
+            bucket: bucket.into(),
+            password,
+            retention_period_hrs,
+            retention_period_seconds,
+        };
+
+        let response = self
+            .request(Method::POST, &setup_init_url)
+            .header("Content-Type", "application/json")
+            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::CREATED => Ok(response
+                .json::<OnboardingResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+
+    /// Set up a new user, org and bucket
+    pub async fn post_setup_user(
+        &self,
+        username: &str,
+        org: &str,
+        bucket: &str,
+        password: Option<String>,
+        retention_period_hrs: Option<i32>,
+        retention_period_seconds: Option<i32>,
+    ) -> Result<OnboardingResponse, RequestError> {
+        let setup_new_url = format!("{}/api/v2/setup/user", self.url);
+
+        let body = OnboardingRequest {
+            username: username.into(),
+            org: org.into(),
+            bucket: bucket.into(),
+            password,
+            retention_period_hrs,
+            retention_period_seconds,
+        };
+
+        let response = self
+            .request(Method::POST, &setup_new_url)
+            .header("Content-Type", "application/json")
+            .body(serde_json::to_string(&body).context(SerializingSnafu)?)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        match response.status() {
+            StatusCode::CREATED => Ok(response
+                .json::<OnboardingResponse>()
+                .await
+                .context(ReqwestProcessingSnafu)?),
+            status => {
+                let text = response.text().await.context(ReqwestProcessingSnafu)?;
+                HttpSnafu { status, text }.fail()?
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mockito::Server;
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn is_onboarding_allowed() {
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("GET", "/api/v2/setup")
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), "");
+
+        let _result = client.is_onboarding_allowed().await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn onboarding() {
+        let token = "some-token";
+        let username = "some-user";
+        let org = "some-org";
+        let bucket = "some-bucket";
+        let password = "some-password";
+        let retention_period_hrs = 1;
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server.mock("POST", "/api/v2/setup")
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                format!(
+                    r#"{{"username":"{username}","org":"{org}","bucket":"{bucket}","password":"{password}","retentionPeriodHrs":{retention_period_hrs}}}"#
+                ).as_str(),
+            )
+            .create_async().await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client
+            .onboarding(
+                username,
+                org,
+                bucket,
+                Some(password.to_string()),
+                Some(retention_period_hrs),
+                None,
+            )
+            .await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn post_setup_user() {
+        let token = "some-token";
+        let username = "some-user";
+        let org = "some-org";
+        let bucket = "some-bucket";
+        let password = "some-password";
+        let retention_period_hrs = 1;
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server.mock("POST", "/api/v2/setup/user")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                format!(
+                    r#"{{"username":"{username}","org":"{org}","bucket":"{bucket}","password":"{password}","retentionPeriodHrs":{retention_period_hrs}}}"#
+                ).as_str(),
+            )
+            .create_async().await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client
+            .post_setup_user(
+                username,
+                org,
+                bucket,
+                Some(password.to_string()),
+                Some(retention_period_hrs),
+                None,
+            )
+            .await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn onboarding_opt() {
+        let username = "some-user";
+        let org = "some-org";
+        let bucket = "some-bucket";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/setup")
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                format!(r#"{{"username":"{username}","org":"{org}","bucket":"{bucket}"}}"#,)
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), "");
+
+        let _result = client
+            .onboarding(username, org, bucket, None, None, None)
+            .await;
+
+        mock.assert_async().await;
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn post_setup_user_opt() {
+        let token = "some-token";
+        let username = "some-user";
+        let org = "some-org";
+        let bucket = "some-bucket";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock("POST", "/api/v2/setup/user")
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_header("Content-Type", "application/json")
+            .match_body(
+                format!(r#"{{"username":"{username}","org":"{org}","bucket":"{bucket}"}}"#,)
+                    .as_str(),
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let _result = client
+            .post_setup_user(username, org, bucket, None, None, None)
+            .await;
+
+        mock.assert_async().await;
+    }
+}
diff --git a/influxdb2_client/src/api/write.rs b/influxdb2_client/src/api/write.rs
new file mode 100644
index 0000000..f97b555
--- /dev/null
+++ b/influxdb2_client/src/api/write.rs
@@ -0,0 +1,116 @@
+//! Write API
+
+use crate::models::WriteDataPoint;
+use crate::{Client, HttpSnafu, RequestError, ReqwestProcessingSnafu};
+use bytes::BufMut;
+use futures::{Stream, StreamExt};
+use reqwest::{Body, Method};
+use snafu::ResultExt;
+use std::io::{self, Write};
+
+impl Client {
+    /// Write line protocol data to the specified organization and bucket.
+    pub async fn write_line_protocol(
+        &self,
+        org: &str,
+        bucket: &str,
+        body: impl Into<Body> + Send,
+    ) -> Result<(), RequestError> {
+        let body = body.into();
+        let write_url = format!("{}/api/v2/write", self.url);
+
+        let response = self
+            .request(Method::POST, &write_url)
+            .query(&[("bucket", bucket), ("org", org)])
+            .body(body)
+            .send()
+            .await
+            .context(ReqwestProcessingSnafu)?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let text = response.text().await.context(ReqwestProcessingSnafu)?;
+            HttpSnafu { status, text }.fail()?;
+        }
+
+        Ok(())
+    }
+
+    /// Write a `Stream` of `DataPoint`s to the specified organization and
+    /// bucket.
+    pub async fn write(
+        &self,
+        org: &str,
+        bucket: &str,
+        body: impl Stream<Item = impl WriteDataPoint> + Send + Sync + 'static,
+    ) -> Result<(), RequestError> {
+        let mut buffer = bytes::BytesMut::new();
+
+        let body = body.map(move |point| {
+            let mut w = (&mut buffer).writer();
+            point.write_data_point_to(&mut w)?;
+            w.flush()?;
+            Ok::<_, io::Error>(buffer.split().freeze())
+        });
+
+        let body = Body::wrap_stream(body);
+
+        self.write_line_protocol(org, bucket, body).await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::models::DataPoint;
+    use futures::stream;
+    use mockito::Server;
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 1)]
+    async fn writing_points() {
+        let org = "some-org";
+        let bucket = "some-bucket";
+        let token = "some-token";
+
+        let mut mock_server = Server::new_async().await;
+        let mock = mock_server
+            .mock(
+                "POST",
+                format!("/api/v2/write?bucket={bucket}&org={org}").as_str(),
+            )
+            .match_header("Authorization", format!("Token {token}").as_str())
+            .match_body(
+                "\
+cpu,host=server01 usage=0.5
+cpu,host=server01,region=us-west usage=0.87
+",
+            )
+            .create_async()
+            .await;
+
+        let client = Client::new(mock_server.url(), token);
+
+        let points = vec![
+            DataPoint::builder("cpu")
+                .tag("host", "server01")
+                .field("usage", 0.5)
+                .build()
+                .unwrap(),
+            DataPoint::builder("cpu")
+                .tag("host", "server01")
+                .tag("region", "us-west")
+                .field("usage", 0.87)
+                .build()
+                .unwrap(),
+        ];
+
+        // If the requests made are incorrect, Mockito returns status 501 and `write`
+        // will return an error, which causes the test to fail here instead of
+        // when we assert on mock_server. The error messages that Mockito
+        // provides are much clearer for explaining why a test failed than just
+        // that the server returned 501, so don't use `?` here.
+        let _result = client.write(org, bucket, stream::iter(points)).await;
+
+        mock.assert_async().await;
+    }
+}
diff --git a/influxdb2_client/src/common.rs b/influxdb2_client/src/common.rs
new file mode 100644
index 0000000..d51ea20
--- /dev/null
+++ b/influxdb2_client/src/common.rs
@@ -0,0 +1,8 @@
+//! Common
+//!
+//! Collection of helper functions
+
+/// Serialize to application/x-www-form-urlencoded syntax
+pub fn urlencode<T: AsRef<str>>(s: T) -> String {
+    ::url::form_urlencoded::byte_serialize(s.as_ref().as_bytes()).collect()
+}
diff --git a/influxdb2_client/src/lib.rs b/influxdb2_client/src/lib.rs
new file mode 100644
index 0000000..0db577e
--- /dev/null
+++ b/influxdb2_client/src/lib.rs
@@ -0,0 +1,203 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+// `clippy::use_self` is deliberately excluded from the lints this crate uses.
+// See <https://github.com/rust-lang/rust-clippy/issues/6902>.
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+//! # influxdb2_client
+//!
+//! This is a Rust client to InfluxDB using the [2.0 API][2api].
+//!
+//! [2api]: https://v2.docs.influxdata.com/v2.0/reference/api/
+//!
+//! ## Work Remaining
+//!
+//! - Query
+//! - optional sync client
+//! - Influx 1.x API?
+//! - Other parts of the API
+//! - Pick the best name to use on crates.io and publish
+//!
+//! ## Quick start
+//!
+//! This example creates a client to an InfluxDB server running at `http://localhost:8888`, creates
+//! a bucket with the name "mybucket" in the organization with name "myorg" and
+//! ID "0000111100001111", builds two points, and writes the points to the
+//! bucket.
+//!
+//! ```
+//! async fn example() -> Result<(), Box<dyn std::error::Error>> {
+//!     use influxdb2_client::Client;
+//!     use influxdb2_client::models::{DataPoint, PostBucketRequest};
+//!     use futures::stream;
+//!
+//!     let org = "myorg";
+//!     let org_id = "0000111100001111";
+//!     let bucket = "mybucket";
+//!
+//!     let client = Client::new("http://localhost:8888", "some-token");
+//!
+//!     client.create_bucket(
+//!         Some(PostBucketRequest::new(org_id.to_string(), bucket.to_string()))
+//!     ).await?;
+//!
+//!     let points = vec![
+//!         DataPoint::builder("cpu")
+//!             .tag("host", "server01")
+//!             .field("usage", 0.5)
+//!             .build()?,
+//!         DataPoint::builder("cpu")
+//!             .tag("host", "server01")
+//!             .tag("region", "us-west")
+//!             .field("usage", 0.87)
+//!             .build()?,
+//!     ];
+//!
+//!     client.write(org, bucket, stream::iter(points)).await?;
+//!     Ok(())
+//! }
+//! ```
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use once_cell as _;
+#[cfg(test)]
+use parking_lot as _;
+#[cfg(test)]
+use test_helpers as _;
+
+use reqwest::Method;
+use snafu::Snafu;
+
+/// Errors that occur while making requests to the Influx server.
+#[derive(Debug, Snafu)]
+pub enum RequestError {
+    /// While making a request to the Influx server, the underlying `reqwest`
+    /// library returned an error that was not an HTTP 400 or 500.
+    #[snafu(display("Error while processing the HTTP request: {}", source))]
+    ReqwestProcessing {
+        /// The underlying error object from `reqwest`.
+        source: reqwest::Error,
+    },
+    /// The underlying `reqwest` library returned an HTTP error with code 400
+    /// (meaning a client error) or 500 (meaning a server error).
+    #[snafu(display("HTTP request returned an error: {}, `{}`", status, text))]
+    Http {
+        /// The `StatusCode` returned from the request
+        status: reqwest::StatusCode,
+        /// Any text data returned from the request
+        text: String,
+    },
+
+    /// While serializing data as JSON to send in a request, the underlying
+    /// `serde_json` library returned an error.
+    #[snafu(display("Error while serializing to JSON: {}", source))]
+    Serializing {
+        /// The underlying error object from `serde_json`.
+        source: serde_json::error::Error,
+    },
+
+    /// While deserializing the response as JSON, something went wrong.
+    #[snafu(display("Could not deserialize as JSON. Error: {source}\nText: `{text}`"))]
+    DeserializingJsonResponse {
+        /// The text of the response
+        text: String,
+        /// The underlying error object from serde
+        source: serde_json::Error,
+    },
+
+    /// Something went wrong getting the raw bytes of the response
+    #[snafu(display("Could not get response bytes: {source}"))]
+    ResponseBytes {
+        /// The underlying error object from reqwest
+        source: reqwest::Error,
+    },
+
+    /// Something went wrong converting the raw bytes of the response to a UTF-8 string
+    #[snafu(display("Invalid UTF-8: {source}"))]
+    ResponseString {
+        /// The underlying error object from std
+        source: std::string::FromUtf8Error,
+    },
+}
+
+/// Client to a server supporting the InfluxData 2.0 API.
+#[derive(Debug, Clone)]
+pub struct Client {
+    /// The base URL this client sends requests to
+    pub url: String,
+    auth_header: Option<String>,
+    reqwest: reqwest::Client,
+    jaeger_debug_header: Option<String>,
+}
+
+impl Client {
+    /// Default [jaeger debug header](Self::with_jaeger_debug) that should work in many
+    /// environments.
+    pub const DEFAULT_JAEGER_DEBUG_HEADER: &'static str = "jaeger-debug-id";
+
+    /// Create a new client pointing to the URL specified in
+    /// `protocol://server:port` format and using the specified token for
+    /// authorization.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// let client = influxdb2_client::Client::new("http://localhost:8888", "my-token");
+    /// ```
+    pub fn new(url: impl Into<String>, auth_token: impl Into<String>) -> Self {
+        let token = auth_token.into();
+        let auth_header = if token.is_empty() {
+            None
+        } else {
+            Some(format!("Token {token}"))
+        };
+
+        Self {
+            url: url.into(),
+            auth_header,
+            reqwest: reqwest::Client::builder()
+                .connection_verbose(true)
+                .build()
+                .expect("reqwest::Client should have built"),
+            jaeger_debug_header: None,
+        }
+    }
+
+    /// Enable generation of jaeger debug headers with the given header name.
+    pub fn with_jaeger_debug(self, header: String) -> Self {
+        Self {
+            jaeger_debug_header: Some(header),
+            ..self
+        }
+    }
+
+    /// Consolidate common request building code
+    fn request(&self, method: Method, url: &str) -> reqwest::RequestBuilder {
+        let mut req = self.reqwest.request(method, url);
+
+        if let Some(auth) = &self.auth_header {
+            req = req.header("Authorization", auth);
+        }
+        if let Some(header) = &self.jaeger_debug_header {
+            req = req.header(header, format!("influxdb_client-{}", uuid::Uuid::new_v4()));
+        }
+
+        req
+    }
+}
+
+pub mod common;
+
+pub mod api;
+pub mod models;
diff --git a/influxdb2_client/src/models/ast/call_expression.rs b/influxdb2_client/src/models/ast/call_expression.rs
new file mode 100644
index 0000000..23dcc1c
--- /dev/null
+++ b/influxdb2_client/src/models/ast/call_expression.rs
@@ -0,0 +1,24 @@
+//! CallExpression
+
+use serde::{Deserialize, Serialize};
+
+/// Represents a function call
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct CallExpression {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Callee
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub callee: Option<Box<crate::models::ast::Expression>>,
+    /// Function arguments
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub arguments: Vec<crate::models::ast::Expression>,
+}
+
+impl CallExpression {
+    /// Represents a function call
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/dialect.rs b/influxdb2_client/src/models/ast/dialect.rs
new file mode 100644
index 0000000..d4e5c78
--- /dev/null
+++ b/influxdb2_client/src/models/ast/dialect.rs
@@ -0,0 +1,54 @@
+//! Dialect
+
+use serde::{Deserialize, Serialize};
+
+/// Dialect are options to change the default CSV output format;
+/// <https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Dialect {
+    /// If true, the results will contain a header row
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub header: Option<bool>,
+    /// Separator between cells; the default is ,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub delimiter: Option<String>,
+    /// <https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#columns>
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub annotations: Option<Vec<Annotations>>,
+    /// Character prefixed to comment strings
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub comment_prefix: Option<String>,
+    /// Format of timestamps
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub date_time_format: Option<DateTimeFormat>,
+}
+
+impl Dialect {
+    /// Dialect are options to change the default CSV output format;
+    /// <https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// <https://www.w3.org/TR/2015/REC-tabular-data-model-20151217/#columns>
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Annotations {
+    /// Group Annotation
+    Group,
+    /// Datatype Annotation
+    Datatype,
+    /// Default Annotation
+    Default,
+}
+
+/// Timestamp Format
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+pub enum DateTimeFormat {
+    /// RFC3339
+    Rfc3339,
+    /// RFC3339Nano
+    Rfc3339Nano,
+}
diff --git a/influxdb2_client/src/models/ast/dict_item.rs b/influxdb2_client/src/models/ast/dict_item.rs
new file mode 100644
index 0000000..1f1fa40
--- /dev/null
+++ b/influxdb2_client/src/models/ast/dict_item.rs
@@ -0,0 +1,24 @@
+//! DictItem
+
+use serde::{Deserialize, Serialize};
+
+/// A key/value pair in a dictionary
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct DictItem {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Key
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub key: Option<crate::models::ast::Expression>,
+    /// Value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub val: Option<crate::models::ast::Expression>,
+}
+
+impl DictItem {
+    /// A key/value pair in a dictionary
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/duration.rs b/influxdb2_client/src/models/ast/duration.rs
new file mode 100644
index 0000000..fae6a61
--- /dev/null
+++ b/influxdb2_client/src/models/ast/duration.rs
@@ -0,0 +1,27 @@
+//! Duration
+
+use serde::{Deserialize, Serialize};
+
+/// Duration : A pair consisting of length of time and the unit of time
+/// measured. It is the atomic unit from which all duration literals are
+/// composed.
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Duration {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Duration Magnitude
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub magnitude: Option<i32>,
+    /// Duration unit
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub unit: Option<String>,
+}
+
+impl Duration {
+    /// A pair consisting of length of time and the unit of time measured. It is
+    /// the atomic unit from which all duration literals are composed.
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/expression.rs b/influxdb2_client/src/models/ast/expression.rs
new file mode 100644
index 0000000..5d4b48b
--- /dev/null
+++ b/influxdb2_client/src/models/ast/expression.rs
@@ -0,0 +1,84 @@
+//! Expression
+
+use serde::{Deserialize, Serialize};
+
+/// Expression AST
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Expression {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Elements of the dictionary
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub elements: Vec<crate::models::ast::DictItem>,
+    /// Function parameters
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub params: Vec<crate::models::ast::Property>,
+    /// Node
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub body: Option<crate::models::ast::Node>,
+    /// Operator
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub operator: Option<String>,
+    /// Left leaf
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub left: Option<Box<crate::models::ast::Expression>>,
+    /// Right leaf
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub right: Option<Box<crate::models::ast::Expression>>,
+    /// Parent Expression
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub callee: Option<Box<crate::models::ast::Expression>>,
+    /// Function arguments
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub arguments: Vec<crate::models::ast::Expression>,
+    /// Test Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub test: Option<Box<crate::models::ast::Expression>>,
+    /// Alternate Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub alternate: Option<Box<crate::models::ast::Expression>>,
+    /// Consequent Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub consequent: Option<Box<crate::models::ast::Expression>>,
+    /// Object Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub object: Option<Box<crate::models::ast::Expression>>,
+    /// PropertyKey Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub property: Option<Box<crate::models::ast::PropertyKey>>,
+    /// Array Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub array: Option<Box<crate::models::ast::Expression>>,
+    /// Index Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub index: Option<Box<crate::models::ast::Expression>>,
+    /// Properties
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub properties: Vec<crate::models::ast::Property>,
+    /// Expression
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub expression: Option<Box<crate::models::ast::Expression>>,
+    /// Argument
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub argument: Option<Box<crate::models::ast::Expression>>,
+    /// Call Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub call: Option<crate::models::ast::CallExpression>,
+    /// Expression Value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub value: Option<String>,
+    /// Duration values
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub values: Vec<crate::models::ast::Duration>,
+    /// Expression Name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+}
+
+impl Expression {
+    /// Return instance of expression
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/identifier.rs b/influxdb2_client/src/models/ast/identifier.rs
new file mode 100644
index 0000000..361c223
--- /dev/null
+++ b/influxdb2_client/src/models/ast/identifier.rs
@@ -0,0 +1,21 @@
+//! Idendifier
+
+use serde::{Deserialize, Serialize};
+
+/// A valid Flux identifier
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Identifier {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Identifier Name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+}
+
+impl Identifier {
+    /// A valid Flux identifier
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/import_declaration.rs b/influxdb2_client/src/models/ast/import_declaration.rs
new file mode 100644
index 0000000..5caec07
--- /dev/null
+++ b/influxdb2_client/src/models/ast/import_declaration.rs
@@ -0,0 +1,24 @@
+//! ImportDeclaration
+
+use serde::{Deserialize, Serialize};
+
+/// Declares a package import
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct ImportDeclaration {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Import Identifier
+    #[serde(rename = "as", skip_serializing_if = "Option::is_none")]
+    pub r#as: Option<crate::models::ast::Identifier>,
+    /// Import Path
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub path: Option<crate::models::ast::StringLiteral>,
+}
+
+impl ImportDeclaration {
+    /// Declares a package import
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/member_expression.rs b/influxdb2_client/src/models/ast/member_expression.rs
new file mode 100644
index 0000000..d3e4dba
--- /dev/null
+++ b/influxdb2_client/src/models/ast/member_expression.rs
@@ -0,0 +1,24 @@
+//! MemberExpression
+
+use serde::{Deserialize, Serialize};
+
+/// Represents accessing a property of an object
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct MemberExpression {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Member object
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub object: Option<crate::models::ast::Expression>,
+    /// Member Property
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub property: Option<crate::models::ast::PropertyKey>,
+}
+
+impl MemberExpression {
+    /// Represents accessing a property of an object
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/mod.rs b/influxdb2_client/src/models/ast/mod.rs
new file mode 100644
index 0000000..f170a85
--- /dev/null
+++ b/influxdb2_client/src/models/ast/mod.rs
@@ -0,0 +1,34 @@
+//! Query AST models
+
+pub mod identifier;
+pub use self::identifier::Identifier;
+pub mod statement;
+pub use self::statement::Statement;
+pub mod expression;
+pub use self::expression::Expression;
+pub mod call_expression;
+pub use self::call_expression::CallExpression;
+pub mod member_expression;
+pub use self::member_expression::MemberExpression;
+pub mod string_literal;
+pub use self::string_literal::StringLiteral;
+pub mod dict_item;
+pub use self::dict_item::DictItem;
+pub mod variable_assignment;
+pub use self::variable_assignment::VariableAssignment;
+pub mod node;
+pub use self::node::Node;
+pub mod property;
+pub use self::property::Property;
+pub mod property_key;
+pub use self::property_key::PropertyKey;
+pub mod dialect;
+pub use self::dialect::Dialect;
+pub mod import_declaration;
+pub use self::import_declaration::ImportDeclaration;
+pub mod package;
+pub use self::package::Package;
+pub mod package_clause;
+pub use self::package_clause::PackageClause;
+pub mod duration;
+pub use self::duration::Duration;
diff --git a/influxdb2_client/src/models/ast/node.rs b/influxdb2_client/src/models/ast/node.rs
new file mode 100644
index 0000000..e6bfcf5
--- /dev/null
+++ b/influxdb2_client/src/models/ast/node.rs
@@ -0,0 +1,84 @@
+//! Node
+
+use serde::{Deserialize, Serialize};
+
+/// Node
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Node {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Elements of the dictionary
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub elements: Vec<crate::models::ast::DictItem>,
+    /// Function parameters
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub params: Vec<crate::models::ast::Property>,
+    /// Block body
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub body: Vec<crate::models::ast::Statement>,
+    /// Node Operator
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub operator: Option<String>,
+    /// Left left node
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub left: Option<Box<crate::models::ast::Expression>>,
+    /// Right right node
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub right: Option<Box<crate::models::ast::Expression>>,
+    /// Parent node
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub callee: Option<Box<crate::models::ast::Expression>>,
+    /// Function arguments
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub arguments: Vec<crate::models::ast::Expression>,
+    /// Test Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub test: Option<Box<crate::models::ast::Expression>>,
+    /// Alternate Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub alternate: Option<Box<crate::models::ast::Expression>>,
+    /// Consequent Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub consequent: Option<Box<crate::models::ast::Expression>>,
+    /// Object Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub object: Option<Box<crate::models::ast::Expression>>,
+    /// PropertyKey
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub property: Option<crate::models::ast::PropertyKey>,
+    /// Array Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub array: Option<Box<crate::models::ast::Expression>>,
+    /// Index Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub index: Option<Box<crate::models::ast::Expression>>,
+    /// Object properties
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub properties: Vec<crate::models::ast::Property>,
+    /// Expression
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub expression: Option<Box<crate::models::ast::Expression>>,
+    /// Node arguments
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub argument: Option<Box<crate::models::ast::Expression>>,
+    /// Call Expr
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub call: Option<crate::models::ast::CallExpression>,
+    /// Node Value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub value: Option<String>,
+    /// Duration values
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub values: Vec<crate::models::ast::Duration>,
+    /// Node name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+}
+
+impl Node {
+    /// Return instance of Node
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/package.rs b/influxdb2_client/src/models/ast/package.rs
new file mode 100644
index 0000000..84f97a0
--- /dev/null
+++ b/influxdb2_client/src/models/ast/package.rs
@@ -0,0 +1,28 @@
+//! Package
+
+use crate::models::File;
+use serde::{Deserialize, Serialize};
+
+/// Represents a complete package source tree.
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Package {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Package import path
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub path: Option<String>,
+    /// Package name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub package: Option<String>,
+    /// Package files
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub files: Vec<File>,
+}
+
+impl Package {
+    /// Represents a complete package source tree.
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/package_clause.rs b/influxdb2_client/src/models/ast/package_clause.rs
new file mode 100644
index 0000000..ec90473
--- /dev/null
+++ b/influxdb2_client/src/models/ast/package_clause.rs
@@ -0,0 +1,21 @@
+//! PackageClause
+
+use serde::{Deserialize, Serialize};
+
+/// Defines a package identifier
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct PackageClause {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Package name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<crate::models::ast::Identifier>,
+}
+
+impl PackageClause {
+    /// Defines a package identifier
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/property.rs b/influxdb2_client/src/models/ast/property.rs
new file mode 100644
index 0000000..ee5efd2
--- /dev/null
+++ b/influxdb2_client/src/models/ast/property.rs
@@ -0,0 +1,24 @@
+//! Property
+
+use serde::{Deserialize, Serialize};
+
+/// The value associated with a key
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Property {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Property Key
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub key: Option<crate::models::ast::PropertyKey>,
+    /// Property Value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub value: Option<crate::models::ast::Expression>,
+}
+
+impl Property {
+    /// The value associated with a key
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/property_key.rs b/influxdb2_client/src/models/ast/property_key.rs
new file mode 100644
index 0000000..71f521c
--- /dev/null
+++ b/influxdb2_client/src/models/ast/property_key.rs
@@ -0,0 +1,24 @@
+//! PropertyKey
+
+use serde::{Deserialize, Serialize};
+
+/// Key value pair
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct PropertyKey {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// PropertyKey name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    /// PropertyKey value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub value: Option<String>,
+}
+
+impl PropertyKey {
+    /// Returns an instance of PropertyKey
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/statement.rs b/influxdb2_client/src/models/ast/statement.rs
new file mode 100644
index 0000000..edbc526
--- /dev/null
+++ b/influxdb2_client/src/models/ast/statement.rs
@@ -0,0 +1,39 @@
+//! Statement
+
+use serde::{Deserialize, Serialize};
+
+/// Expression Statement
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Statement {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Raw source text
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub text: Option<String>,
+    /// Statement identitfier
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<crate::models::ast::Identifier>,
+    /// Initial Value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub init: Option<crate::models::ast::Expression>,
+    /// Member
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub member: Option<crate::models::ast::MemberExpression>,
+    /// Expression
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub expression: Option<crate::models::ast::Expression>,
+    /// Argument
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub argument: Option<crate::models::ast::Expression>,
+    /// Assignment
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub assignment: Option<crate::models::ast::VariableAssignment>,
+}
+
+impl Statement {
+    /// Returns an instance of Statement
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/string_literal.rs b/influxdb2_client/src/models/ast/string_literal.rs
new file mode 100644
index 0000000..e5c144e
--- /dev/null
+++ b/influxdb2_client/src/models/ast/string_literal.rs
@@ -0,0 +1,21 @@
+//! StringLiteral
+
+use serde::{Deserialize, Serialize};
+
+/// Expressions begin and end with double quote marks
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct StringLiteral {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// StringLiteral Value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub value: Option<String>,
+}
+
+impl StringLiteral {
+    /// Expressions begin and end with double quote marks
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/ast/variable_assignment.rs b/influxdb2_client/src/models/ast/variable_assignment.rs
new file mode 100644
index 0000000..f9f6111
--- /dev/null
+++ b/influxdb2_client/src/models/ast/variable_assignment.rs
@@ -0,0 +1,24 @@
+//! VariableAssignment
+
+use serde::{Deserialize, Serialize};
+
+/// Represents the declaration of a variable
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct VariableAssignment {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// Variable Identifier
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<crate::models::ast::Identifier>,
+    /// Variable initial value
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub init: Option<crate::models::ast::Expression>,
+}
+
+impl VariableAssignment {
+    /// Represents the declaration of a variable
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/authorization.rs b/influxdb2_client/src/models/authorization.rs
new file mode 100644
index 0000000..fe4878d
--- /dev/null
+++ b/influxdb2_client/src/models/authorization.rs
@@ -0,0 +1,88 @@
+//! Authorization
+//!
+//! Auth tokens for InfluxDB
+
+use serde::{Deserialize, Serialize};
+
+/// Authorization to create
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Authorization {
+    /// If inactive the token is inactive and requests using the token will be
+    /// rejected.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub status: Option<Status>,
+    /// A description of the token.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    /// Auth created_at
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_at: Option<String>,
+    /// Auth updated_at
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub updated_at: Option<String>,
+    /// ID of org that authorization is scoped to.
+    #[serde(rename = "orgID")]
+    pub org_id: String,
+    /// List of permissions for an auth. An auth must have at least one
+    /// Permission.
+    pub permissions: Vec<crate::models::Permission>,
+    /// Auth ID.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    /// Passed via the Authorization Header and Token Authentication type.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub token: Option<String>,
+    /// ID of user that created and owns the token.
+    #[serde(rename = "userID", skip_serializing_if = "Option::is_none")]
+    pub user_id: Option<String>,
+    /// Name of user that created and owns the token.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<String>,
+    /// Name of the org token is scoped to.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub org: Option<String>,
+    /// Links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::AuthorizationAllOfLinks>,
+}
+
+impl Authorization {
+    /// Returns an Authorization with the given orgID and permissions
+    pub fn new(org_id: String, permissions: Vec<crate::models::Permission>) -> Self {
+        Self {
+            org_id,
+            permissions,
+            ..Default::default()
+        }
+    }
+}
+
+/// If inactive the token is inactive and requests using the token will be
+/// rejected.
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Status {
+    /// Token is active.
+    Active,
+    /// Token is inactive.
+    Inactive,
+}
+
+/// AuthorizationAllOfLinks
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct AuthorizationAllOfLinks {
+    /// Self
+    #[serde(rename = "self", skip_serializing_if = "Option::is_none")]
+    pub self_: Option<String>,
+    /// User
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<String>,
+}
+
+impl AuthorizationAllOfLinks {
+    /// Return an instance of AuthorizationAllOfLinks
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/bucket.rs b/influxdb2_client/src/models/bucket.rs
new file mode 100644
index 0000000..432b785
--- /dev/null
+++ b/influxdb2_client/src/models/bucket.rs
@@ -0,0 +1,142 @@
+//! Bucket
+
+use serde::{Deserialize, Serialize};
+
+/// Bucket Schema
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Bucket {
+    /// BucketLinks
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::BucketLinks>,
+    /// Bucket ID
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    /// Bucket Type
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<Type>,
+    /// Bucket name
+    pub name: String,
+    /// Bucket description
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    /// Organization ID of bucket
+    #[serde(rename = "orgID", skip_serializing_if = "Option::is_none")]
+    pub org_id: Option<String>,
+    /// RP
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub rp: Option<String>,
+    /// Created At
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_at: Option<String>,
+    /// Updated At
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub updated_at: Option<String>,
+    /// Rules to expire or retain data. No rules means data never expires.
+    pub retention_rules: Vec<crate::models::RetentionRule>,
+    /// Bucket labels
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub labels: Vec<crate::models::Label>,
+}
+
+impl Bucket {
+    /// Returns instance of Bucket
+    pub fn new(name: String, retention_rules: Vec<crate::models::RetentionRule>) -> Self {
+        Self {
+            name,
+            retention_rules,
+            ..Default::default()
+        }
+    }
+}
+
+/// Bucket Type
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Type {
+    /// User
+    User,
+    /// System
+    System,
+}
+
+/// Bucket links
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct BucketLinks {
+    /// Labels
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub labels: Option<String>,
+    /// Members
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub members: Option<String>,
+    /// Organization
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub org: Option<String>,
+    /// Owners
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub owners: Option<String>,
+    /// Self
+    #[serde(rename = "self", skip_serializing_if = "Option::is_none")]
+    pub self_: Option<String>,
+    /// Write
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub write: Option<String>,
+}
+
+impl BucketLinks {
+    /// Returns instance of BucketLinks
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// List all buckets
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Buckets {
+    /// Links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::Links>,
+    /// Buckets
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub buckets: Vec<crate::models::Bucket>,
+}
+
+impl Buckets {
+    /// Returns list of buckets
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// PostBucketRequest
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[serde(rename_all = "camelCase")]
+pub struct PostBucketRequest {
+    /// Organization ID
+    #[serde(rename = "orgID")]
+    pub org_id: String,
+    /// Bucket name
+    pub name: String,
+    /// Bucket Description
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    /// RP
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub rp: Option<String>,
+    /// Rules to expire or retain data.  No rules means data never expires.
+    #[serde(default)]
+    pub retention_rules: Vec<crate::models::RetentionRule>,
+}
+
+impl PostBucketRequest {
+    /// Returns instance of PostBucketRequest
+    pub fn new(org_id: String, name: String) -> Self {
+        Self {
+            org_id,
+            name,
+            ..Default::default()
+        }
+    }
+}
diff --git a/influxdb2_client/src/models/data_point.rs b/influxdb2_client/src/models/data_point.rs
new file mode 100644
index 0000000..ff4877a
--- /dev/null
+++ b/influxdb2_client/src/models/data_point.rs
@@ -0,0 +1,511 @@
+//! Data point building and writing
+
+use snafu::{ensure, Snafu};
+use std::{collections::BTreeMap, io};
+
+/// Errors that occur while building `DataPoint`s
+#[derive(Debug, Snafu)]
+pub enum DataPointError {
+    /// Returned when calling `build` on a `DataPointBuilder` that has no
+    /// fields.
+    #[snafu(display(
+        "All `DataPoints` must have at least one field. Builder contains: {:?}",
+        data_point_builder
+    ))]
+    AtLeastOneFieldRequired {
+        /// The current state of the `DataPointBuilder`
+        data_point_builder: DataPointBuilder,
+    },
+}
+
+/// Incrementally constructs a `DataPoint`.
+///
+/// Create this via `DataPoint::builder`.
+#[derive(Debug)]
+pub struct DataPointBuilder {
+    measurement: String,
+    // Keeping the tags sorted improves performance on the server side
+    tags: BTreeMap<String, String>,
+    fields: BTreeMap<String, FieldValue>,
+    timestamp: Option<i64>,
+}
+
+impl DataPointBuilder {
+    fn new(measurement: impl Into<String>) -> Self {
+        Self {
+            measurement: measurement.into(),
+            tags: Default::default(),
+            fields: Default::default(),
+            timestamp: Default::default(),
+        }
+    }
+
+    /// Sets a tag, replacing any existing tag of the same name.
+    pub fn tag(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
+        self.tags.insert(name.into(), value.into());
+        self
+    }
+
+    /// Sets a field, replacing any existing field of the same name.
+    pub fn field(mut self, name: impl Into<String>, value: impl Into<FieldValue>) -> Self {
+        self.fields.insert(name.into(), value.into());
+        self
+    }
+
+    /// Sets the timestamp, replacing any existing timestamp.
+    ///
+    /// The value is treated as the number of nanoseconds since the
+    /// UNIX epoch.
+    pub fn timestamp(mut self, value: i64) -> Self {
+        self.timestamp = Some(value);
+        self
+    }
+
+    /// Constructs the data point
+    pub fn build(self) -> Result<DataPoint, DataPointError> {
+        ensure!(
+            !self.fields.is_empty(),
+            AtLeastOneFieldRequiredSnafu {
+                data_point_builder: self
+            }
+        );
+
+        let Self {
+            measurement,
+            tags,
+            fields,
+            timestamp,
+        } = self;
+
+        Ok(DataPoint {
+            measurement,
+            tags,
+            fields,
+            timestamp,
+        })
+    }
+}
+
+/// A single point of information to send to InfluxDB.
+// TODO: If we want to support non-UTF-8 data, all `String`s stored in `DataPoint` would need
+// to be `Vec<u8>` instead, the API for creating a `DataPoint` would need some more consideration,
+// and there would need to be more `Write*` trait implementations. Because the `Write*` traits work
+// on a writer of bytes, that part of the design supports non-UTF-8 data now.
+#[derive(Debug)]
+pub struct DataPoint {
+    measurement: String,
+    tags: BTreeMap<String, String>,
+    fields: BTreeMap<String, FieldValue>,
+    timestamp: Option<i64>,
+}
+
+impl DataPoint {
+    /// Create a builder to incrementally construct a `DataPoint`.
+    pub fn builder(measurement: impl Into<String>) -> DataPointBuilder {
+        DataPointBuilder::new(measurement)
+    }
+}
+
+impl WriteDataPoint for DataPoint {
+    fn write_data_point_to<W>(&self, mut w: W) -> io::Result<()>
+    where
+        W: io::Write,
+    {
+        self.measurement.write_measurement_to(&mut w)?;
+
+        for (k, v) in &self.tags {
+            w.write_all(b",")?;
+            k.write_tag_key_to(&mut w)?;
+            w.write_all(b"=")?;
+            v.write_tag_value_to(&mut w)?;
+        }
+
+        for (i, (k, v)) in self.fields.iter().enumerate() {
+            let d = if i == 0 { b" " } else { b"," };
+
+            w.write_all(d)?;
+            k.write_field_key_to(&mut w)?;
+            w.write_all(b"=")?;
+            v.write_field_value_to(&mut w)?;
+        }
+
+        if let Some(ts) = self.timestamp {
+            w.write_all(b" ")?;
+            ts.write_timestamp_to(&mut w)?;
+        }
+
+        w.write_all(b"\n")?;
+
+        Ok(())
+    }
+}
+
+/// Possible value types
+#[derive(Debug, Clone, PartialEq)]
+pub enum FieldValue {
+    /// A true or false value
+    Bool(bool),
+    /// A 64-bit floating point number
+    F64(f64),
+    /// A 64-bit signed integer number
+    I64(i64),
+    /// A 64-bit unsigned integer number
+    U64(u64),
+    /// A string value
+    String(String),
+}
+
+impl From<bool> for FieldValue {
+    fn from(other: bool) -> Self {
+        Self::Bool(other)
+    }
+}
+
+impl From<f64> for FieldValue {
+    fn from(other: f64) -> Self {
+        Self::F64(other)
+    }
+}
+
+impl From<i64> for FieldValue {
+    fn from(other: i64) -> Self {
+        Self::I64(other)
+    }
+}
+
+impl From<u64> for FieldValue {
+    fn from(other: u64) -> Self {
+        Self::U64(other)
+    }
+}
+
+impl From<&str> for FieldValue {
+    fn from(other: &str) -> Self {
+        Self::String(other.into())
+    }
+}
+
+impl From<String> for FieldValue {
+    fn from(other: String) -> Self {
+        Self::String(other)
+    }
+}
+
+/// Transform a type into valid line protocol lines
+///
+/// This trait is to enable the conversion of `DataPoint`s to line protocol; it
+/// is unlikely that you would need to implement this trait. In the future, a
+/// `derive` crate may exist that would facilitate the generation of
+/// implementations of this trait on custom types to help uphold the
+/// responsibilities for escaping and producing complete lines.
+pub trait WriteDataPoint {
+    /// Write this data point as line protocol. The implementor is responsible
+    /// for properly escaping the data and ensuring that complete lines
+    /// are generated.
+    fn write_data_point_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write;
+}
+
+// The following are traits rather than free functions so that we can limit
+// their implementations to only the data types supported for each of
+// measurement, tag key, tag value, field key, field value, and timestamp. They
+// are a private implementation detail and any custom implementations
+// of these traits would be generated by a future derive trait.
+trait WriteMeasurement {
+    fn write_measurement_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write;
+}
+
+impl WriteMeasurement for str {
+    fn write_measurement_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write,
+    {
+        escape_and_write_value(self, MEASUREMENT_DELIMITERS, w)
+    }
+}
+
+trait WriteTagKey {
+    fn write_tag_key_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write;
+}
+
+impl WriteTagKey for str {
+    fn write_tag_key_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write,
+    {
+        escape_and_write_value(self, TAG_KEY_DELIMITERS, w)
+    }
+}
+
+trait WriteTagValue {
+    fn write_tag_value_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write;
+}
+
+impl WriteTagValue for str {
+    fn write_tag_value_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write,
+    {
+        escape_and_write_value(self, TAG_VALUE_DELIMITERS, w)
+    }
+}
+
+trait WriteFieldKey {
+    fn write_field_key_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write;
+}
+
+impl WriteFieldKey for str {
+    fn write_field_key_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write,
+    {
+        escape_and_write_value(self, FIELD_KEY_DELIMITERS, w)
+    }
+}
+
+trait WriteFieldValue {
+    fn write_field_value_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write;
+}
+
+impl WriteFieldValue for FieldValue {
+    fn write_field_value_to<W>(&self, mut w: W) -> io::Result<()>
+    where
+        W: io::Write,
+    {
+        use FieldValue::*;
+
+        match self {
+            Bool(v) => write!(w, "{}", if *v { "t" } else { "f" }),
+            F64(v) => write!(w, "{v}"),
+            I64(v) => write!(w, "{v}i"),
+            U64(v) => write!(w, "{v}u"),
+            String(v) => {
+                w.write_all(br#"""#)?;
+                escape_and_write_value(v, FIELD_VALUE_STRING_DELIMITERS, &mut w)?;
+                w.write_all(br#"""#)
+            }
+        }
+    }
+}
+
+trait WriteTimestamp {
+    fn write_timestamp_to<W>(&self, w: W) -> io::Result<()>
+    where
+        W: io::Write;
+}
+
+impl WriteTimestamp for i64 {
+    fn write_timestamp_to<W>(&self, mut w: W) -> io::Result<()>
+    where
+        W: io::Write,
+    {
+        write!(w, "{self}")
+    }
+}
+
+const MEASUREMENT_DELIMITERS: &[char] = &[',', ' '];
+const TAG_KEY_DELIMITERS: &[char] = &[',', '=', ' '];
+const TAG_VALUE_DELIMITERS: &[char] = TAG_KEY_DELIMITERS;
+const FIELD_KEY_DELIMITERS: &[char] = TAG_KEY_DELIMITERS;
+const FIELD_VALUE_STRING_DELIMITERS: &[char] = &['"'];
+
+fn escape_and_write_value<W>(
+    value: &str,
+    escaping_specification: &[char],
+    mut w: W,
+) -> io::Result<()>
+where
+    W: io::Write,
+{
+    let mut last = 0;
+
+    for (idx, delim) in value.match_indices(escaping_specification) {
+        let s = &value[last..idx];
+        write!(w, r#"{s}\{delim}"#)?;
+        last = idx + delim.len();
+    }
+
+    w.write_all(value[last..].as_bytes())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::str;
+
+    fn assert_utf8_strings_eq(left: &[u8], right: &[u8]) {
+        assert_eq!(
+            left,
+            right,
+            "\n\nleft string value:  `{}`,\nright string value: `{}`",
+            str::from_utf8(left).unwrap(),
+            str::from_utf8(right).unwrap(),
+        );
+    }
+
+    #[test]
+    fn point_builder_allows_setting_tags_and_fields() {
+        let point = DataPoint::builder("swap")
+            .tag("host", "server01")
+            .tag("name", "disk0")
+            .field("in", 3_i64)
+            .field("out", 4_i64)
+            .timestamp(1)
+            .build()
+            .unwrap();
+
+        assert_utf8_strings_eq(
+            &point.data_point_to_vec().unwrap(),
+            b"swap,host=server01,name=disk0 in=3i,out=4i 1\n".as_ref(),
+        );
+    }
+
+    #[test]
+    fn no_tags_or_timestamp() {
+        let point = DataPoint::builder("m0")
+            .field("f0", 1.0)
+            .field("f1", 2_i64)
+            .build()
+            .unwrap();
+
+        assert_utf8_strings_eq(
+            &point.data_point_to_vec().unwrap(),
+            b"m0 f0=1,f1=2i\n".as_ref(),
+        );
+    }
+
+    #[test]
+    fn no_timestamp() {
+        let point = DataPoint::builder("m0")
+            .tag("t0", "v0")
+            .tag("t1", "v1")
+            .field("f1", 2_i64)
+            .build()
+            .unwrap();
+
+        assert_utf8_strings_eq(
+            &point.data_point_to_vec().unwrap(),
+            b"m0,t0=v0,t1=v1 f1=2i\n".as_ref(),
+        );
+    }
+
+    #[test]
+    fn no_field() {
+        let point_result = DataPoint::builder("m0").build();
+
+        assert!(point_result.is_err());
+    }
+
+    const ALL_THE_DELIMITERS: &str = r#"alpha,beta=delta gamma"epsilon"#;
+
+    #[test]
+    fn special_characters_are_escaped_in_measurements() {
+        assert_utf8_strings_eq(
+            &ALL_THE_DELIMITERS.measurement_to_vec().unwrap(),
+            br#"alpha\,beta=delta\ gamma"epsilon"#.as_ref(),
+        );
+    }
+
+    #[test]
+    fn special_characters_are_escaped_in_tag_keys() {
+        assert_utf8_strings_eq(
+            &ALL_THE_DELIMITERS.tag_key_to_vec().unwrap(),
+            br#"alpha\,beta\=delta\ gamma"epsilon"#.as_ref(),
+        );
+    }
+
+    #[test]
+    fn special_characters_are_escaped_in_tag_values() {
+        assert_utf8_strings_eq(
+            &ALL_THE_DELIMITERS.tag_value_to_vec().unwrap(),
+            br#"alpha\,beta\=delta\ gamma"epsilon"#.as_ref(),
+        );
+    }
+
+    #[test]
+    fn special_characters_are_escaped_in_field_keys() {
+        assert_utf8_strings_eq(
+            &ALL_THE_DELIMITERS.field_key_to_vec().unwrap(),
+            br#"alpha\,beta\=delta\ gamma"epsilon"#.as_ref(),
+        );
+    }
+
+    #[test]
+    fn special_characters_are_escaped_in_field_values_of_strings() {
+        assert_utf8_strings_eq(
+            &FieldValue::from(ALL_THE_DELIMITERS)
+                .field_value_to_vec()
+                .unwrap(),
+            br#""alpha,beta=delta gamma\"epsilon""#.as_ref(),
+        );
+    }
+
+    #[test]
+    fn field_value_of_bool() {
+        let e = FieldValue::from(true);
+        assert_utf8_strings_eq(&e.field_value_to_vec().unwrap(), b"t");
+
+        let e = FieldValue::from(false);
+        assert_utf8_strings_eq(&e.field_value_to_vec().unwrap(), b"f");
+    }
+
+    #[test]
+    fn field_value_of_float() {
+        let e = FieldValue::from(42_f64);
+        assert_utf8_strings_eq(&e.field_value_to_vec().unwrap(), b"42");
+    }
+
+    #[test]
+    fn field_value_of_signed_integer() {
+        let e = FieldValue::from(42_i64);
+        assert_utf8_strings_eq(&e.field_value_to_vec().unwrap(), b"42i");
+    }
+
+    #[test]
+    fn field_value_of_unsigned_integer() {
+        let e = FieldValue::from(42_u64);
+        assert_utf8_strings_eq(&e.field_value_to_vec().unwrap(), b"42u");
+    }
+
+    #[test]
+    fn field_value_of_string() {
+        let e = FieldValue::from("hello");
+        assert_utf8_strings_eq(&e.field_value_to_vec().unwrap(), br#""hello""#);
+    }
+
+    // Clears up the boilerplate of writing to a vector from the tests
+    macro_rules! test_extension_traits {
+        ($($ext_name:ident :: $ext_fn_name:ident -> $base_name:ident :: $base_fn_name:ident,)*) => {
+            $(
+                trait $ext_name: $base_name {
+                    fn $ext_fn_name(&self) -> io::Result<Vec<u8>> {
+                        let mut v = Vec::new();
+                        self.$base_fn_name(&mut v)?;
+                        Ok(v)
+                    }
+                }
+                impl<T: $base_name + ?Sized> $ext_name for T {}
+            )*
+        }
+    }
+
+    test_extension_traits! {
+        WriteDataPointExt::data_point_to_vec -> WriteDataPoint::write_data_point_to,
+        WriteMeasurementExt::measurement_to_vec -> WriteMeasurement::write_measurement_to,
+        WriteTagKeyExt::tag_key_to_vec -> WriteTagKey::write_tag_key_to,
+        WriteTagValueExt::tag_value_to_vec -> WriteTagValue::write_tag_value_to,
+        WriteFieldKeyExt::field_key_to_vec -> WriteFieldKey::write_field_key_to,
+        WriteFieldValueExt::field_value_to_vec -> WriteFieldValue::write_field_value_to,
+    }
+}
diff --git a/influxdb2_client/src/models/file.rs b/influxdb2_client/src/models/file.rs
new file mode 100644
index 0000000..d0f0ab8
--- /dev/null
+++ b/influxdb2_client/src/models/file.rs
@@ -0,0 +1,30 @@
+//! File
+
+use serde::{Deserialize, Serialize};
+
+/// Represents a source from a single file
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct File {
+    /// Type of AST node
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    /// The name of the file.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    /// PackageClause
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub package: Option<crate::models::ast::PackageClause>,
+    /// A list of package imports
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub imports: Vec<crate::models::ast::ImportDeclaration>,
+    /// List of Flux statements
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub body: Vec<crate::models::ast::Statement>,
+}
+
+impl File {
+    /// Represents a source from a single file
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/health.rs b/influxdb2_client/src/models/health.rs
new file mode 100644
index 0000000..b9f46eb
--- /dev/null
+++ b/influxdb2_client/src/models/health.rs
@@ -0,0 +1,49 @@
+//! Health
+
+use serde::{Deserialize, Serialize};
+
+/// HealthCheck
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct HealthCheck {
+    /// Name of the influxdb instance
+    pub name: String,
+    /// Message
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub message: Option<String>,
+    /// Checks
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub checks: Vec<crate::models::HealthCheck>,
+    /// Status
+    pub status: Status,
+    /// Version
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub version: Option<String>,
+    /// Commit
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub commit: Option<String>,
+}
+
+impl HealthCheck {
+    /// Returns instance of HealthCheck
+    pub fn new(name: String, status: Status) -> Self {
+        Self {
+            name,
+            status,
+            message: None,
+            checks: Vec::new(),
+            version: None,
+            commit: None,
+        }
+    }
+}
+
+/// Status
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Status {
+    /// Pass
+    Pass,
+    /// Fail
+    Fail,
+}
diff --git a/influxdb2_client/src/models/label.rs b/influxdb2_client/src/models/label.rs
new file mode 100644
index 0000000..de11ae4
--- /dev/null
+++ b/influxdb2_client/src/models/label.rs
@@ -0,0 +1,111 @@
+//! Labels
+
+use serde::{Deserialize, Serialize};
+
+/// Post create label request, to create a new label
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct LabelCreateRequest {
+    /// Organisation ID
+    #[serde(rename = "orgID")]
+    pub org_id: String,
+    /// Label name
+    pub name: String,
+    /// Key/Value pairs associated with this label.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub properties: Option<::std::collections::HashMap<String, String>>,
+}
+
+impl LabelCreateRequest {
+    /// Return instance of LabelCreateRequest
+    pub fn new(org_id: String, name: String) -> Self {
+        Self {
+            org_id,
+            name,
+            ..Default::default()
+        }
+    }
+}
+
+/// LabelResponse
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct LabelResponse {
+    /// Label
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub label: Option<crate::models::Label>,
+    /// Links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::Links>,
+}
+
+impl LabelResponse {
+    /// Returns instance of LabelResponse
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+///LabelsResponse
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct LabelsResponse {
+    /// Labels
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub labels: Vec<crate::models::Label>,
+    /// Links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::Links>,
+}
+
+impl LabelsResponse {
+    /// Returns List of Labels
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+///LabelUpdateRequest
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct LabelUpdate {
+    /// Name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    /// Key/Value pairs associated with this label.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub properties: Option<::std::collections::HashMap<String, String>>,
+}
+
+impl LabelUpdate {
+    /// Returns an instance of LabelUpdate
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// Label
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Label {
+    /// Label ID
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    /// Org ID
+    #[serde(rename = "orgID", skip_serializing_if = "Option::is_none")]
+    pub org_id: Option<String>,
+    /// Label name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    /// Key/Value pairs associated with this label. Keys can be removed by
+    /// sending an update with an empty value.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub properties: Option<::std::collections::HashMap<String, String>>,
+}
+
+impl Label {
+    /// Returns an instance of Label
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/links.rs b/influxdb2_client/src/models/links.rs
new file mode 100644
index 0000000..6bf5871
--- /dev/null
+++ b/influxdb2_client/src/models/links.rs
@@ -0,0 +1,28 @@
+//! Links
+
+use serde::{Deserialize, Serialize};
+
+/// Links
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Links {
+    /// Next link
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub next: Option<String>,
+    /// Link to self
+    #[serde(rename = "self")]
+    pub self_: String,
+    /// Previous Link
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prev: Option<String>,
+}
+
+impl Links {
+    /// Returns list of Links
+    pub fn new(self_: String) -> Self {
+        Self {
+            self_,
+            ..Default::default()
+        }
+    }
+}
diff --git a/influxdb2_client/src/models/mod.rs b/influxdb2_client/src/models/mod.rs
new file mode 100644
index 0000000..8c5b4bc
--- /dev/null
+++ b/influxdb2_client/src/models/mod.rs
@@ -0,0 +1,37 @@
+//! InfluxDB Models
+//!
+//! Roughly follows the OpenAPI specification
+
+pub mod ast;
+
+pub mod user;
+pub use self::user::{User, UserLinks, Users, UsersLinks};
+pub mod organization;
+pub use self::organization::{Organization, OrganizationLinks, Organizations};
+pub mod bucket;
+pub use self::bucket::{Bucket, BucketLinks, Buckets, PostBucketRequest};
+pub mod onboarding;
+pub use self::onboarding::{IsOnboarding, OnboardingRequest, OnboardingResponse};
+pub mod links;
+pub use self::links::Links;
+pub mod permission;
+pub use self::permission::Permission;
+pub mod label;
+pub use self::label::{Label, LabelCreateRequest, LabelResponse, LabelUpdate, LabelsResponse};
+pub mod authorization;
+pub use self::authorization::{Authorization, AuthorizationAllOfLinks};
+pub mod resource;
+pub use self::resource::Resource;
+pub mod retention_rule;
+pub use self::retention_rule::RetentionRule;
+pub mod query;
+pub use self::query::{
+    AnalyzeQueryResponse, AnalyzeQueryResponseErrors, AstResponse, FluxSuggestion, FluxSuggestions,
+    LanguageRequest, Query,
+};
+pub mod file;
+pub use self::file::File;
+pub mod health;
+pub use self::health::{HealthCheck, Status};
+pub mod data_point;
+pub use data_point::{DataPoint, FieldValue, WriteDataPoint};
diff --git a/influxdb2_client/src/models/onboarding.rs b/influxdb2_client/src/models/onboarding.rs
new file mode 100644
index 0000000..9c720c9
--- /dev/null
+++ b/influxdb2_client/src/models/onboarding.rs
@@ -0,0 +1,80 @@
+//! # Onboarding
+//!
+//! Initial setup of InfluxDB instance
+
+use serde::{Deserialize, Serialize};
+
+/// Check if database has default user, org, bucket created, returns true if
+/// not.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct IsOnboarding {
+    /// True if onboarding has already been completed otherwise false
+    #[serde(default)]
+    pub allowed: bool,
+}
+
+impl IsOnboarding {
+    /// Return instance of IsOnboarding
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// Post onboarding request, to setup initial user, org and bucket.
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct OnboardingRequest {
+    /// Initial username
+    pub username: String,
+    /// Initial organization name
+    pub org: String,
+    /// Initial bucket name
+    pub bucket: String,
+    /// Initial password of user
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub password: Option<String>,
+    /// Retention period in nanoseconds
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub retention_period_seconds: Option<i32>,
+    /// Retention period *in nanoseconds* for the new bucket. This key's name
+    /// has been misleading since OSS 2.0 GA, please transition to use
+    /// `retentionPeriodSeconds`
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub retention_period_hrs: Option<i32>,
+}
+
+impl OnboardingRequest {
+    /// Return instance of OnboardingRequest
+    pub fn new(username: String, org: String, bucket: String) -> Self {
+        Self {
+            username,
+            org,
+            bucket,
+            ..Default::default()
+        }
+    }
+}
+
+/// OnboardingResponse
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct OnboardingResponse {
+    /// User
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub user: Option<crate::models::User>,
+    /// Organization
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub org: Option<crate::models::Organization>,
+    /// Bucket
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub bucket: Option<crate::models::Bucket>,
+    /// Auth token
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub auth: Option<crate::models::Authorization>,
+}
+
+impl OnboardingResponse {
+    /// Return instance of OnboardingResponse
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/organization.rs b/influxdb2_client/src/models/organization.rs
new file mode 100644
index 0000000..96c81ff
--- /dev/null
+++ b/influxdb2_client/src/models/organization.rs
@@ -0,0 +1,103 @@
+//! Organization
+
+use serde::{Deserialize, Serialize};
+
+/// Organization Schema
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Organization {
+    /// Links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::OrganizationLinks>,
+    /// Organization ID
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    /// Organization Name
+    pub name: String,
+    /// Organization description
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    /// Organization created timestamp
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_at: Option<String>,
+    /// Organization updated timestamp
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub updated_at: Option<String>,
+    /// If inactive the organization is inactive.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub status: Option<Status>,
+}
+
+impl Organization {
+    /// Returns instance of Organization
+    pub fn new(name: String) -> Self {
+        Self {
+            name,
+            ..Default::default()
+        }
+    }
+}
+
+/// If inactive the organization is inactive.
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Status {
+    /// Organization is active
+    Active,
+    /// Organization is inactive
+    Inactive,
+}
+
+/// Organization Links
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct OrganizationLinks {
+    /// Link to self
+    #[serde(rename = "self", skip_serializing_if = "Option::is_none")]
+    pub self_: Option<String>,
+    /// Links to members
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub members: Option<String>,
+    /// Links to owners
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub owners: Option<String>,
+    /// Links to labels
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub labels: Option<String>,
+    /// Links to secrets
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub secrets: Option<String>,
+    /// Links to buckets
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub buckets: Option<String>,
+    /// Links to tasks
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tasks: Option<String>,
+    /// Links to dashboards
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub dashboards: Option<String>,
+}
+
+impl OrganizationLinks {
+    /// Returns instance of Organization Links
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// Organizations
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Organizations {
+    /// Links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::Links>,
+    /// List of organizations
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub orgs: Vec<crate::models::Organization>,
+}
+
+impl Organizations {
+    /// Returns instance of Organizations
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/src/models/permission.rs b/influxdb2_client/src/models/permission.rs
new file mode 100644
index 0000000..7791d1b
--- /dev/null
+++ b/influxdb2_client/src/models/permission.rs
@@ -0,0 +1,29 @@
+//! Permissions
+
+use serde::{Deserialize, Serialize};
+
+/// Permissions for a resource
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct Permission {
+    /// Access Type
+    pub action: Action,
+    /// Resource object
+    pub resource: crate::models::Resource,
+}
+
+impl Permission {
+    /// Return instance of Permission
+    pub fn new(action: Action, resource: crate::models::Resource) -> Self {
+        Self { action, resource }
+    }
+}
+
+/// Allowed Permission Action
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Action {
+    /// Read access
+    Read,
+    /// Write access
+    Write,
+}
diff --git a/influxdb2_client/src/models/query.rs b/influxdb2_client/src/models/query.rs
new file mode 100644
index 0000000..cc5c7a2
--- /dev/null
+++ b/influxdb2_client/src/models/query.rs
@@ -0,0 +1,161 @@
+//! Query
+
+use crate::models::ast::Package;
+use crate::models::File;
+use serde::{Deserialize, Serialize};
+use serde_json::Number;
+use std::collections::HashMap;
+
+/// Query influx using the Flux language
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Query {
+    /// Query Script
+    #[serde(rename = "extern", skip_serializing_if = "Option::is_none")]
+    pub r#extern: Option<File>,
+    /// Query script to execute.
+    pub query: String,
+    /// The type of query. Must be \"flux\".
+    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<Type>,
+    /// Dialect
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub dialect: Option<crate::models::ast::Dialect>,
+    /// Specifies the time that should be reported as "now" in the query.
+    /// Default is the server's now time.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub now: Option<String>,
+
+    /// Params for use in query via params.param_name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub params: Option<HashMap<String, Param>>,
+}
+
+/// Query Param Enum for Flux
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+#[serde(untagged)]
+pub enum Param {
+    /// A number param
+    Number(Number),
+    /// A string param
+    String(String),
+}
+
+impl Query {
+    /// Query influx using the Flux language
+    pub fn new(query: String) -> Self {
+        Self {
+            query,
+            ..Default::default()
+        }
+    }
+}
+
+/// The type of query. Must be \"flux\".
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Type {
+    /// Query Type
+    Flux,
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+/// Flux Query Suggestion
+pub struct FluxSuggestion {
+    /// Suggestion Name
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    /// Suggestion Params
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub params: Option<HashMap<String, String>>,
+}
+
+impl FluxSuggestion {
+    /// Returns an instance FluxSuggestion
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// FluxSuggestions
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct FluxSuggestions {
+    /// List of Flux Suggestions
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub funcs: Vec<crate::models::FluxSuggestion>,
+}
+
+impl FluxSuggestions {
+    /// Return an instance of FluxSuggestions
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// AnalyzeQueryResponse
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct AnalyzeQueryResponse {
+    /// List of QueryResponseErrors
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub errors: Vec<AnalyzeQueryResponseErrors>,
+}
+
+impl AnalyzeQueryResponse {
+    /// Return an instance of AnanlyzeQueryResponse
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// AnalyzeQueryResponseErrors
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct AnalyzeQueryResponseErrors {
+    /// Error line
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub line: Option<i32>,
+    /// Error column
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub column: Option<i32>,
+    /// Error char
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub character: Option<i32>,
+    /// Error message
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub message: Option<String>,
+}
+
+impl AnalyzeQueryResponseErrors {
+    /// Return an instance of AnalyzeQueryResponseErrors
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// AstResponse : Contains the AST for the supplied Flux query
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct AstResponse {
+    /// AST of Flux query
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ast: Option<Package>,
+}
+
+impl AstResponse {
+    /// Contains the AST for the supplied Flux query
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// LanguageRequest : Flux query to be analyzed.
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct LanguageRequest {
+    /// Flux query script to be analyzed
+    pub query: String,
+}
+
+impl LanguageRequest {
+    /// Flux query to be analyzed.
+    pub fn new(query: String) -> Self {
+        Self { query }
+    }
+}
diff --git a/influxdb2_client/src/models/resource.rs b/influxdb2_client/src/models/resource.rs
new file mode 100644
index 0000000..1b1e0e6
--- /dev/null
+++ b/influxdb2_client/src/models/resource.rs
@@ -0,0 +1,82 @@
+//! Resources
+
+use serde::{Deserialize, Serialize};
+
+/// Construct a resource
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Resource {
+    /// Resource Type
+    #[serde(rename = "type")]
+    pub r#type: Type,
+    /// If ID is set that is a permission for a specific resource. if it is not
+    /// set it is a permission for all resources of that resource type.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    /// Optional name of the resource if the resource has a name field.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub name: Option<String>,
+    /// If orgID is set that is a permission for all resources owned my that
+    /// org. if it is not set it is a permission for all resources of that
+    /// resource type.
+    #[serde(rename = "orgID", skip_serializing_if = "Option::is_none")]
+    pub org_id: Option<String>,
+    /// Optional name of the organization of the organization with orgID.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub org: Option<String>,
+}
+
+impl Resource {
+    /// Returns instance of Resource
+    pub fn new(r#type: Type) -> Self {
+        Self {
+            r#type,
+            id: None,
+            name: None,
+            org_id: None,
+            org: None,
+        }
+    }
+}
+
+/// Resource Type
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Type {
+    /// Authorizations
+    Authorizations,
+    /// Buckets
+    Buckets,
+    /// Dashboards
+    Dashboards,
+    /// Organizations
+    Orgs,
+    /// Sources
+    Sources,
+    /// Tasks
+    Tasks,
+    /// Telegrafs
+    Telegrafs,
+    /// Users
+    Users,
+    /// Variables
+    Variables,
+    /// Scrapers
+    Scrapers,
+    /// Secrets
+    Secrets,
+    /// Labels
+    Labels,
+    /// Views
+    Views,
+    /// Documents
+    Documents,
+    /// Notification Rules
+    NotificationRules,
+    /// Notification Endpoints
+    NotificationEndpoints,
+    /// Checks
+    Checks,
+    /// DBRP
+    Dbrp,
+}
diff --git a/influxdb2_client/src/models/retention_rule.rs b/influxdb2_client/src/models/retention_rule.rs
new file mode 100644
index 0000000..ee78609
--- /dev/null
+++ b/influxdb2_client/src/models/retention_rule.rs
@@ -0,0 +1,37 @@
+//! Retention Rules
+
+use serde::{Deserialize, Serialize};
+
+/// RetentionRule
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct RetentionRule {
+    /// Expiry
+    #[serde(rename = "type")]
+    pub r#type: Type,
+    /// Duration in seconds for how long data will be kept in the database. 0
+    /// means infinite.
+    pub every_seconds: i32,
+    /// Shard duration measured in seconds.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub shard_group_duration_seconds: Option<i64>,
+}
+
+impl RetentionRule {
+    /// Returns instance of RetentionRule
+    pub fn new(r#type: Type, every_seconds: i32) -> Self {
+        Self {
+            r#type,
+            every_seconds,
+            shard_group_duration_seconds: None,
+        }
+    }
+}
+
+/// Set Retention Rule expired or not
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Type {
+    /// RetentionRule Expired
+    Expire,
+}
diff --git a/influxdb2_client/src/models/user.rs b/influxdb2_client/src/models/user.rs
new file mode 100644
index 0000000..f0d8074
--- /dev/null
+++ b/influxdb2_client/src/models/user.rs
@@ -0,0 +1,90 @@
+//! Users
+
+use serde::{Deserialize, Serialize};
+
+/// User Schema
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct User {
+    /// User ID
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+    /// User oauth token id
+    #[serde(rename = "oauthID", skip_serializing_if = "Option::is_none")]
+    pub oauth_id: Option<String>,
+    /// User name
+    pub name: String,
+    /// If inactive the user is inactive.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub status: Option<Status>,
+    /// User links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::UserLinks>,
+}
+
+impl User {
+    /// Returns instance of user
+    pub fn new(name: String) -> Self {
+        Self {
+            name,
+            ..Default::default()
+        }
+    }
+}
+
+/// If inactive the user is inactive.
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum Status {
+    /// User is active
+    Active,
+    /// User is inactive
+    Inactive,
+}
+
+/// User links
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct UserLinks {
+    /// User link to Self
+    #[serde(rename = "self", skip_serializing_if = "Option::is_none")]
+    pub self_: Option<String>,
+}
+
+impl UserLinks {
+    /// Returns instance of UserLinks
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// List of Users
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct Users {
+    /// List of user links
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub links: Option<crate::models::UsersLinks>,
+    /// List of users
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub users: Vec<crate::models::User>,
+}
+
+impl Users {
+    /// Returns instance of Users
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+/// UsersLinks
+#[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub struct UsersLinks {
+    /// Users Link to Self
+    #[serde(rename = "self", skip_serializing_if = "Option::is_none")]
+    pub self_: Option<String>,
+}
+
+impl UsersLinks {
+    /// Returns instance of UsersLinks
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
diff --git a/influxdb2_client/tests/common/mod.rs b/influxdb2_client/tests/common/mod.rs
new file mode 100644
index 0000000..a057eb4
--- /dev/null
+++ b/influxdb2_client/tests/common/mod.rs
@@ -0,0 +1 @@
+pub mod server_fixture;
diff --git a/influxdb2_client/tests/common/server_fixture.rs b/influxdb2_client/tests/common/server_fixture.rs
new file mode 100644
index 0000000..e8707f2
--- /dev/null
+++ b/influxdb2_client/tests/common/server_fixture.rs
@@ -0,0 +1,358 @@
+use once_cell::sync::OnceCell;
+use std::{
+    fs::File,
+    process::{Child, Command, Stdio},
+    sync::{
+        atomic::{AtomicUsize, Ordering::SeqCst},
+        Arc, Weak,
+    },
+    time::Duration,
+};
+use tokio::sync::Mutex;
+
+#[macro_export]
+/// If `TEST_INTEGRATION` is set and InfluxDB 2.0 OSS is available (either locally
+/// via `influxd` directly if  the `INFLUXDB_IOX_INTEGRATION_LOCAL` environment
+// variable is set, or via `docker` otherwise), set up the server as requested and
+/// return it to the caller.
+///
+/// If `TEST_INTEGRATION` is not set, skip the calling test by returning early.
+macro_rules! maybe_skip_integration {
+    ($server_fixture:expr) => {{
+        let local = std::env::var("INFLUXDB_IOX_INTEGRATION_LOCAL").is_ok();
+        let command = if local { "influxd" } else { "docker" };
+
+        match (
+            std::process::Command::new("which")
+                .arg(command)
+                .stdout(std::process::Stdio::null())
+                .status()
+                .expect("should be able to run `which`")
+                .success(),
+            std::env::var("TEST_INTEGRATION").is_ok(),
+        ) {
+            (true, true) => $server_fixture,
+            (false, true) => {
+                panic!(
+                    "TEST_INTEGRATION is set which requires running integration tests, but \
+                     `{}` is not available",
+                    command
+                )
+            }
+            _ => {
+                eprintln!(
+                    "skipping integration test - set the TEST_INTEGRATION environment variable \
+                     and install `{}` to run",
+                    command
+                );
+                return Ok(());
+            }
+        }
+    }};
+}
+
+/// Represents a server that has been started and is available for
+/// testing.
+#[derive(Debug)]
+pub struct ServerFixture {
+    server: Arc<TestServer>,
+}
+
+impl ServerFixture {
+    /// Create a new server fixture and wait for it to be ready. This
+    /// is called "create" rather than new because it is async and
+    /// waits. The shared database can be used immediately.
+    ///
+    /// This is currently implemented as a singleton so all tests *must*
+    /// use a new database and not interfere with the existing database.
+    pub async fn create_shared() -> Self {
+        // Try and reuse the same shared server, if there is already
+        // one present
+        static SHARED_SERVER: OnceCell<parking_lot::Mutex<Weak<TestServer>>> = OnceCell::new();
+
+        let shared_server = SHARED_SERVER.get_or_init(|| parking_lot::Mutex::new(Weak::new()));
+
+        let shared_upgraded = {
+            let locked = shared_server.lock();
+            locked.upgrade()
+        };
+
+        // is a shared server already present?
+        let server = match shared_upgraded {
+            Some(server) => server,
+            None => {
+                // if not, create one
+                let mut server = TestServer::new();
+                // ensure the server is ready
+                server.wait_until_ready(InitialConfig::Onboarded).await;
+
+                let server = Arc::new(server);
+                // save a reference for other threads that may want to
+                // use this server, but don't prevent it from being
+                // destroyed when going out of scope
+                let mut shared_server = shared_server.lock();
+                *shared_server = Arc::downgrade(&server);
+                server
+            }
+        };
+
+        Self { server }
+    }
+
+    /// Create a new server fixture and wait for it to be ready. This
+    /// is called "create" rather than new because it is async and
+    /// waits. The database is left unconfigured and is not shared
+    /// with any other tests.
+    pub async fn create_single_use() -> Self {
+        let mut server = TestServer::new();
+
+        // ensure the server is ready
+        server.wait_until_ready(InitialConfig::None).await;
+
+        let server = Arc::new(server);
+
+        Self { server }
+    }
+
+    /// Return a client suitable for communicating with this server
+    pub fn client(&self) -> influxdb2_client::Client {
+        match self.server.admin_token.as_ref() {
+            Some(token) => influxdb2_client::Client::new(self.http_base(), token),
+            None => influxdb2_client::Client::new(self.http_base(), ""),
+        }
+    }
+
+    /// Return the http base URL for the HTTP API
+    pub fn http_base(&self) -> &str {
+        &self.server.http_base
+    }
+}
+
+/// Specifies whether the server should be set up initially
+#[derive(Debug, Copy, Clone, PartialEq)]
+enum InitialConfig {
+    /// Don't set up the server, the test will (for testing onboarding)
+    None,
+    /// Onboard the server and set up the client with the associated token (for
+    /// most tests)
+    Onboarded,
+}
+
+// These port numbers are chosen to not collide with a development ioxd/influxd
+// server running locally.
+// TODO(786): allocate random free ports instead of hardcoding.
+// TODO(785): we cannot use localhost here.
+static NEXT_PORT: AtomicUsize = AtomicUsize::new(8190);
+
+/// Represents the current known state of a TestServer
+#[derive(Debug)]
+enum ServerState {
+    Started,
+    Ready,
+    Error,
+}
+
+const ADMIN_TEST_USER: &str = "admin-test-user";
+const ADMIN_TEST_ORG: &str = "admin-test-org";
+const ADMIN_TEST_BUCKET: &str = "admin-test-bucket";
+const ADMIN_TEST_PASSWORD: &str = "admin-test-password";
+
+#[derive(Debug)]
+struct TestServer {
+    /// Is the server ready to accept connections?
+    ready: Mutex<ServerState>,
+    /// Handle to the server process being controlled
+    server_process: Child,
+    /// When using Docker, the name of the detached child
+    docker_name: Option<String>,
+    /// HTTP API base
+    http_base: String,
+    /// Admin token, if onboarding has happened
+    admin_token: Option<String>,
+}
+
+impl TestServer {
+    fn new() -> Self {
+        let ready = Mutex::new(ServerState::Started);
+        let http_port = NEXT_PORT.fetch_add(1, SeqCst);
+        let http_base = format!("http://127.0.0.1:{http_port}");
+
+        let temp_dir = test_helpers::tmp_dir().unwrap();
+
+        let mut log_path = temp_dir.path().to_path_buf();
+        log_path.push(format!("influxdb_server_fixture_{http_port}.log"));
+
+        let mut bolt_path = temp_dir.path().to_path_buf();
+        bolt_path.push(format!("influxd_{http_port}.bolt"));
+
+        let mut engine_path = temp_dir.path().to_path_buf();
+        engine_path.push(format!("influxd_{http_port}_engine"));
+
+        println!("****************");
+        println!("Server Logging to {log_path:?}");
+        println!("****************");
+        let log_file = File::create(log_path).expect("Opening log file");
+
+        let stdout_log_file = log_file
+            .try_clone()
+            .expect("cloning file handle for stdout");
+        let stderr_log_file = log_file;
+
+        let local = std::env::var("INFLUXDB_IOX_INTEGRATION_LOCAL").is_ok();
+
+        let (server_process, docker_name) = if local {
+            let cmd = Command::new("influxd")
+                .arg("--http-bind-address")
+                .arg(format!(":{http_port}"))
+                .arg("--bolt-path")
+                .arg(bolt_path)
+                .arg("--engine-path")
+                .arg(engine_path)
+                // redirect output to log file
+                .stdout(stdout_log_file)
+                .stderr(stderr_log_file)
+                .spawn()
+                .expect("starting of local server process");
+            (cmd, None)
+        } else {
+            let ci_image = "quay.io/influxdb/rust:ci";
+            let container_name = format!("influxdb2_{http_port}");
+
+            Command::new("docker")
+                .arg("container")
+                .arg("run")
+                .arg("--name")
+                .arg(&container_name)
+                .arg("--publish")
+                .arg(format!("{http_port}:8086"))
+                .arg("--rm")
+                .arg("--pull")
+                .arg("always")
+                .arg("--detach")
+                .arg(ci_image)
+                .arg("influxd")
+                .output()
+                .expect("starting of docker server process");
+
+            let cmd = Command::new("docker")
+                .arg("logs")
+                .arg(&container_name)
+                // redirect output to log file
+                .stdout(stdout_log_file)
+                .stderr(stderr_log_file)
+                .spawn()
+                .expect("starting of docker logs process");
+
+            (cmd, Some(container_name))
+        };
+
+        Self {
+            ready,
+            server_process,
+            docker_name,
+            http_base,
+            admin_token: None,
+        }
+    }
+
+    async fn wait_until_ready(&mut self, initial_config: InitialConfig) {
+        let mut ready = self.ready.lock().await;
+        match *ready {
+            ServerState::Started => {} // first time, need to try and start it
+            ServerState::Ready => {
+                return;
+            }
+            ServerState::Error => {
+                panic!("Server was previously found to be in Error, aborting");
+            }
+        }
+
+        let try_http_connect = async {
+            let client = reqwest::Client::new();
+            let url = format!("{}/health", self.http_base);
+            let mut interval = tokio::time::interval(Duration::from_secs(5));
+            loop {
+                match client.get(&url).send().await {
+                    Ok(resp) => {
+                        println!("Successfully got a response from HTTP: {resp:?}");
+                        return;
+                    }
+                    Err(e) => {
+                        println!("Waiting for HTTP server to be up: {e}");
+                    }
+                }
+                interval.tick().await;
+            }
+        };
+
+        let capped_check = tokio::time::timeout(Duration::from_secs(100), try_http_connect);
+
+        match capped_check.await {
+            Ok(_) => {
+                println!("Successfully started {self}");
+                *ready = ServerState::Ready;
+            }
+            Err(e) => {
+                // tell others that this server had some problem
+                *ready = ServerState::Error;
+                std::mem::drop(ready);
+                panic!("Server was not ready in required time: {e}");
+            }
+        }
+
+        // Onboard, if requested.
+        if initial_config == InitialConfig::Onboarded {
+            let client = influxdb2_client::Client::new(&self.http_base, "");
+            let response = client
+                .onboarding(
+                    ADMIN_TEST_USER,
+                    ADMIN_TEST_ORG,
+                    ADMIN_TEST_BUCKET,
+                    Some(ADMIN_TEST_PASSWORD.to_string()),
+                    Some(0),
+                    None,
+                )
+                .await;
+
+            match response {
+                Ok(onboarding) => {
+                    let token = onboarding
+                        .auth
+                        .expect("Onboarding should have returned auth info")
+                        .token
+                        .expect("Onboarding auth should have returned a token");
+                    self.admin_token = Some(token);
+                }
+                Err(e) => {
+                    *ready = ServerState::Error;
+                    std::mem::drop(ready);
+                    panic!("Could not onboard: {e}");
+                }
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for TestServer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        write!(f, "TestServer (http api: {})", self.http_base)
+    }
+}
+
+impl Drop for TestServer {
+    fn drop(&mut self) {
+        self.server_process
+            .kill()
+            .expect("Should have been able to kill the test server");
+
+        if let Some(docker_name) = &self.docker_name {
+            Command::new("docker")
+                .arg("rm")
+                .arg("--force")
+                .arg(docker_name)
+                .stdout(Stdio::null())
+                .status()
+                .expect("killing of docker process");
+        }
+    }
+}
diff --git a/influxdb2_client/tests/health.rs b/influxdb2_client/tests/health.rs
new file mode 100644
index 0000000..293550e
--- /dev/null
+++ b/influxdb2_client/tests/health.rs
@@ -0,0 +1,17 @@
+pub mod common;
+use common::server_fixture::ServerFixture;
+
+type Result<T = (), E = Box<dyn std::error::Error>> = std::result::Result<T, E>;
+
+#[tokio::test]
+async fn get_health() -> Result {
+    // Using a server that has been set up
+    let server_fixture = maybe_skip_integration!(ServerFixture::create_shared()).await;
+    let client = server_fixture.client();
+
+    let res = client.health().await?;
+
+    assert_eq!(res.status, influxdb2_client::models::Status::Pass);
+
+    Ok(())
+}
diff --git a/influxdb2_client/tests/setup.rs b/influxdb2_client/tests/setup.rs
new file mode 100644
index 0000000..1ba03f9
--- /dev/null
+++ b/influxdb2_client/tests/setup.rs
@@ -0,0 +1,118 @@
+pub mod common;
+use common::server_fixture::ServerFixture;
+
+type Result<T = (), E = Box<dyn std::error::Error>> = std::result::Result<T, E>;
+
+#[tokio::test]
+async fn new_server_needs_onboarded() -> Result {
+    let server_fixture = maybe_skip_integration!(ServerFixture::create_single_use()).await;
+    let client = server_fixture.client();
+
+    let res = client.is_onboarding_allowed().await?;
+    assert!(res);
+
+    // Creating a new setup user without first onboarding is an error
+    let username = "some-user";
+    let org = "some-org";
+    let bucket = "some-bucket";
+    let password = "some-password";
+    let retention_period_hrs = 0;
+
+    let err = client
+        .post_setup_user(
+            username,
+            org,
+            bucket,
+            Some(password.to_string()),
+            Some(retention_period_hrs),
+            None,
+        )
+        .await
+        .expect_err("Expected error, got success");
+
+    assert!(matches!(
+        err,
+        influxdb2_client::RequestError::Http {
+            status: reqwest::StatusCode::UNAUTHORIZED,
+            ..
+        }
+    ));
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn onboarding() -> Result {
+    let server_fixture = maybe_skip_integration!(ServerFixture::create_single_use()).await;
+    let client = server_fixture.client();
+
+    let username = "some-user";
+    let org = "some-org";
+    let bucket = "some-bucket";
+    let password = "some-password";
+    let retention_period_hrs = 0;
+
+    client
+        .onboarding(
+            username,
+            org,
+            bucket,
+            Some(password.to_string()),
+            Some(retention_period_hrs),
+            None,
+        )
+        .await?;
+
+    let res = client.is_onboarding_allowed().await?;
+    assert!(!res);
+
+    // Onboarding twice is an error
+    let err = client
+        .onboarding(
+            username,
+            org,
+            bucket,
+            Some(password.to_string()),
+            Some(retention_period_hrs),
+            None,
+        )
+        .await
+        .expect_err("Expected error, got success");
+
+    assert!(matches!(
+        err,
+        influxdb2_client::RequestError::Http {
+            status: reqwest::StatusCode::UNPROCESSABLE_ENTITY,
+            ..
+        }
+    ));
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn create_users() -> Result {
+    // Using a server that has been set up
+    let server_fixture = maybe_skip_integration!(ServerFixture::create_shared()).await;
+    let client = server_fixture.client();
+
+    let username = "another-user";
+    let org = "another-org";
+    let bucket = "another-bucket";
+    let password = "another-password";
+    let retention_period_hrs = 0;
+
+    // Creating a user should work
+    client
+        .post_setup_user(
+            username,
+            org,
+            bucket,
+            Some(password.to_string()),
+            Some(retention_period_hrs),
+            None,
+        )
+        .await?;
+
+    Ok(())
+}
diff --git a/influxdb_influxql_parser/Cargo.toml b/influxdb_influxql_parser/Cargo.toml
new file mode 100644
index 0000000..6751e45
--- /dev/null
+++ b/influxdb_influxql_parser/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "influxdb_influxql_parser"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+nom = { version = "7", default-features = false, features = ["std"] }
+once_cell = "1"
+chrono = { version = "0.4", default-features = false, features = ["std"] }
+chrono-tz = { version = "0.8" }
+num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] }
+num-traits = "0.2"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+test_helpers = { path = "../test_helpers" }
+assert_matches = "1"
+insta = { version = "1.34.0", features = ["yaml"] }
+paste = "1.0.14"
diff --git a/influxdb_influxql_parser/src/common.rs b/influxdb_influxql_parser/src/common.rs
new file mode 100644
index 0000000..e3788c5
--- /dev/null
+++ b/influxdb_influxql_parser/src/common.rs
@@ -0,0 +1,1014 @@
+//! Type and parsers common to many statements.
+
+use crate::expression::conditional::{conditional_expression, ConditionalExpression};
+use crate::identifier::{identifier, Identifier};
+use crate::internal::{expect, verify, ParseResult};
+use crate::keywords::{keyword, Token};
+use crate::literal::unsigned_integer;
+use crate::string::{regex, Regex};
+use core::fmt;
+use nom::branch::alt;
+use nom::bytes::complete::{tag, take_till, take_until};
+use nom::character::complete::{char, multispace1};
+use nom::combinator::{map, opt, recognize, value};
+use nom::multi::{fold_many0, fold_many1, separated_list1};
+use nom::sequence::{delimited, pair, preceded, terminated};
+use std::fmt::{Display, Formatter};
+use std::mem;
+use std::ops::{Deref, DerefMut};
+
+/// A error returned when parsing an InfluxQL query, expressions.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ParseError {
+    pub(crate) message: String,
+    pub(crate) pos: usize,
+}
+
+impl Display for ParseError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} at pos {}", self.message, self.pos)
+    }
+}
+
+/// Represents a measurement name as either an identifier or a regular expression.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum MeasurementName {
+    /// A measurement name expressed as an [`Identifier`].
+    Name(Identifier),
+
+    /// A measurement name expressed as a [`Regex`].
+    Regex(Regex),
+}
+
+impl Parser for MeasurementName {
+    /// Parse a measurement name, which may be an identifier or a regular expression.
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        alt((
+            map(identifier, MeasurementName::Name),
+            map(regex, MeasurementName::Regex),
+        ))(i)
+    }
+}
+
+impl Display for MeasurementName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Name(ident) => fmt::Display::fmt(ident, f),
+            Self::Regex(regex) => fmt::Display::fmt(regex, f),
+        }
+    }
+}
+
+/// Represents a fully-qualified, 3-part measurement name.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct QualifiedMeasurementName {
+    /// An optional database name.
+    pub database: Option<Identifier>,
+
+    /// An optional retention policy.
+    pub retention_policy: Option<Identifier>,
+
+    /// The measurement name.
+    pub name: MeasurementName,
+}
+
+impl Display for QualifiedMeasurementName {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            Self {
+                database: None,
+                retention_policy: None,
+                name,
+            } => write!(f, "{name}"),
+            Self {
+                database: Some(db),
+                retention_policy: None,
+                name,
+            } => write!(f, "{db}..{name}"),
+            Self {
+                database: None,
+                retention_policy: Some(rp),
+                name,
+            } => write!(f, "{rp}.{name}"),
+            Self {
+                database: Some(db),
+                retention_policy: Some(rp),
+                name,
+            } => write!(f, "{db}.{rp}.{name}"),
+        }
+    }
+}
+
+/// Match a fully-qualified, 3-part measurement name.
+///
+/// ```text
+/// qualified_measurement_name ::= measurement_name |
+///                              ( policy_name "." measurement_name ) |
+///                              ( db_name "." policy_name? "." measurement_name )
+///
+/// db_name          ::= identifier
+/// policy_name      ::= identifier
+/// measurement_name ::= identifier | regex_lit
+/// ```
+pub(crate) fn qualified_measurement_name(i: &str) -> ParseResult<&str, QualifiedMeasurementName> {
+    let (remaining_input, (opt_db_rp, name)) = pair(
+        opt(alt((
+            // database "." retention_policy "."
+            map(
+                pair(
+                    terminated(identifier, tag(".")),
+                    terminated(identifier, tag(".")),
+                ),
+                |(db, rp)| (Some(db), Some(rp)),
+            ),
+            // database ".."
+            map(terminated(identifier, tag("..")), |db| (Some(db), None)),
+            // retention_policy "."
+            map(terminated(identifier, tag(".")), |rp| (None, Some(rp))),
+        ))),
+        MeasurementName::parse,
+    )(i)?;
+
+    // Extract possible `database` and / or `retention_policy`
+    let (database, retention_policy) = match opt_db_rp {
+        Some(db_rp) => db_rp,
+        _ => (None, None),
+    };
+
+    Ok((
+        remaining_input,
+        QualifiedMeasurementName {
+            database,
+            retention_policy,
+            name,
+        },
+    ))
+}
+
+/// Parse a SQL-style single-line comment
+fn comment_single_line(i: &str) -> ParseResult<&str, &str> {
+    recognize(pair(tag("--"), take_till(|c| c == '\n' || c == '\r')))(i)
+}
+
+/// Parse a SQL-style inline comment, which can span multiple lines
+fn comment_inline(i: &str) -> ParseResult<&str, &str> {
+    recognize(delimited(
+        tag("/*"),
+        expect(
+            "invalid inline comment, missing closing */",
+            take_until("*/"),
+        ),
+        tag("*/"),
+    ))(i)
+}
+
+/// Repeats the embedded parser until it fails, discarding the results.
+///
+/// This parser is used as a non-allocating version of [`nom::multi::many0`].
+fn many0_<'a, A, F>(mut f: F) -> impl FnMut(&'a str) -> ParseResult<&'a str, ()>
+where
+    F: FnMut(&'a str) -> ParseResult<&'a str, A>,
+{
+    move |i| fold_many0(&mut f, || (), |_, _| ())(i)
+}
+
+/// Optionally consume all whitespace, single-line or inline comments
+pub(crate) fn ws0(i: &str) -> ParseResult<&str, ()> {
+    many0_(alt((multispace1, comment_single_line, comment_inline)))(i)
+}
+
+/// Runs the embedded parser until it fails, discarding the results.
+/// Fails if the embedded parser does not produce at least one result.
+///
+/// This parser is used as a non-allocating version of [`nom::multi::many1`].
+fn many1_<'a, A, F>(mut f: F) -> impl FnMut(&'a str) -> ParseResult<&'a str, ()>
+where
+    F: FnMut(&'a str) -> ParseResult<&'a str, A>,
+{
+    move |i| fold_many1(&mut f, || (), |_, _| ())(i)
+}
+
+/// Must consume either whitespace, single-line or inline comments
+pub(crate) fn ws1(i: &str) -> ParseResult<&str, ()> {
+    many1_(alt((multispace1, comment_single_line, comment_inline)))(i)
+}
+
+/// Implements common behaviour for u64 tuple-struct types
+#[macro_export]
+macro_rules! impl_tuple_clause {
+    ($NAME:ident, $FOR:ty) => {
+        impl $NAME {
+            /// Create a new instance with the specified value.
+            pub fn new(value: $FOR) -> Self {
+                Self(value)
+            }
+        }
+
+        impl std::ops::DerefMut for $NAME {
+            fn deref_mut(&mut self) -> &mut Self::Target {
+                &mut self.0
+            }
+        }
+
+        impl std::ops::Deref for $NAME {
+            type Target = $FOR;
+
+            fn deref(&self) -> &Self::Target {
+                &self.0
+            }
+        }
+
+        impl From<$FOR> for $NAME {
+            fn from(value: $FOR) -> Self {
+                Self(value)
+            }
+        }
+    };
+}
+
+/// Represents the value for a `LIMIT` clause.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct LimitClause(pub(crate) u64);
+
+impl_tuple_clause!(LimitClause, u64);
+
+impl Display for LimitClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "LIMIT {}", self.0)
+    }
+}
+
+/// Parse a `LIMIT <n>` clause.
+pub(crate) fn limit_clause(i: &str) -> ParseResult<&str, LimitClause> {
+    preceded(
+        pair(keyword("LIMIT"), ws1),
+        expect(
+            "invalid LIMIT clause, expected unsigned integer",
+            map(unsigned_integer, LimitClause),
+        ),
+    )(i)
+}
+
+/// Represents the value for a `OFFSET` clause.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct OffsetClause(pub(crate) u64);
+
+impl_tuple_clause!(OffsetClause, u64);
+
+impl Display for OffsetClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "OFFSET {}", self.0)
+    }
+}
+
+/// Parse an `OFFSET <n>` clause.
+pub(crate) fn offset_clause(i: &str) -> ParseResult<&str, OffsetClause> {
+    preceded(
+        pair(keyword("OFFSET"), ws1),
+        expect(
+            "invalid OFFSET clause, expected unsigned integer",
+            map(unsigned_integer, OffsetClause),
+        ),
+    )(i)
+}
+
+/// Parse a terminator that ends a SQL statement.
+pub(crate) fn statement_terminator(i: &str) -> ParseResult<&str, ()> {
+    value((), char(';'))(i)
+}
+
+/// Represents the `WHERE` clause of a statement.
+#[derive(Debug, Clone, PartialEq)]
+pub struct WhereClause(pub(crate) ConditionalExpression);
+
+impl WhereClause {
+    /// Create an instance of a `WhereClause` using `expr`
+    pub fn new(expr: ConditionalExpression) -> Self {
+        Self(expr)
+    }
+}
+
+impl DerefMut for WhereClause {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl Deref for WhereClause {
+    type Target = ConditionalExpression;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl Display for WhereClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "WHERE {}", self.0)
+    }
+}
+
+/// Parse a `WHERE` clause.
+pub(crate) fn where_clause(i: &str) -> ParseResult<&str, WhereClause> {
+    preceded(
+        pair(keyword("WHERE"), ws0),
+        map(conditional_expression, WhereClause),
+    )(i)
+}
+
+/// Represents an InfluxQL `ORDER BY` clause.
+#[derive(Default, Debug, Clone, Copy, Eq, PartialEq)]
+pub enum OrderByClause {
+    /// Signals the `ORDER BY` is in ascending order.
+    #[default]
+    Ascending,
+
+    /// Signals the `ORDER BY` is in descending order.
+    Descending,
+}
+
+impl OrderByClause {
+    /// Return `true` if the order by clause is ascending.
+    pub fn is_ascending(self) -> bool {
+        matches!(self, Self::Ascending)
+    }
+}
+
+impl Display for OrderByClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "ORDER BY TIME {}",
+            match self {
+                Self::Ascending => "ASC",
+                Self::Descending => "DESC",
+            }
+        )
+    }
+}
+
+/// Parse an InfluxQL `ORDER BY` clause.
+///
+/// An `ORDER BY` in InfluxQL is limited when compared to the equivalent
+/// SQL definition. It is defined by the following [EBNF] notation:
+///
+/// ```text
+/// order_by   ::= "ORDER" "BY" (time_order | order)
+/// order      ::= "ASC | "DESC
+/// time_order ::= "TIME" order?
+/// ```
+///
+/// Resulting in the following valid strings:
+///
+/// ```text
+/// ORDER BY ASC
+/// ORDER BY DESC
+/// ORDER BY time
+/// ORDER BY time ASC
+/// ORDER BY time DESC
+/// ```
+///
+/// [EBNF]: https://www.w3.org/TR/2010/REC-xquery-20101214/#EBNFNotation
+pub(crate) fn order_by_clause(i: &str) -> ParseResult<&str, OrderByClause> {
+    let order = || {
+        preceded(
+            ws1,
+            alt((
+                value(OrderByClause::Ascending, keyword("ASC")),
+                value(OrderByClause::Descending, keyword("DESC")),
+            )),
+        )
+    };
+
+    preceded(
+        // "ORDER" "BY"
+        pair(keyword("ORDER"), preceded(ws1, keyword("BY"))),
+        expect(
+            "invalid ORDER BY, expected ASC, DESC or TIME",
+            alt((
+                // "ASC" | "DESC"
+                order(),
+                // "TIME" ( "ASC" | "DESC" )?
+                map(
+                    preceded(
+                        preceded(
+                            ws1,
+                            verify("invalid ORDER BY, expected TIME column", identifier, |v| {
+                                Token(&v.0) == Token("time")
+                            }),
+                        ),
+                        opt(order()),
+                    ),
+                    Option::<_>::unwrap_or_default,
+                ),
+            )),
+        ),
+    )(i)
+}
+
+/// Parser is a trait that allows a type to parse itself.
+pub trait Parser: Sized {
+    /// Parse this type from the string `i`.
+    fn parse(i: &str) -> ParseResult<&str, Self>;
+}
+
+/// `OneOrMore` is a container for representing a minimum of one `T`.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct OneOrMore<T> {
+    pub(crate) contents: Vec<T>,
+}
+
+#[allow(clippy::len_without_is_empty)]
+impl<T> OneOrMore<T> {
+    /// Construct a new `OneOrMore<T>` with `contents`.
+    ///
+    /// **NOTE:** that `new` panics if contents is empty.
+    pub fn new(contents: Vec<T>) -> Self {
+        if contents.is_empty() {
+            panic!("OneOrMore requires elements");
+        }
+
+        Self { contents }
+    }
+
+    /// Returns the first element.
+    pub fn head(&self) -> &T {
+        self.contents.first().unwrap()
+    }
+
+    /// Returns the remaining elements after [Self::head].
+    pub fn tail(&self) -> &[T] {
+        &self.contents[1..]
+    }
+
+    /// Returns the total number of elements.
+    /// Note that `len` ≥ 1.
+    pub fn len(&self) -> usize {
+        self.contents.len()
+    }
+}
+
+impl<T> Deref for OneOrMore<T> {
+    type Target = [T];
+
+    fn deref(&self) -> &Self::Target {
+        &self.contents
+    }
+}
+
+impl<T> DerefMut for OneOrMore<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.contents
+    }
+}
+
+impl<T: Parser> OneOrMore<T> {
+    /// Parse a list of one or more `T`, separated by commas.
+    ///
+    /// Returns an error using `msg` if `separated_list1` fails to parse any elements.
+    pub(crate) fn separated_list1<'a>(
+        msg: &'static str,
+    ) -> impl FnMut(&'a str) -> ParseResult<&'a str, Self> {
+        move |i: &str| {
+            map(
+                expect(
+                    msg,
+                    separated_list1(preceded(ws0, char(',')), preceded(ws0, T::parse)),
+                ),
+                Self::new,
+            )(i)
+        }
+    }
+}
+
+/// `ZeroOrMore` is a container for representing zero or more elements of type `T`.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct ZeroOrMore<T> {
+    pub(crate) contents: Vec<T>,
+}
+
+impl<T> ZeroOrMore<T> {
+    /// Construct a new `ZeroOrMore<T>` with `contents`.
+    pub fn new(contents: Vec<T>) -> Self {
+        Self { contents }
+    }
+
+    /// Returns the first element or `None` if the container is empty.
+    pub fn head(&self) -> Option<&T> {
+        self.contents.first()
+    }
+
+    /// Returns the remaining elements after [Self::head].
+    pub fn tail(&self) -> &[T] {
+        if self.contents.len() < 2 {
+            &[]
+        } else {
+            &self.contents[1..]
+        }
+    }
+
+    /// Returns the total number of elements in the container.
+    pub fn len(&self) -> usize {
+        self.contents.len()
+    }
+
+    /// Returns true if the container has no elements.
+    pub fn is_empty(&self) -> bool {
+        self.contents.is_empty()
+    }
+
+    /// Takes the vector out of the receiver, leaving a default vector value in its place.
+    pub fn take(&mut self) -> Vec<T> {
+        mem::take(&mut self.contents)
+    }
+
+    /// Replaces the actual value in the receiver by the value given in parameter,
+    /// returning the old value if present.
+    pub fn replace(&mut self, value: Vec<T>) -> Vec<T> {
+        mem::replace(&mut self.contents, value)
+    }
+}
+
+impl<T> Deref for ZeroOrMore<T> {
+    type Target = [T];
+
+    fn deref(&self) -> &Self::Target {
+        &self.contents
+    }
+}
+
+impl<T> DerefMut for ZeroOrMore<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.contents
+    }
+}
+
+impl<T: Parser> ZeroOrMore<T> {
+    /// Parse a list of one or more `T`, separated by commas.
+    ///
+    /// Returns an error using `msg` if `separated_list1` fails to parse any elements.
+    pub(crate) fn separated_list1<'a>(
+        msg: &'static str,
+    ) -> impl FnMut(&'a str) -> ParseResult<&'a str, Self> {
+        move |i: &str| {
+            map(
+                expect(
+                    msg,
+                    separated_list1(preceded(ws0, char(',')), preceded(ws0, T::parse)),
+                ),
+                Self::new,
+            )(i)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{assert_error, assert_expect_error};
+    use assert_matches::assert_matches;
+    use nom::character::complete::alphanumeric1;
+
+    impl From<&str> for MeasurementName {
+        /// Convert a `str` to [`MeasurementName::Name`].
+        fn from(s: &str) -> Self {
+            Self::Name(Identifier::new(s.into()))
+        }
+    }
+
+    impl QualifiedMeasurementName {
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`.
+        pub fn new(name: MeasurementName) -> Self {
+            Self {
+                database: None,
+                retention_policy: None,
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name` and `database`.
+        pub fn new_db(name: MeasurementName, database: Identifier) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: None,
+                name,
+            }
+        }
+
+        /// Constructs a new `MeasurementNameExpression` with the specified `name`, `database` and `retention_policy`.
+        pub fn new_db_rp(
+            name: MeasurementName,
+            database: Identifier,
+            retention_policy: Identifier,
+        ) -> Self {
+            Self {
+                database: Some(database),
+                retention_policy: Some(retention_policy),
+                name,
+            }
+        }
+    }
+
+    #[test]
+    fn test_qualified_measurement_name() {
+        use MeasurementName::*;
+
+        let (_, got) = qualified_measurement_name("diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Name("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("/diskio/").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf.autogen.diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: Some("autogen".into()),
+                name: Name("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf.autogen./diskio/").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: Some("autogen".into()),
+                name: Regex("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf..diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: None,
+                name: Name("diskio".into()),
+            }
+        );
+
+        let (_, got) = qualified_measurement_name("telegraf../diskio/").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: None,
+                name: Regex("diskio".into()),
+            }
+        );
+
+        // With whitespace
+        let (_, got) = qualified_measurement_name("\"telegraf\".. \"diskio\"").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: None,
+                name: Name("diskio".into()),
+            }
+        );
+
+        let (_, got) =
+            qualified_measurement_name("telegraf. /* a comment */  autogen. diskio").unwrap();
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: Some("telegraf".into()),
+                retention_policy: Some("autogen".into()),
+                name: Name("diskio".into()),
+            }
+        );
+
+        // Whitespace following identifier is not supported
+        let (rem, got) = qualified_measurement_name("telegraf . autogen. diskio").unwrap();
+        assert_eq!(rem, " . autogen. diskio");
+        assert_eq!(
+            got,
+            QualifiedMeasurementName {
+                database: None,
+                retention_policy: None,
+                name: Name("telegraf".into()),
+            }
+        );
+
+        // Fallible
+
+        // Whitespace preceding regex is not supported
+        qualified_measurement_name("telegraf.autogen. /diskio/").unwrap_err();
+    }
+
+    #[test]
+    fn test_limit_clause() {
+        let (_, got) = limit_clause("LIMIT 587").unwrap();
+        assert_eq!(*got, 587);
+
+        // case insensitive
+        let (_, got) = limit_clause("limit 587").unwrap();
+        assert_eq!(*got, 587);
+
+        // extra spaces between tokens
+        let (_, got) = limit_clause("LIMIT     123").unwrap();
+        assert_eq!(*got, 123);
+
+        // not digits
+        assert_expect_error!(
+            limit_clause("LIMIT from"),
+            "invalid LIMIT clause, expected unsigned integer"
+        );
+
+        // incomplete input
+        assert_expect_error!(
+            limit_clause("LIMIT "),
+            "invalid LIMIT clause, expected unsigned integer"
+        );
+
+        // overflow
+        assert_expect_error!(
+            limit_clause("LIMIT 34593745733489743985734857394"),
+            "unable to parse unsigned integer"
+        );
+    }
+
+    #[test]
+    fn test_offset_clause() {
+        let (_, got) = offset_clause("OFFSET 587").unwrap();
+        assert_eq!(*got, 587);
+
+        // case insensitive
+        let (_, got) = offset_clause("offset 587").unwrap();
+        assert_eq!(*got, 587);
+
+        // extra spaces between tokens
+        let (_, got) = offset_clause("OFFSET     123").unwrap();
+        assert_eq!(*got, 123);
+
+        // not digits
+        assert_expect_error!(
+            offset_clause("OFFSET from"),
+            "invalid OFFSET clause, expected unsigned integer"
+        );
+
+        // incomplete input
+        assert_expect_error!(
+            offset_clause("OFFSET "),
+            "invalid OFFSET clause, expected unsigned integer"
+        );
+
+        // overflow
+        assert_expect_error!(
+            offset_clause("OFFSET 34593745733489743985734857394"),
+            "unable to parse unsigned integer"
+        );
+    }
+
+    #[test]
+    fn test_order_by() {
+        use OrderByClause::*;
+
+        let (_, got) = order_by_clause("ORDER by asc").unwrap();
+        assert_eq!(got, Ascending);
+
+        let (_, got) = order_by_clause("ORDER by desc").unwrap();
+        assert_eq!(got, Descending);
+
+        // "time" as a quoted identifier
+        let (_, got) = order_by_clause("ORDER by \"time\" asc").unwrap();
+        assert_eq!(got, Ascending);
+
+        let (_, got) = order_by_clause("ORDER by time asc").unwrap();
+        assert_eq!(got, Ascending);
+
+        let (_, got) = order_by_clause("ORDER by time desc").unwrap();
+        assert_eq!(got, Descending);
+
+        // default case is ascending
+        let (_, got) = order_by_clause("ORDER by time").unwrap();
+        assert_eq!(got, Ascending);
+
+        // case insensitive
+        let (_, got) = order_by_clause("ORDER by \"TIME\"").unwrap();
+        assert_eq!(got, Ascending);
+
+        let (_, got) = order_by_clause("ORDER by Time").unwrap();
+        assert_eq!(got, Ascending);
+
+        // does not consume remaining input
+        let (i, got) = order_by_clause("ORDER by time LIMIT 10").unwrap();
+        assert_eq!(got, Ascending);
+        assert_eq!(i, " LIMIT 10");
+
+        // Fallible cases
+
+        // Must be "time" identifier
+        assert_expect_error!(
+            order_by_clause("ORDER by foo"),
+            "invalid ORDER BY, expected TIME column"
+        );
+    }
+
+    #[test]
+    fn test_where_clause() {
+        // Can parse a WHERE clause
+        where_clause("WHERE foo = 'bar'").unwrap();
+
+        // Remaining input is not consumed
+        let (i, _) = where_clause("WHERE foo = 'bar' LIMIT 10").unwrap();
+        assert_eq!(i, " LIMIT 10");
+
+        // Without unnecessary whitespace
+        where_clause("WHERE(foo = 'bar')").unwrap();
+
+        let (rem, _) = where_clause("WHERE/* a comment*/foo = 'bar'").unwrap();
+        assert_eq!(rem, "");
+
+        // Fallible cases
+        where_clause("WHERE foo = LIMIT 10").unwrap_err();
+        where_clause("WHERE").unwrap_err();
+    }
+
+    #[test]
+    fn test_statement_terminator() {
+        let (i, _) = statement_terminator(";foo").unwrap();
+        assert_eq!(i, "foo");
+
+        let (i, _) = statement_terminator("; foo").unwrap();
+        assert_eq!(i, " foo");
+
+        // Fallible cases
+        statement_terminator("foo").unwrap_err();
+    }
+
+    impl Parser for String {
+        fn parse(i: &str) -> ParseResult<&str, Self> {
+            map(alphanumeric1, &str::to_string)(i)
+        }
+    }
+
+    type OneOrMoreString = OneOrMore<String>;
+
+    impl Display for OneOrMoreString {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            Display::fmt(self.head(), f)?;
+            for arg in self.tail() {
+                write!(f, ", {arg}")?;
+            }
+            Ok(())
+        }
+    }
+
+    #[test]
+    #[should_panic(expected = "OneOrMore requires elements")]
+    fn test_one_or_more() {
+        let (_, got) = OneOrMoreString::separated_list1("Expects one or more")("foo").unwrap();
+        assert_eq!(got.len(), 1);
+        assert_eq!(got.head(), "foo");
+        assert_eq!(*got, vec!["foo"]); // deref
+        assert_eq!(got.to_string(), "foo");
+
+        let (_, got) =
+            OneOrMoreString::separated_list1("Expects one or more")("foo ,  bar,foobar").unwrap();
+        assert_eq!(got.len(), 3);
+        assert_eq!(got.head(), "foo");
+        assert_eq!(got.tail(), vec!["bar", "foobar"]);
+        assert_eq!(*got, vec!["foo", "bar", "foobar"]); // deref
+        assert_eq!(got.to_string(), "foo, bar, foobar");
+
+        // Fallible cases
+
+        assert_expect_error!(
+            OneOrMoreString::separated_list1("Expects one or more")("+"),
+            "Expects one or more"
+        );
+
+        // should panic
+        OneOrMoreString::new(vec![]);
+    }
+
+    type ZeroOrMoreString = ZeroOrMore<String>;
+
+    impl Display for ZeroOrMoreString {
+        fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+            if let Some(first) = self.head() {
+                Display::fmt(first, f)?;
+                for arg in self.tail() {
+                    write!(f, ", {arg}")?;
+                }
+            }
+
+            Ok(())
+        }
+    }
+
+    #[test]
+    fn test_zero_or_more() {
+        let (_, got) = ZeroOrMoreString::separated_list1("Expects one or more")("foo").unwrap();
+        assert_eq!(got.len(), 1);
+        assert_eq!(got.head().unwrap(), "foo");
+        assert_eq!(*got, vec!["foo"]); // deref
+        assert_eq!(got.to_string(), "foo");
+
+        let (_, got) =
+            ZeroOrMoreString::separated_list1("Expects one or more")("foo ,  bar,foobar").unwrap();
+        assert_eq!(got.len(), 3);
+        assert_eq!(got.head().unwrap(), "foo");
+        assert_eq!(got.tail(), vec!["bar", "foobar"]);
+        assert_eq!(*got, vec!["foo", "bar", "foobar"]); // deref
+        assert_eq!(got.to_string(), "foo, bar, foobar");
+
+        // should not panic
+        let got = ZeroOrMoreString::new(vec![]);
+        assert!(got.is_empty());
+        assert_matches!(got.head(), None);
+        assert_eq!(got.tail().len(), 0);
+
+        // Fallible cases
+
+        assert_expect_error!(
+            OneOrMoreString::separated_list1("Expects one or more")("+"),
+            "Expects one or more"
+        );
+    }
+
+    #[test]
+    fn test_comment_single_line() {
+        // Comment to EOF
+        let (rem, _) = comment_single_line("-- this is a test").unwrap();
+        assert_eq!(rem, "");
+
+        // Comment to EOL
+        let (rem, _) = comment_single_line("-- this is a test\nmore text").unwrap();
+        assert_eq!(rem, "\nmore text");
+
+        // Empty comments
+        let (rem, _) = comment_single_line("--").unwrap();
+        assert_eq!(rem, "");
+        let (rem, _) = comment_single_line("--\nSELECT").unwrap();
+        assert_eq!(rem, "\nSELECT");
+    }
+
+    #[test]
+    fn test_comment_inline() {
+        let (rem, _) = comment_inline("/* this is a test */").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) = comment_inline("/* this is a test*/more text").unwrap();
+        assert_eq!(rem, "more text");
+
+        let (rem, _) = comment_inline("/* this\nis a test*/more text").unwrap();
+        assert_eq!(rem, "more text");
+
+        // Ignores embedded /*
+        let (rem, _) = comment_inline("/* this /* is a test*/more text").unwrap();
+        assert_eq!(rem, "more text");
+
+        // Fallible cases
+
+        assert_expect_error!(
+            comment_inline("/* this is a test"),
+            "invalid inline comment, missing closing */"
+        );
+    }
+
+    #[test]
+    fn test_ws0() {
+        let (rem, _) = ws0("  -- this is a comment\n/* and some more*/  \t").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) = ws0("  -- this is a comment\n/* and some more*/  \tSELECT").unwrap();
+        assert_eq!(rem, "SELECT");
+
+        // no whitespace
+        let (rem, _) = ws0("SELECT").unwrap();
+        assert_eq!(rem, "SELECT");
+    }
+
+    #[test]
+    fn test_ws1() {
+        let (rem, _) = ws1("  -- this is a comment\n/* and some more*/  \t").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) = ws1("  -- this is a comment\n/* and some more*/  \tSELECT").unwrap();
+        assert_eq!(rem, "SELECT");
+
+        // Fallible cases
+
+        // Missing whitespace
+        assert_error!(ws1("SELECT"), Many1);
+    }
+}
diff --git a/influxdb_influxql_parser/src/create.rs b/influxdb_influxql_parser/src/create.rs
new file mode 100644
index 0000000..6b6ae9c
--- /dev/null
+++ b/influxdb_influxql_parser/src/create.rs
@@ -0,0 +1,236 @@
+//! Types and parsers for the [`CREATE DATABASE`][sql] schema statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/manage-database/#create-database
+
+use crate::common::ws1;
+use crate::identifier::{identifier, Identifier};
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use crate::literal::{duration, unsigned_integer, Duration};
+use crate::statement::Statement;
+use nom::branch::alt;
+use nom::combinator::{map, opt, peek};
+use nom::sequence::{pair, preceded, tuple};
+use std::fmt::{Display, Formatter};
+
+pub(crate) fn create_statement(i: &str) -> ParseResult<&str, Statement> {
+    preceded(
+        pair(keyword("CREATE"), ws1),
+        expect(
+            "Invalid CREATE statement, expected DATABASE following CREATE",
+            map(create_database, |s| Statement::CreateDatabase(Box::new(s))),
+        ),
+    )(i)
+}
+
+/// Represents a `CREATE DATABASE` statement.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CreateDatabaseStatement {
+    /// Name of database to be created.
+    pub name: Identifier,
+
+    /// Duration of retention policy.
+    pub duration: Option<Duration>,
+
+    /// Replication factor of retention policy.
+    pub replication: Option<u64>,
+
+    /// Shard duration of retention policy.
+    pub shard_duration: Option<Duration>,
+
+    /// Retention policy name.
+    pub retention_name: Option<Identifier>,
+}
+
+impl CreateDatabaseStatement {
+    /// Returns true if the "WITH" clause is present.
+    pub fn has_with_clause(&self) -> bool {
+        self.duration.is_some()
+            || self.replication.is_some()
+            || self.shard_duration.is_some()
+            || self.retention_name.is_some()
+    }
+}
+
+impl Display for CreateDatabaseStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "CREATE DATABASE {}", self.name)?;
+
+        if self.has_with_clause() {
+            f.write_str(" WITH")?;
+
+            if let Some(v) = self.duration {
+                write!(f, " DURATION {v}")?;
+            }
+
+            if let Some(v) = self.replication {
+                write!(f, " REPLICATION {v}")?;
+            }
+
+            if let Some(v) = self.shard_duration {
+                write!(f, " SHARD DURATION {v}")?;
+            }
+
+            if let Some(v) = &self.retention_name {
+                write!(f, " NAME {v}")?;
+            }
+        }
+        Ok(())
+    }
+}
+
+fn create_database(i: &str) -> ParseResult<&str, CreateDatabaseStatement> {
+    let (
+        remaining,
+        (
+            _, // "DATABASE"
+            name,
+            opt_with_clause,
+        ),
+    ) = tuple((
+        keyword("DATABASE"),
+        identifier,
+        opt(tuple((
+            preceded(ws1, keyword("WITH")),
+            expect(
+                "invalid WITH clause, expected \"DURATION\", \"REPLICATION\", \"SHARD\" or \"NAME\"",
+                peek(preceded(
+                    ws1,
+                    alt((
+                        keyword("DURATION"),
+                        keyword("REPLICATION"),
+                        keyword("SHARD"),
+                        keyword("NAME"),
+                    )),
+                )),
+            ),
+            opt(preceded(
+                preceded(ws1, keyword("DURATION")),
+                expect(
+                    "invalid DURATION clause, expected duration",
+                    preceded(ws1, duration),
+                ),
+            )),
+            opt(preceded(
+                preceded(ws1, keyword("REPLICATION")),
+                expect(
+                    "invalid REPLICATION clause, expected unsigned integer",
+                    preceded(ws1, unsigned_integer),
+                ),
+            )),
+            opt(preceded(
+                pair(
+                    preceded(ws1, keyword("SHARD")),
+                    expect(
+                        "invalid SHARD DURATION clause, expected \"DURATION\"",
+                        preceded(ws1, keyword("DURATION")),
+                    ),
+                ),
+                expect(
+                    "invalid SHARD DURATION clause, expected duration",
+                    preceded(ws1, duration),
+                ),
+            )),
+            opt(preceded(
+                preceded(ws1, keyword("NAME")),
+                expect(
+                    "invalid NAME clause, expected identifier",
+                    identifier,
+                ),
+            )),
+        ))),
+    ))(i)?;
+
+    let (_, _, duration, replication, shard_duration, retention_name) =
+        opt_with_clause.unwrap_or(("", "", None, None, None, None));
+
+    Ok((
+        remaining,
+        CreateDatabaseStatement {
+            name,
+            duration,
+            replication,
+            shard_duration,
+            retention_name,
+        },
+    ))
+}
+
+#[cfg(test)]
+mod test {
+    use super::create_database;
+    use super::create_statement;
+    use crate::assert_expect_error;
+
+    #[test]
+    fn test_create_statement() {
+        create_statement("CREATE DATABASE telegraf").unwrap();
+    }
+
+    #[test]
+    fn test_create_database() {
+        let (rem, got) = create_database("DATABASE telegraf").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got.name, "telegraf".into());
+
+        let (rem, got) = create_database("DATABASE telegraf WITH DURATION 5m").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got.name, "telegraf".into());
+        assert_eq!(got.duration.unwrap().to_string(), "5m");
+
+        let (rem, got) = create_database("DATABASE telegraf WITH REPLICATION 10").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got.name, "telegraf".into());
+        assert_eq!(got.replication.unwrap(), 10);
+
+        let (rem, got) = create_database("DATABASE telegraf WITH SHARD DURATION 6m").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got.name, "telegraf".into());
+        assert_eq!(got.shard_duration.unwrap().to_string(), "6m");
+
+        let (rem, got) = create_database("DATABASE telegraf WITH NAME \"5 minutes\"").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got.name, "telegraf".into());
+        assert_eq!(got.retention_name.unwrap(), "5 minutes".into());
+
+        let (rem, got) = create_database("DATABASE telegraf WITH DURATION 5m REPLICATION 10 SHARD DURATION 6m NAME \"5 minutes\"").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got.name, "telegraf".into());
+        assert_eq!(got.duration.unwrap().to_string(), "5m");
+        assert_eq!(got.replication.unwrap(), 10);
+        assert_eq!(got.shard_duration.unwrap().to_string(), "6m");
+        assert_eq!(got.retention_name.unwrap(), "5 minutes".into());
+
+        // Fallible
+
+        assert_expect_error!(
+            create_database("DATABASE telegraf WITH foo"),
+            "invalid WITH clause, expected \"DURATION\", \"REPLICATION\", \"SHARD\" or \"NAME\""
+        );
+
+        assert_expect_error!(
+            create_database("DATABASE telegraf WITH DURATION foo"),
+            "invalid DURATION clause, expected duration"
+        );
+
+        assert_expect_error!(
+            create_database("DATABASE telegraf WITH REPLICATION foo"),
+            "invalid REPLICATION clause, expected unsigned integer"
+        );
+
+        assert_expect_error!(
+            create_database("DATABASE telegraf WITH SHARD foo"),
+            "invalid SHARD DURATION clause, expected \"DURATION\""
+        );
+
+        assert_expect_error!(
+            create_database("DATABASE telegraf WITH SHARD DURATION foo"),
+            "invalid SHARD DURATION clause, expected duration"
+        );
+
+        assert_expect_error!(
+            create_database("DATABASE telegraf WITH NAME 5"),
+            "invalid NAME clause, expected identifier"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/delete.rs b/influxdb_influxql_parser/src/delete.rs
new file mode 100644
index 0000000..7a494b3
--- /dev/null
+++ b/influxdb_influxql_parser/src/delete.rs
@@ -0,0 +1,107 @@
+//! Types and parsers for the [`DELETE`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/manage-database/#delete-series-with-delete
+
+use crate::common::{where_clause, ws0, ws1, WhereClause};
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use crate::simple_from_clause::{delete_from_clause, DeleteFromClause};
+use nom::branch::alt;
+use nom::combinator::{map, opt};
+use nom::sequence::{pair, preceded};
+use std::fmt::{Display, Formatter};
+
+/// Represents a `DELETE` statement.
+#[derive(Clone, Debug, PartialEq)]
+pub enum DeleteStatement {
+    /// A DELETE with a `FROM` clause specifying one or more measurements
+    /// and an optional `WHERE` clause to restrict which series are deleted.
+    FromWhere {
+        /// Represents the `FROM` clause.
+        from: DeleteFromClause,
+
+        /// Represents the optional `WHERE` clause.
+        condition: Option<WhereClause>,
+    },
+
+    /// A `DELETE` with a `WHERE` clause to restrict which series are deleted.
+    Where(WhereClause),
+}
+
+impl Display for DeleteStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DELETE")?;
+
+        match self {
+            Self::FromWhere { from, condition } => {
+                write!(f, " {from}")?;
+                if let Some(where_clause) = condition {
+                    write!(f, " {where_clause}")?;
+                }
+            }
+            Self::Where(where_clause) => write!(f, " {where_clause}")?,
+        };
+
+        Ok(())
+    }
+}
+
+/// Parse a `DELETE` statement.
+pub(crate) fn delete_statement(i: &str) -> ParseResult<&str, DeleteStatement> {
+    // delete ::=  "DELETE" ( from_clause where_clause? | where_clause )
+    preceded(
+        keyword("DELETE"),
+        expect(
+            "invalid DELETE statement, expected FROM or WHERE",
+            preceded(
+                ws1,
+                alt((
+                    // delete ::= from_clause where_clause?
+                    map(
+                        pair(delete_from_clause, opt(preceded(ws0, where_clause))),
+                        |(from, condition)| DeleteStatement::FromWhere { from, condition },
+                    ),
+                    // delete ::= where_clause
+                    map(where_clause, DeleteStatement::Where),
+                )),
+            ),
+        ),
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::assert_expect_error;
+    use crate::delete::delete_statement;
+
+    #[test]
+    fn test_delete() {
+        // Validate via the Display trait, as we don't need to validate the contents of the
+        // FROM and / or WHERE clauses, given they are tested in their on modules.
+
+        // Measurement name expressed as an identifier
+        let (_, got) = delete_statement("DELETE FROM foo").unwrap();
+        assert_eq!(got.to_string(), "DELETE FROM foo");
+
+        // Measurement name expressed as a regular expression
+        let (_, got) = delete_statement("DELETE FROM /foo/").unwrap();
+        assert_eq!(got.to_string(), "DELETE FROM /foo/");
+
+        let (_, got) = delete_statement("DELETE FROM foo WHERE time > 10").unwrap();
+        assert_eq!(got.to_string(), "DELETE FROM foo WHERE time > 10");
+
+        let (_, got) = delete_statement("DELETE WHERE time > 10").unwrap();
+        assert_eq!(got.to_string(), "DELETE WHERE time > 10");
+
+        // Fallible cases
+        assert_expect_error!(
+            delete_statement("DELETE"),
+            "invalid DELETE statement, expected FROM or WHERE"
+        );
+
+        assert_expect_error!(
+            delete_statement("DELETE FOO"),
+            "invalid DELETE statement, expected FROM or WHERE"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/drop.rs b/influxdb_influxql_parser/src/drop.rs
new file mode 100644
index 0000000..9080403
--- /dev/null
+++ b/influxdb_influxql_parser/src/drop.rs
@@ -0,0 +1,78 @@
+//! Types and parsers for the [`DROP MEASUREMENT`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/manage-database/#delete-measurements-with-drop-measurement
+
+use crate::common::ws1;
+use crate::identifier::{identifier, Identifier};
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use nom::combinator::map;
+use nom::sequence::{pair, preceded};
+use std::fmt::{Display, Formatter};
+
+/// Represents a `DROP MEASUREMENT` statement.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DropMeasurementStatement {
+    /// The name of the measurement to delete.
+    name: Identifier,
+}
+
+impl Display for DropMeasurementStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DROP MEASUREMENT {}", self.name)
+    }
+}
+
+pub(crate) fn drop_statement(i: &str) -> ParseResult<&str, DropMeasurementStatement> {
+    preceded(
+        pair(keyword("DROP"), ws1),
+        expect(
+            "invalid DROP statement, expected MEASUREMENT",
+            drop_measurement,
+        ),
+    )(i)
+}
+
+fn drop_measurement(i: &str) -> ParseResult<&str, DropMeasurementStatement> {
+    preceded(
+        keyword("MEASUREMENT"),
+        map(
+            expect(
+                "invalid DROP MEASUREMENT statement, expected identifier",
+                identifier,
+            ),
+            |name| DropMeasurementStatement { name },
+        ),
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::assert_expect_error;
+
+    #[test]
+    fn test_drop_statement() {
+        drop_statement("DROP MEASUREMENT foo").unwrap();
+
+        // Fallible cases
+        assert_expect_error!(
+            drop_statement("DROP foo"),
+            "invalid DROP statement, expected MEASUREMENT"
+        );
+    }
+
+    #[test]
+    fn test_drop_measurement() {
+        let (_, got) = drop_measurement("MEASUREMENT \"foo\"").unwrap();
+        assert_eq!(got, DropMeasurementStatement { name: "foo".into() });
+        // validate Display
+        assert_eq!(got.to_string(), "DROP MEASUREMENT foo");
+
+        // Fallible cases
+        assert_expect_error!(
+            drop_measurement("MEASUREMENT 'foo'"),
+            "invalid DROP MEASUREMENT statement, expected identifier"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/explain.rs b/influxdb_influxql_parser/src/explain.rs
new file mode 100644
index 0000000..d96fbd7
--- /dev/null
+++ b/influxdb_influxql_parser/src/explain.rs
@@ -0,0 +1,291 @@
+//! Types and parsers for the [`EXPLAIN`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/spec/#explain
+
+#![allow(dead_code)] // Temporary
+
+use crate::common::ws1;
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use crate::statement::{statement, Statement};
+use nom::branch::alt;
+use nom::combinator::{map, opt, value};
+use nom::sequence::{preceded, tuple};
+use std::fmt::{Display, Formatter};
+
+/// Represents various options for an `EXPLAIN` statement.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ExplainOption {
+    /// `EXPLAIN VERBOSE statement`
+    Verbose,
+    /// `EXPLAIN ANALYZE statement`
+    Analyze,
+    /// `EXPLAIN ANALYZE VERBOSE statement`
+    AnalyzeVerbose,
+}
+
+impl Display for ExplainOption {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Verbose => f.write_str("VERBOSE"),
+            Self::Analyze => f.write_str("ANALYZE"),
+            Self::AnalyzeVerbose => f.write_str("ANALYZE VERBOSE"),
+        }
+    }
+}
+
+/// Represents an `EXPLAIN` statement.
+///
+/// ```text
+/// explain         ::= "EXPLAIN" explain_options? select_statement
+/// explain_options ::= "VERBOSE" | ( "ANALYZE" "VERBOSE"? )
+/// ```
+#[derive(Debug, Clone, PartialEq)]
+pub struct ExplainStatement {
+    /// Represents any options specified for the `EXPLAIN` statement.
+    pub options: Option<ExplainOption>,
+
+    /// Represents the `SELECT` statement to be explained and / or analyzed.
+    pub statement: Box<Statement>,
+}
+
+impl Display for ExplainStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_str("EXPLAIN ")?;
+        if let Some(options) = &self.options {
+            write!(f, "{options} ")?;
+        }
+        Display::fmt(&self.statement, f)
+    }
+}
+
+/// Parse an `EXPLAIN` statement.
+pub(crate) fn explain_statement(i: &str) -> ParseResult<&str, ExplainStatement> {
+    map(
+        tuple((
+            keyword("EXPLAIN"),
+            opt(preceded(
+                ws1,
+                alt((
+                    map(
+                        preceded(keyword("ANALYZE"), opt(preceded(ws1, keyword("VERBOSE")))),
+                        |v| match v {
+                            // If the optional combinator is Some, then it matched VERBOSE
+                            Some(_) => ExplainOption::AnalyzeVerbose,
+                            _ => ExplainOption::Analyze,
+                        },
+                    ),
+                    value(ExplainOption::Verbose, keyword("VERBOSE")),
+                )),
+            )),
+            ws1,
+            expect(
+                "invalid EXPLAIN statement, expected InfluxQL statement",
+                statement,
+            ),
+        )),
+        |(_, options, _, statement)| ExplainStatement {
+            options,
+            statement: Box::new(statement),
+        },
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::assert_expect_error;
+    use crate::explain::{explain_statement, ExplainOption};
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_explain_statement() {
+        // EXPLAIN SELECT cases
+
+        let (remain, got) = explain_statement("EXPLAIN SELECT val from temp").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SELECT val FROM temp");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SELECT val FROM temp");
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SELECT val from temp").unwrap();
+        assert_eq!(remain, "");
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE VERBOSE SELECT val FROM temp"
+        );
+
+        // EXPLAIN SHOW MEASUREMENTS cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW MEASUREMENTS");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW MEASUREMENTS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW MEASUREMENTS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW MEASUREMENTS");
+
+        // EXPLAIN SHOW TAG KEYS cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW TAG KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW TAG KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW TAG KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW TAG KEYS");
+
+        // EXPLAIN SHOW TAG VALUES cases
+        let (remain, got) =
+            explain_statement("EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\"").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\""
+        );
+
+        let (remain, got) =
+            explain_statement("EXPLAIN VERBOSE SHOW TAG VALUES WITH KEY = \"Key\"").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN VERBOSE SHOW TAG VALUES WITH KEY = \"Key\""
+        );
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE SHOW TAG VALUES WITH KEY = \"Key\"").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE SHOW TAG VALUES WITH KEY = \"Key\""
+        );
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SHOW TAG VALUES WITH KEY = \"Key\"")
+                .unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE VERBOSE SHOW TAG VALUES WITH KEY = \"Key\""
+        );
+
+        // EXPLAIN SHOW FIELD KEYS cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW FIELD KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW FIELD KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW FIELD KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW FIELD KEYS");
+
+        // EXPLAIN SHOW RETENTION POLICIES cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW RETENTION POLICIES");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW RETENTION POLICIES");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW RETENTION POLICIES");
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE VERBOSE SHOW RETENTION POLICIES"
+        );
+
+        // EXPLAIN SHOW DATABASES cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW DATABASES");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW DATABASES");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW DATABASES");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW DATABASES");
+
+        // NOTE: Nested EXPLAIN is valid; DataFusion will throw a "No Nested EXPLAIN" error later
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE EXPLAIN SELECT val FROM temp"
+        );
+
+        // surfaces statement-specific errors
+        assert_expect_error!(
+            explain_statement("EXPLAIN ANALYZE SELECT cpu FROM 'foo'"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/expression.rs b/influxdb_influxql_parser/src/expression.rs
new file mode 100644
index 0000000..ee765cc
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression.rs
@@ -0,0 +1,14 @@
+//! Types and parsers for arithmetic and conditional expressions.
+
+pub use arithmetic::*;
+pub use conditional::*;
+
+/// Provides arithmetic expression parsing.
+pub mod arithmetic;
+/// Provides conditional expression parsing.
+pub mod conditional;
+/// Provides APIs to traverse an expression tree using closures.
+pub mod walk;
+
+#[cfg(test)]
+mod test_util;
diff --git a/influxdb_influxql_parser/src/expression/arithmetic.rs b/influxdb_influxql_parser/src/expression/arithmetic.rs
new file mode 100644
index 0000000..264370f
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/arithmetic.rs
@@ -0,0 +1,1113 @@
+use crate::common::ws0;
+use crate::identifier::unquoted_identifier;
+use crate::internal::{expect, Error, ParseError, ParseResult};
+use crate::keywords::keyword;
+use crate::literal::{literal_regex, Duration};
+use crate::timestamp::Timestamp;
+use crate::{
+    identifier::{identifier, Identifier},
+    literal::Literal,
+    parameter::BindParameter,
+};
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::character::complete::char;
+use nom::combinator::{cut, map, opt, value};
+use nom::multi::{many0, separated_list0};
+use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
+use num_traits::cast;
+use std::fmt::{Display, Formatter, Write};
+use std::ops::Neg;
+
+/// Reference to a tag or field key.
+#[derive(Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct VarRef {
+    /// The name of the tag or field.
+    pub name: Identifier,
+
+    /// An optional data type selection specified using the `::` operator.
+    ///
+    /// When the `::` operator follows an identifier, it instructs InfluxQL to fetch
+    /// only data of the matching data type.
+    ///
+    /// The `::` operator appears after an [`Identifier`] and may be described using
+    /// the following EBNF:
+    ///
+    /// ```text
+    /// variable_ref ::= identifier ( "::" data_type )?
+    /// data_type    ::= "float" | "integer" | "boolean" | "string" | "tag" | "field"
+    /// ```
+    ///
+    /// For example:
+    ///
+    /// ```text
+    /// SELECT foo::field, host::tag, usage_idle::integer, idle::boolean FROM cpu
+    /// ```
+    ///
+    /// Specifies the following:
+    ///
+    /// * `foo::field` will return a field of any data type named `foo`
+    /// * `host::tag` will return a tag named `host`
+    /// * `usage_idle::integer` will return either a float or integer field named `usage_idle`,
+    ///   and casting it to an `integer`
+    /// * `idle::boolean` will return a field named `idle` that has a matching data type of
+    ///    `boolean`
+    pub data_type: Option<VarRefDataType>,
+}
+
+impl Display for VarRef {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let Self { name, data_type } = self;
+        write!(f, "{name}")?;
+        if let Some(d) = data_type {
+            write!(f, "::{d}")?;
+        }
+        Ok(())
+    }
+}
+
+/// Function call
+#[derive(Clone, Debug, PartialEq)]
+pub struct Call {
+    /// Represents the name of the function call.
+    pub name: String,
+
+    /// Represents the list of arguments to the function call.
+    pub args: Vec<Expr>,
+}
+
+impl Display for Call {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let Self { name, args } = self;
+        write!(f, "{name}(")?;
+        if !args.is_empty() {
+            let args = args.as_slice();
+            write!(f, "{}", args[0])?;
+            for arg in &args[1..] {
+                write!(f, ", {arg}")?;
+            }
+        }
+        write!(f, ")")
+    }
+}
+
+/// Binary operations, such as `1 + 2`.
+#[derive(Clone, Debug, PartialEq)]
+pub struct Binary {
+    /// Represents the left-hand side of the binary expression.
+    pub lhs: Box<Expr>,
+    /// Represents the operator to apply to the binary expression.
+    pub op: BinaryOperator,
+    /// Represents the right-hand side of the binary expression.
+    pub rhs: Box<Expr>,
+}
+
+impl Display for Binary {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let Self { lhs, op, rhs } = self;
+        write!(f, "{lhs} {op} {rhs}")
+    }
+}
+
+/// An InfluxQL arithmetic expression.
+#[derive(Clone, Debug, PartialEq)]
+pub enum Expr {
+    /// Reference to a tag or field key.
+    VarRef(VarRef),
+
+    /// BindParameter identifier
+    BindParameter(BindParameter),
+
+    /// Literal value such as 'foo', 5 or /^(a|b)$/
+    Literal(Literal),
+
+    /// A literal wildcard (`*`) with an optional data type selection.
+    Wildcard(Option<WildcardType>),
+
+    /// A DISTINCT `<identifier>` expression.
+    Distinct(Identifier),
+
+    /// Function call
+    Call(Call),
+
+    /// Binary operations, such as `1 + 2`.
+    Binary(Binary),
+
+    /// Nested expression, such as (foo = 'bar') or (1)
+    Nested(Box<Expr>),
+}
+
+impl From<Literal> for Expr {
+    fn from(v: Literal) -> Self {
+        Self::Literal(v)
+    }
+}
+
+impl From<i64> for Expr {
+    fn from(v: i64) -> Self {
+        Self::Literal(v.into())
+    }
+}
+
+impl From<u64> for Expr {
+    fn from(v: u64) -> Self {
+        Self::Literal(v.into())
+    }
+}
+
+impl From<f64> for Expr {
+    fn from(v: f64) -> Self {
+        Self::Literal(v.into())
+    }
+}
+
+impl From<u64> for Box<Expr> {
+    fn from(v: u64) -> Self {
+        Self::new(v.into())
+    }
+}
+
+impl From<i64> for Box<Expr> {
+    fn from(v: i64) -> Self {
+        Self::new(v.into())
+    }
+}
+
+impl From<i32> for Box<Expr> {
+    fn from(v: i32) -> Self {
+        Self::new((v as i64).into())
+    }
+}
+
+impl Display for Expr {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::VarRef(v) => write!(f, "{v}"),
+            Self::BindParameter(v) => write!(f, "{v}"),
+            Self::Literal(v) => write!(f, "{v}"),
+            Self::Binary(v) => write!(f, "{v}"),
+            Self::Nested(v) => write!(f, "({v})"),
+            Self::Call(v) => write!(f, "{v}"),
+            Self::Wildcard(Some(v)) => write!(f, "*::{v}"),
+            Self::Wildcard(None) => f.write_char('*'),
+            Self::Distinct(v) => write!(f, "DISTINCT {v}"),
+        }
+    }
+}
+
+/// Traits to help creating InfluxQL [`Expr`]s containing
+/// a [`VarRef`].
+pub trait AsVarRefExpr {
+    /// Creates an InfluxQL [`VarRef`] expression.
+    fn to_var_ref_expr(&self) -> Expr;
+}
+
+impl AsVarRefExpr for str {
+    fn to_var_ref_expr(&self) -> Expr {
+        Expr::VarRef(VarRef {
+            name: self.into(),
+            data_type: None,
+        })
+    }
+}
+
+/// Specifies the data type of a wildcard (`*`) when using the `::` operator.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum WildcardType {
+    /// Indicates the wildcard refers to tags only.
+    Tag,
+
+    /// Indicates the wildcard refers to fields only.
+    Field,
+}
+
+impl Display for WildcardType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Tag => f.write_str("tag"),
+            Self::Field => f.write_str("field"),
+        }
+    }
+}
+
+/// Represents the primitive data types of a [`Expr::VarRef`] when specified
+/// using a [cast operation][cast].
+///
+/// InfluxQL only supports casting between [`Self::Float`] and [`Self::Integer`] types.
+///
+/// [cast]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#cast-operations
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum VarRefDataType {
+    /// Represents a 64-bit float.
+    Float,
+    /// Represents a 64-bit integer.
+    Integer,
+    /// Represents a 64-bit unsigned integer.
+    Unsigned,
+    /// Represents a UTF-8 string.
+    String,
+    /// Represents a boolean.
+    Boolean,
+    /// Represents a field.
+    Field,
+    /// Represents a tag.
+    Tag,
+    /// Represents a timestamp.
+    Timestamp,
+}
+
+impl VarRefDataType {
+    /// Returns true if the receiver is a data type that identifies as a field type.
+    pub fn is_field_type(&self) -> bool {
+        *self < Self::Tag
+    }
+
+    /// Returns true if the receiver is a data type that identifies as a tag type.
+    pub fn is_tag_type(&self) -> bool {
+        *self == Self::Tag
+    }
+
+    /// Returns true if the receiver is a numeric type.
+    pub fn is_numeric_type(&self) -> bool {
+        *self <= Self::Unsigned
+    }
+}
+
+impl Display for VarRefDataType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Float => f.write_str("float"),
+            Self::Integer => f.write_str("integer"),
+            Self::Unsigned => f.write_str("unsigned"),
+            Self::String => f.write_str("string"),
+            Self::Boolean => f.write_str("boolean"),
+            Self::Tag => f.write_str("tag"),
+            Self::Field => f.write_str("field"),
+            Self::Timestamp => f.write_str("timestamp"),
+        }
+    }
+}
+
+/// An InfluxQL unary operator.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum UnaryOperator {
+    /// Represents the unary `+` operator.
+    Plus,
+    /// Represents the unary `-` operator.
+    Minus,
+}
+
+impl Display for UnaryOperator {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Plus => f.write_char('+'),
+            Self::Minus => f.write_char('-'),
+        }
+    }
+}
+
+/// An InfluxQL binary operators.
+#[derive(Clone, Debug, Copy, PartialEq, Eq)]
+pub enum BinaryOperator {
+    /// Represents the `+` operator.
+    Add,
+    /// Represents the `-` operator.
+    Sub,
+    /// Represents the `*` operator.
+    Mul,
+    /// Represents the `/` operator.
+    Div,
+    /// Represents the `%` or modulus operator.
+    Mod,
+    /// Represents the `&` or bitwise-and operator.
+    BitwiseAnd,
+    /// Represents the `|` or bitwise-or operator.
+    BitwiseOr,
+    /// Represents the `^` or bitwise-xor operator.
+    BitwiseXor,
+}
+
+impl BinaryOperator {
+    fn reduce_number<T>(&self, lhs: T, rhs: T) -> T
+    where
+        T: num_traits::NumOps,
+        T: num_traits::identities::Zero,
+    {
+        match self {
+            Self::Add => lhs + rhs,
+            Self::Sub => lhs - rhs,
+            Self::Mul => lhs * rhs,
+            // Divide by zero yields zero per
+            // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5216-L5218
+            Self::Div if rhs.is_zero() => T::zero(),
+            Self::Div => lhs / rhs,
+            Self::Mod => lhs % rhs,
+            _ => unreachable!(),
+        }
+    }
+
+    /// Return a value by applying the operation defined by the receiver.
+    pub fn reduce<T: num_traits::int::PrimInt>(&self, lhs: T, rhs: T) -> T {
+        match self {
+            Self::Add | Self::Sub | Self::Mul | Self::Div | Self::Mod => {
+                self.reduce_number(lhs, rhs)
+            }
+            Self::BitwiseAnd => lhs & rhs,
+            Self::BitwiseOr => lhs | rhs,
+            Self::BitwiseXor => lhs ^ rhs,
+        }
+    }
+
+    /// Return a value by applying the operation defined by the receiver or [`None`]
+    /// if the operation is not supported.
+    pub fn try_reduce<T, U>(&self, lhs: T, rhs: U) -> Option<T>
+    where
+        T: num_traits::Float,
+        U: num_traits::NumOps,
+        U: num_traits::NumCast,
+    {
+        match self {
+            Self::Add | Self::Sub | Self::Mul | Self::Div | Self::Mod => Some(self.reduce_number(
+                lhs,
+                match cast(rhs) {
+                    Some(v) => v,
+                    None => return None,
+                },
+            )),
+            _ => None,
+        }
+    }
+}
+
+impl Display for BinaryOperator {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Add => f.write_char('+'),
+            Self::Sub => f.write_char('-'),
+            Self::Mul => f.write_char('*'),
+            Self::Div => f.write_char('/'),
+            Self::Mod => f.write_char('%'),
+            Self::BitwiseAnd => f.write_char('&'),
+            Self::BitwiseOr => f.write_char('|'),
+            Self::BitwiseXor => f.write_char('^'),
+        }
+    }
+}
+
+/// Parse a unary expression.
+fn unary<T>(i: &str) -> ParseResult<&str, Expr>
+where
+    T: ArithmeticParsers,
+{
+    let (i, op) = preceded(
+        ws0,
+        alt((
+            value(UnaryOperator::Plus, char('+')),
+            value(UnaryOperator::Minus, char('-')),
+        )),
+    )(i)?;
+
+    let (i, e) = factor::<T>(i)?;
+
+    // Unary minus is expressed by negating existing literals,
+    // or producing a binary arithmetic expression that multiplies
+    // Expr `e` by -1
+    let e = if op == UnaryOperator::Minus {
+        match e {
+            Expr::Literal(Literal::Float(v)) => Expr::Literal(Literal::Float(v.neg())),
+            Expr::Literal(Literal::Integer(v)) => Expr::Literal(Literal::Integer(v.neg())),
+            Expr::Literal(Literal::Duration(v)) => Expr::Literal(Literal::Duration((v.0.neg()).into())),
+            Expr::Literal(Literal::Unsigned(v)) => {
+                if v == (i64::MAX as u64) + 1 {
+                    // The minimum i64 is parsed as a Literal::Unsigned, as it exceeds
+                    // int64::MAX, so we explicitly handle that case per
+                    // https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2750-L2755
+                    Expr::Literal(Literal::Integer(i64::MIN))
+                } else {
+                    return Err(nom::Err::Failure(Error::from_message(
+                        i,
+                        "constant overflows signed integer",
+                    )));
+                }
+            },
+            v @ Expr::VarRef { .. } | v @ Expr::Call { .. } | v @ Expr::Nested(..) | v @ Expr::BindParameter(..) => {
+                Expr::Binary(Binary {
+                    lhs: Box::new(Expr::Literal(Literal::Integer(-1))),
+                    op: BinaryOperator::Mul,
+                    rhs: Box::new(v),
+                })
+            }
+            _ => {
+                return Err(nom::Err::Failure(Error::from_message(
+                    i,
+                    "unexpected unary expression: expected literal integer, float, duration, field, function or parenthesis",
+                )))
+            }
+        }
+    } else {
+        e
+    };
+
+    Ok((i, e))
+}
+
+/// Parse a parenthesis expression.
+fn parens<T>(i: &str) -> ParseResult<&str, Expr>
+where
+    T: ArithmeticParsers,
+{
+    delimited(
+        preceded(ws0, char('(')),
+        map(arithmetic::<T>, |e| Expr::Nested(e.into())),
+        preceded(ws0, char(')')),
+    )(i)
+}
+
+/// Parse a function call expression.
+///
+/// The `name` field of the [`Expr::Call`] variant is guaranteed to be in lowercase.
+pub(crate) fn call_expression<T>(i: &str) -> ParseResult<&str, Expr>
+where
+    T: ArithmeticParsers,
+{
+    map(
+        separated_pair(
+            // special case to handle `DISTINCT`, which is allowed as an identifier
+            // in a call expression
+            map(alt((unquoted_identifier, keyword("DISTINCT"))), |n| {
+                n.to_ascii_lowercase()
+            }),
+            ws0,
+            delimited(
+                char('('),
+                alt((
+                    // A single regular expression to match 0 or more field keys
+                    map(preceded(ws0, literal_regex), |re| vec![re.into()]),
+                    // A list of Expr, separated by commas
+                    separated_list0(preceded(ws0, char(',')), arithmetic::<T>),
+                )),
+                cut(preceded(ws0, char(')'))),
+            ),
+        ),
+        |(name, args)| Expr::Call(Call { name, args }),
+    )(i)
+}
+
+/// Parse a segmented identifier
+///
+/// ```text
+/// segmented_identifier ::= identifier |
+///                          ( identifier "." identifier ) |
+///                          ( identifier "." identifier? "." identifier )
+/// ```
+fn segmented_identifier(i: &str) -> ParseResult<&str, Identifier> {
+    let (remaining, (opt_prefix, name)) = pair(
+        opt(alt((
+            // ident2 "." ident1 "."
+            map(
+                pair(
+                    terminated(identifier, tag(".")),
+                    terminated(identifier, tag(".")),
+                ),
+                |(ident2, ident1)| (Some(ident2), Some(ident1)),
+            ),
+            // identifier ".."
+            map(terminated(identifier, tag("..")), |ident2| {
+                (Some(ident2), None)
+            }),
+            // identifier "."
+            map(terminated(identifier, tag(".")), |ident1| {
+                (None, Some(ident1))
+            }),
+        ))),
+        identifier,
+    )(i)?;
+
+    Ok((
+        remaining,
+        match opt_prefix {
+            Some((None, Some(ident1))) => format!("{}.{}", ident1.0, name.0).into(),
+            Some((Some(ident2), None)) => format!("{}..{}", ident2.0, name.0).into(),
+            Some((Some(ident2), Some(ident1))) => {
+                format!("{}.{}.{}", ident2.0, ident1.0, name.0).into()
+            }
+            _ => name,
+        },
+    ))
+}
+
+/// Parse a variable reference, which is a segmented identifier followed by an optional cast expression.
+pub(crate) fn var_ref(i: &str) -> ParseResult<&str, Expr> {
+    map(
+        pair(
+            segmented_identifier,
+            opt(preceded(
+                tag("::"),
+                expect(
+                    "invalid data type for tag or field reference, expected float, integer, unsigned, string, boolean, field, tag",
+                    alt((
+                        value(VarRefDataType::Float, keyword("FLOAT")),
+                        value(VarRefDataType::Integer, keyword("INTEGER")),
+                        value(VarRefDataType::Unsigned, keyword("UNSIGNED")),
+                        value(VarRefDataType::String, keyword("STRING")),
+                        value(VarRefDataType::Boolean, keyword("BOOLEAN")),
+                        value(VarRefDataType::Tag, keyword("TAG")),
+                        value(VarRefDataType::Field, keyword("FIELD"))
+                    ))
+                )
+            )),
+        ),
+        |(name, data_type)| Expr::VarRef(VarRef { name, data_type }),
+    )(i)
+}
+
+/// Parse precedence priority 1 operators.
+///
+/// These are the highest precedence operators, and include parenthesis and the unary operators.
+fn factor<T>(i: &str) -> ParseResult<&str, Expr>
+where
+    T: ArithmeticParsers,
+{
+    alt((unary::<T>, parens::<T>, T::operand))(i)
+}
+
+/// Parse arithmetic, precedence priority 2 operators.
+///
+/// This includes the multiplication, division, bitwise and, and modulus operators.
+fn term<T>(i: &str) -> ParseResult<&str, Expr>
+where
+    T: ArithmeticParsers,
+{
+    let (input, left) = factor::<T>(i)?;
+    let (input, remaining) = many0(tuple((
+        preceded(
+            ws0,
+            alt((
+                value(BinaryOperator::Mul, char('*')),
+                value(BinaryOperator::Div, char('/')),
+                value(BinaryOperator::BitwiseAnd, char('&')),
+                value(BinaryOperator::Mod, char('%')),
+            )),
+        ),
+        factor::<T>,
+    )))(input)?;
+    Ok((input, reduce_expr(left, remaining)))
+}
+
+/// Parse an arithmetic expression.
+///
+/// This includes the addition, subtraction, bitwise or, and bitwise xor operators.
+pub(crate) fn arithmetic<T>(i: &str) -> ParseResult<&str, Expr>
+where
+    T: ArithmeticParsers,
+{
+    let (input, left) = term::<T>(i)?;
+    let (input, remaining) = many0(tuple((
+        preceded(
+            ws0,
+            alt((
+                value(BinaryOperator::Add, char('+')),
+                value(BinaryOperator::Sub, char('-')),
+                value(BinaryOperator::BitwiseOr, char('|')),
+                value(BinaryOperator::BitwiseXor, char('^')),
+            )),
+        ),
+        cut(term::<T>),
+    )))(input)?;
+    Ok((input, reduce_expr(left, remaining)))
+}
+
+/// A trait for customizing arithmetic parsers.
+pub(crate) trait ArithmeticParsers {
+    /// Parse an operand of an arithmetic expression.
+    fn operand(i: &str) -> ParseResult<&str, Expr>;
+}
+
+/// Folds `expr` and `remainder` into a [Expr::Binary] tree.
+fn reduce_expr(expr: Expr, remainder: Vec<(BinaryOperator, Expr)>) -> Expr {
+    remainder.into_iter().fold(expr, |lhs, val| {
+        Expr::Binary(Binary {
+            lhs: lhs.into(),
+            op: val.0,
+            rhs: val.1.into(),
+        })
+    })
+}
+
+/// Trait for converting a type to a [`Expr::Literal`] expression.
+pub trait LiteralExpr {
+    /// Convert the receiver to a literal expression.
+    fn lit(self) -> Expr;
+}
+
+/// Convert `v` to a literal expression.
+pub fn lit<T: LiteralExpr>(v: T) -> Expr {
+    v.lit()
+}
+
+impl LiteralExpr for Literal {
+    fn lit(self) -> Expr {
+        Expr::Literal(self)
+    }
+}
+
+impl LiteralExpr for Duration {
+    fn lit(self) -> Expr {
+        Expr::Literal(Literal::Duration(self))
+    }
+}
+
+impl LiteralExpr for bool {
+    fn lit(self) -> Expr {
+        Expr::Literal(Literal::Boolean(self))
+    }
+}
+
+impl LiteralExpr for i64 {
+    fn lit(self) -> Expr {
+        Expr::Literal(Literal::Integer(self))
+    }
+}
+
+impl LiteralExpr for f64 {
+    fn lit(self) -> Expr {
+        Expr::Literal(Literal::Float(self))
+    }
+}
+
+impl LiteralExpr for String {
+    fn lit(self) -> Expr {
+        Expr::Literal(Literal::String(self))
+    }
+}
+
+impl LiteralExpr for Timestamp {
+    fn lit(self) -> Expr {
+        Expr::Literal(Literal::Timestamp(self))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::literal::literal_no_regex;
+    use crate::parameter::parameter;
+    use crate::{assert_expect_error, assert_failure, binary_op, nested, param, var_ref};
+
+    struct TestParsers;
+
+    impl ArithmeticParsers for TestParsers {
+        fn operand(i: &str) -> ParseResult<&str, Expr> {
+            preceded(
+                ws0,
+                alt((
+                    map(literal_no_regex, Expr::Literal),
+                    var_ref,
+                    map(parameter, Expr::BindParameter),
+                )),
+            )(i)
+        }
+    }
+
+    fn arithmetic_expression(i: &str) -> ParseResult<&str, Expr> {
+        arithmetic::<TestParsers>(i)
+    }
+
+    #[test]
+    fn test_arithmetic() {
+        let (_, got) = arithmetic_expression("5 + 51").unwrap();
+        assert_eq!(got, binary_op!(5, Add, 51));
+
+        let (_, got) = arithmetic_expression("5 + $foo").unwrap();
+        assert_eq!(got, binary_op!(5, Add, param!("foo")));
+
+        // Following two tests validate that operators of higher precedence
+        // are nested deeper in the AST.
+
+        let (_, got) = arithmetic_expression("5 % -3 | 2").unwrap();
+        assert_eq!(got, binary_op!(binary_op!(5, Mod, -3), BitwiseOr, 2));
+
+        let (_, got) = arithmetic_expression("-3 | 2 % 5").unwrap();
+        assert_eq!(got, binary_op!(-3, BitwiseOr, binary_op!(2, Mod, 5)));
+
+        let (_, got) = arithmetic_expression("5 % 2 | -3").unwrap();
+        assert_eq!(got, binary_op!(binary_op!(5, Mod, 2), BitwiseOr, -3));
+
+        let (_, got) = arithmetic_expression("2 | -3 % 5").unwrap();
+        assert_eq!(got, binary_op!(2, BitwiseOr, binary_op!(-3, Mod, 5)));
+
+        let (_, got) = arithmetic_expression("5 - -(3 | 2)").unwrap();
+        assert_eq!(
+            got,
+            binary_op!(
+                5,
+                Sub,
+                binary_op!(-1, Mul, nested!(binary_op!(3, BitwiseOr, 2)))
+            )
+        );
+
+        let (_, got) = arithmetic_expression("2 | 5 % 3").unwrap();
+        assert_eq!(got, binary_op!(2, BitwiseOr, binary_op!(5, Mod, 3)));
+
+        // Expressions are still valid when unnecessary whitespace is omitted
+
+        let (_, got) = arithmetic_expression("5+51").unwrap();
+        assert_eq!(got, binary_op!(5, Add, 51));
+
+        let (_, got) = arithmetic_expression("5+$foo").unwrap();
+        assert_eq!(got, binary_op!(5, Add, param!("foo")));
+
+        let (_, got) = arithmetic_expression("5- -(3|2)").unwrap();
+        assert_eq!(
+            got,
+            binary_op!(
+                5,
+                Sub,
+                binary_op!(-1, Mul, nested!(binary_op!(3, BitwiseOr, 2)))
+            )
+        );
+
+        // whitespace is not significant between unary operators
+        let (_, got) = arithmetic_expression("5+-(3|2)").unwrap();
+        assert_eq!(
+            got,
+            binary_op!(
+                5,
+                Add,
+                binary_op!(-1, Mul, nested!(binary_op!(3, BitwiseOr, 2)))
+            )
+        );
+
+        // Test unary max signed
+        let (_, got) = arithmetic_expression("-9223372036854775808").unwrap();
+        assert_eq!(got, Expr::Literal(Literal::Integer(-9223372036854775808)));
+
+        // Fallible cases
+
+        // invalid operator / incomplete expression
+        assert_failure!(arithmetic_expression("5 || 3"));
+        assert_failure!(arithmetic_expression("5+--(3|2)"));
+        // exceeds i64::MIN
+        assert_failure!(arithmetic_expression("-9223372036854775809"));
+    }
+
+    #[test]
+    fn test_var_ref() {
+        let (_, got) = var_ref("foo").unwrap();
+        assert_eq!(got, var_ref!("foo"));
+
+        // Whilst this is parsed as a 3-part name, it is treated as a quoted string 🙄
+        // VarRefs are parsed as segmented identifiers
+        //
+        //   * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2515-L2516
+        //
+        // and then the segments are joined as a single string
+        //
+        //   * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2551
+        let (rem, got) = var_ref("db.rp.foo").unwrap();
+        assert_eq!(got, var_ref!("db.rp.foo"));
+        assert_eq!(got.to_string(), r#""db.rp.foo""#);
+        assert_eq!(rem, "");
+
+        // with cast operators
+
+        let (_, got) = var_ref("foo::float").unwrap();
+        assert_eq!(got, var_ref!("foo", Float));
+        let (_, got) = var_ref("foo::integer").unwrap();
+        assert_eq!(got, var_ref!("foo", Integer));
+        let (_, got) = var_ref("foo::unsigned").unwrap();
+        assert_eq!(got, var_ref!("foo", Unsigned));
+        let (_, got) = var_ref("foo::string").unwrap();
+        assert_eq!(got, var_ref!("foo", String));
+        let (_, got) = var_ref("foo::boolean").unwrap();
+        assert_eq!(got, var_ref!("foo", Boolean));
+        let (_, got) = var_ref("foo::field").unwrap();
+        assert_eq!(got, var_ref!("foo", Field));
+        let (_, got) = var_ref("foo::tag").unwrap();
+        assert_eq!(got, var_ref!("foo", Tag));
+
+        // Fallible cases
+
+        assert_expect_error!(var_ref("foo::invalid"), "invalid data type for tag or field reference, expected float, integer, unsigned, string, boolean, field, tag");
+    }
+
+    #[test]
+    fn test_spacing_and_remaining_input() {
+        // Validate that the remaining input is returned
+        let (got, _) = arithmetic_expression("foo - 1 + 2 LIMIT 10").unwrap();
+        assert_eq!(got, " LIMIT 10");
+
+        // Any whitespace preceding the expression is consumed
+        let (got, _) = arithmetic_expression("  foo - 1 + 2").unwrap();
+        assert_eq!(got, "");
+
+        // Various whitespace separators are supported between tokens
+        let (got, _) = arithmetic_expression("foo\n | 1 \t + \n \t3").unwrap();
+        assert!(got.is_empty())
+    }
+
+    #[test]
+    fn test_segmented_identifier() {
+        // Unquoted
+        let (rem, id) = segmented_identifier("part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(id.to_string(), "part0");
+
+        // id.id
+        let (rem, id) = segmented_identifier("part1.part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(id.to_string(), "\"part1.part0\"");
+
+        // id..id
+        let (rem, id) = segmented_identifier("part2..part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(id.to_string(), "\"part2..part0\"");
+
+        // id.id.id
+        let (rem, id) = segmented_identifier("part2.part1.part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(id.to_string(), "\"part2.part1.part0\"");
+
+        // "id"."id".id
+        let (rem, id) = segmented_identifier(r#""part 2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(id.to_string(), "\"part 2.part 1.part0\"");
+
+        // Only parses 3 segments
+        let (rem, id) = segmented_identifier("part2.part1.part0.foo").unwrap();
+        assert_eq!(rem, ".foo");
+        assert_eq!(id.to_string(), "\"part2.part1.part0\"");
+
+        // Quoted
+        let (rem, id) = segmented_identifier("\"part0\"").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(id.to_string(), "part0");
+
+        // Additional test cases, with compatibility proven via https://go.dev/play/p/k2150CJocVl
+
+        let (rem, id) = segmented_identifier(r#""part" 2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#" 2"."part 1".part0"#);
+        assert_eq!(id.to_string(), "part");
+
+        let (rem, id) = segmented_identifier(r#""part" 2."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#" 2."part 1".part0"#);
+        assert_eq!(id.to_string(), "part");
+
+        let (rem, id) = segmented_identifier(r#""part "2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#"2"."part 1".part0"#);
+        assert_eq!(id.to_string(), r#""part ""#);
+
+        let (rem, id) = segmented_identifier(r#""part ""2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#""2"."part 1".part0"#);
+        assert_eq!(id.to_string(), r#""part ""#);
+    }
+
+    #[test]
+    fn test_display_expr() {
+        #[track_caller]
+        fn assert_display_expr(input: &str, expected: &str) {
+            let (_, e) = arithmetic_expression(input).unwrap();
+            assert_eq!(e.to_string(), expected);
+        }
+
+        assert_display_expr("5 + 51", "5 + 51");
+        assert_display_expr("5 + -10", "5 + -10");
+        assert_display_expr("-(5 % 6)", "-1 * (5 % 6)");
+
+        // vary spacing
+        assert_display_expr("( 5 + 6 ) * -( 7+ 8)", "(5 + 6) * -1 * (7 + 8)");
+
+        // multiple unary and parenthesis
+        assert_display_expr("(-(5 + 6) & -+( 7 + 8 ))", "(-1 * (5 + 6) & -1 * (7 + 8))");
+
+        // unquoted identifier
+        assert_display_expr("foo + 5", "foo + 5");
+
+        // identifier, negated
+        assert_display_expr("-foo + 5", "-1 * foo + 5");
+
+        // bind parameter identifier
+        assert_display_expr("foo + $0", "foo + $0");
+
+        // quoted identifier
+        assert_display_expr(r#""foo" + 'bar'"#, r#"foo + 'bar'"#);
+
+        // quoted identifier, negated
+        assert_display_expr(r#"-"foo" + 'bar'"#, r#"-1 * foo + 'bar'"#);
+
+        // quoted identifier with spaces, negated
+        assert_display_expr(r#"-"foo bar" + 'bar'"#, r#"-1 * "foo bar" + 'bar'"#);
+
+        // Duration
+        assert_display_expr("6h30m", "6h30m");
+
+        // Negated
+        assert_display_expr("- 6h30m", "-6h30m");
+
+        // Validate other expression types
+
+        assert_eq!(Expr::Wildcard(None).to_string(), "*");
+        assert_eq!(
+            Expr::Wildcard(Some(WildcardType::Field)).to_string(),
+            "*::field"
+        );
+        assert_eq!(Expr::Distinct("foo".into()).to_string(), "DISTINCT foo");
+
+        // can't parse literal regular expressions as part of an arithmetic expression
+        assert_failure!(arithmetic_expression(r#""foo" + /^(no|match)$/"#));
+    }
+
+    /// Test call expressions using `ConditionalExpression`
+    fn call(i: &str) -> ParseResult<&str, Expr> {
+        call_expression::<TestParsers>(i)
+    }
+
+    #[test]
+    fn test_call() {
+        #[track_caller]
+        fn assert_call(input: &str, expected: &str) {
+            let (_, ex) = call(input).unwrap();
+            assert_eq!(ex.to_string(), expected);
+        }
+
+        // These tests validate a `Call` expression and also it's Display implementation.
+        // We don't need to validate Expr trees, as we do that in the conditional and arithmetic
+        // tests.
+
+        // No arguments
+        assert_call("FN()", "fn()");
+
+        // Single argument with surrounding whitespace
+        assert_call("FN ( 1 )", "fn(1)");
+
+        // Multiple arguments with varying whitespace
+        assert_call("FN ( 1,2\n,3,\t4 )", "fn(1, 2, 3, 4)");
+
+        // Arguments as expressions
+        assert_call("FN ( 1 + 2, foo, 'bar' )", "fn(1 + 2, foo, 'bar')");
+
+        // A single regular expression argument
+        assert_call("FN ( /foo/ )", "fn(/foo/)");
+
+        // Fallible cases
+
+        call("FN ( 1").unwrap_err();
+        call("FN ( 1, )").unwrap_err();
+        call("FN ( 1,, 2 )").unwrap_err();
+
+        // Conditionals not supported
+        call("FN ( 1 = 2 )").unwrap_err();
+
+        // Multiple regular expressions not supported
+        call("FN ( /foo/, /bar/ )").unwrap_err();
+    }
+
+    #[test]
+    fn test_var_ref_display() {
+        assert_eq!(
+            Expr::VarRef(VarRef {
+                name: "foo".into(),
+                data_type: None
+            })
+            .to_string(),
+            "foo"
+        );
+        assert_eq!(
+            Expr::VarRef(VarRef {
+                name: "foo".into(),
+                data_type: Some(VarRefDataType::Field)
+            })
+            .to_string(),
+            "foo::field"
+        );
+    }
+
+    #[test]
+    fn test_var_ref_data_type() {
+        use VarRefDataType::*;
+
+        // Ensure ordering of data types relative to one another.
+
+        assert!(Float < Integer);
+        assert!(Integer < Unsigned);
+        assert!(Unsigned < String);
+        assert!(String < Boolean);
+        assert!(Boolean < Field);
+        assert!(Field < Tag);
+
+        assert!(Float.is_field_type());
+        assert!(Integer.is_field_type());
+        assert!(Unsigned.is_field_type());
+        assert!(String.is_field_type());
+        assert!(Boolean.is_field_type());
+        assert!(Field.is_field_type());
+        assert!(Tag.is_tag_type());
+
+        assert!(!Float.is_tag_type());
+        assert!(!Integer.is_tag_type());
+        assert!(!Unsigned.is_tag_type());
+        assert!(!String.is_tag_type());
+        assert!(!Boolean.is_tag_type());
+        assert!(!Field.is_tag_type());
+        assert!(!Tag.is_field_type());
+
+        assert!(Float.is_numeric_type());
+        assert!(Integer.is_numeric_type());
+        assert!(Unsigned.is_numeric_type());
+        assert!(!String.is_numeric_type());
+        assert!(!Boolean.is_numeric_type());
+        assert!(!Field.is_numeric_type());
+        assert!(!Tag.is_numeric_type());
+    }
+
+    #[test]
+    fn test_binary_operator_reduce() {
+        use BinaryOperator::*;
+
+        //
+        // Integer, Integer
+        //
+
+        // Numeric operations
+        assert_eq!(Add.reduce(10, 2), 12);
+        assert_eq!(Sub.reduce(10, 2), 8);
+        assert_eq!(Mul.reduce(10, 2), 20);
+        assert_eq!(Div.reduce(10, 2), 5);
+        // Divide by zero yields zero
+        assert_eq!(Div.reduce(10, 0), 0);
+        assert_eq!(Mod.reduce(10, 2), 0);
+        // Bitwise operations
+        assert_eq!(BitwiseAnd.reduce(0b1111, 0b1010), 0b1010);
+        assert_eq!(BitwiseOr.reduce(0b0101, 0b1010), 0b1111);
+        assert_eq!(BitwiseXor.reduce(0b1101, 0b1010), 0b0111);
+
+        //
+        // Float, Float
+        //
+
+        assert_eq!(Add.try_reduce(10.0, 2.0).unwrap(), 12.0);
+        assert_eq!(Sub.try_reduce(10.0, 2.0).unwrap(), 8.0);
+        assert_eq!(Mul.try_reduce(10.0, 2.0).unwrap(), 20.0);
+        assert_eq!(Div.try_reduce(10.0, 2.0).unwrap(), 5.0);
+        // Divide by zero yields zero
+        assert_eq!(Div.try_reduce(10.0, 0.0).unwrap(), 0.0);
+        assert_eq!(Mod.try_reduce(10.0, 2.0).unwrap(), 0.0);
+
+        // Bitwise operations
+        assert!(BitwiseAnd.try_reduce(1.0, 1.0).is_none());
+        assert!(BitwiseOr.try_reduce(1.0, 1.0).is_none());
+        assert!(BitwiseXor.try_reduce(1.0, 1.0).is_none());
+
+        //
+        // Float, Integer
+        //
+
+        assert_eq!(Add.try_reduce(10.0, 2).unwrap(), 12.0);
+        assert_eq!(Sub.try_reduce(10.0, 2).unwrap(), 8.0);
+        assert_eq!(Mul.try_reduce(10.0, 2).unwrap(), 20.0);
+        assert_eq!(Div.try_reduce(10.0, 2).unwrap(), 5.0);
+        // Divide by zero yields zero
+        assert_eq!(Div.try_reduce(10.0, 0).unwrap(), 0.0);
+        assert_eq!(Mod.try_reduce(10.0, 2).unwrap(), 0.0);
+    }
+}
diff --git a/influxdb_influxql_parser/src/expression/conditional.rs b/influxdb_influxql_parser/src/expression/conditional.rs
new file mode 100644
index 0000000..f34d696
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/conditional.rs
@@ -0,0 +1,631 @@
+use crate::common::{ws0, ParseError};
+use crate::expression::arithmetic::{
+    arithmetic, call_expression, var_ref, ArithmeticParsers, Expr,
+};
+use crate::expression::Call;
+use crate::functions::is_scalar_math_function;
+use crate::internal::{expect, verify, Error as InternalError, ParseResult};
+use crate::keywords::keyword;
+use crate::literal::{literal_no_regex, literal_regex, Literal};
+use crate::parameter::parameter;
+use crate::select::is_valid_now_call;
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::character::complete::char;
+use nom::combinator::{map, value};
+use nom::multi::many0;
+use nom::sequence::{delimited, preceded, tuple};
+use nom::Offset;
+use std::fmt;
+use std::fmt::{Display, Formatter, Write};
+use std::str::FromStr;
+
+/// Represents one of the conditional operators supported by [`ConditionalExpression::Binary`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ConditionalOperator {
+    /// Represents the `=` operator.
+    Eq,
+    /// Represents the `!=` or `<>` operator.
+    NotEq,
+    /// Represents the `=~` (regular expression equals) operator.
+    EqRegex,
+    /// Represents the `!~` (regular expression not equals) operator.
+    NotEqRegex,
+    /// Represents the `<` operator.
+    Lt,
+    /// Represents the `<=` operator.
+    LtEq,
+    /// Represents the `>` operator.
+    Gt,
+    /// Represents the `>=` operator.
+    GtEq,
+    /// Represents the `IN` operator.
+    In,
+    /// Represents the `AND` operator.
+    And,
+    /// Represents the `OR` operator.
+    Or,
+}
+
+impl Display for ConditionalOperator {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Eq => f.write_char('='),
+            Self::NotEq => f.write_str("!="),
+            Self::EqRegex => f.write_str("=~"),
+            Self::NotEqRegex => f.write_str("!~"),
+            Self::Lt => f.write_char('<'),
+            Self::LtEq => f.write_str("<="),
+            Self::Gt => f.write_char('>'),
+            Self::GtEq => f.write_str(">="),
+            Self::In => f.write_str("IN"),
+            Self::And => f.write_str("AND"),
+            Self::Or => f.write_str("OR"),
+        }
+    }
+}
+
+/// Conditional binary operations, such as `foo = 'bar'` or `true AND false`.
+#[derive(Debug, Clone, PartialEq)]
+pub struct ConditionalBinary {
+    /// Represents the left-hand side of the conditional binary expression.
+    pub lhs: Box<ConditionalExpression>,
+    /// Represents the operator to apply to the conditional binary expression.
+    pub op: ConditionalOperator,
+    /// Represents the right-hand side of the conditional binary expression.
+    pub rhs: Box<ConditionalExpression>,
+}
+
+impl Display for ConditionalBinary {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        let Self { lhs, op, rhs } = self;
+        write!(f, "{lhs} {op} {rhs}")
+    }
+}
+
+/// Represents a conditional expression.
+#[derive(Debug, Clone, PartialEq)]
+pub enum ConditionalExpression {
+    /// Represents an arithmetic expression.
+    Expr(Box<Expr>),
+
+    /// Binary operations, such as `foo = 'bar'` or `true AND false`.
+    Binary(ConditionalBinary),
+
+    /// Represents a conditional expression enclosed in parenthesis.
+    Grouped(Box<ConditionalExpression>),
+}
+
+impl ConditionalExpression {
+    /// Returns the inner arithmetic [`Expr`].
+    pub fn expr(&self) -> Option<&Expr> {
+        if let Self::Expr(expr) = self {
+            Some(expr)
+        } else {
+            None
+        }
+    }
+
+    /// Return `self == other`
+    pub fn eq(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::Eq, other)
+    }
+
+    /// Return `self != other`
+    pub fn not_eq(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::NotEq, other)
+    }
+
+    /// Return `self > other`
+    pub fn gt(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::Gt, other)
+    }
+
+    /// Return `self >= other`
+    pub fn gt_eq(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::GtEq, other)
+    }
+
+    /// Return `self < other`
+    pub fn lt(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::Lt, other)
+    }
+
+    /// Return `self <= other`
+    pub fn lt_eq(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::LtEq, other)
+    }
+
+    /// Return `self AND other`
+    pub fn and(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::And, other)
+    }
+
+    /// Return `self OR other`
+    pub fn or(self, other: Self) -> Self {
+        binary_cond(self, ConditionalOperator::Or, other)
+    }
+}
+
+impl Display for ConditionalExpression {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Expr(v) => fmt::Display::fmt(v, f),
+            Self::Binary(v) => fmt::Display::fmt(v, f),
+            Self::Grouped(v) => write!(f, "({v})"),
+        }
+    }
+}
+
+impl From<Literal> for ConditionalExpression {
+    fn from(v: Literal) -> Self {
+        Self::Expr(Box::new(Expr::Literal(v)))
+    }
+}
+
+/// Parse a parenthesis expression.
+fn parens(i: &str) -> ParseResult<&str, ConditionalExpression> {
+    delimited(
+        preceded(ws0, char('(')),
+        map(conditional_expression, |e| {
+            ConditionalExpression::Grouped(e.into())
+        }),
+        preceded(ws0, char(')')),
+    )(i)
+}
+
+fn expr_or_group(i: &str) -> ParseResult<&str, ConditionalExpression> {
+    alt((
+        map(arithmetic_expression, |v| {
+            ConditionalExpression::Expr(Box::new(v))
+        }),
+        parens,
+    ))(i)
+}
+
+/// Parse the conditional regular expression operators `=~` and `!~`.
+fn conditional_regex(i: &str) -> ParseResult<&str, ConditionalExpression> {
+    let (input, f1) = expr_or_group(i)?;
+    let (input, exprs) = many0(tuple((
+        preceded(
+            ws0,
+            alt((
+                value(ConditionalOperator::EqRegex, tag("=~")),
+                value(ConditionalOperator::NotEqRegex, tag("!~")),
+            )),
+        ),
+        map(
+            expect(
+                "invalid conditional, expected regular expression",
+                preceded(ws0, literal_regex),
+            ),
+            From::from,
+        ),
+    )))(input)?;
+    Ok((input, reduce_expr(f1, exprs)))
+}
+
+/// Parse conditional operators.
+fn conditional(i: &str) -> ParseResult<&str, ConditionalExpression> {
+    let (input, f1) = conditional_regex(i)?;
+    let (input, exprs) = many0(tuple((
+        preceded(
+            ws0,
+            alt((
+                // try longest matches first
+                value(ConditionalOperator::LtEq, tag("<=")),
+                value(ConditionalOperator::GtEq, tag(">=")),
+                value(ConditionalOperator::NotEq, tag("!=")),
+                value(ConditionalOperator::NotEq, tag("<>")),
+                value(ConditionalOperator::Lt, char('<')),
+                value(ConditionalOperator::Gt, char('>')),
+                value(ConditionalOperator::Eq, char('=')),
+            )),
+        ),
+        expect("invalid conditional expression", conditional_regex),
+    )))(input)?;
+    Ok((input, reduce_expr(f1, exprs)))
+}
+
+/// Parse conjunction operators, such as `AND`.
+fn conjunction(i: &str) -> ParseResult<&str, ConditionalExpression> {
+    let (input, f1) = conditional(i)?;
+    let (input, exprs) = many0(tuple((
+        value(ConditionalOperator::And, preceded(ws0, keyword("AND"))),
+        expect("invalid conditional expression", conditional),
+    )))(input)?;
+    Ok((input, reduce_expr(f1, exprs)))
+}
+
+/// Parse disjunction operator, such as `OR`.
+fn disjunction(i: &str) -> ParseResult<&str, ConditionalExpression> {
+    let (input, f1) = conjunction(i)?;
+    let (input, exprs) = many0(tuple((
+        value(ConditionalOperator::Or, preceded(ws0, keyword("OR"))),
+        expect("invalid conditional expression", conjunction),
+    )))(input)?;
+    Ok((input, reduce_expr(f1, exprs)))
+}
+
+/// Parse an InfluxQL conditional expression.
+pub(crate) fn conditional_expression(i: &str) -> ParseResult<&str, ConditionalExpression> {
+    disjunction(i)
+}
+
+/// Parse the input completely and return a [`ConditionalExpression`].
+///
+/// All leading and trailing whitespace is consumed. If any input remains after parsing,
+/// an error is returned.
+pub fn parse_conditional_expression(input: &str) -> Result<ConditionalExpression, ParseError> {
+    let mut i: &str = input;
+
+    // Consume whitespace from the input
+    (i, _) = ws0(i).expect("ws0 is infallible");
+
+    if i.is_empty() {
+        return Err(ParseError {
+            message: "unexpected eof".into(),
+            pos: 0,
+        });
+    }
+
+    let (mut i, cond) = match conditional_expression(i) {
+        Ok((i1, cond)) => (i1, cond),
+        Err(nom::Err::Failure(InternalError::Syntax {
+            input: pos,
+            message,
+        })) => {
+            return Err(ParseError {
+                message: message.into(),
+                pos: input.offset(pos),
+            })
+        }
+        // any other error indicates an invalid expression
+        Err(_) => {
+            return Err(ParseError {
+                message: "invalid conditional expression".into(),
+                pos: input.offset(i),
+            })
+        }
+    };
+
+    // Consume remaining whitespace from the input
+    (i, _) = ws0(i).expect("ws0 is infallible");
+
+    if !i.is_empty() {
+        return Err(ParseError {
+            message: "invalid conditional expression".into(),
+            pos: input.offset(i),
+        });
+    }
+
+    Ok(cond)
+}
+
+impl FromStr for ConditionalExpression {
+    type Err = ParseError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        parse_conditional_expression(s)
+    }
+}
+
+/// Folds `expr` and `remainder` into a [ConditionalExpression::Binary] tree.
+fn reduce_expr(
+    expr: ConditionalExpression,
+    remainder: Vec<(ConditionalOperator, ConditionalExpression)>,
+) -> ConditionalExpression {
+    remainder.into_iter().fold(expr, |lhs, val| {
+        ConditionalExpression::Binary(ConditionalBinary {
+            lhs: lhs.into(),
+            op: val.0,
+            rhs: val.1.into(),
+        })
+    })
+}
+
+/// Returns true if `expr` is a valid [`Expr::Call`] expression for condtional expressions
+/// in the WHERE clause.
+pub(crate) fn is_valid_conditional_call(expr: &Expr) -> bool {
+    is_valid_now_call(expr)
+        || match expr {
+            Expr::Call(Call { name, .. }) => is_scalar_math_function(name),
+            _ => false,
+        }
+}
+
+impl ConditionalExpression {
+    /// Parse the `now()` function call
+    fn call(i: &str) -> ParseResult<&str, Expr> {
+        verify(
+            "invalid expression, the only valid function calls are 'now' with no arguments, or scalar math functions",
+            call_expression::<Self>,
+            is_valid_conditional_call,
+        )(i)
+    }
+}
+
+impl ArithmeticParsers for ConditionalExpression {
+    fn operand(i: &str) -> ParseResult<&str, Expr> {
+        preceded(
+            ws0,
+            alt((
+                map(literal_no_regex, Expr::Literal),
+                Self::call,
+                var_ref,
+                map(parameter, Expr::BindParameter),
+            )),
+        )(i)
+    }
+}
+
+/// Parse an arithmetic expression used by conditional expressions.
+pub(crate) fn arithmetic_expression(i: &str) -> ParseResult<&str, Expr> {
+    arithmetic::<ConditionalExpression>(i)
+}
+
+/// Return a new conditional expression, `lhs op rhs`.
+pub fn binary_cond(
+    lhs: ConditionalExpression,
+    op: ConditionalOperator,
+    rhs: ConditionalExpression,
+) -> ConditionalExpression {
+    ConditionalExpression::Binary(ConditionalBinary {
+        lhs: Box::new(lhs),
+        op,
+        rhs: Box::new(rhs),
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::expression::arithmetic::Expr;
+    use crate::{
+        assert_expect_error, assert_failure, binary_op, call, cond_op, grouped, regex, var_ref,
+    };
+    use test_helpers::assert_error;
+
+    impl From<Expr> for ConditionalExpression {
+        fn from(v: Expr) -> Self {
+            Self::Expr(Box::new(v))
+        }
+    }
+
+    impl From<i32> for Box<ConditionalExpression> {
+        fn from(v: i32) -> Self {
+            Self::new(ConditionalExpression::Expr(Box::new(Expr::Literal(
+                (v as i64).into(),
+            ))))
+        }
+    }
+
+    impl From<i64> for Box<ConditionalExpression> {
+        fn from(v: i64) -> Self {
+            Self::new(ConditionalExpression::Expr(Box::new(Expr::Literal(
+                v.into(),
+            ))))
+        }
+    }
+
+    impl From<u64> for Box<ConditionalExpression> {
+        fn from(v: u64) -> Self {
+            Self::new(ConditionalExpression::Expr(Box::new(Expr::Literal(
+                v.into(),
+            ))))
+        }
+    }
+
+    impl From<Expr> for Box<ConditionalExpression> {
+        fn from(v: Expr) -> Self {
+            Self::new(ConditionalExpression::Expr(v.into()))
+        }
+    }
+
+    impl From<Box<Expr>> for Box<ConditionalExpression> {
+        fn from(v: Box<Expr>) -> Self {
+            Self::new(ConditionalExpression::Expr(v))
+        }
+    }
+
+    #[test]
+    fn test_arithmetic_expression() {
+        // now() function call is permitted
+        let (_, got) = arithmetic_expression("now() + 3").unwrap();
+        assert_eq!(got, binary_op!(call!("now"), Add, 3));
+
+        // arithmetic functions calls are permitted
+        let (_, got) = arithmetic_expression("abs(f) + 3").unwrap();
+        assert_eq!(got, binary_op!(call!("abs", var_ref!("f")), Add, 3));
+
+        // Fallible cases
+
+        assert_expect_error!(
+            arithmetic_expression("sum(foo)"),
+            "invalid expression, the only valid function calls are 'now' with no arguments, or scalar math functions"
+        );
+
+        assert_expect_error!(
+            arithmetic_expression("now(1)"),
+            "invalid expression, the only valid function calls are 'now' with no arguments, or scalar math functions"
+        );
+    }
+
+    #[test]
+    fn test_conditional_expression() {
+        let (_, got) = conditional_expression("foo = 5").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), Eq, 5));
+
+        let (_, got) = conditional_expression("foo != 5").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), NotEq, 5));
+
+        let (_, got) = conditional_expression("foo > 5").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), Gt, 5));
+
+        let (_, got) = conditional_expression("foo >= 5").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), GtEq, 5));
+
+        let (_, got) = conditional_expression("foo < 5").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), Lt, 5));
+
+        let (_, got) = conditional_expression("foo <= 5").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), LtEq, 5));
+
+        let (_, got) = conditional_expression("foo > 5 + 6 ").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), Gt, binary_op!(5, Add, 6)));
+
+        let (_, got) = conditional_expression("5 <= -6").unwrap();
+        assert_eq!(got, *cond_op!(5, LtEq, -6));
+
+        // simple expressions
+        let (_, got) = conditional_expression("true").unwrap();
+        assert_eq!(
+            got,
+            ConditionalExpression::Expr(Box::new(Expr::Literal(true.into())))
+        );
+
+        // Expressions are still valid when whitespace is omitted
+
+        let (_, got) = conditional_expression("foo>5+6 ").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), Gt, binary_op!(5, Add, 6)));
+
+        let (_, got) = conditional_expression("5<=-6").unwrap();
+        assert_eq!(got, *cond_op!(5, LtEq, -6));
+
+        // var refs with cast operator
+        let (_, got) = conditional_expression("foo::integer = 5").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo", Integer), Eq, 5));
+
+        // Fallible cases
+
+        // conditional expression must be complete
+        assert_failure!(conditional_expression("5 <="));
+
+        // should not accept a regex literal
+        assert_failure!(conditional_expression("5 = /regex/"));
+    }
+
+    #[test]
+    fn test_logical_expression() {
+        let (_, got) = conditional_expression("5 AND 6").unwrap();
+        assert_eq!(got, *cond_op!(5, And, 6));
+
+        let (_, got) = conditional_expression("5 AND 6 OR 7").unwrap();
+        assert_eq!(got, *cond_op!(cond_op!(5, And, 6), Or, 7));
+
+        let (_, got) = conditional_expression("5 > 3 OR 6 = 7 AND 7 != 1").unwrap();
+        assert_eq!(
+            got,
+            *cond_op!(
+                cond_op!(5, Gt, 3),
+                Or,
+                cond_op!(cond_op!(6, Eq, 7), And, cond_op!(7, NotEq, 1))
+            )
+        );
+
+        let (_, got) = conditional_expression("5 AND (6 OR 7)").unwrap();
+        assert_eq!(got, *cond_op!(5, And, grouped!(cond_op!(6, Or, 7))));
+
+        // <> is recognised as !=
+        let (_, got) = conditional_expression("5 <> 6").unwrap();
+        assert_eq!(got, *cond_op!(5, NotEq, 6));
+
+        // In the following cases, we validate that the `OR` keyword is not eagerly
+        // parsed from substrings
+        let (got, _) = conditional_expression("foo = bar ORDER BY time ASC").unwrap();
+        assert_eq!(got, " ORDER BY time ASC");
+
+        let (got, _) = conditional_expression("foo = bar OR1").unwrap();
+        assert_eq!(got, " OR1");
+
+        // Whitespace is optional for certain characters
+        let (got, _) = conditional_expression("foo = bar OR(foo > bar) ORDER BY time ASC").unwrap();
+        assert_eq!(got, " ORDER BY time ASC");
+
+        // Fallible cases
+
+        // Expects Expr after operator
+        assert_failure!(conditional_expression("5 OR -"));
+        assert_failure!(conditional_expression("5 OR"));
+        assert_failure!(conditional_expression("5 AND"));
+
+        // Can't use "and" as identifier
+        assert_failure!(conditional_expression("5 AND and OR 5"));
+    }
+
+    #[test]
+    fn test_regex() {
+        let (_, got) = conditional_expression("foo =~ /(a > b)/").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), EqRegex, regex!("(a > b)")));
+
+        let (_, got) = conditional_expression("foo !~ /bar/").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), NotEqRegex, regex!("bar")));
+
+        // Expressions are still valid when whitespace is omitted
+
+        let (_, got) = conditional_expression("foo=~/(a > b)/").unwrap();
+        assert_eq!(got, *cond_op!(var_ref!("foo"), EqRegex, regex!("(a > b)")));
+
+        // Fallible cases
+
+        // Expects a regex literal after regex conditional operators
+        assert_expect_error!(
+            conditional_expression("foo =~ 5"),
+            "invalid conditional, expected regular expression"
+        );
+        assert_expect_error!(
+            conditional_expression("foo !~ 5"),
+            "invalid conditional, expected regular expression"
+        );
+    }
+
+    #[test]
+    fn test_display_expr() {
+        let (_, e) = conditional_expression("foo = 'test'").unwrap();
+        assert_eq!(e.to_string(), "foo = 'test'");
+    }
+
+    #[test]
+    fn test_parse_conditional_expression() {
+        assert_eq!(
+            parse_conditional_expression("a>b").unwrap().to_string(),
+            "a > b"
+        );
+
+        // with leading and trailing whitespace
+        assert_eq!(
+            parse_conditional_expression("  a>b  ").unwrap().to_string(),
+            "a > b"
+        );
+
+        // Fallible cases
+
+        // Expected regular expression
+        assert_error!(parse_conditional_expression("a =~ 'foo'"), ref e @ ParseError { .. } if e.pos == 4);
+
+        // Invalid operator
+        assert_error!(parse_conditional_expression("a ~= /foo/"), ref e @ ParseError { .. } if e.pos == 2);
+    }
+
+    /// Validate the [`FromStr`] implementation for [`ConditionalExpression`].
+    #[test]
+    fn test_conditional_expression_parse() {
+        let cond = " a>b ".parse::<ConditionalExpression>().unwrap();
+        assert_eq!(cond.to_string(), "a > b");
+    }
+
+    #[test]
+    fn test_conditional_expression_expr() {
+        let cond: ConditionalExpression = "a + 1 > b - 2".parse().unwrap();
+        assert!(cond.expr().is_none());
+
+        let cond: ConditionalExpression = "(a + 1 > b - 2)".parse().unwrap();
+        assert!(cond.expr().is_none());
+
+        let cond: ConditionalExpression = "a + 1".parse().unwrap();
+        assert_eq!(cond.expr().unwrap().to_string(), "a + 1");
+
+        let cond: ConditionalExpression = "(a + 1)".parse().unwrap();
+        assert_eq!(cond.expr().unwrap().to_string(), "(a + 1)");
+    }
+}
diff --git a/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr-2.snap b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr-2.snap
new file mode 100644
index 0000000..625a695
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr-2.snap
@@ -0,0 +1,7 @@
+---
+source: influxdb_influxql_parser/src/expression/walk.rs
+expression: "walk_expr(\"now() + 1h\")"
+---
+0: Call(Call { name: "now", args: [] })
+1: Literal(Duration(Duration(3600000000000)))
+2: Binary(Binary { lhs: Call(Call { name: "now", args: [] }), op: Add, rhs: Literal(Duration(Duration(3600000000000))) })
diff --git a/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr.snap b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr.snap
new file mode 100644
index 0000000..219abf1
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr.snap
@@ -0,0 +1,7 @@
+---
+source: influxdb_influxql_parser/src/expression/walk.rs
+expression: "walk_expr(\"5 + 6\")"
+---
+0: Literal(Integer(5))
+1: Literal(Integer(6))
+2: Binary(Binary { lhs: Literal(Integer(5)), op: Add, rhs: Literal(Integer(6)) })
diff --git a/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr_mut-2.snap b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr_mut-2.snap
new file mode 100644
index 0000000..27cd9cc
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr_mut-2.snap
@@ -0,0 +1,7 @@
+---
+source: influxdb_influxql_parser/src/expression/walk.rs
+expression: "walk_expr_mut(\"now() + 1h\")"
+---
+0: Call(Call { name: "now", args: [] })
+1: Literal(Duration(Duration(3600000000000)))
+2: Binary(Binary { lhs: Call(Call { name: "now", args: [] }), op: Add, rhs: Literal(Duration(Duration(3600000000000))) })
diff --git a/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr_mut.snap b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr_mut.snap
new file mode 100644
index 0000000..6eb590b
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expr_mut.snap
@@ -0,0 +1,7 @@
+---
+source: influxdb_influxql_parser/src/expression/walk.rs
+expression: "walk_expr_mut(\"5 + 6\")"
+---
+0: Literal(Integer(5))
+1: Literal(Integer(6))
+2: Binary(Binary { lhs: Literal(Integer(5)), op: Add, rhs: Literal(Integer(6)) })
diff --git a/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expression-2.snap b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expression-2.snap
new file mode 100644
index 0000000..6ff8ee4
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expression-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/expression/walk.rs
+expression: "walk_expression(\"time > now() + 1h\")"
+---
+0: Arithmetic(VarRef(VarRef { name: Identifier("time"), data_type: None }))
+1: Conditional(Expr(VarRef(VarRef { name: Identifier("time"), data_type: None })))
+2: Arithmetic(Call(Call { name: "now", args: [] }))
+3: Arithmetic(Literal(Duration(Duration(3600000000000))))
+4: Arithmetic(Binary(Binary { lhs: Call(Call { name: "now", args: [] }), op: Add, rhs: Literal(Duration(Duration(3600000000000))) }))
+5: Conditional(Expr(Binary(Binary { lhs: Call(Call { name: "now", args: [] }), op: Add, rhs: Literal(Duration(Duration(3600000000000))) })))
+6: Conditional(Binary(ConditionalBinary { lhs: Expr(VarRef(VarRef { name: Identifier("time"), data_type: None })), op: Gt, rhs: Expr(Binary(Binary { lhs: Call(Call { name: "now", args: [] }), op: Add, rhs: Literal(Duration(Duration(3600000000000))) })) }))
diff --git a/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expression.snap b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expression.snap
new file mode 100644
index 0000000..aae7068
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/snapshots/influxdb_influxql_parser__expression__walk__test__walk_expression.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/expression/walk.rs
+expression: "walk_expression(\"5 + 6 = 2 + 9\")"
+---
+0: Arithmetic(Literal(Integer(5)))
+1: Arithmetic(Literal(Integer(6)))
+2: Arithmetic(Binary(Binary { lhs: Literal(Integer(5)), op: Add, rhs: Literal(Integer(6)) }))
+3: Conditional(Expr(Binary(Binary { lhs: Literal(Integer(5)), op: Add, rhs: Literal(Integer(6)) })))
+4: Arithmetic(Literal(Integer(2)))
+5: Arithmetic(Literal(Integer(9)))
+6: Arithmetic(Binary(Binary { lhs: Literal(Integer(2)), op: Add, rhs: Literal(Integer(9)) }))
+7: Conditional(Expr(Binary(Binary { lhs: Literal(Integer(2)), op: Add, rhs: Literal(Integer(9)) })))
+8: Conditional(Binary(ConditionalBinary { lhs: Expr(Binary(Binary { lhs: Literal(Integer(5)), op: Add, rhs: Literal(Integer(6)) })), op: Eq, rhs: Expr(Binary(Binary { lhs: Literal(Integer(2)), op: Add, rhs: Literal(Integer(9)) })) }))
diff --git a/influxdb_influxql_parser/src/expression/test_util.rs b/influxdb_influxql_parser/src/expression/test_util.rs
new file mode 100644
index 0000000..2d18de4
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/test_util.rs
@@ -0,0 +1,130 @@
+#![cfg(test)]
+
+/// Constructs an [crate::expression::arithmetic::Expr::VarRef] expression.
+#[macro_export]
+macro_rules! var_ref {
+    ($NAME: literal) => {
+        $crate::expression::Expr::VarRef($crate::expression::VarRef {
+            name: $NAME.into(),
+            data_type: None,
+        })
+    };
+
+    ($NAME: literal, $TYPE: ident) => {
+        $crate::expression::Expr::VarRef($crate::expression::VarRef {
+            name: $NAME.into(),
+            data_type: Some($crate::expression::arithmetic::VarRefDataType::$TYPE),
+        })
+    };
+}
+
+/// Constructs a regular expression [crate::expression::arithmetic::Expr::Literal].
+#[macro_export]
+macro_rules! regex {
+    ($EXPR: expr) => {
+        $crate::expression::arithmetic::Expr::Literal(
+            $crate::literal::Literal::Regex($EXPR.into()).into(),
+        )
+    };
+}
+
+/// Constructs a [crate::expression::arithmetic::Expr::BindParameter] expression.
+#[macro_export]
+macro_rules! param {
+    ($EXPR: expr) => {
+        $crate::expression::arithmetic::Expr::BindParameter(
+            $crate::parameter::BindParameter::new($EXPR.into()).into(),
+        )
+    };
+}
+
+/// Constructs a [crate::expression::conditional::ConditionalExpression::Grouped] expression.
+#[macro_export]
+macro_rules! grouped {
+    ($EXPR: expr) => {
+        <$crate::expression::conditional::ConditionalExpression as std::convert::Into<
+            Box<$crate::expression::conditional::ConditionalExpression>,
+        >>::into($crate::expression::conditional::ConditionalExpression::Grouped($EXPR.into()))
+    };
+}
+
+/// Constructs a [crate::expression::arithmetic::Expr::Nested] expression.
+#[macro_export]
+macro_rules! nested {
+    ($EXPR: expr) => {
+        <$crate::expression::arithmetic::Expr as std::convert::Into<
+            Box<$crate::expression::arithmetic::Expr>,
+        >>::into($crate::expression::arithmetic::Expr::Nested($EXPR.into()))
+    };
+}
+
+/// Constructs a [crate::expression::arithmetic::Expr::Call] expression.
+#[macro_export]
+macro_rules! call {
+    ($NAME:literal) => {
+        $crate::expression::Expr::Call($crate::expression::Call {
+            name: $NAME.into(),
+            args: vec![],
+        })
+    };
+    ($NAME:literal, $( $ARG:expr ),+) => {
+        $crate::expression::Expr::Call($crate::expression::Call {
+            name: $NAME.into(),
+            args: vec![$( $ARG ),+],
+        })
+    };
+}
+
+/// Constructs a [crate::expression::arithmetic::Expr::Distinct] expression.
+#[macro_export]
+macro_rules! distinct {
+    ($IDENT:literal) => {
+        $crate::expression::arithmetic::Expr::Distinct($IDENT.into())
+    };
+}
+
+/// Constructs a [crate::expression::arithmetic::Expr::Wildcard] expression.
+#[macro_export]
+macro_rules! wildcard {
+    () => {
+        $crate::expression::arithmetic::Expr::Wildcard(None)
+    };
+    (tag) => {
+        $crate::expression::arithmetic::Expr::Wildcard(Some(
+            $crate::expression::arithmetic::WildcardType::Tag,
+        ))
+    };
+    (field) => {
+        $crate::expression::arithmetic::Expr::Wildcard(Some(
+            $crate::expression::arithmetic::WildcardType::Field,
+        ))
+    };
+}
+
+/// Constructs a [crate::expression::arithmetic::Expr::Binary] expression.
+#[macro_export]
+macro_rules! binary_op {
+    ($LHS: expr, $OP: ident, $RHS: expr) => {
+        $crate::expression::Expr::Binary($crate::expression::Binary {
+            lhs: $LHS.into(),
+            op: $crate::expression::BinaryOperator::$OP,
+            rhs: $RHS.into(),
+        })
+    };
+}
+
+/// Constructs a [crate::expression::conditional::ConditionalExpression::Binary] expression.
+#[macro_export]
+macro_rules! cond_op {
+    ($LHS: expr, $OP: ident, $RHS: expr) => {
+        <$crate::expression::ConditionalExpression as std::convert::Into<
+            Box<$crate::expression::ConditionalExpression>,
+        >>::into($crate::expression::ConditionalExpression::Binary(
+            $crate::expression::ConditionalBinary {
+                lhs: $LHS.into(),
+                op: $crate::expression::ConditionalOperator::$OP,
+                rhs: $RHS.into(),
+            },
+        ))
+    };
+}
diff --git a/influxdb_influxql_parser/src/expression/walk.rs b/influxdb_influxql_parser/src/expression/walk.rs
new file mode 100644
index 0000000..352633d
--- /dev/null
+++ b/influxdb_influxql_parser/src/expression/walk.rs
@@ -0,0 +1,205 @@
+use crate::expression::{Binary, Call, ConditionalBinary, ConditionalExpression, Expr};
+
+/// Expression distinguishes InfluxQL [`ConditionalExpression`] or [`Expr`]
+/// nodes when visiting a [`ConditionalExpression`] tree. See [`walk_expression`].
+#[derive(Debug)]
+pub enum Expression<'a> {
+    /// Specifies a conditional expression.
+    Conditional(&'a ConditionalExpression),
+    /// Specifies an arithmetic expression.
+    Arithmetic(&'a Expr),
+}
+
+/// ExpressionMut is the same as [`Expression`] with the exception that
+/// it provides mutable access to the nodes of the tree.
+#[derive(Debug)]
+pub enum ExpressionMut<'a> {
+    /// Specifies a conditional expression.
+    Conditional(&'a mut ConditionalExpression),
+    /// Specifies an arithmetic expression.
+    Arithmetic(&'a mut Expr),
+}
+
+/// Perform a depth-first traversal of an expression tree.
+pub fn walk_expression<'a, B>(
+    node: &'a ConditionalExpression,
+    visit: &mut impl FnMut(Expression<'a>) -> std::ops::ControlFlow<B>,
+) -> std::ops::ControlFlow<B> {
+    match node {
+        ConditionalExpression::Expr(n) => walk_expr(n, &mut |n| visit(Expression::Arithmetic(n)))?,
+        ConditionalExpression::Binary(ConditionalBinary { lhs, rhs, .. }) => {
+            walk_expression(lhs, visit)?;
+            walk_expression(rhs, visit)?;
+        }
+        ConditionalExpression::Grouped(n) => walk_expression(n, visit)?,
+    }
+
+    visit(Expression::Conditional(node))
+}
+
+/// Perform a depth-first traversal of a mutable arithmetic or conditional expression tree.
+pub fn walk_expression_mut<B>(
+    node: &mut ConditionalExpression,
+    visit: &mut impl FnMut(ExpressionMut<'_>) -> std::ops::ControlFlow<B>,
+) -> std::ops::ControlFlow<B> {
+    match node {
+        ConditionalExpression::Expr(n) => {
+            walk_expr_mut(n, &mut |n| visit(ExpressionMut::Arithmetic(n)))?
+        }
+        ConditionalExpression::Binary(ConditionalBinary { lhs, rhs, .. }) => {
+            walk_expression_mut(lhs, visit)?;
+            walk_expression_mut(rhs, visit)?;
+        }
+        ConditionalExpression::Grouped(n) => walk_expression_mut(n, visit)?,
+    }
+
+    visit(ExpressionMut::Conditional(node))
+}
+
+/// Perform a depth-first traversal of the arithmetic expression tree.
+pub fn walk_expr<'a, B>(
+    expr: &'a Expr,
+    visit: &mut impl FnMut(&'a Expr) -> std::ops::ControlFlow<B>,
+) -> std::ops::ControlFlow<B> {
+    match expr {
+        Expr::Binary(Binary { lhs, rhs, .. }) => {
+            walk_expr(lhs, visit)?;
+            walk_expr(rhs, visit)?;
+        }
+        Expr::Nested(n) => walk_expr(n, visit)?,
+        Expr::Call(Call { args, .. }) => {
+            args.iter().try_for_each(|n| walk_expr(n, visit))?;
+        }
+        Expr::VarRef { .. }
+        | Expr::BindParameter(_)
+        | Expr::Literal(_)
+        | Expr::Wildcard(_)
+        | Expr::Distinct(_) => {}
+    }
+
+    visit(expr)
+}
+
+/// Perform a depth-first traversal of a mutable arithmetic expression tree.
+pub fn walk_expr_mut<B>(
+    expr: &mut Expr,
+    visit: &mut impl FnMut(&mut Expr) -> std::ops::ControlFlow<B>,
+) -> std::ops::ControlFlow<B> {
+    match expr {
+        Expr::Binary(Binary { lhs, rhs, .. }) => {
+            walk_expr_mut(lhs, visit)?;
+            walk_expr_mut(rhs, visit)?;
+        }
+        Expr::Nested(n) => walk_expr_mut(n, visit)?,
+        Expr::Call(Call { args, .. }) => {
+            args.iter_mut().try_for_each(|n| walk_expr_mut(n, visit))?;
+        }
+        Expr::VarRef { .. }
+        | Expr::BindParameter(_)
+        | Expr::Literal(_)
+        | Expr::Wildcard(_)
+        | Expr::Distinct(_) => {}
+    }
+
+    visit(expr)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::expression::walk::{walk_expr_mut, walk_expression_mut, ExpressionMut};
+    use crate::expression::{
+        arithmetic_expression, conditional_expression, ConditionalBinary, ConditionalExpression,
+        ConditionalOperator, Expr, VarRef,
+    };
+    use crate::literal::Literal;
+
+    #[test]
+    fn test_walk_expression() {
+        fn walk_expression(s: &str) -> String {
+            let (_, ref expr) = conditional_expression(s).unwrap();
+            let mut calls = Vec::new();
+            let mut call_no = 0;
+            super::walk_expression::<()>(expr, &mut |n| {
+                calls.push(format!("{call_no}: {n:?}"));
+                call_no += 1;
+                std::ops::ControlFlow::Continue(())
+            });
+            calls.join("\n")
+        }
+
+        insta::assert_display_snapshot!(walk_expression("5 + 6 = 2 + 9"));
+        insta::assert_display_snapshot!(walk_expression("time > now() + 1h"));
+    }
+
+    #[test]
+    fn test_walk_expression_mut_modify() {
+        let (_, ref mut expr) = conditional_expression("foo + bar + 5 =~ /str/").unwrap();
+        walk_expression_mut::<()>(expr, &mut |e| {
+            match e {
+                ExpressionMut::Arithmetic(n) => match n {
+                    Expr::VarRef(VarRef { name, .. }) => *name = format!("c_{name}").into(),
+                    Expr::Literal(Literal::Integer(v)) => *v *= 10,
+                    Expr::Literal(Literal::Regex(v)) => *v = format!("c_{}", v.0).into(),
+                    _ => {}
+                },
+                ExpressionMut::Conditional(n) => {
+                    if let ConditionalExpression::Binary(ConditionalBinary { op, .. }) = n {
+                        *op = ConditionalOperator::NotEqRegex
+                    }
+                }
+            }
+            std::ops::ControlFlow::Continue(())
+        });
+        assert_eq!(expr.to_string(), "c_foo + c_bar + 50 !~ /c_str/")
+    }
+
+    #[test]
+    fn test_walk_expr() {
+        fn walk_expr(s: &str) -> String {
+            let (_, expr) = arithmetic_expression(s).unwrap();
+            let mut calls = Vec::new();
+            let mut call_no = 0;
+            super::walk_expr::<()>(&expr, &mut |n| {
+                calls.push(format!("{call_no}: {n:?}"));
+                call_no += 1;
+                std::ops::ControlFlow::Continue(())
+            });
+            calls.join("\n")
+        }
+
+        insta::assert_display_snapshot!(walk_expr("5 + 6"));
+        insta::assert_display_snapshot!(walk_expr("now() + 1h"));
+    }
+
+    #[test]
+    fn test_walk_expr_mut() {
+        fn walk_expr_mut(s: &str) -> String {
+            let (_, mut expr) = arithmetic_expression(s).unwrap();
+            let mut calls = Vec::new();
+            let mut call_no = 0;
+            super::walk_expr_mut::<()>(&mut expr, &mut |n| {
+                calls.push(format!("{call_no}: {n:?}"));
+                call_no += 1;
+                std::ops::ControlFlow::Continue(())
+            });
+            calls.join("\n")
+        }
+
+        insta::assert_display_snapshot!(walk_expr_mut("5 + 6"));
+        insta::assert_display_snapshot!(walk_expr_mut("now() + 1h"));
+    }
+
+    #[test]
+    fn test_walk_expr_mut_modify() {
+        let (_, mut expr) = arithmetic_expression("foo + bar + 5").unwrap();
+        walk_expr_mut::<()>(&mut expr, &mut |e| {
+            match e {
+                Expr::VarRef(VarRef { name, .. }) => *name = format!("c_{name}").into(),
+                Expr::Literal(Literal::Integer(v)) => *v *= 10,
+                _ => {}
+            }
+            std::ops::ControlFlow::Continue(())
+        });
+        assert_eq!(expr.to_string(), "c_foo + c_bar + 50")
+    }
+}
diff --git a/influxdb_influxql_parser/src/functions.rs b/influxdb_influxql_parser/src/functions.rs
new file mode 100644
index 0000000..b42103e
--- /dev/null
+++ b/influxdb_influxql_parser/src/functions.rs
@@ -0,0 +1,74 @@
+//! # [Functions] supported by InfluxQL
+//!
+//! [Functions]: https://docs.influxdata.com/influxdb/v1.8/query_language/functions/
+
+use std::collections::HashSet;
+
+use once_cell::sync::Lazy;
+
+/// Returns `true` if `name` is a mathematical scalar function
+/// supported by InfluxQL.
+pub fn is_scalar_math_function(name: &str) -> bool {
+    static FUNCTIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+        HashSet::from([
+            "abs", "sin", "cos", "tan", "asin", "acos", "atan", "atan2", "exp", "log", "ln",
+            "log2", "log10", "sqrt", "pow", "floor", "ceil", "round",
+        ])
+    });
+
+    FUNCTIONS.contains(name)
+}
+
+/// Returns `true` if `name` is an aggregate or aggregate function
+/// supported by InfluxQL.
+pub fn is_aggregate_function(name: &str) -> bool {
+    static FUNCTIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+        HashSet::from([
+            // Scalar-like functions
+            "cumulative_sum",
+            "derivative",
+            "difference",
+            "elapsed",
+            "moving_average",
+            "non_negative_derivative",
+            "non_negative_difference",
+            // Selector functions
+            "bottom",
+            "first",
+            "last",
+            "max",
+            "min",
+            "percentile",
+            "sample",
+            "top",
+            // Aggregate functions
+            "count",
+            "integral",
+            "mean",
+            "median",
+            "mode",
+            "spread",
+            "stddev",
+            "sum",
+            // Prediction functions
+            "holt_winters",
+            "holt_winters_with_fit",
+            // Technical analysis functions
+            "chande_momentum_oscillator",
+            "exponential_moving_average",
+            "double_exponential_moving_average",
+            "kaufmans_efficiency_ratio",
+            "kaufmans_adaptive_moving_average",
+            "triple_exponential_moving_average",
+            "triple_exponential_derivative",
+            "relative_strength_index",
+        ])
+    });
+
+    FUNCTIONS.contains(name)
+}
+
+/// Returns `true` if `name` is `"now"`.
+pub fn is_now_function(name: &str) -> bool {
+    name == "now"
+}
diff --git a/influxdb_influxql_parser/src/identifier.rs b/influxdb_influxql_parser/src/identifier.rs
new file mode 100644
index 0000000..dcbc2fb
--- /dev/null
+++ b/influxdb_influxql_parser/src/identifier.rs
@@ -0,0 +1,167 @@
+//! # Parse an InfluxQL [identifier]
+//!
+//! Identifiers are parsed using the following rules:
+//!
+//! * double quoted identifiers can contain any unicode character other than a new line
+//! * double quoted identifiers can contain escaped characters, namely `\"`, `\n`, `\t`, `\\` and `\'`
+//! * double quoted identifiers can contain [InfluxQL keywords][keywords]
+//! * unquoted identifiers must start with an upper or lowercase ASCII character or `_`
+//! * unquoted identifiers may contain only ASCII letters, decimal digits, and `_`
+//! * identifiers may be preceded by whitespace
+//!
+//! [identifier]: https://docs.influxdata.com/influxdb/v1.8/query_language/spec/#identifiers
+//! [keywords]: https://docs.influxdata.com/influxdb/v1.8/query_language/spec/#keywords
+
+use crate::common::ws0;
+use crate::internal::ParseResult;
+use crate::keywords::sql_keyword;
+use crate::string::double_quoted_string;
+use crate::{impl_tuple_clause, write_quoted_string};
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::character::complete::{alpha1, alphanumeric1};
+use nom::combinator::{map, not, recognize};
+use nom::multi::many0_count;
+use nom::sequence::{pair, preceded};
+use std::fmt::{Display, Formatter, Write};
+use std::{fmt, mem};
+
+/// Parse an unquoted InfluxQL identifier.
+pub(crate) fn unquoted_identifier(i: &str) -> ParseResult<&str, &str> {
+    preceded(
+        not(sql_keyword),
+        recognize(pair(
+            alt((alpha1, tag("_"))),
+            many0_count(alt((alphanumeric1, tag("_")))),
+        )),
+    )(i)
+}
+
+/// A type that represents an InfluxQL identifier.
+#[derive(Clone, Debug, Eq, Hash, PartialEq, Ord, PartialOrd)]
+pub struct Identifier(pub(crate) String);
+
+impl_tuple_clause!(Identifier, String);
+
+impl From<&str> for Identifier {
+    fn from(s: &str) -> Self {
+        Self(s.to_string())
+    }
+}
+
+impl Identifier {
+    /// Returns true if the identifier requires quotes.
+    pub fn requires_quotes(&self) -> bool {
+        nom::sequence::terminated(unquoted_identifier, nom::combinator::eof)(&self.0).is_err()
+    }
+
+    /// Takes the string value out of the identifier, leaving a default string value in its place.
+    pub fn take(&mut self) -> String {
+        mem::take(&mut self.0)
+    }
+}
+
+impl Display for Identifier {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write_quoted_string!(f, '"', self.0.as_str(), unquoted_identifier, '\n' => "\\n", '\\' => "\\\\", '"' => "\\\"");
+        Ok(())
+    }
+}
+
+/// Parses an InfluxQL [Identifier].
+///
+/// EBNF for an identifier is approximately:
+///
+/// ```text
+/// identifier          ::= whitespace? ( quoted_identifier | unquoted_identifier )
+/// unquoted_identifier ::= [_a..zA..Z] [_a..zA..Z0..9]*
+/// quoted_identifier   ::= '"' [^"\n] '"'
+/// ```
+pub(crate) fn identifier(i: &str) -> ParseResult<&str, Identifier> {
+    // See: https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L432-L438
+    preceded(
+        ws0,
+        alt((
+            map(unquoted_identifier, Into::into),
+            map(double_quoted_string, Into::into),
+        )),
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_unquoted_identifier() {
+        // all ascii
+        let (_, got) = unquoted_identifier("cpu").unwrap();
+        assert_eq!(got, "cpu");
+
+        // all valid chars
+        let (_, got) = unquoted_identifier("cpu_0").unwrap();
+        assert_eq!(got, "cpu_0");
+
+        // begin with underscore
+        let (_, got) = unquoted_identifier("_cpu_0").unwrap();
+        assert_eq!(got, "_cpu_0");
+
+        // ┌─────────────────────────────┐
+        // │       Fallible tests        │
+        // └─────────────────────────────┘
+
+        // start with number
+        unquoted_identifier("0cpu").unwrap_err();
+
+        // is a keyword
+        unquoted_identifier("as").unwrap_err();
+    }
+
+    #[test]
+    fn test_identifier() {
+        // quoted
+        let (_, got) = identifier("\"quick draw\"").unwrap();
+        assert_eq!(got, "quick draw".into());
+        // validate that `as_str` returns the unquoted string
+        assert_eq!(got.as_str(), "quick draw");
+
+        // unquoted
+        let (_, got) = identifier("quick_draw").unwrap();
+        assert_eq!(got, "quick_draw".into());
+
+        // leading whitespace
+        let (_, got) = identifier("  quick_draw").unwrap();
+        assert_eq!(got, "quick_draw".into());
+    }
+
+    #[test]
+    fn test_identifier_display() {
+        // Identifier properly escapes specific characters and quotes output
+        let got = Identifier("quick\n\t\\\"'draw \u{1f47d}".into()).to_string();
+        assert_eq!(got, r#""quick\n	\\\"'draw 👽""#);
+
+        // Identifier displays unquoted output
+        let got = Identifier("quick_draw".into()).to_string();
+        assert_eq!(got, "quick_draw");
+    }
+
+    #[test]
+    fn test_identifier_requires_quotes() {
+        // Following examples require quotes
+
+        // Quotes, spaces, non-ASCII
+        assert!(Identifier("quick\n\t\\\"'draw \u{1f47d}".into()).requires_quotes());
+        // non-ASCII
+        assert!(Identifier("quick_\u{1f47d}".into()).requires_quotes());
+        // starts with number
+        assert!(Identifier("0quick".into()).requires_quotes());
+
+        // Following examples do not require quotes
+
+        // starts with underscore
+        assert!(!Identifier("_quick".into()).requires_quotes());
+
+        // Only ASCII, non-space
+        assert!(!Identifier("quick_90".into()).requires_quotes());
+    }
+}
diff --git a/influxdb_influxql_parser/src/internal.rs b/influxdb_influxql_parser/src/internal.rs
new file mode 100644
index 0000000..90b2f59
--- /dev/null
+++ b/influxdb_influxql_parser/src/internal.rs
@@ -0,0 +1,133 @@
+//! Internal result and error types used to build InfluxQL parsers
+//!
+use nom::error::{ErrorKind as NomErrorKind, ParseError as NomParseError};
+use nom::Parser;
+use std::borrow::Borrow;
+use std::fmt::{Display, Formatter};
+
+/// This trait must be implemented in order to use the [`map_fail`] and
+/// [`expect`] functions for generating user-friendly error messages.
+pub(crate) trait ParseError<'a>: NomParseError<&'a str> + Sized {
+    fn from_message(input: &'a str, message: &'static str) -> Self;
+}
+
+/// An internal error type used to build InfluxQL parsers.
+#[derive(Debug, PartialEq, Eq)]
+pub enum Error<I> {
+    Syntax { input: I, message: &'static str },
+    Nom(I, NomErrorKind),
+}
+
+impl<I: Display> Display for Error<I> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Syntax { input: _, message } => {
+                write!(f, "Syntax error: {message}")
+            }
+            Self::Nom(_, kind) => write!(f, "nom error: {kind:?}"),
+        }
+    }
+}
+
+impl<'a> ParseError<'a> for Error<&'a str> {
+    fn from_message(input: &'a str, message: &'static str) -> Self {
+        Self::Syntax { input, message }
+    }
+}
+
+/// Applies a function returning a [`ParseResult`] over the result of the `parser`.
+/// If the parser returns an error, the result will be mapped to an unrecoverable
+/// [`nom::Err::Failure`] with the specified `message` for additional context.
+pub(crate) fn map_fail<'a, O1, O2, E: ParseError<'a>, E2, F, G>(
+    message: &'static str,
+    mut parser: F,
+    mut f: G,
+) -> impl FnMut(&'a str) -> ParseResult<&'a str, O2, E>
+where
+    F: Parser<&'a str, O1, E>,
+    G: FnMut(O1) -> Result<O2, E2>,
+{
+    move |input| {
+        let (input, o1) = parser.parse(input)?;
+        match f(o1) {
+            Ok(o2) => Ok((input, o2)),
+            Err(_) => Err(nom::Err::Failure(E::from_message(input, message))),
+        }
+    }
+}
+
+/// Applies a function returning a [`ParseResult`] over the result of the `parser`.
+/// If the parser returns an error, the result will be mapped to a recoverable
+/// [`nom::Err::Error`] with the specified `message` for additional context.
+pub(crate) fn map_error<'a, O1, O2, E: ParseError<'a>, E2, F, G>(
+    message: &'static str,
+    mut parser: F,
+    mut f: G,
+) -> impl FnMut(&'a str) -> ParseResult<&'a str, O2, E>
+where
+    F: Parser<&'a str, O1, E>,
+    G: FnMut(O1) -> Result<O2, E2>,
+{
+    move |input| {
+        let (input, o1) = parser.parse(input)?;
+        match f(o1) {
+            Ok(o2) => Ok((input, o2)),
+            Err(_) => Err(nom::Err::Error(E::from_message(input, message))),
+        }
+    }
+}
+
+/// Transforms a [`nom::Err::Error`] to a [`nom::Err::Failure`] using `message` for additional
+/// context.
+pub(crate) fn expect<'a, E: ParseError<'a>, F, O>(
+    message: &'static str,
+    mut f: F,
+) -> impl FnMut(&'a str) -> ParseResult<&'a str, O, E>
+where
+    F: Parser<&'a str, O, E>,
+{
+    move |i| match f.parse(i) {
+        Ok(o) => Ok(o),
+        Err(nom::Err::Incomplete(i)) => Err(nom::Err::Incomplete(i)),
+        Err(nom::Err::Error(_)) => Err(nom::Err::Failure(E::from_message(i, message))),
+        Err(nom::Err::Failure(e)) => Err(nom::Err::Failure(e)),
+    }
+}
+
+/// Returns the result of `f` if it satisfies `is_valid`; otherwise,
+/// returns an error using the specified `message`.
+pub(crate) fn verify<'a, O1, O2, E: ParseError<'a>, F, G>(
+    message: &'static str,
+    mut f: F,
+    is_valid: G,
+) -> impl FnMut(&'a str) -> ParseResult<&'a str, O1, E>
+where
+    F: Parser<&'a str, O1, E>,
+    G: Fn(&O2) -> bool,
+    O1: Borrow<O2>,
+    O2: ?Sized,
+{
+    move |i: &str| {
+        let (remain, o) = f.parse(i)?;
+
+        if is_valid(o.borrow()) {
+            Ok((remain, o))
+        } else {
+            Err(nom::Err::Failure(E::from_message(i, message)))
+        }
+    }
+}
+
+impl<I> NomParseError<I> for Error<I> {
+    fn from_error_kind(input: I, kind: NomErrorKind) -> Self {
+        Self::Nom(input, kind)
+    }
+
+    fn append(_: I, _: NomErrorKind, other: Self) -> Self {
+        other
+    }
+}
+
+/// ParseResult is a type alias for [`nom::IResult`] used by nom combinator
+/// functions for parsing InfluxQL.
+pub(crate) type ParseResult<I, T, E = Error<I>> = nom::IResult<I, T, E>;
diff --git a/influxdb_influxql_parser/src/keywords.rs b/influxdb_influxql_parser/src/keywords.rs
new file mode 100644
index 0000000..d665245
--- /dev/null
+++ b/influxdb_influxql_parser/src/keywords.rs
@@ -0,0 +1,353 @@
+//! # Parse InfluxQL [keywords]
+//!
+//! [keywords]: https://docs.influxdata.com/influxdb/v1.8/query_language/spec/#keywords
+
+use crate::internal::ParseResult;
+use nom::bytes::complete::tag_no_case;
+use nom::character::complete::alpha1;
+use nom::combinator::{fail, verify};
+use nom::sequence::terminated;
+use nom::FindToken;
+use once_cell::sync::Lazy;
+use std::collections::HashSet;
+use std::hash::{Hash, Hasher};
+
+/// Verifies the next character of `i` is valid following a keyword.
+///
+/// Keywords may be followed by whitespace, statement terminator (;), parens,
+/// or conditional and arithmetic operators or EOF
+fn keyword_follow_char(i: &str) -> ParseResult<&str, ()> {
+    if i.is_empty() || b" \n\t;(),=!><+-/*|&^%".find_token(i.bytes().next().unwrap()) {
+        Ok((i, ()))
+    } else {
+        fail(i)
+    }
+}
+
+/// Token represents a string with case-insensitive ordering and equality.
+#[derive(Debug, Clone)]
+pub(crate) struct Token<'a>(pub(crate) &'a str);
+
+impl PartialEq<Self> for Token<'_> {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.len() == other.0.len()
+            && self
+                .0
+                .chars()
+                .zip(other.0.chars())
+                .all(|(l, r)| l.to_ascii_uppercase() == r.to_ascii_uppercase())
+    }
+}
+
+impl<'a> Eq for Token<'a> {}
+
+/// The Hash implementation for Token ensures
+/// that two tokens, regardless of case, hash to the same
+/// value.
+impl<'a> Hash for Token<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.0
+            .as_bytes()
+            .iter()
+            .map(u8::to_ascii_uppercase)
+            .for_each(|v| state.write_u8(v));
+    }
+}
+
+static KEYWORDS: Lazy<HashSet<Token<'static>>> = Lazy::new(|| {
+    HashSet::from([
+        Token("ALL"),
+        Token("ALTER"),
+        Token("ANALYZE"),
+        Token("AND"),
+        Token("ANY"),
+        Token("AS"),
+        Token("ASC"),
+        Token("BEGIN"),
+        Token("BY"),
+        Token("CARDINALITY"),
+        Token("CREATE"),
+        Token("CONTINUOUS"),
+        Token("DATABASE"),
+        Token("DATABASES"),
+        Token("DEFAULT"),
+        Token("DELETE"),
+        Token("DESC"),
+        Token("DESTINATIONS"),
+        Token("DIAGNOSTICS"),
+        Token("DISTINCT"),
+        Token("DROP"),
+        Token("DURATION"),
+        Token("END"),
+        Token("EVERY"),
+        Token("EXACT"),
+        Token("EXPLAIN"),
+        Token("FIELD"),
+        Token("FOR"),
+        Token("FROM"),
+        Token("GRANT"),
+        Token("GRANTS"),
+        Token("GROUP"),
+        Token("GROUPS"),
+        Token("IN"),
+        Token("INF"),
+        Token("INSERT"),
+        Token("INTO"),
+        Token("KEY"),
+        Token("KEYS"),
+        Token("KILL"),
+        Token("LIMIT"),
+        Token("MEASUREMENT"),
+        Token("MEASUREMENTS"),
+        Token("NAME"),
+        Token("OFFSET"),
+        Token("OR"),
+        Token("ON"),
+        Token("ORDER"),
+        Token("PASSWORD"),
+        Token("POLICY"),
+        Token("POLICIES"),
+        Token("PRIVILEGES"),
+        Token("QUERIES"),
+        Token("QUERY"),
+        Token("READ"),
+        Token("REPLICATION"),
+        Token("RESAMPLE"),
+        Token("RETENTION"),
+        Token("REVOKE"),
+        Token("SELECT"),
+        Token("SERIES"),
+        Token("SET"),
+        Token("SHOW"),
+        Token("SHARD"),
+        Token("SHARDS"),
+        Token("SLIMIT"),
+        Token("SOFFSET"),
+        Token("STATS"),
+        Token("SUBSCRIPTION"),
+        Token("SUBSCRIPTIONS"),
+        Token("TAG"),
+        Token("TO"),
+        Token("USER"),
+        Token("USERS"),
+        Token("VALUES"),
+        Token("WHERE"),
+        Token("WITH"),
+        Token("WRITE"),
+    ])
+});
+
+/// Matches any InfluxQL reserved keyword.
+pub(crate) fn sql_keyword(i: &str) -> ParseResult<&str, &str> {
+    verify(terminated(alpha1, keyword_follow_char), |tok: &str| {
+        KEYWORDS.contains(&Token(tok))
+    })(i)
+}
+
+/// Recognizes a case-insensitive `keyword`, ensuring it is followed by
+/// a valid separator.
+pub(crate) fn keyword<'a>(keyword: &'static str) -> impl FnMut(&'a str) -> ParseResult<&str, &str> {
+    terminated(tag_no_case(keyword), keyword_follow_char)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::assert_error;
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_keywords() {
+        // all keywords
+
+        sql_keyword("ALL").unwrap();
+        sql_keyword("ALTER").unwrap();
+        sql_keyword("ANALYZE").unwrap();
+        sql_keyword("ANY").unwrap();
+        sql_keyword("AS").unwrap();
+        sql_keyword("ASC").unwrap();
+        sql_keyword("BEGIN").unwrap();
+        sql_keyword("BY").unwrap();
+        sql_keyword("CARDINALITY").unwrap();
+        sql_keyword("CREATE").unwrap();
+        sql_keyword("CONTINUOUS").unwrap();
+        sql_keyword("DATABASE").unwrap();
+        sql_keyword("DATABASES").unwrap();
+        sql_keyword("DEFAULT").unwrap();
+        sql_keyword("DELETE").unwrap();
+        sql_keyword("DESC").unwrap();
+        sql_keyword("DESTINATIONS").unwrap();
+        sql_keyword("DIAGNOSTICS").unwrap();
+        sql_keyword("DISTINCT").unwrap();
+        sql_keyword("DROP").unwrap();
+        sql_keyword("DURATION").unwrap();
+        sql_keyword("END").unwrap();
+        sql_keyword("EVERY").unwrap();
+        sql_keyword("EXACT").unwrap();
+        sql_keyword("EXPLAIN").unwrap();
+        sql_keyword("FIELD").unwrap();
+        sql_keyword("FOR").unwrap();
+        sql_keyword("FROM").unwrap();
+        sql_keyword("GRANT").unwrap();
+        sql_keyword("GRANTS").unwrap();
+        sql_keyword("GROUP").unwrap();
+        sql_keyword("GROUPS").unwrap();
+        sql_keyword("IN").unwrap();
+        sql_keyword("INF").unwrap();
+        sql_keyword("INSERT").unwrap();
+        sql_keyword("INTO").unwrap();
+        sql_keyword("KEY").unwrap();
+        sql_keyword("KEYS").unwrap();
+        sql_keyword("KILL").unwrap();
+        sql_keyword("LIMIT").unwrap();
+        sql_keyword("MEASUREMENT").unwrap();
+        sql_keyword("MEASUREMENTS").unwrap();
+        sql_keyword("NAME").unwrap();
+        sql_keyword("OFFSET").unwrap();
+        sql_keyword("ON").unwrap();
+        sql_keyword("ORDER").unwrap();
+        sql_keyword("PASSWORD").unwrap();
+        sql_keyword("POLICY").unwrap();
+        sql_keyword("POLICIES").unwrap();
+        sql_keyword("PRIVILEGES").unwrap();
+        sql_keyword("QUERIES").unwrap();
+        sql_keyword("QUERY").unwrap();
+        sql_keyword("READ").unwrap();
+        sql_keyword("REPLICATION").unwrap();
+        sql_keyword("RESAMPLE").unwrap();
+        sql_keyword("RETENTION").unwrap();
+        sql_keyword("REVOKE").unwrap();
+        sql_keyword("SELECT").unwrap();
+        sql_keyword("SERIES").unwrap();
+        sql_keyword("SET").unwrap();
+        sql_keyword("SHOW").unwrap();
+        sql_keyword("SHARD").unwrap();
+        sql_keyword("SHARDS").unwrap();
+        sql_keyword("SLIMIT").unwrap();
+        sql_keyword("SOFFSET").unwrap();
+        sql_keyword("STATS").unwrap();
+        sql_keyword("SUBSCRIPTION").unwrap();
+        sql_keyword("SUBSCRIPTIONS").unwrap();
+        sql_keyword("TAG").unwrap();
+        sql_keyword("TO").unwrap();
+        sql_keyword("USER").unwrap();
+        sql_keyword("USERS").unwrap();
+        sql_keyword("VALUES").unwrap();
+        sql_keyword("WHERE").unwrap();
+        sql_keyword("WITH").unwrap();
+        sql_keyword("WRITE").unwrap();
+
+        // case insensitivity
+        sql_keyword("all").unwrap();
+
+        // ┌─────────────────────────────┐
+        // │       Fallible tests        │
+        // └─────────────────────────────┘
+
+        sql_keyword("NOT_A_KEYWORD").unwrap_err();
+    }
+
+    #[test]
+    fn test_keyword() {
+        // Create a parser for the OR keyword
+        let mut or_keyword = keyword("OR");
+
+        // Can parse with matching case
+        let (rem, got) = or_keyword("OR").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got, "OR");
+
+        // Not case sensitive
+        let (rem, got) = or_keyword("or").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got, "or");
+
+        // Does not consume input that follows a keyword
+        let (rem, got) = or_keyword("or(a AND b)").unwrap();
+        assert_eq!(rem, "(a AND b)");
+        assert_eq!(got, "or");
+
+        // Will fail because keyword `OR` in `ORDER` is not recognized, as is not terminated by a valid character
+        let err = or_keyword("ORDER").unwrap_err();
+        assert_matches!(err, nom::Err::Error(crate::internal::Error::Nom(_, kind)) if kind == nom::error::ErrorKind::Fail);
+    }
+
+    #[test]
+    fn test_keyword_followed_by_valid_char() {
+        let mut tag_keyword = keyword("TAG");
+
+        // followed by EOF
+        let (rem, got) = tag_keyword("tag").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(got, "tag");
+
+        //
+        // Test some of the expected characters
+        //
+
+        let (rem, got) = tag_keyword("tag!=foo").unwrap();
+        assert_eq!(rem, "!=foo");
+        assert_eq!(got, "tag");
+
+        let (rem, got) = tag_keyword("tag>foo").unwrap();
+        assert_eq!(rem, ">foo");
+        assert_eq!(got, "tag");
+
+        let (rem, got) = tag_keyword("tag&1 = foo").unwrap();
+        assert_eq!(rem, "&1 = foo");
+        assert_eq!(got, "tag");
+
+        // Fallible
+
+        assert_error!(tag_keyword("tag$"), Fail);
+    }
+
+    #[test]
+    fn test_token() {
+        // Are equal with differing case
+        let (a, b) = (Token("and"), Token("AND"));
+        assert_eq!(a, b);
+
+        // Are equal with same case
+        let (a, b) = (Token("and"), Token("and"));
+        assert_eq!(a, b);
+
+        // a < b
+        let (a, b) = (Token("and"), Token("apple"));
+        assert_ne!(a, b);
+
+        // a < b
+        let (a, b) = (Token("and"), Token("APPLE"));
+        assert_ne!(a, b);
+
+        // a < b
+        let (a, b) = (Token("AND"), Token("apple"));
+        assert_ne!(a, b);
+
+        // a > b
+        let (a, b) = (Token("and"), Token("aardvark"));
+        assert_ne!(a, b);
+
+        // a > b
+        let (a, b) = (Token("and"), Token("AARDVARK"));
+        assert_ne!(a, b);
+
+        // a > b
+        let (a, b) = (Token("AND"), Token("aardvark"));
+        assert_ne!(a, b);
+
+        // Validate prefixes don't match and are correct ordering
+
+        let (a, b) = (Token("aaa"), Token("aaabbb"));
+        assert_ne!(a, b);
+
+        let (a, b) = (Token("aaabbb"), Token("aaa"));
+        assert_ne!(a, b);
+
+        let (a, b) = (Token("aaa"), Token("AAABBB"));
+        assert_ne!(a, b);
+
+        let (a, b) = (Token("AAABBB"), Token("aaa"));
+        assert_ne!(a, b);
+    }
+}
diff --git a/influxdb_influxql_parser/src/lib.rs b/influxdb_influxql_parser/src/lib.rs
new file mode 100644
index 0000000..4bb1a60
--- /dev/null
+++ b/influxdb_influxql_parser/src/lib.rs
@@ -0,0 +1,188 @@
+//! # Parse a subset of [InfluxQL]
+//!
+//! [InfluxQL]: https://docs.influxdata.com/influxdb/v1.8/query_language
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use crate::common::{statement_terminator, ws0};
+use crate::internal::Error as InternalError;
+use crate::statement::{statement, Statement};
+use common::ParseError;
+use nom::combinator::eof;
+use nom::Offset;
+
+#[cfg(test)]
+mod test_util;
+
+pub mod common;
+pub mod create;
+pub mod delete;
+pub mod drop;
+pub mod explain;
+pub mod expression;
+pub mod functions;
+pub mod identifier;
+mod internal;
+mod keywords;
+pub mod literal;
+pub mod parameter;
+pub mod select;
+pub mod show;
+pub mod show_field_keys;
+pub mod show_measurements;
+pub mod show_retention_policies;
+pub mod show_tag_keys;
+pub mod show_tag_values;
+pub mod simple_from_clause;
+pub mod statement;
+pub mod string;
+pub mod time_range;
+pub mod timestamp;
+pub mod visit;
+pub mod visit_mut;
+
+/// ParseResult is type that represents the success or failure of parsing
+/// a given input into a set of InfluxQL statements.
+///
+/// Errors are human-readable messages indicating the cause of the parse failure.
+pub type ParseResult = Result<Vec<Statement>, ParseError>;
+
+/// Parse the input into a set of InfluxQL statements.
+pub fn parse_statements(input: &str) -> ParseResult {
+    let mut res = Vec::new();
+    let mut i: &str = input;
+
+    loop {
+        // Consume whitespace from the input
+        (i, _) = ws0(i).expect("ws0 is infallible");
+
+        if eof::<_, nom::error::Error<_>>(i).is_ok() {
+            return Ok(res);
+        }
+
+        if let Ok((i1, _)) = statement_terminator(i) {
+            i = i1;
+            continue;
+        }
+
+        match statement(i) {
+            Ok((i1, o)) => {
+                res.push(o);
+                i = i1;
+            }
+            Err(nom::Err::Failure(InternalError::Syntax {
+                input: pos,
+                message,
+            })) => {
+                return Err(ParseError {
+                    message: message.into(),
+                    pos: input.offset(pos),
+                })
+            }
+            // any other error indicates an invalid statement
+            Err(_) => {
+                return Err(ParseError {
+                    message: "invalid SQL statement".into(),
+                    pos: input.offset(i),
+                })
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::parse_statements;
+
+    /// Validates that the [`parse_statements`] function
+    /// handles statement terminators and errors.
+    #[test]
+    fn test_parse_statements() {
+        // Parse a single statement, without a terminator
+        let got = parse_statements("SHOW MEASUREMENTS").unwrap();
+        assert_eq!(got.first().unwrap().to_string(), "SHOW MEASUREMENTS");
+
+        // Parse a single statement, with a terminator
+        let got = parse_statements("SHOW MEASUREMENTS;").unwrap();
+        assert_eq!(got[0].to_string(), "SHOW MEASUREMENTS");
+
+        // Parse multiple statements with whitespace
+        let got = parse_statements("SHOW MEASUREMENTS;\nSHOW MEASUREMENTS LIMIT 1").unwrap();
+        assert_eq!(got[0].to_string(), "SHOW MEASUREMENTS");
+        assert_eq!(got[1].to_string(), "SHOW MEASUREMENTS LIMIT 1");
+
+        // Parse multiple statements with a terminator in quotes, ensuring it is not interpreted as
+        // a terminator
+        let got =
+            parse_statements("SHOW MEASUREMENTS WITH MEASUREMENT = \";\";SHOW DATABASES").unwrap();
+        assert_eq!(
+            got[0].to_string(),
+            "SHOW MEASUREMENTS WITH MEASUREMENT = \";\""
+        );
+        assert_eq!(got[1].to_string(), "SHOW DATABASES");
+
+        // Parses a statement with a comment
+        let got = parse_statements(
+            "SELECT idle FROM cpu WHERE host = 'host1' --GROUP BY host fill(null)",
+        )
+        .unwrap();
+        assert_eq!(
+            got[0].to_string(),
+            "SELECT idle FROM cpu WHERE host = 'host1'"
+        );
+
+        // Parses multiple statements with a comment
+        let got = parse_statements(
+            "SELECT idle FROM cpu WHERE host = 'host1' --GROUP BY host fill(null)\nSHOW DATABASES",
+        )
+        .unwrap();
+        assert_eq!(
+            got[0].to_string(),
+            "SELECT idle FROM cpu WHERE host = 'host1'"
+        );
+        assert_eq!(got[1].to_string(), "SHOW DATABASES");
+
+        // Parses statement with inline comment
+        let got = parse_statements(r#"SELECT idle FROM cpu WHERE/* time > now() AND */host = 'host1' --GROUP BY host fill(null)"#).unwrap();
+        assert_eq!(
+            got[0].to_string(),
+            "SELECT idle FROM cpu WHERE host = 'host1'"
+        );
+
+        // Parses empty single-line comments in various placements
+        let got = parse_statements(
+            r#"-- foo
+        --
+        --
+        SELECT value FROM cpu--
+        -- foo
+        ;SELECT val2 FROM cpu"#,
+        )
+        .unwrap();
+        assert_eq!(got[0].to_string(), "SELECT value FROM cpu");
+        assert_eq!(got[1].to_string(), "SELECT val2 FROM cpu");
+
+        // Returns error for invalid statement
+        let got = parse_statements("BAD SQL").unwrap_err();
+        assert_eq!(got.to_string(), "invalid SQL statement at pos 0");
+
+        // Returns error for invalid statement after first
+        let got = parse_statements("SHOW MEASUREMENTS;BAD SQL").unwrap_err();
+        assert_eq!(got.to_string(), "invalid SQL statement at pos 18");
+    }
+}
diff --git a/influxdb_influxql_parser/src/literal.rs b/influxdb_influxql_parser/src/literal.rs
new file mode 100644
index 0000000..3611987
--- /dev/null
+++ b/influxdb_influxql_parser/src/literal.rs
@@ -0,0 +1,600 @@
+//! Types and parsers for literals.
+
+use crate::common::ws0;
+use crate::internal::{map_error, map_fail, ParseResult};
+use crate::keywords::keyword;
+use crate::string::{regex, single_quoted_string, Regex};
+use crate::timestamp::Timestamp;
+use crate::{impl_tuple_clause, write_escaped};
+use chrono::{NaiveDateTime, Offset};
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::character::complete::{char, digit0, digit1};
+use nom::combinator::{map, opt, recognize, value};
+use nom::multi::fold_many1;
+use nom::sequence::{pair, preceded, separated_pair};
+use std::fmt;
+use std::fmt::{Display, Formatter, Write};
+
+/// Number of nanoseconds in a microsecond.
+const NANOS_PER_MICRO: i64 = 1000;
+/// Number of nanoseconds in a millisecond.
+const NANOS_PER_MILLI: i64 = 1000 * NANOS_PER_MICRO;
+/// Number of nanoseconds in a second.
+const NANOS_PER_SEC: i64 = 1000 * NANOS_PER_MILLI;
+/// Number of nanoseconds in a minute.
+const NANOS_PER_MIN: i64 = 60 * NANOS_PER_SEC;
+/// Number of nanoseconds in an hour.
+const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MIN;
+/// Number of nanoseconds in a day.
+const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR;
+/// Number of nanoseconds in a week.
+const NANOS_PER_WEEK: i64 = 7 * NANOS_PER_DAY;
+
+/// Primitive InfluxQL literal values, such as strings and regular expressions.
+#[derive(Clone, Debug, PartialEq)]
+pub enum Literal {
+    /// Signed integer literal.
+    Integer(i64),
+
+    /// Unsigned integer literal.
+    Unsigned(u64),
+
+    /// Float literal.
+    Float(f64),
+
+    /// Unescaped string literal.
+    String(String),
+
+    /// Boolean literal.
+    Boolean(bool),
+
+    /// Duration literal in nanoseconds.
+    Duration(Duration),
+
+    /// Unescaped regular expression literal.
+    Regex(Regex),
+
+    /// A timestamp identified in a time range expression of a conditional expression.
+    Timestamp(Timestamp),
+}
+
+impl From<String> for Literal {
+    fn from(v: String) -> Self {
+        Self::String(v)
+    }
+}
+
+impl From<u64> for Literal {
+    fn from(v: u64) -> Self {
+        Self::Unsigned(v)
+    }
+}
+
+impl From<i64> for Literal {
+    fn from(v: i64) -> Self {
+        Self::Integer(v)
+    }
+}
+
+impl From<f64> for Literal {
+    fn from(v: f64) -> Self {
+        Self::Float(v)
+    }
+}
+
+impl From<bool> for Literal {
+    fn from(v: bool) -> Self {
+        Self::Boolean(v)
+    }
+}
+
+impl From<Duration> for Literal {
+    fn from(v: Duration) -> Self {
+        Self::Duration(v)
+    }
+}
+
+impl From<Regex> for Literal {
+    fn from(v: Regex) -> Self {
+        Self::Regex(v)
+    }
+}
+
+impl Display for Literal {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Integer(v) => write!(f, "{v}"),
+            Self::Unsigned(v) => write!(f, "{v}"),
+            Self::Float(v) => write!(f, "{v}"),
+            Self::String(v) => {
+                f.write_char('\'')?;
+                write_escaped!(f, v, '\n' => "\\n", '\\' => "\\\\", '\'' => "\\'", '"' => "\\\"");
+                f.write_char('\'')
+            }
+            Self::Boolean(v) => write!(f, "{}", if *v { "true" } else { "false" }),
+            Self::Duration(v) => write!(f, "{v}"),
+            Self::Regex(v) => write!(f, "{v}"),
+            Self::Timestamp(ts) => write!(f, "{}", ts.to_rfc3339()),
+        }
+    }
+}
+
+/// Parse an InfluxQL integer.
+///
+/// InfluxQL defines an integer as follows
+///
+/// ```text
+/// INTEGER ::= [0-9]+
+/// ```
+fn integer(i: &str) -> ParseResult<&str, i64> {
+    map_error("unable to parse integer", digit1, &str::parse)(i)
+}
+
+/// Parse an InfluxQL integer to a [`Literal::Integer`] or [`Literal::Unsigned`]
+/// if the string overflows. This behavior is consistent with [InfluxQL].
+///
+/// InfluxQL defines an integer as follows
+///
+/// ```text
+/// INTEGER ::= [0-9]+
+/// ```
+///
+/// [InfluxQL]: https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2669-L2675
+fn integer_literal(i: &str) -> ParseResult<&str, Literal> {
+    map_fail(
+        "unable to parse integer due to overflow",
+        digit1,
+        |s: &str| {
+            s.parse::<i64>()
+                .map(Literal::Integer)
+                .or_else(|_| s.parse::<u64>().map(Literal::Unsigned))
+        },
+    )(i)
+}
+
+/// Parse an unsigned InfluxQL integer.
+///
+/// InfluxQL defines an integer as follows
+///
+/// ```text
+/// INTEGER ::= [0-9]+
+/// ```
+pub(crate) fn unsigned_integer(i: &str) -> ParseResult<&str, u64> {
+    map_fail("unable to parse unsigned integer", digit1, &str::parse)(i)
+}
+
+/// Parse an unsigned InfluxQL floating point number.
+///
+/// InfluxQL defines a floating point number as follows
+///
+/// ```text
+/// float   ::= INTEGER "." INTEGER
+/// INTEGER ::= [0-9]+
+/// ```
+fn float(i: &str) -> ParseResult<&str, f64> {
+    map_fail(
+        "unable to parse float",
+        recognize(separated_pair(digit0, tag("."), digit1)),
+        &str::parse,
+    )(i)
+}
+
+/// Represents any signed number.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum Number {
+    /// Contains a 64-bit integer.
+    Integer(i64),
+    /// Contains a 64-bit float.
+    Float(f64),
+}
+
+impl Display for Number {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Integer(v) => fmt::Display::fmt(v, f),
+            Self::Float(v) => fmt::Display::fmt(v, f),
+        }
+    }
+}
+
+impl From<f64> for Number {
+    fn from(v: f64) -> Self {
+        Self::Float(v)
+    }
+}
+
+impl From<i64> for Number {
+    fn from(v: i64) -> Self {
+        Self::Integer(v)
+    }
+}
+
+/// Parse a signed [`Number`].
+pub(crate) fn number(i: &str) -> ParseResult<&str, Number> {
+    let (remaining, sign) = opt(alt((char('-'), char('+'))))(i)?;
+    preceded(
+        ws0,
+        alt((
+            map(float, move |v| {
+                Number::Float(v * if let Some('-') = sign { -1.0 } else { 1.0 })
+            }),
+            map(integer, move |v| {
+                Number::Integer(v * if let Some('-') = sign { -1 } else { 1 })
+            }),
+        )),
+    )(remaining)
+}
+
+/// Parse the input for an InfluxQL boolean, which must be the value `true` or `false`.
+fn boolean(i: &str) -> ParseResult<&str, bool> {
+    alt((value(true, keyword("TRUE")), value(false, keyword("FALSE"))))(i)
+}
+
+#[derive(Clone)]
+enum DurationUnit {
+    Nanosecond,
+    Microsecond,
+    Millisecond,
+    Second,
+    Minute,
+    Hour,
+    Day,
+    Week,
+}
+
+/// Represents an InfluxQL duration in nanoseconds.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Duration(pub(crate) i64);
+
+impl_tuple_clause!(Duration, i64);
+
+static DIVISORS: [(i64, &str); 8] = [
+    (NANOS_PER_WEEK, "w"),
+    (NANOS_PER_DAY, "d"),
+    (NANOS_PER_HOUR, "h"),
+    (NANOS_PER_MIN, "m"),
+    (NANOS_PER_SEC, "s"),
+    (NANOS_PER_MILLI, "ms"),
+    (NANOS_PER_MICRO, "us"),
+    (1, "ns"),
+];
+
+impl Display for Duration {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let v = if self.0.is_negative() {
+            write!(f, "-")?;
+            -self.0
+        } else {
+            self.0
+        };
+        match v {
+            0 => f.write_str("0s")?,
+            mut i => {
+                // only return the divisors that are > self
+                for (div, unit) in DIVISORS.iter().filter(|(div, _)| v > *div) {
+                    let units = i / div;
+                    if units > 0 {
+                        write!(f, "{units}{unit}")?;
+                        i -= units * div;
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Parse the input for a InfluxQL duration fragment and returns the value in nanoseconds.
+fn single_duration(i: &str) -> ParseResult<&str, i64> {
+    use DurationUnit::*;
+
+    map_fail(
+        "overflow",
+        pair(
+            integer,
+            alt((
+                value(Nanosecond, tag("ns")),  // nanoseconds
+                value(Microsecond, tag("µ")),  // microseconds
+                value(Microsecond, tag("u")),  // microseconds
+                value(Millisecond, tag("ms")), // milliseconds
+                value(Second, tag("s")),       // seconds
+                value(Minute, tag("m")),       // minutes
+                value(Hour, tag("h")),         // hours
+                value(Day, tag("d")),          // days
+                value(Week, tag("w")),         // weeks
+            )),
+        ),
+        |(v, unit)| {
+            (match unit {
+                Nanosecond => Some(v),
+                Microsecond => v.checked_mul(NANOS_PER_MICRO),
+                Millisecond => v.checked_mul(NANOS_PER_MILLI),
+                Second => v.checked_mul(NANOS_PER_SEC),
+                Minute => v.checked_mul(NANOS_PER_MIN),
+                Hour => v.checked_mul(NANOS_PER_HOUR),
+                Day => v.checked_mul(NANOS_PER_DAY),
+                Week => v.checked_mul(NANOS_PER_WEEK),
+            })
+            .ok_or("integer overflow")
+        },
+    )(i)
+}
+
+/// Parse the input for an InfluxQL duration.
+pub(crate) fn duration(i: &str) -> ParseResult<&str, Duration> {
+    map(
+        fold_many1(single_duration, || 0, |acc, fragment| acc + fragment),
+        Duration,
+    )(i)
+}
+
+/// Parse an InfluxQL literal, except a [`Regex`].
+///
+/// Use [`literal`] for parsing any literals, excluding regular expressions.
+pub(crate) fn literal_no_regex(i: &str) -> ParseResult<&str, Literal> {
+    alt((
+        // NOTE: order is important, as floats should be tested before durations and integers.
+        map(float, Literal::Float),
+        map(duration, Literal::Duration),
+        integer_literal,
+        map(single_quoted_string, Literal::String),
+        map(boolean, Literal::Boolean),
+    ))(i)
+}
+
+/// Parse any InfluxQL literal.
+pub(crate) fn literal(i: &str) -> ParseResult<&str, Literal> {
+    alt((literal_no_regex, map(regex, Literal::Regex)))(i)
+}
+
+/// Parse an InfluxQL literal regular expression.
+pub(crate) fn literal_regex(i: &str) -> ParseResult<&str, Literal> {
+    map(regex, Literal::Regex)(i)
+}
+
+/// Returns `nanos` as a timestamp.
+pub fn nanos_to_timestamp(nanos: i64) -> Timestamp {
+    let (secs, nsec) = num_integer::div_mod_floor(nanos, NANOS_PER_SEC);
+
+    Timestamp::from_naive_utc_and_offset(
+        NaiveDateTime::from_timestamp_opt(secs, nsec as u32)
+            .expect("unable to convert duration to timestamp"),
+        chrono::Utc.fix(),
+    )
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_literal_no_regex() {
+        // Whole numbers are parsed first as a signed integer, and if that overflows,
+        // tries an unsigned integer, which is consistent with InfluxQL
+        let (_, got) = literal_no_regex("42").unwrap();
+        assert_matches!(got, Literal::Integer(42));
+
+        // > i64::MAX + 1 should be parsed as an unsigned integer
+        let (_, got) = literal_no_regex("9223372036854775808").unwrap();
+        assert_matches!(got, Literal::Unsigned(9223372036854775808));
+
+        let (_, got) = literal_no_regex("42.69").unwrap();
+        assert_matches!(got, Literal::Float(v) if v == 42.69);
+
+        let (_, got) = literal_no_regex("'quick draw'").unwrap();
+        assert_matches!(got, Literal::String(v) if v == "quick draw");
+
+        let (_, got) = literal_no_regex("false").unwrap();
+        assert_matches!(got, Literal::Boolean(false));
+
+        let (_, got) = literal_no_regex("true").unwrap();
+        assert_matches!(got, Literal::Boolean(true));
+
+        let (_, got) = literal_no_regex("3h25m").unwrap();
+        assert_matches!(got, Literal::Duration(v) if v == Duration(3 * NANOS_PER_HOUR + 25 * NANOS_PER_MIN));
+
+        // Fallible cases
+        literal_no_regex("/foo/").unwrap_err();
+    }
+
+    #[test]
+    fn test_literal() {
+        let (_, got) = literal("/^(match|this)$/").unwrap();
+        assert_matches!(got, Literal::Regex(v) if v == "^(match|this)$".into());
+    }
+
+    #[test]
+    fn test_literal_regex() {
+        let (_, got) = literal_regex("/^(match|this)$/").unwrap();
+        assert_matches!(got, Literal::Regex(v) if v == "^(match|this)$".into());
+    }
+
+    #[test]
+    fn test_integer() {
+        let (_, got) = integer("42").unwrap();
+        assert_eq!(got, 42);
+
+        let (_, got) = integer(&i64::MAX.to_string()[..]).unwrap();
+        assert_eq!(got, i64::MAX);
+
+        // Fallible cases
+
+        integer("hello").unwrap_err();
+
+        integer("9223372036854775808").expect_err("expected overflow");
+    }
+
+    #[test]
+    fn test_unsigned_integer() {
+        let (_, got) = unsigned_integer("42").unwrap();
+        assert_eq!(got, 42);
+
+        let (_, got) = unsigned_integer(&u64::MAX.to_string()[..]).unwrap();
+        assert_eq!(got, u64::MAX);
+
+        // Fallible cases
+
+        unsigned_integer("hello").unwrap_err();
+    }
+
+    #[test]
+    fn test_float() {
+        let (_, got) = float("42.69").unwrap();
+        assert_eq!(got, 42.69);
+
+        let (_, got) = float(".25").unwrap();
+        assert_eq!(got, 0.25);
+
+        let (_, got) = float(&format!("{:.1}", f64::MAX)[..]).unwrap();
+        assert_eq!(got, f64::MAX);
+
+        // Fallible cases
+
+        // missing trailing digits
+        float("41.").unwrap_err();
+
+        // missing decimal
+        float("41").unwrap_err();
+    }
+
+    #[test]
+    fn test_boolean() {
+        let (_, got) = boolean("true").unwrap();
+        assert!(got);
+        let (_, got) = boolean("false").unwrap();
+        assert!(!got);
+
+        // Fallible cases
+
+        boolean("truey").unwrap_err();
+        boolean("falsey").unwrap_err();
+    }
+
+    #[test]
+    fn test_duration_fragment() {
+        let (_, got) = single_duration("38ns").unwrap();
+        assert_eq!(got, 38);
+
+        let (_, got) = single_duration("22u").unwrap();
+        assert_eq!(got, 22 * NANOS_PER_MICRO);
+
+        let (rem, got) = single_duration("22us").unwrap();
+        assert_eq!(got, 22 * NANOS_PER_MICRO);
+        assert_eq!(rem, "s"); // prove that we ignore the trailing s
+
+        let (_, got) = single_duration("7µ").unwrap();
+        assert_eq!(got, 7 * NANOS_PER_MICRO);
+
+        let (_, got) = single_duration("15ms").unwrap();
+        assert_eq!(got, 15 * NANOS_PER_MILLI);
+
+        let (_, got) = single_duration("53s").unwrap();
+        assert_eq!(got, 53 * NANOS_PER_SEC);
+
+        let (_, got) = single_duration("158m").unwrap();
+        assert_eq!(got, 158 * NANOS_PER_MIN);
+
+        let (_, got) = single_duration("39h").unwrap();
+        assert_eq!(got, 39 * NANOS_PER_HOUR);
+
+        let (_, got) = single_duration("2d").unwrap();
+        assert_eq!(got, 2 * NANOS_PER_DAY);
+
+        let (_, got) = single_duration("5w").unwrap();
+        assert_eq!(got, 5 * NANOS_PER_WEEK);
+
+        // Fallible
+
+        // Handle overflow
+        single_duration("16000w").expect_err("expected overflow");
+    }
+
+    #[test]
+    fn test_duration() {
+        let (_, got) = duration("10h3m2s").unwrap();
+        assert_eq!(
+            got,
+            Duration(10 * NANOS_PER_HOUR + 3 * NANOS_PER_MIN + 2 * NANOS_PER_SEC)
+        );
+    }
+
+    #[test]
+    fn test_display_duration() {
+        let (_, d) = duration("3w2h15ms").unwrap();
+        assert_eq!(d.to_string(), "3w2h15ms");
+
+        let (_, d) = duration("5s5s5s5s5s").unwrap();
+        assert_eq!(d.to_string(), "25s");
+
+        let d = Duration(0);
+        assert_eq!(d.to_string(), "0s");
+
+        // Negative duration
+        let (_, d) = duration("3w2h15ms").unwrap();
+        let d = Duration(-d.0);
+        assert_eq!(d.to_string(), "-3w2h15ms");
+
+        let d = Duration(
+            20 * NANOS_PER_WEEK
+                + 6 * NANOS_PER_DAY
+                + 13 * NANOS_PER_HOUR
+                + 11 * NANOS_PER_MIN
+                + 10 * NANOS_PER_SEC
+                + 9 * NANOS_PER_MILLI
+                + 8 * NANOS_PER_MICRO
+                + 500,
+        );
+        assert_eq!(d.to_string(), "20w6d13h11m10s9ms8us500ns");
+    }
+
+    #[test]
+    fn test_number() {
+        // Test floating point numbers
+        let (_, got) = number("55.3").unwrap();
+        assert_matches!(got, Number::Float(v) if v == 55.3);
+
+        let (_, got) = number("-18.9").unwrap();
+        assert_matches!(got, Number::Float(v) if v == -18.9);
+
+        let (_, got) = number("- 18.9").unwrap();
+        assert_matches!(got, Number::Float(v) if v == -18.9);
+
+        let (_, got) = number("+33.1").unwrap();
+        assert_matches!(got, Number::Float(v) if v == 33.1);
+
+        let (_, got) = number("+ 33.1").unwrap();
+        assert_matches!(got, Number::Float(v) if v == 33.1);
+
+        // Test integers
+        let (_, got) = number("42").unwrap();
+        assert_matches!(got, Number::Integer(v) if v == 42);
+
+        let (_, got) = number("-32").unwrap();
+        assert_matches!(got, Number::Integer(v) if v == -32);
+
+        let (_, got) = number("- 32").unwrap();
+        assert_matches!(got, Number::Integer(v) if v == -32);
+
+        let (_, got) = number("+501").unwrap();
+        assert_matches!(got, Number::Integer(v) if v == 501);
+
+        let (_, got) = number("+ 501").unwrap();
+        assert_matches!(got, Number::Integer(v) if v == 501);
+    }
+
+    #[test]
+    fn test_nanos_to_timestamp() {
+        let ts = nanos_to_timestamp(0);
+        assert_eq!(ts.to_rfc3339(), "1970-01-01T00:00:00+00:00");
+
+        // infallible
+        let ts = nanos_to_timestamp(i64::MAX);
+        assert_eq!(ts.timestamp_nanos_opt().unwrap(), i64::MAX);
+
+        let ts = nanos_to_timestamp(i64::MIN);
+        assert_eq!(ts.timestamp_nanos_opt().unwrap(), i64::MIN);
+    }
+}
diff --git a/influxdb_influxql_parser/src/parameter.rs b/influxdb_influxql_parser/src/parameter.rs
new file mode 100644
index 0000000..5ed28b7
--- /dev/null
+++ b/influxdb_influxql_parser/src/parameter.rs
@@ -0,0 +1,107 @@
+//! # Parse an InfluxQL [bind parameter]
+//!
+//! Bind parameters are parsed where a literal value may appear and are prefixed
+//! by a `$`. Per the original Go [implementation], the token following the `$` is
+//! parsed as an identifier, and therefore may appear in double quotes.
+//!
+//! [bind parameter]: https://docs.influxdata.com/influxdb/v1.8/tools/api/#bind-parameters
+//! [implementation]: https://github.com/influxdata/influxql/blob/df51a45762be9c1b578f01718fa92d286a843fe9/scanner.go#L57-L62
+
+use crate::internal::ParseResult;
+use crate::string::double_quoted_string;
+use crate::{impl_tuple_clause, write_quoted_string};
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::character::complete::{alphanumeric1, char};
+use nom::combinator::{map, recognize};
+use nom::multi::many1_count;
+use nom::sequence::preceded;
+use std::fmt;
+use std::fmt::{Display, Formatter, Write};
+
+/// Parse an unquoted InfluxQL bind parameter.
+fn unquoted_parameter(i: &str) -> ParseResult<&str, &str> {
+    recognize(many1_count(alt((alphanumeric1, tag("_")))))(i)
+}
+
+/// A type that represents an InfluxQL bind parameter.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct BindParameter(pub(crate) String);
+
+impl_tuple_clause!(BindParameter, String);
+
+impl From<&str> for BindParameter {
+    fn from(s: &str) -> Self {
+        Self(s.to_string())
+    }
+}
+
+impl Display for BindParameter {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_char('$')?;
+        write_quoted_string!(f, '"', self.0.as_str(), unquoted_parameter, '\n' => "\\n", '\\' => "\\\\", '"' => "\\\"");
+        Ok(())
+    }
+}
+
+/// Parses an InfluxQL [BindParameter].
+pub(crate) fn parameter(i: &str) -> ParseResult<&str, BindParameter> {
+    // See: https://github.com/influxdata/influxql/blob/df51a45762be9c1b578f01718fa92d286a843fe9/scanner.go#L358-L362
+    preceded(
+        char('$'),
+        alt((
+            map(unquoted_parameter, Into::into),
+            map(double_quoted_string, Into::into),
+        )),
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_parameter() {
+        // all ascii
+        let (_, got) = parameter("$cpu").unwrap();
+        assert_eq!(got, "cpu".into());
+
+        // digits
+        let (_, got) = parameter("$01").unwrap();
+        assert_eq!(got, "01".into());
+
+        // all valid chars
+        let (_, got) = parameter("$cpu_0").unwrap();
+        assert_eq!(got, "cpu_0".into());
+
+        // keyword
+        let (_, got) = parameter("$from").unwrap();
+        assert_eq!(got, "from".into());
+
+        // quoted
+        let (_, got) = parameter("$\"quick draw\"").unwrap();
+        assert_eq!(got, "quick draw".into());
+
+        // ┌─────────────────────────────┐
+        // │       Fallible tests        │
+        // └─────────────────────────────┘
+
+        // missing `$` prefix
+        parameter("cpu").unwrap_err();
+    }
+
+    #[test]
+    fn test_bind_parameter_display() {
+        // BindParameter displays quoted output
+        let got = BindParameter("from foo".into()).to_string();
+        assert_eq!(got, r#"$"from foo""#);
+
+        // BindParameter displays quoted and escaped output
+        let got = BindParameter("from\nfoo".into()).to_string();
+        assert_eq!(got, r#"$"from\nfoo""#);
+
+        // BindParameter displays unquoted output
+        let got = BindParameter("quick_draw".into()).to_string();
+        assert_eq!(got, "$quick_draw");
+    }
+}
diff --git a/influxdb_influxql_parser/src/select.rs b/influxdb_influxql_parser/src/select.rs
new file mode 100644
index 0000000..f0568c3
--- /dev/null
+++ b/influxdb_influxql_parser/src/select.rs
@@ -0,0 +1,1404 @@
+//! Types and parsers for the [`SELECT`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#the-basic-select-statement
+
+use crate::common::{
+    limit_clause, offset_clause, order_by_clause, qualified_measurement_name, where_clause, ws0,
+    ws1, LimitClause, OffsetClause, OrderByClause, ParseError, Parser, QualifiedMeasurementName,
+    WhereClause, ZeroOrMore,
+};
+use crate::expression::arithmetic::Expr::Wildcard;
+use crate::expression::arithmetic::{
+    arithmetic, call_expression, var_ref, ArithmeticParsers, Expr, WildcardType,
+};
+use crate::expression::{Call, VarRef};
+use crate::functions::is_now_function;
+use crate::identifier::{identifier, Identifier};
+use crate::impl_tuple_clause;
+use crate::internal::{expect, map_fail, verify, ParseResult};
+use crate::keywords::keyword;
+use crate::literal::{duration, literal, number, unsigned_integer, Literal, Number};
+use crate::parameter::parameter;
+use crate::select::MeasurementSelection::Subquery;
+use crate::string::{regex, single_quoted_string, Regex};
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::character::complete::char;
+use nom::combinator::{map, opt, value};
+use nom::sequence::{delimited, pair, preceded, tuple};
+use nom::Offset;
+use std::fmt;
+use std::fmt::{Display, Formatter, Write};
+use std::str::FromStr;
+
+/// Represents a `SELECT` statement.
+#[derive(Clone, Debug, PartialEq)]
+pub struct SelectStatement {
+    /// Expressions returned by the selection.
+    pub fields: FieldList,
+
+    /// A list of measurements or subqueries used as the source data for the selection.
+    pub from: FromMeasurementClause,
+
+    /// A conditional expression to filter the selection.
+    pub condition: Option<WhereClause>,
+
+    /// Expressions used for grouping the selection.
+    pub group_by: Option<GroupByClause>,
+
+    /// The [fill] clause specifies the fill behaviour for the selection. If the value is [`None`],
+    /// it is the same behavior as `fill(null)`.
+    ///
+    /// [fill]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#group-by-time-intervals-and-fill
+    pub fill: Option<FillClause>,
+
+    /// Configures the ordering of the selection by time.
+    pub order_by: Option<OrderByClause>,
+
+    /// A value to restrict the number of rows returned.
+    pub limit: Option<LimitClause>,
+
+    /// A value to specify an offset to start retrieving rows.
+    pub offset: Option<OffsetClause>,
+
+    /// A value to restrict the number of series returned.
+    pub series_limit: Option<SLimitClause>,
+
+    /// A value to specify an offset to start retrieving series.
+    pub series_offset: Option<SOffsetClause>,
+
+    /// The timezone for the query, specified as [`tz('<time zone>')`][time_zone_clause].
+    ///
+    /// [time_zone_clause]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#the-time-zone-clause
+    pub timezone: Option<TimeZoneClause>,
+}
+
+impl SelectStatement {
+    /// Return the sort order for the `SELECT` statement.
+    ///
+    /// The default when no `ORDER BY` clause present is `TIME ASC`.
+    pub fn order_by(&self) -> OrderByClause {
+        self.order_by.unwrap_or_default()
+    }
+}
+
+impl Display for SelectStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "SELECT {} {}", self.fields, self.from)?;
+
+        if let Some(where_clause) = &self.condition {
+            write!(f, " {where_clause}")?;
+        }
+
+        if let Some(group_by) = &self.group_by {
+            write!(f, " {group_by}")?;
+        }
+
+        if let Some(fill_clause) = &self.fill {
+            write!(f, " {fill_clause}")?;
+        }
+
+        if let Some(order_by) = &self.order_by {
+            write!(f, " {order_by}")?;
+        }
+
+        if let Some(limit) = &self.limit {
+            write!(f, " {limit}")?;
+        }
+
+        if let Some(offset) = &self.offset {
+            write!(f, " {offset}")?;
+        }
+
+        if let Some(slimit) = &self.series_limit {
+            write!(f, " {slimit}")?;
+        }
+
+        if let Some(soffset) = &self.series_offset {
+            write!(f, " {soffset}")?;
+        }
+
+        if let Some(tz_clause) = &self.timezone {
+            write!(f, " {tz_clause}")?;
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) fn select_statement(i: &str) -> ParseResult<&str, SelectStatement> {
+    let (
+        remaining,
+        (
+            _, // SELECT
+            _, // whitespace
+            fields,
+            from,
+            condition,
+            group_by,
+            fill,
+            order_by,
+            limit,
+            offset,
+            series_limit,
+            series_offset,
+            timezone,
+        ),
+    ) = tuple((
+        keyword("SELECT"),
+        ws0,
+        field_list,
+        preceded(ws0, from_clause),
+        opt(preceded(ws0, where_clause)),
+        opt(preceded(ws0, group_by_clause)),
+        opt(preceded(ws0, fill_clause)),
+        opt(preceded(ws0, order_by_clause)),
+        opt(preceded(ws0, limit_clause)),
+        opt(preceded(ws0, offset_clause)),
+        opt(preceded(ws0, slimit_clause)),
+        opt(preceded(ws0, soffset_clause)),
+        opt(preceded(ws0, timezone_clause)),
+    ))(i)?;
+
+    Ok((
+        remaining,
+        SelectStatement {
+            fields,
+            from,
+            condition,
+            group_by,
+            fill,
+            order_by,
+            limit,
+            offset,
+            series_limit,
+            series_offset,
+            timezone,
+        },
+    ))
+}
+
+/// Represents a single measurement selection for a `FROM` clause.
+#[derive(Clone, Debug, PartialEq)]
+pub enum MeasurementSelection {
+    /// The measurement selection is measurement name or regular expression.
+    Name(QualifiedMeasurementName),
+
+    /// The measurement selection is a subquery.
+    Subquery(Box<SelectStatement>),
+}
+
+impl Display for MeasurementSelection {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Name(ref name) => fmt::Display::fmt(name, f),
+            Self::Subquery(ref subquery) => write!(f, "({subquery})"),
+        }
+    }
+}
+
+impl Parser for MeasurementSelection {
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        alt((
+            map(qualified_measurement_name, MeasurementSelection::Name),
+            map(
+                delimited(
+                    preceded(ws0, char('(')),
+                    preceded(ws0, select_statement),
+                    preceded(ws0, char(')')),
+                ),
+                |s| Subquery(Box::new(s)),
+            ),
+        ))(i)
+    }
+}
+
+/// Represents a `FROM` clause for a `SELECT` statement.
+pub type FromMeasurementClause = ZeroOrMore<MeasurementSelection>;
+
+impl Display for FromMeasurementClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if let Some(first) = self.head() {
+            write!(f, "FROM {first}")?;
+            for arg in self.tail() {
+                write!(f, ", {arg}")?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn from_clause(i: &str) -> ParseResult<&str, FromMeasurementClause> {
+    preceded(
+        pair(keyword("FROM"), ws0),
+        FromMeasurementClause::separated_list1(
+            "invalid FROM clause, expected identifier, regular expression or subquery",
+        ),
+    )(i)
+}
+
+/// Represents the collection of dimensions for a `GROUP BY` clause.
+pub type GroupByClause = ZeroOrMore<Dimension>;
+
+impl Display for GroupByClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if let Some(first) = self.head() {
+            write!(f, "GROUP BY {first}")?;
+            for arg in self.tail() {
+                write!(f, ", {arg}")?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl GroupByClause {
+    /// Returns the time dimension for the `GROUP BY` clause.
+    pub fn time_dimension(&self) -> Option<&TimeDimension> {
+        self.contents.iter().find_map(|dim| match dim {
+            Dimension::Time(t) => Some(t),
+            _ => None,
+        })
+    }
+
+    /// Returns an iterator of all the names of the tag dimensions for the `GROUP BY` clause.
+    pub fn tag_names(&self) -> impl Iterator<Item = &Identifier> + '_ {
+        self.contents.iter().filter_map(|dim| match dim {
+            Dimension::VarRef(i) => Some(&i.name),
+            _ => None,
+        })
+    }
+
+    /// Returns an iterator of all the tag dimensions for the `GROUP BY` clause.
+    pub fn tags(&self) -> impl Iterator<Item = &VarRef> + '_ {
+        self.contents.iter().filter_map(|dim| match dim {
+            Dimension::VarRef(i) => Some(i),
+            _ => None,
+        })
+    }
+}
+
+/// Used to parse the interval argument of the TIME function
+struct TimeCallIntervalArgument;
+
+impl ArithmeticParsers for TimeCallIntervalArgument {
+    fn operand(i: &str) -> ParseResult<&str, Expr> {
+        // Any literal
+        preceded(
+            ws0,
+            map(
+                alt((
+                    map(duration, Literal::Duration),
+                    map(unsigned_integer, Literal::Unsigned),
+                )),
+                Expr::Literal,
+            ),
+        )(i)
+    }
+}
+
+/// Used to parse the offset argument of the TIME function
+///
+/// The offset argument accepts either a duration, datetime-like string or `now`.
+struct TimeCallOffsetArgument;
+
+/// Returns true if `expr` is a valid [`Expr::Call`] expression for the `now` function.
+pub(crate) fn is_valid_now_call(expr: &Expr) -> bool {
+    match expr {
+        Expr::Call(Call { name, args }) => is_now_function(&name.to_lowercase()) && args.is_empty(),
+        _ => false,
+    }
+}
+
+impl TimeCallOffsetArgument {
+    /// Parse the `now()` function call
+    fn now_call(i: &str) -> ParseResult<&str, Expr> {
+        verify(
+            "invalid expression, the only valid function call is 'now' with no arguments",
+            call_expression::<Self>,
+            is_valid_now_call,
+        )(i)
+    }
+}
+
+impl ArithmeticParsers for TimeCallOffsetArgument {
+    fn operand(i: &str) -> ParseResult<&str, Expr> {
+        preceded(
+            ws0,
+            alt((
+                Self::now_call,
+                map(duration, |v| Expr::Literal(Literal::Duration(v))),
+                map(single_quoted_string, |v| Expr::Literal(Literal::String(v))),
+            )),
+        )(i)
+    }
+}
+
+/// Represents a `TIME` dimension in a `GROUP BY` clause.
+#[derive(Clone, Debug, PartialEq)]
+pub struct TimeDimension {
+    /// The first argument of the `TIME` call.
+    pub interval: Expr,
+    /// An optional second argument to specify the offset applied to the `TIME` call.
+    pub offset: Option<Expr>,
+}
+
+impl Display for TimeDimension {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "TIME({}", self.interval)?;
+        if let Some(offset) = &self.offset {
+            write!(f, ", {offset}")?;
+        }
+        write!(f, ")")
+    }
+}
+
+/// Represents a dimension of a `GROUP BY` clause.
+#[derive(Clone, Debug, PartialEq)]
+pub enum Dimension {
+    /// Represents a `TIME` call in a `GROUP BY` clause.
+    Time(TimeDimension),
+
+    /// Represents a literal tag reference in a `GROUP BY` clause.
+    VarRef(VarRef),
+
+    /// Represents a regular expression in a `GROUP BY` clause.
+    Regex(Regex),
+
+    /// Represents a wildcard in a `GROUP BY` clause.
+    Wildcard,
+}
+
+impl Display for Dimension {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Time(v) => Display::fmt(v, f),
+            Self::VarRef(v) => Display::fmt(v, f),
+            Self::Regex(v) => Display::fmt(v, f),
+            Self::Wildcard => f.write_char('*'),
+        }
+    }
+}
+
+impl Parser for Dimension {
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        alt((
+            // Explicitly ignore the `WildCardType`, is InfluxQL always assumes `*::tag`
+            map(wildcard, |_| Self::Wildcard),
+            time_call_expression,
+            map(regex, Self::Regex),
+            map(var_ref, |v| {
+                Self::VarRef(match v {
+                    Expr::VarRef(var_ref) => var_ref,
+                    // var_ref only returns Expr::VarRef
+                    _ => unreachable!(),
+                })
+            }),
+        ))(i)
+    }
+}
+
+fn time_call_expression(i: &str) -> ParseResult<&str, Dimension> {
+    map(
+        preceded(
+            keyword("TIME"),
+            delimited(
+                expect(
+                    "invalid TIME call, expected 1 or 2 arguments",
+                    preceded(ws0, char('(')),
+                ),
+                pair(
+                    expect(
+                        "invalid TIME call, expected a duration for the interval",
+                        arithmetic::<TimeCallIntervalArgument>,
+                    ),
+                    opt(preceded(
+                        preceded(ws0, char(',')),
+                        preceded(ws0, arithmetic::<TimeCallOffsetArgument>),
+                    )),
+                ),
+                expect("invalid TIME call, expected ')'", preceded(ws0, char(')'))),
+            ),
+        ),
+        |(interval, offset)| Dimension::Time(TimeDimension { interval, offset }),
+    )(i)
+}
+
+/// Parse a `GROUP BY` clause.
+///
+/// ```text
+/// group_by_clause ::= dimension ( "," dimension )*
+/// ```
+fn group_by_clause(i: &str) -> ParseResult<&str, GroupByClause> {
+    preceded(
+        tuple((
+            keyword("GROUP"),
+            ws1,
+            expect("invalid GROUP BY clause, expected BY", keyword("BY")),
+            ws1,
+        )),
+        GroupByClause::separated_list1(
+            "invalid GROUP BY clause, expected wildcard, TIME, identifier or regular expression",
+        ),
+    )(i)
+}
+
+/// Represents a `FILL` clause, and specifies all possible cases of the argument to the `FILL` clause.
+#[derive(Debug, Default, Clone, Copy, PartialEq)]
+pub enum FillClause {
+    /// Empty aggregate windows will contain null values and is specified as `fill(null)`
+    ///
+    /// This is the default behavior of a `SELECT` statement, when the `FILL` clause is omitted.
+    #[default]
+    Null,
+
+    /// Empty aggregate windows will be discarded and is specified as `fill(none)`.
+    None,
+
+    /// Empty aggregate windows will be filled with the specified numerical value and is specified as
+    /// `fill(<value>)`
+    Value(Number),
+
+    /// Empty aggregate windows will be filled with the value from the previous aggregate window
+    /// and is specified as `fill(previous)`
+    Previous,
+
+    /// Empty aggregate windows will be filled with a value that is the linear interpolation of
+    /// the prior two non-null window values and is specified as `fill(linear)`
+    Linear,
+}
+
+impl Display for FillClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_str("FILL(")?;
+        match self {
+            Self::Null => f.write_str("NULL")?,
+            Self::None => f.write_str("NONE")?,
+            Self::Value(v) => fmt::Display::fmt(v, f)?,
+            Self::Previous => f.write_str("PREVIOUS")?,
+            Self::Linear => f.write_str("LINEAR")?,
+        }
+        f.write_str(")")
+    }
+}
+
+/// Represents an expression specified in the projection list of a `SELECT` statement.
+#[derive(Debug, Clone, PartialEq)]
+pub struct Field {
+    /// The expression which represents the field projection.
+    pub expr: Expr,
+
+    /// An optional alias for the field projection.
+    pub alias: Option<Identifier>,
+}
+
+impl Display for Field {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        Display::fmt(&self.expr, f)?;
+        if let Some(alias) = &self.alias {
+            write!(f, " AS {alias}")?;
+        }
+        Ok(())
+    }
+}
+
+impl Parser for Field {
+    /// Parse a field expression that appears in the projection list of a `SELECT` clause.
+    ///
+    /// ```text
+    /// field ::= field_expression ( "AS" identifier )?
+    /// ```
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        map(
+            pair(
+                arithmetic::<FieldExpression>,
+                opt(preceded(
+                    preceded(ws0, keyword("AS")),
+                    expect("invalid field alias, expected identifier", identifier),
+                )),
+            ),
+            |(expr, alias)| Self { expr, alias },
+        )(i)
+    }
+}
+
+/// Parse the input completely and return a [`Field`].
+///
+/// All leading and trailing whitespace is consumed. If any input remains after parsing,
+/// an error is returned.
+pub fn parse_field(input: &str) -> Result<Field, ParseError> {
+    let mut i: &str = input;
+
+    // Consume whitespace from the input
+    (i, _) = ws0(i).expect("ws0 is infallible");
+
+    if i.is_empty() {
+        return Err(ParseError {
+            message: "unexpected eof".into(),
+            pos: 0,
+        });
+    }
+
+    let (mut i, cond) = match Field::parse(i) {
+        Ok((i1, cond)) => (i1, cond),
+        Err(nom::Err::Failure(crate::InternalError::Syntax {
+            input: pos,
+            message,
+        })) => {
+            return Err(ParseError {
+                message: message.into(),
+                pos: input.offset(pos),
+            })
+        }
+        // any other error indicates an invalid expression
+        Err(_) => {
+            return Err(ParseError {
+                message: "invalid field expression".into(),
+                pos: input.offset(i),
+            })
+        }
+    };
+
+    // Consume remaining whitespace from the input
+    (i, _) = ws0(i).expect("ws0 is infallible");
+
+    if !i.is_empty() {
+        return Err(ParseError {
+            message: "invalid field expression".into(),
+            pos: input.offset(i),
+        });
+    }
+
+    Ok(cond)
+}
+
+impl FromStr for Field {
+    type Err = ParseError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        parse_field(s)
+    }
+}
+
+/// Parse a wildcard expression.
+///
+/// wildcard ::= "*" ( "::" ("field" | "tag")?
+fn wildcard(i: &str) -> ParseResult<&str, Option<WildcardType>> {
+    preceded(
+        char('*'),
+        opt(preceded(
+            tag("::"),
+            expect(
+                "invalid wildcard type specifier, expected TAG or FIELD",
+                alt((
+                    value(WildcardType::Tag, keyword("TAG")),
+                    value(WildcardType::Field, keyword("FIELD")),
+                )),
+            ),
+        )),
+    )(i)
+}
+
+/// Represents the field projection list of a `SELECT` statement.
+pub type FieldList = ZeroOrMore<Field>;
+
+impl Display for FieldList {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if let Some(first) = self.head() {
+            Display::fmt(first, f)?;
+            for arg in self.tail() {
+                write!(f, ", {arg}")?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Parse a field expression.
+///
+/// A field expression is an arithmetic expression accepting
+/// a specific set of operands.
+struct FieldExpression;
+
+impl ArithmeticParsers for FieldExpression {
+    fn operand(i: &str) -> ParseResult<&str, Expr> {
+        preceded(
+            ws0,
+            alt((
+                // distinct_expression ::= "DISTINCT" ws+ identifier
+                map(
+                    preceded(
+                        pair(keyword("DISTINCT"), ws1),
+                        expect(
+                            "invalid DISTINCT expression, expected identifier",
+                            identifier,
+                        ),
+                    ),
+                    Expr::Distinct,
+                ),
+                // *
+                map(wildcard, Wildcard),
+                // Any literal
+                map(literal, Expr::Literal),
+                // A call expression
+                call_expression::<Self>,
+                // A tag or field reference
+                var_ref,
+                // A bind parameter
+                map(parameter, Expr::BindParameter),
+            )),
+        )(i)
+    }
+}
+
+/// Parse the projection list of a `SELECT` statement.
+///
+/// ```text
+/// field_list ::= field ( "," field )*
+/// ```
+fn field_list(i: &str) -> ParseResult<&str, FieldList> {
+    FieldList::separated_list1("invalid SELECT statement, expected field")(i)
+}
+
+/// Parse a `FILL(option)` clause.
+///
+/// ```text
+/// fill_clause ::= "FILL" "(" fill_option ")"
+/// fill_option ::= "NULL" | "NONE" | "PREVIOUS" | "LINEAR" | number
+/// number      ::= signed_integer | signed_float
+/// ```
+fn fill_clause(i: &str) -> ParseResult<&str, FillClause> {
+    preceded(
+        keyword("FILL"),
+        delimited(
+            preceded(ws0, char('(')),
+            expect(
+                "invalid FILL option, expected NULL, NONE, PREVIOUS, LINEAR, or a number",
+                preceded(
+                    ws0,
+                    alt((
+                        value(FillClause::Null, keyword("NULL")),
+                        value(FillClause::None, keyword("NONE")),
+                        map(number, FillClause::Value),
+                        value(FillClause::Previous, keyword("PREVIOUS")),
+                        value(FillClause::Linear, keyword("LINEAR")),
+                    )),
+                ),
+            ),
+            preceded(ws0, char(')')),
+        ),
+    )(i)
+}
+
+/// Represents the value for a `SLIMIT` clause.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct SLimitClause(pub(crate) u64);
+
+impl_tuple_clause!(SLimitClause, u64);
+
+impl Display for SLimitClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "SLIMIT {}", self.0)
+    }
+}
+
+/// Parse a series limit (`SLIMIT <n>`) clause.
+///
+/// ```text
+/// slimit_clause ::= "SLIMIT" unsigned_integer
+/// ```
+fn slimit_clause(i: &str) -> ParseResult<&str, SLimitClause> {
+    preceded(
+        pair(keyword("SLIMIT"), ws1),
+        expect(
+            "invalid SLIMIT clause, expected unsigned integer",
+            map(unsigned_integer, SLimitClause),
+        ),
+    )(i)
+}
+
+/// Represents the value for a `SOFFSET` clause.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct SOffsetClause(pub(crate) u64);
+
+impl_tuple_clause!(SOffsetClause, u64);
+
+impl Display for SOffsetClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "SOFFSET {}", self.0)
+    }
+}
+
+/// Parse a series offset (`SOFFSET <n>`) clause.
+///
+/// ```text
+/// soffset_clause ::= "SOFFSET" unsigned_integer
+/// ```
+fn soffset_clause(i: &str) -> ParseResult<&str, SOffsetClause> {
+    preceded(
+        pair(keyword("SOFFSET"), ws1),
+        expect(
+            "invalid SLIMIT clause, expected unsigned integer",
+            map(unsigned_integer, SOffsetClause),
+        ),
+    )(i)
+}
+
+/// Represents an IANA time zone parsed from the `TZ` clause.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct TimeZoneClause(pub(crate) chrono_tz::Tz);
+
+impl_tuple_clause!(TimeZoneClause, chrono_tz::Tz);
+
+impl Display for TimeZoneClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "TZ('{}')", self.0)
+    }
+}
+
+/// Parse a timezone clause.
+///
+/// ```text
+/// timezone_clause ::= "TZ" "(" single_quoted_string ")"
+/// ```
+///
+/// ## NOTE
+///
+/// There are some differences with how the IANA timezone string
+/// is parsed to a [chrono_tz::Tz] in Rust vs a [`time.Location`][location] via
+/// Go's [`time.LoadLocation`][load_location]
+/// function, which is used by the canonical Go InfluxQL parser.
+///
+/// It isn't expected that these differences matter for parsing, however,
+/// the notable differences are:
+///
+/// * Specifying the location name `Local` returns `time.Local`, which represents
+///   the system's local time zone. As a result, a user could specify a `TZ` clause
+///   as `TZ('Local')` to use the local time zone of the server running InfluxDB.
+///
+/// * on macOS, IANA name lookups are case-insensitive, whereas the Rust implementation
+///   is case-sensitive. However, this is purely a result of the Go implementation,
+///   which loads the zoneinfo files from the filesystem. macOS uses a case-insensitive
+///   file system by default. When using a case-sensitive file system, name lookups are
+///   also case-sensitive.
+///
+/// * Go's implementation (by default) loads the timezone database from the local file system
+///   vs Rust's implementation, where the database is statically compiled into the binary. Changes
+///   to the IANA database on disk will allow an existing binary to load new timezones.
+///
+/// [location]: https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2384
+/// [load_location]: https://pkg.go.dev/time#LoadLocation
+///
+fn timezone_clause(i: &str) -> ParseResult<&str, TimeZoneClause> {
+    preceded(
+        keyword("TZ"),
+        delimited(
+            preceded(ws0, char('(')),
+            expect(
+                "invalid TZ clause, expected string",
+                preceded(
+                    ws0,
+                    map_fail("unable to find timezone", single_quoted_string, |s| {
+                        s.parse().map(TimeZoneClause)
+                    }),
+                ),
+            ),
+            preceded(ws0, char(')')),
+        ),
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::{assert_expect_error, binary_op, call, distinct, regex, var_ref, wildcard};
+    use assert_matches::assert_matches;
+    use test_helpers::assert_error;
+
+    #[test]
+    fn test_select_statement() {
+        let (_, got) = select_statement("SELECT value FROM foo").unwrap();
+        assert_eq!(got.to_string(), "SELECT value FROM foo");
+
+        let (_, got) =
+            select_statement(r#"SELECT f1,/f2/, f3 AS "a field" FROM foo WHERE host =~ /c1/"#)
+                .unwrap();
+        assert_eq!(
+            got.to_string(),
+            r#"SELECT f1, /f2/, f3 AS "a field" FROM foo WHERE host =~ /c1/"#
+        );
+
+        let (_, got) =
+            select_statement("SELECT sum(value) FROM foo GROUP BY time(5m), host").unwrap();
+        assert_eq!(
+            got.to_string(),
+            r#"SELECT sum(value) FROM foo GROUP BY TIME(5m), host"#
+        );
+
+        // Parses TIME() with expressions
+        let (_, got) =
+            select_statement("SELECT sum(value) FROM foo GROUP BY time(5m * 10), host").unwrap();
+        assert_eq!(
+            got.to_string(),
+            r#"SELECT sum(value) FROM foo GROUP BY TIME(5m * 10), host"#
+        );
+
+        // TIME only supports integer and duration literals
+        select_statement("SELECT sum(value) FROM foo GROUP BY time(5m + 'foo'), host").unwrap_err();
+        select_statement("SELECT sum(value) FROM foo GROUP BY time(5m + 5.4), host").unwrap_err();
+        select_statement("SELECT sum(value) FROM foo GROUP BY time(5m + true), host").unwrap_err();
+
+        let (_, got) =
+            select_statement("SELECT sum(value) FROM foo GROUP BY time(5m), host FILL(previous)")
+                .unwrap();
+        assert_eq!(
+            got.to_string(),
+            r#"SELECT sum(value) FROM foo GROUP BY TIME(5m), host FILL(PREVIOUS)"#
+        );
+        assert_matches!(got.fill, Some(FillClause::Previous));
+
+        let (_, got) = select_statement("SELECT value FROM foo ORDER BY DESC").unwrap();
+        assert_eq!(
+            got.to_string(),
+            r#"SELECT value FROM foo ORDER BY TIME DESC"#
+        );
+
+        let (_, got) = select_statement("SELECT value FROM foo ORDER BY TIME ASC").unwrap();
+        assert_eq!(
+            got.to_string(),
+            r#"SELECT value FROM foo ORDER BY TIME ASC"#
+        );
+
+        let (_, got) = select_statement("SELECT value FROM foo LIMIT 5").unwrap();
+        assert_eq!(got.to_string(), r#"SELECT value FROM foo LIMIT 5"#);
+
+        let (_, got) = select_statement("SELECT value FROM foo OFFSET 20").unwrap();
+        assert_eq!(got.to_string(), r#"SELECT value FROM foo OFFSET 20"#);
+
+        let (_, got) = select_statement("SELECT value FROM foo SLIMIT 25").unwrap();
+        assert_eq!(got.to_string(), r#"SELECT value FROM foo SLIMIT 25"#);
+
+        let (_, got) = select_statement("SELECT value FROM foo SOFFSET 220").unwrap();
+        assert_eq!(got.to_string(), r#"SELECT value FROM foo SOFFSET 220"#);
+
+        let (_, got) = select_statement("SELECT value FROM foo tz('Australia/Hobart')").unwrap();
+        assert_eq!(
+            got.to_string(),
+            r#"SELECT value FROM foo TZ('Australia/Hobart')"#
+        );
+
+        // validate spacing between keywords
+
+        let (rem, _) = select_statement("SELECT value FROM(SELECT val FROM cpu)").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) = select_statement("SELECT (value)FROM cpu").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM (SELECT val FROM cpu)WHERE 1=1").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM cpu WHERE time <= now()FILL(previous)").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM cpu WHERE time <= now()ORDER BY time").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM cpu WHERE time <= now()LIMIT 10").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM cpu WHERE time <= now()OFFSET 10").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM cpu WHERE time <= now()SLIMIT 10").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM cpu WHERE time <= now()SOFFSET 10").unwrap();
+        assert_eq!(rem, "");
+
+        let (rem, _) =
+            select_statement("SELECT value FROM cpu WHERE time <= now()TZ('Australia/Hobart')")
+                .unwrap();
+        assert_eq!(rem, "");
+
+        // segmented var ref identifiers
+        let (rem, _) =
+            select_statement(r#"SELECT LAST("n.usage_user") FROM cpu WHERE n.usage_user > 0"#)
+                .unwrap();
+        assert_eq!(rem, "");
+    }
+
+    #[test]
+    fn test_field() {
+        // Parse a VarRef
+        let (_, got) = Field::parse("foo").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: var_ref!("foo"),
+                alias: None
+            }
+        );
+
+        // Parse expression
+        let (_, got) = Field::parse("foo + 1").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: binary_op!(var_ref!("foo"), Add, 1),
+                alias: None
+            }
+        );
+
+        // Parse a DISTINCT unary operator
+        let (_, got) = Field::parse("distinct foo").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: distinct!("foo"),
+                alias: None
+            }
+        );
+
+        // Parse a VarRef with an alias
+        let (_, got) = Field::parse("foo AS bar").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: var_ref!("foo"),
+                alias: Some("bar".into())
+            }
+        );
+
+        // Parse expression with an alias using lowercase AS token
+        let (_, got) = Field::parse("foo + 1 as bar").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: binary_op!(var_ref!("foo"), Add, 1),
+                alias: Some("bar".into())
+            }
+        );
+
+        // Parse a distinct VarRef with an alias
+        let (_, got) = Field::parse("DISTINCT foo AS bar").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: distinct!("foo"),
+                alias: Some("bar".into())
+            }
+        );
+
+        // Parse expression with an alias and no unnecessary whitespace
+        let (_, got) = Field::parse("COUNT(foo)AS bar").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: call!("count", var_ref!("foo")),
+                alias: Some("bar".into())
+            }
+        );
+
+        // Parse expression with an alias and no unnecessary whitespace
+        let (_, got) = Field::parse("LAST(\"n.asks\")").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: call!("last", var_ref!("n.asks")),
+                alias: None
+            }
+        );
+
+        // Parse a call with a VarRef
+        let (_, got) = Field::parse("DISTINCT foo AS bar").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: distinct!("foo"),
+                alias: Some("bar".into())
+            }
+        );
+
+        // Parse a call with a VarRef
+        let (_, got) = Field::parse("COUNT(DISTINCT foo) AS bar").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: call!("count", distinct!("foo")),
+                alias: Some("bar".into())
+            }
+        );
+
+        // Parse a call with a nested distinct call
+        let (_, got) = Field::parse("COUNT(DISTINCT(foo))").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: call!("count", call!("distinct", var_ref!("foo"))),
+                alias: None
+            }
+        );
+
+        // Parse a wildcard
+        let (_, got) = Field::parse("*").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: wildcard!(),
+                alias: None,
+            }
+        );
+
+        // Parse a wildcard with a data type
+        let (_, got) = Field::parse("*::tag").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: wildcard!(tag),
+                alias: None,
+            }
+        );
+
+        // Parse a wildcard with a data type and an alias
+        let (_, got) = Field::parse("*::field as foo").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: wildcard!(field),
+                alias: Some("foo".into()),
+            }
+        );
+
+        // Parse a call with a wildcard
+        let (_, got) = Field::parse("COUNT(*)").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: call!("count", wildcard!()),
+                alias: None,
+            }
+        );
+
+        // Regex
+        let (_, got) = Field::parse("/foo/").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: regex!("foo"),
+                alias: None,
+            }
+        );
+
+        // Fallible cases
+        assert_expect_error!(
+            Field::parse("distinct *"),
+            "invalid DISTINCT expression, expected identifier"
+        );
+        assert_expect_error!(
+            Field::parse("foo as 1"),
+            "invalid field alias, expected identifier"
+        );
+    }
+
+    impl Field {
+        fn new(expr: Expr) -> Self {
+            Self { expr, alias: None }
+        }
+
+        fn new_alias(expr: Expr, alias: Identifier) -> Self {
+            Self {
+                expr,
+                alias: Some(alias),
+            }
+        }
+    }
+
+    #[test]
+    fn test_field_list() {
+        // Single field
+        let (_, got) = field_list("foo").unwrap();
+        assert_eq!(got, FieldList::new(vec![Field::new(var_ref!("foo"))]));
+
+        // Many fields
+        let (_, got) = field_list("foo, bar AS foobar").unwrap();
+        assert_eq!(
+            got,
+            FieldList::new(vec![
+                Field::new(var_ref!("foo")),
+                Field::new_alias(var_ref!("bar"), "foobar".into())
+            ])
+        );
+
+        // Fallible cases
+
+        // Unable to parse any valid fields
+        assert_expect_error!(field_list("."), "invalid SELECT statement, expected field");
+    }
+
+    #[test]
+    fn test_measurement_selection() {
+        // measurement name expression
+        let (_, got) = MeasurementSelection::parse("diskio").unwrap();
+        assert_matches!(got, MeasurementSelection::Name(_));
+
+        let (_, got) = MeasurementSelection::parse("/regex/").unwrap();
+        assert_matches!(got, MeasurementSelection::Name(_));
+
+        let (_, got) = MeasurementSelection::parse("(SELECT foo FROM bar)").unwrap();
+        assert_matches!(got, MeasurementSelection::Subquery(_));
+    }
+
+    #[test]
+    fn test_from_clause() {
+        // Single, exact-match measurement source
+        let (got, _) = from_clause("FROM diskio").unwrap();
+        // Validate we consumed all input, which is a successful result
+        assert_eq!(got, "");
+
+        // Single, regex measurement source
+        let (got, _) = from_clause("FROM /^c/").unwrap();
+        // Validate we consumed all input
+        assert_eq!(got, "");
+
+        // Single, subquery measurement source
+        let (got, _) = from_clause("FROM (SELECT value FROM cpu)").unwrap();
+        // Validate we consumed all input
+        assert_eq!(got, "");
+
+        // Multiple measurement sources with lots of unnecessary whitespace
+        let (got, _) = from_clause("FROM  ( select *  from  cpu    ),\n/cpu/,diskio").unwrap();
+        assert_eq!(got, "");
+
+        // Can use keyword in quotes
+        let (got, _) = from_clause("FROM \"where\"").unwrap();
+        assert_eq!(got, "");
+
+        // Fallible cases
+
+        assert_expect_error!(
+            from_clause("FROM"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+        assert_expect_error!(
+            from_clause("FROM 1"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+        assert_expect_error!(
+            from_clause("FROM (foo)"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+        assert_expect_error!(
+            from_clause("FROM WHERE"),
+            "invalid FROM clause, expected identifier, regular expression or subquery"
+        );
+    }
+
+    #[test]
+    fn test_dimension() {
+        // Test the valid dimension expressions for a GROUP BY clause
+
+        let (_, got) = Dimension::parse("*").unwrap();
+        assert_matches!(got, Dimension::Wildcard);
+
+        let (_, got) = Dimension::parse("TIME(5m)").unwrap();
+        // TIME parsing is validated with test_time_call_expression, so we just
+        // validate that we matched a Time case.
+        assert_matches!(got, Dimension::Time { .. });
+
+        let (_, got) = Dimension::parse("foo").unwrap();
+        assert_matches!(got, Dimension::VarRef(VarRef { name, ..}) if name == "foo".into());
+
+        let (_, got) = Dimension::parse("/bar/").unwrap();
+        assert_matches!(got, Dimension::Regex(_));
+    }
+
+    #[test]
+    fn test_group_by_clause() {
+        let (got, _) = group_by_clause("GROUP BY time(1m)").unwrap();
+        // Validate we consumed all input, which is a successful result
+        assert_eq!(got, "");
+
+        let (got, _) = group_by_clause("GROUP BY foo").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = group_by_clause("GROUP BY *").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = group_by_clause("GROUP BY *::tag").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = group_by_clause("GROUP BY /foo/").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = group_by_clause("GROUP BY time(5m), foo").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = group_by_clause("GROUP BY time(5m), /foo/, *").unwrap();
+        assert_eq!(got, "");
+
+        // Fallible cases
+
+        assert_expect_error!(
+            group_by_clause("GROUP time(5m)"),
+            "invalid GROUP BY clause, expected BY"
+        );
+
+        assert_expect_error!(
+            group_by_clause("GROUP BY 1"),
+            "invalid GROUP BY clause, expected wildcard, TIME, identifier or regular expression"
+        );
+    }
+
+    #[test]
+    fn test_group_by_clause_tags_time_dimension() {
+        let (_, got) = group_by_clause("GROUP BY *, /foo/, TIME(5m), tag1, tag2").unwrap();
+        assert!(got.time_dimension().is_some());
+        assert_eq!(
+            got.tag_names().cloned().collect::<Vec<_>>(),
+            vec!["tag1".into(), "tag2".into()]
+        );
+
+        let (_, got) = group_by_clause("GROUP BY *, /foo/").unwrap();
+        assert!(got.time_dimension().is_none());
+        assert_eq!(got.tag_names().count(), 0);
+    }
+
+    #[test]
+    fn test_time_call_expression() {
+        let (got, _) = time_call_expression("TIME(5m)").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = time_call_expression("TIME(5m , 1m)").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = time_call_expression("TIME(5m3s)").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = time_call_expression("TIME(5m + 3s)").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = time_call_expression("TIME(5m, now())").unwrap();
+        assert_eq!(got, "");
+
+        // Strings are later evaluated to be datetime-like:
+        // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3660-L3676
+        let (got, _) = time_call_expression("TIME(5m, 'some string')").unwrap();
+        assert_eq!(got, "");
+
+        // Limited expressions are supported
+        let (got, _) = time_call_expression("TIME(5m * 10)").unwrap();
+        assert_eq!(got, "");
+
+        // Fallible cases
+        assert_expect_error!(
+            time_call_expression("TIME"),
+            "invalid TIME call, expected 1 or 2 arguments"
+        );
+        assert_expect_error!(
+            time_call_expression("TIME(5m"),
+            "invalid TIME call, expected ')'"
+        );
+
+        // The offset argument parser does not recognise the 3, therefore it results
+        // in attempting to parse a `)`, and fails.
+        assert_expect_error!(
+            time_call_expression("TIME(5m, 3)"),
+            "invalid TIME call, expected ')'"
+        );
+    }
+
+    #[test]
+    fn test_fill_clause() {
+        let (_, got) = fill_clause("FILL(null)").unwrap();
+        assert_matches!(got, FillClause::Null);
+
+        let (_, got) = fill_clause("FILL(NONE)").unwrap();
+        assert_matches!(got, FillClause::None);
+
+        let (_, got) = fill_clause("FILL(53)").unwrap();
+        assert_matches!(got, FillClause::Value(v) if v == 53.into());
+
+        let (_, got) = fill_clause("FILL(-18.9)").unwrap();
+        assert_matches!(got, FillClause::Value(v) if v == (-18.9).into());
+
+        let (_, got) = fill_clause("FILL(previous)").unwrap();
+        assert_matches!(got, FillClause::Previous);
+
+        let (_, got) = fill_clause("FILL(linear)").unwrap();
+        assert_matches!(got, FillClause::Linear);
+
+        // unnecessary whitespace
+        let (_, got) = fill_clause("FILL ( null )").unwrap();
+        assert_matches!(got, FillClause::Null);
+
+        // Fallible cases
+
+        assert_expect_error!(
+            fill_clause("FILL(foo)"),
+            "invalid FILL option, expected NULL, NONE, PREVIOUS, LINEAR, or a number"
+        );
+    }
+
+    #[test]
+    fn test_timezone_clause() {
+        let (_, got) = timezone_clause("TZ('Australia/Hobart')").unwrap();
+        assert_eq!(*got, chrono_tz::Australia::Hobart);
+
+        let (_, got) = timezone_clause("TZ('UTC')").unwrap();
+        assert_eq!(*got, chrono_tz::UTC);
+
+        // Fallible cases
+        assert_expect_error!(
+            timezone_clause("TZ(foo)"),
+            "invalid TZ clause, expected string"
+        );
+        assert_expect_error!(timezone_clause("TZ('Foo')"), "unable to find timezone");
+    }
+
+    #[test]
+    fn test_wildcard() {
+        let (_, got) = wildcard("*").unwrap();
+        assert_matches!(got, None);
+        let (_, got) = wildcard("*::tag").unwrap();
+        assert_matches!(got, Some(v) if v == WildcardType::Tag);
+        let (_, got) = wildcard("*::field").unwrap();
+        assert_matches!(got, Some(v) if v == WildcardType::Field);
+
+        // Fallible cases
+
+        assert_expect_error!(
+            wildcard("*::foo"),
+            "invalid wildcard type specifier, expected TAG or FIELD"
+        );
+    }
+
+    #[test]
+    fn test_parse_field() {
+        assert_eq!(parse_field("a as foo").unwrap().to_string(), "a AS foo");
+
+        // with leading and trailing whitespace
+        assert_eq!(
+            parse_field("  a+3 as foo  ").unwrap().to_string(),
+            "a + 3 AS foo"
+        );
+
+        // fallible
+        assert_error!(parse_field("  a+3 as "), ref e @ ParseError { .. } if e.pos == 8);
+
+        // FromStr
+
+        let field = " sum(a) as foo  ".parse::<Field>().unwrap();
+        assert_eq!(field.to_string(), "sum(a) AS foo");
+    }
+}
diff --git a/influxdb_influxql_parser/src/show.rs b/influxdb_influxql_parser/src/show.rs
new file mode 100644
index 0000000..e3a74fa
--- /dev/null
+++ b/influxdb_influxql_parser/src/show.rs
@@ -0,0 +1,139 @@
+//! Types and parsers for various [`SHOW`][sql] schema statements.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/
+
+use crate::common::ws1;
+use crate::identifier::{identifier, Identifier};
+use crate::impl_tuple_clause;
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use crate::show_field_keys::show_field_keys;
+use crate::show_measurements::show_measurements;
+use crate::show_retention_policies::show_retention_policies;
+use crate::show_tag_keys::show_tag_keys;
+use crate::show_tag_values::show_tag_values;
+use crate::statement::Statement;
+use nom::branch::alt;
+use nom::combinator::{map, value};
+use nom::sequence::{pair, preceded};
+use std::fmt::{Display, Formatter};
+
+/// Parse a SHOW statement.
+pub(crate) fn show_statement(i: &str) -> ParseResult<&str, Statement> {
+    preceded(
+        pair(keyword("SHOW"), ws1),
+        expect(
+            "invalid SHOW statement, expected DATABASES, FIELD, MEASUREMENTS, TAG, or RETENTION following SHOW",
+            alt((
+                // SHOW DATABASES
+                map(show_databases, |s| Statement::ShowDatabases(Box::new(s))),
+                // SHOW FIELD KEYS
+                map(show_field_keys, |s| Statement::ShowFieldKeys(Box::new(s))),
+                // SHOW MEASUREMENTS
+                map(show_measurements, |s| {
+                    Statement::ShowMeasurements(Box::new(s))
+                }),
+                // SHOW RETENTION POLICIES
+                map(show_retention_policies, |s| {
+                    Statement::ShowRetentionPolicies(Box::new(s))
+                }),
+                // SHOW TAG
+                show_tag,
+            )),
+        ),
+    )(i)
+}
+
+/// Represents a `SHOW DATABASES` statement.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct ShowDatabasesStatement;
+
+impl Display for ShowDatabasesStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_str("SHOW DATABASES")
+    }
+}
+
+/// Parse a `SHOW DATABASES` statement.
+fn show_databases(i: &str) -> ParseResult<&str, ShowDatabasesStatement> {
+    value(ShowDatabasesStatement, keyword("DATABASES"))(i)
+}
+
+/// Represents an `ON` clause for the case where the database is a single [`Identifier`].
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct OnClause(pub(crate) Identifier);
+
+impl_tuple_clause!(OnClause, Identifier);
+
+impl Display for OnClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "ON {}", self.0)
+    }
+}
+
+/// Parse an `ON` clause for statements such as `SHOW TAG KEYS` and `SHOW FIELD KEYS`.
+pub(crate) fn on_clause(i: &str) -> ParseResult<&str, OnClause> {
+    preceded(
+        keyword("ON"),
+        expect(
+            "invalid ON clause, expected identifier",
+            map(identifier, OnClause),
+        ),
+    )(i)
+}
+
+/// Parse a `SHOW TAG (KEYS|VALUES)` statement.
+fn show_tag(i: &str) -> ParseResult<&str, Statement> {
+    preceded(
+        pair(keyword("TAG"), ws1),
+        expect(
+            "invalid SHOW TAG statement, expected KEYS or VALUES",
+            alt((
+                map(show_tag_keys, |s| Statement::ShowTagKeys(Box::new(s))),
+                map(show_tag_values, |s| Statement::ShowTagValues(Box::new(s))),
+            )),
+        ),
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::assert_expect_error;
+
+    #[test]
+    fn test_show_statement() {
+        // Validate each of the `SHOW` statements are accepted
+
+        let (_, got) = show_statement("SHOW DATABASES").unwrap();
+        assert_eq!(got.to_string(), "SHOW DATABASES");
+
+        let (_, got) = show_statement("SHOW FIELD KEYS").unwrap();
+        assert_eq!(got.to_string(), "SHOW FIELD KEYS");
+
+        let (_, got) = show_statement("SHOW MEASUREMENTS").unwrap();
+        assert_eq!(got.to_string(), "SHOW MEASUREMENTS");
+
+        let (_, got) = show_statement("SHOW RETENTION POLICIES ON \"foo\"").unwrap();
+        assert_eq!(got.to_string(), "SHOW RETENTION POLICIES ON foo");
+
+        let (_, got) = show_statement("SHOW TAG KEYS").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS");
+
+        let (_, got) = show_statement("SHOW TAG VALUES WITH KEY = some_key").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG VALUES WITH KEY = some_key");
+
+        // Fallible cases
+
+        assert_expect_error!(
+            show_statement("SHOW TAG FOO WITH KEY = some_key"),
+            "invalid SHOW TAG statement, expected KEYS or VALUES"
+        );
+
+        // Unsupported SHOW
+        assert_expect_error!(
+            show_statement("SHOW FOO"),
+            "invalid SHOW statement, expected DATABASES, FIELD, MEASUREMENTS, TAG, or RETENTION following SHOW"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/show_field_keys.rs b/influxdb_influxql_parser/src/show_field_keys.rs
new file mode 100644
index 0000000..a7d32d7
--- /dev/null
+++ b/influxdb_influxql_parser/src/show_field_keys.rs
@@ -0,0 +1,142 @@
+//! Types and parsers for the [`SHOW FIELD KEYS`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-field-keys
+
+use crate::common::{limit_clause, offset_clause, ws1, LimitClause, OffsetClause};
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use crate::show::{on_clause, OnClause};
+use crate::simple_from_clause::{show_from_clause, ShowFromClause};
+use nom::combinator::opt;
+use nom::sequence::{preceded, tuple};
+use std::fmt;
+use std::fmt::Formatter;
+
+/// Represents a `SHOW FIELD KEYS` InfluxQL statement.
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct ShowFieldKeysStatement {
+    /// The name of the database to query. If `None`, a default
+    /// database will be used.
+    pub database: Option<OnClause>,
+
+    /// The measurement or measurements to restrict which field keys
+    /// are retrieved.
+    pub from: Option<ShowFromClause>,
+
+    /// A value to restrict the number of field keys returned.
+    pub limit: Option<LimitClause>,
+
+    /// A value to specify an offset to start retrieving field keys.
+    pub offset: Option<OffsetClause>,
+}
+
+impl fmt::Display for ShowFieldKeysStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_str("SHOW FIELD KEYS")?;
+
+        if let Some(ref on_clause) = self.database {
+            write!(f, " {on_clause}")?;
+        }
+
+        if let Some(ref expr) = self.from {
+            write!(f, " {expr}")?;
+        }
+
+        if let Some(ref limit) = self.limit {
+            write!(f, " {limit}")?;
+        }
+
+        if let Some(ref offset) = self.offset {
+            write!(f, " {offset}")?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Parse a `SHOW FIELD KEYS` statement, starting from the `FIELD` token.
+pub(crate) fn show_field_keys(i: &str) -> ParseResult<&str, ShowFieldKeysStatement> {
+    let (
+        remaining_input,
+        (
+            _, // FIELD
+            _, // whitespace
+            _, // "KEYS"
+            database,
+            from,
+            limit,
+            offset,
+        ),
+    ) = tuple((
+        keyword("FIELD"),
+        ws1,
+        expect(
+            "invalid SHOW FIELD KEYS statement, expected KEYS",
+            keyword("KEYS"),
+        ),
+        opt(preceded(ws1, on_clause)),
+        opt(preceded(ws1, show_from_clause)),
+        opt(preceded(ws1, limit_clause)),
+        opt(preceded(ws1, offset_clause)),
+    ))(i)?;
+
+    Ok((
+        remaining_input,
+        ShowFieldKeysStatement {
+            database,
+            from,
+            limit,
+            offset,
+        },
+    ))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::assert_expect_error;
+
+    #[test]
+    fn test_show_field_keys() {
+        // No optional clauses
+        let (_, got) = show_field_keys("FIELD KEYS").unwrap();
+        assert_eq!(got.to_string(), "SHOW FIELD KEYS");
+
+        let (_, got) = show_field_keys("FIELD KEYS ON db").unwrap();
+        assert_eq!(got.to_string(), "SHOW FIELD KEYS ON db");
+
+        // measurement selection using name
+        let (_, got) = show_field_keys("FIELD KEYS FROM db..foo").unwrap();
+        assert_eq!(got.to_string(), "SHOW FIELD KEYS FROM db..foo");
+
+        // measurement selection using regex
+        let (_, got) = show_field_keys("FIELD KEYS FROM /foo/").unwrap();
+        assert_eq!(got.to_string(), "SHOW FIELD KEYS FROM /foo/");
+
+        // measurement selection using list
+        let (_, got) = show_field_keys("FIELD KEYS FROM /foo/ , bar, \"foo bar\"").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW FIELD KEYS FROM /foo/, bar, \"foo bar\""
+        );
+
+        let (_, got) = show_field_keys("FIELD KEYS LIMIT 1").unwrap();
+        assert_eq!(got.to_string(), "SHOW FIELD KEYS LIMIT 1");
+
+        let (_, got) = show_field_keys("FIELD KEYS OFFSET 2").unwrap();
+        assert_eq!(got.to_string(), "SHOW FIELD KEYS OFFSET 2");
+
+        // all optional clauses
+        let (_, got) = show_field_keys("FIELD KEYS ON db FROM /foo/ LIMIT 1 OFFSET 2").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW FIELD KEYS ON db FROM /foo/ LIMIT 1 OFFSET 2"
+        );
+
+        // Fallible cases
+        assert_expect_error!(
+            show_field_keys("FIELD ON db"),
+            "invalid SHOW FIELD KEYS statement, expected KEYS"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/show_measurements.rs b/influxdb_influxql_parser/src/show_measurements.rs
new file mode 100644
index 0000000..12d91be
--- /dev/null
+++ b/influxdb_influxql_parser/src/show_measurements.rs
@@ -0,0 +1,420 @@
+//! Types and parsers for the [`SHOW MEASUREMENTS`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-measurements
+
+use crate::common::{
+    limit_clause, offset_clause, qualified_measurement_name, where_clause, ws0, ws1, LimitClause,
+    OffsetClause, QualifiedMeasurementName, WhereClause,
+};
+use crate::identifier::{identifier, Identifier};
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::combinator::{map, opt, value};
+use nom::sequence::tuple;
+use nom::sequence::{pair, preceded, terminated};
+use std::fmt;
+use std::fmt::Formatter;
+
+/// Represents an `ON` clause for a `SHOW MEASUREMENTS` statement to specify
+/// which database the statement applies to.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum ExtendedOnClause {
+    /// Represents a specific database and the default retention policy.
+    Database(Identifier),
+    /// Represents a specific database and retention policy.
+    DatabaseRetentionPolicy(Identifier, Identifier),
+    /// Represents all databases and their default retention policies.
+    AllDatabases,
+    /// Represents all databases and all their retention policies.
+    AllDatabasesAndRetentionPolicies,
+}
+
+impl fmt::Display for ExtendedOnClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_str("ON ")?;
+
+        match self {
+            Self::Database(db) => write!(f, "{db}"),
+            Self::DatabaseRetentionPolicy(db, rp) => write!(f, "{db}.{rp}"),
+            Self::AllDatabases => write!(f, "*"),
+            Self::AllDatabasesAndRetentionPolicies => write!(f, "*.*"),
+        }
+    }
+}
+
+/// Parse the `ON` clause of the `SHOW MEASUREMENTS` statement.
+fn extended_on_clause(i: &str) -> ParseResult<&str, ExtendedOnClause> {
+    preceded(
+        pair(keyword("ON"), ws1),
+        expect(
+            "invalid ON clause, expected wildcard or identifier",
+            alt((
+                value(
+                    ExtendedOnClause::AllDatabasesAndRetentionPolicies,
+                    tag("*.*"),
+                ),
+                value(ExtendedOnClause::AllDatabases, tag("*")),
+                map(
+                    pair(opt(terminated(identifier, tag("."))), identifier),
+                    |tup| match tup {
+                        (None, db) => ExtendedOnClause::Database(db),
+                        (Some(db), rp) => ExtendedOnClause::DatabaseRetentionPolicy(db, rp),
+                    },
+                ),
+            )),
+        ),
+    )(i)
+}
+
+/// Represents a `SHOW MEASUREMENTS` statement.
+#[derive(Clone, Debug, Default, PartialEq)]
+pub struct ShowMeasurementsStatement {
+    /// Represents the `ON` clause, which limits the search
+    /// to databases matching the expression.
+    pub on: Option<ExtendedOnClause>,
+
+    /// Represents the `WITH MEASUREMENT` clause, which limits
+    /// the search to measurements matching the expression.
+    pub with_measurement: Option<WithMeasurementClause>,
+
+    /// Represents the `WHERE` clause, which holds a conditional
+    /// expression to filter the measurement list.
+    pub condition: Option<WhereClause>,
+
+    /// Represents the `LIMIT` clause, which holds a value to
+    /// restrict the number of tag keys returned.
+    pub limit: Option<LimitClause>,
+
+    /// Represents the `OFFSET` clause.
+    ///
+    /// The `OFFSET` clause holds value to specify an offset to start retrieving tag keys.
+    pub offset: Option<OffsetClause>,
+}
+
+impl fmt::Display for ShowMeasurementsStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "SHOW MEASUREMENTS")?;
+
+        if let Some(ref on_clause) = self.on {
+            write!(f, " {on_clause}")?;
+        }
+
+        if let Some(ref with_clause) = self.with_measurement {
+            write!(f, " {with_clause}")?;
+        }
+
+        if let Some(ref where_clause) = self.condition {
+            write!(f, " {where_clause}")?;
+        }
+
+        if let Some(ref limit) = self.limit {
+            write!(f, " {limit}")?;
+        }
+
+        if let Some(ref offset) = self.offset {
+            write!(f, " {offset}")?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Represents the expression of a `WITH MEASUREMENT` clause.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum WithMeasurementClause {
+    /// Limit the measurements identified by the measurement name using an equals operator.
+    Equals(QualifiedMeasurementName),
+    /// Limit the measurements identified by the measurement name using a
+    /// regular expression equals operator.
+    Regex(QualifiedMeasurementName),
+}
+
+impl fmt::Display for WithMeasurementClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_str("WITH MEASUREMENT ")?;
+        match self {
+            Self::Equals(ref name) => write!(f, "= {name}"),
+            Self::Regex(ref re) => write!(f, "=~ {re}"),
+        }
+    }
+}
+
+fn with_measurement_clause(i: &str) -> ParseResult<&str, WithMeasurementClause> {
+    preceded(
+        tuple((
+            keyword("WITH"),
+            ws1,
+            expect(
+                "invalid WITH clause, expected MEASUREMENT",
+                keyword("MEASUREMENT"),
+            ),
+            ws0,
+        )),
+        expect(
+            "expected = or =~",
+            alt((
+                map(
+                    preceded(pair(tag("=~"), ws0), qualified_measurement_name),
+                    WithMeasurementClause::Regex,
+                ),
+                map(
+                    preceded(
+                        pair(tag("="), ws0),
+                        expect("expected measurement name", qualified_measurement_name),
+                    ),
+                    WithMeasurementClause::Equals,
+                ),
+            )),
+        ),
+    )(i)
+}
+
+/// Parse a `SHOW MEASUREMENTS` statement after `SHOW` and any whitespace has been consumed.
+pub(crate) fn show_measurements(i: &str) -> ParseResult<&str, ShowMeasurementsStatement> {
+    let (
+        remaining_input,
+        (
+            _, // "MEASUREMENTS"
+            on_expression,
+            measurement_expression,
+            condition,
+            limit,
+            offset,
+        ),
+    ) = tuple((
+        keyword("MEASUREMENTS"),
+        opt(preceded(ws1, extended_on_clause)),
+        opt(preceded(ws1, with_measurement_clause)),
+        opt(preceded(ws1, where_clause)),
+        opt(preceded(ws1, limit_clause)),
+        opt(preceded(ws1, offset_clause)),
+    ))(i)?;
+
+    Ok((
+        remaining_input,
+        ShowMeasurementsStatement {
+            on: on_expression,
+            with_measurement: measurement_expression,
+            condition,
+            limit,
+            offset,
+        },
+    ))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::assert_expect_error;
+    use crate::common::MeasurementName;
+    use crate::expression::arithmetic::Expr;
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_show_measurements() {
+        let (_, got) = show_measurements("measurements").unwrap();
+        assert_eq!(
+            got,
+            ShowMeasurementsStatement {
+                on: None,
+                ..Default::default()
+            },
+        );
+
+        let (_, got) = show_measurements("measurements ON foo").unwrap();
+        assert_eq!(
+            got,
+            ShowMeasurementsStatement {
+                on: Some(ExtendedOnClause::Database("foo".into())),
+                ..Default::default()
+            },
+        );
+
+        let (_, got) = show_measurements(
+            "MEASUREMENTS\tON  foo  WITH  MEASUREMENT\n=  bar WHERE\ntrue LIMIT 10 OFFSET 20",
+        )
+        .unwrap();
+        assert_eq!(
+            got,
+            ShowMeasurementsStatement {
+                on: Some(ExtendedOnClause::Database("foo".into())),
+                with_measurement: Some(WithMeasurementClause::Equals(QualifiedMeasurementName {
+                    database: None,
+                    retention_policy: None,
+                    name: "bar".into(),
+                })),
+                condition: Some(WhereClause::new(Expr::Literal(true.into()).into())),
+                limit: Some(10.into()),
+                offset: Some(20.into())
+            },
+        );
+        assert_eq!(
+            got.to_string(),
+            "SHOW MEASUREMENTS ON foo WITH MEASUREMENT = bar WHERE true LIMIT 10 OFFSET 20"
+        );
+
+        let (_, got) =
+            show_measurements("MEASUREMENTS\tON  foo  WITH  MEASUREMENT\n=~ /bar/ WHERE\ntrue")
+                .unwrap();
+        assert_eq!(
+            got,
+            ShowMeasurementsStatement {
+                on: Some(ExtendedOnClause::Database("foo".into())),
+                with_measurement: Some(WithMeasurementClause::Regex(
+                    QualifiedMeasurementName::new(MeasurementName::Regex("bar".into()))
+                )),
+                condition: Some(WhereClause::new(Expr::Literal(true.into()).into())),
+                limit: None,
+                offset: None
+            },
+        );
+        assert_eq!(
+            got.to_string(),
+            "SHOW MEASUREMENTS ON foo WITH MEASUREMENT =~ /bar/ WHERE true"
+        );
+    }
+
+    #[test]
+    fn test_display() {
+        let got = format!(
+            "{}",
+            ShowMeasurementsStatement {
+                on: None,
+                ..Default::default()
+            }
+        );
+        assert_eq!(got, "SHOW MEASUREMENTS");
+
+        let got = format!(
+            "{}",
+            ShowMeasurementsStatement {
+                on: Some(ExtendedOnClause::Database("foo".into())),
+                ..Default::default()
+            }
+        );
+        assert_eq!(got, "SHOW MEASUREMENTS ON foo");
+
+        let got = format!(
+            "{}",
+            ShowMeasurementsStatement {
+                on: Some(ExtendedOnClause::DatabaseRetentionPolicy(
+                    "foo".into(),
+                    "bar".into()
+                )),
+                ..Default::default()
+            }
+        );
+        assert_eq!(got, "SHOW MEASUREMENTS ON foo.bar");
+
+        let got = format!(
+            "{}",
+            ShowMeasurementsStatement {
+                on: Some(ExtendedOnClause::AllDatabases),
+                ..Default::default()
+            }
+        );
+        assert_eq!(got, "SHOW MEASUREMENTS ON *");
+
+        let got = format!(
+            "{}",
+            ShowMeasurementsStatement {
+                on: Some(ExtendedOnClause::AllDatabasesAndRetentionPolicies),
+                ..Default::default()
+            }
+        );
+        assert_eq!(got, "SHOW MEASUREMENTS ON *.*");
+    }
+
+    #[test]
+    fn test_extended_on_clause() {
+        let (_, got) = extended_on_clause("ON cpu").unwrap();
+        assert_eq!(got, ExtendedOnClause::Database("cpu".into()));
+
+        let (_, got) = extended_on_clause("ON cpu.autogen").unwrap();
+        assert_eq!(
+            got,
+            ExtendedOnClause::DatabaseRetentionPolicy("cpu".into(), "autogen".into())
+        );
+
+        let (_, got) = extended_on_clause("ON *").unwrap();
+        assert_matches!(got, ExtendedOnClause::AllDatabases);
+
+        let (_, got) = extended_on_clause("ON *.*").unwrap();
+        assert_matches!(got, ExtendedOnClause::AllDatabasesAndRetentionPolicies);
+
+        assert_expect_error!(
+            extended_on_clause("ON WHERE cpu = 'test'"),
+            "invalid ON clause, expected wildcard or identifier"
+        )
+    }
+
+    #[test]
+    fn test_with_measurement_clause() {
+        use crate::common::MeasurementName::*;
+
+        let (_, got) = with_measurement_clause("WITH measurement = foo").unwrap();
+        assert_eq!(
+            got,
+            WithMeasurementClause::Equals(QualifiedMeasurementName::new(Name("foo".into())))
+        );
+
+        let (_, got) = with_measurement_clause("WITH measurement =~ /foo/").unwrap();
+        assert_eq!(
+            got,
+            WithMeasurementClause::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
+
+        // Expressions are still valid when whitespace is omitted
+
+        let (_, got) = with_measurement_clause("WITH measurement=foo..bar").unwrap();
+        assert_eq!(
+            got,
+            WithMeasurementClause::Equals(QualifiedMeasurementName::new_db(
+                Name("bar".into()),
+                "foo".into()
+            ))
+        );
+
+        let (_, got) = with_measurement_clause("WITH measurement=~/foo/").unwrap();
+        assert_eq!(
+            got,
+            WithMeasurementClause::Regex(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
+
+        // Quirks of InfluxQL per https://github.com/influxdata/influxdb_iox/issues/5662
+
+        let (_, got) = with_measurement_clause("WITH measurement =~ foo").unwrap();
+        assert_eq!(
+            got,
+            WithMeasurementClause::Regex(QualifiedMeasurementName::new(Name("foo".into())))
+        );
+
+        let (_, got) = with_measurement_clause("WITH measurement = /foo/").unwrap();
+        assert_eq!(
+            got,
+            WithMeasurementClause::Equals(QualifiedMeasurementName::new(Regex("foo".into())))
+        );
+
+        // Fallible cases
+
+        // Missing MEASUREMENT token
+        assert_expect_error!(
+            with_measurement_clause("WITH =~ foo"),
+            "invalid WITH clause, expected MEASUREMENT"
+        );
+
+        // Unsupported regex not equal operator
+        assert_expect_error!(
+            with_measurement_clause("WITH measurement !~ foo"),
+            "expected = or =~"
+        );
+
+        // Must have an identifier
+        assert_expect_error!(
+            with_measurement_clause("WITH measurement = 1"),
+            "expected measurement name"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/show_retention_policies.rs b/influxdb_influxql_parser/src/show_retention_policies.rs
new file mode 100644
index 0000000..7c5fea4
--- /dev/null
+++ b/influxdb_influxql_parser/src/show_retention_policies.rs
@@ -0,0 +1,73 @@
+//! Types and parsers for the [`SHOW RETENTION POLICIES`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-retention-policies
+
+use crate::common::ws1;
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use crate::show::{on_clause, OnClause};
+use nom::combinator::opt;
+use nom::sequence::{preceded, tuple};
+use std::fmt::{Display, Formatter};
+
+/// Represents a `SHOW RETENTION POLICIES` statement.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ShowRetentionPoliciesStatement {
+    /// Name of the database to list the retention policies, or all if this is `None`.
+    pub database: Option<OnClause>,
+}
+
+impl Display for ShowRetentionPoliciesStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "SHOW RETENTION POLICIES")?;
+        if let Some(ref database) = self.database {
+            write!(f, " {database}")?;
+        }
+        Ok(())
+    }
+}
+
+pub(crate) fn show_retention_policies(
+    i: &str,
+) -> ParseResult<&str, ShowRetentionPoliciesStatement> {
+    let (remaining, (_, _, _, database)) = tuple((
+        keyword("RETENTION"),
+        ws1,
+        expect(
+            "invalid SHOW RETENTION POLICIES statement, expected POLICIES",
+            keyword("POLICIES"),
+        ),
+        opt(preceded(ws1, on_clause)),
+    ))(i)?;
+
+    Ok((remaining, ShowRetentionPoliciesStatement { database }))
+}
+
+#[cfg(test)]
+mod test {
+    use crate::assert_expect_error;
+    use crate::show_retention_policies::show_retention_policies;
+
+    #[test]
+    fn test_show_retention_policies() {
+        // no ON clause
+        show_retention_policies("RETENTION POLICIES").unwrap();
+
+        // with ON clause
+        show_retention_policies("RETENTION POLICIES ON foo").unwrap();
+
+        // Fallible cases
+
+        // missing POLICIES keyword
+        assert_expect_error!(
+            show_retention_policies("RETENTION ON foo"),
+            "invalid SHOW RETENTION POLICIES statement, expected POLICIES"
+        );
+
+        // missing database
+        assert_expect_error!(
+            show_retention_policies("RETENTION POLICIES ON "),
+            "invalid ON clause, expected identifier"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/show_tag_keys.rs b/influxdb_influxql_parser/src/show_tag_keys.rs
new file mode 100644
index 0000000..d781b89
--- /dev/null
+++ b/influxdb_influxql_parser/src/show_tag_keys.rs
@@ -0,0 +1,146 @@
+//! Types and parsers for the [`SHOW TAG KEYS`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-tag-keys
+
+use crate::common::{
+    limit_clause, offset_clause, where_clause, ws1, LimitClause, OffsetClause, WhereClause,
+};
+use crate::internal::ParseResult;
+use crate::keywords::keyword;
+use crate::show::{on_clause, OnClause};
+use crate::simple_from_clause::{show_from_clause, ShowFromClause};
+use nom::combinator::opt;
+use nom::sequence::{preceded, tuple};
+use std::fmt;
+use std::fmt::Formatter;
+
+/// Represents a `SHOW TAG KEYS` InfluxQL statement.
+#[derive(Clone, Debug, Default, PartialEq)]
+pub struct ShowTagKeysStatement {
+    /// The name of the database to query. If `None`, a default
+    /// database will be used.
+    pub database: Option<OnClause>,
+
+    /// The measurement or measurements to restrict which tag keys
+    /// are retrieved.
+    pub from: Option<ShowFromClause>,
+
+    /// A conditional expression to filter the tag keys.
+    pub condition: Option<WhereClause>,
+
+    /// A value to restrict the number of tag keys returned.
+    pub limit: Option<LimitClause>,
+
+    /// A value to specify an offset to start retrieving tag keys.
+    pub offset: Option<OffsetClause>,
+}
+
+impl fmt::Display for ShowTagKeysStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "SHOW TAG KEYS")?;
+
+        if let Some(ref on_clause) = self.database {
+            write!(f, " {on_clause}")?;
+        }
+
+        if let Some(ref expr) = self.from {
+            write!(f, " {expr}")?;
+        }
+
+        if let Some(ref cond) = self.condition {
+            write!(f, " {cond}")?;
+        }
+
+        if let Some(ref limit) = self.limit {
+            write!(f, " {limit}")?;
+        }
+
+        if let Some(ref offset) = self.offset {
+            write!(f, " {offset}")?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Parse a `SHOW TAG KEYS` statement, starting from the `KEYS` token.
+pub(crate) fn show_tag_keys(i: &str) -> ParseResult<&str, ShowTagKeysStatement> {
+    let (
+        remaining_input,
+        (
+            _, // "KEYS"
+            database,
+            from,
+            condition,
+            limit,
+            offset,
+        ),
+    ) = tuple((
+        keyword("KEYS"),
+        opt(preceded(ws1, on_clause)),
+        opt(preceded(ws1, show_from_clause)),
+        opt(preceded(ws1, where_clause)),
+        opt(preceded(ws1, limit_clause)),
+        opt(preceded(ws1, offset_clause)),
+    ))(i)?;
+
+    Ok((
+        remaining_input,
+        ShowTagKeysStatement {
+            database,
+            from,
+            condition,
+            limit,
+            offset,
+        },
+    ))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_show_tag_keys() {
+        // No optional clauses
+        let (_, got) = show_tag_keys("KEYS").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS");
+
+        let (_, got) = show_tag_keys("KEYS ON db").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS ON db");
+
+        // measurement selection using name
+        let (_, got) = show_tag_keys("KEYS FROM db..foo").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS FROM db..foo");
+
+        // measurement selection using regex
+        let (_, got) = show_tag_keys("KEYS FROM /foo/").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS FROM /foo/");
+
+        // measurement selection using list
+        let (_, got) = show_tag_keys("KEYS FROM /foo/ , bar, \"foo bar\"").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG KEYS FROM /foo/, bar, \"foo bar\""
+        );
+
+        let (_, got) = show_tag_keys("KEYS WHERE foo = 'bar'").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS WHERE foo = 'bar'");
+
+        let (_, got) = show_tag_keys("KEYS LIMIT 1").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS LIMIT 1");
+
+        let (_, got) = show_tag_keys("KEYS OFFSET 2").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG KEYS OFFSET 2");
+
+        // all optional clauses
+        let (_, got) =
+            show_tag_keys("KEYS ON db FROM /foo/ WHERE foo = 'bar' LIMIT 1 OFFSET 2").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG KEYS ON db FROM /foo/ WHERE foo = 'bar' LIMIT 1 OFFSET 2"
+        );
+
+        // Fallible cases are tested by the various combinator functions
+    }
+}
diff --git a/influxdb_influxql_parser/src/show_tag_values.rs b/influxdb_influxql_parser/src/show_tag_values.rs
new file mode 100644
index 0000000..72b9991
--- /dev/null
+++ b/influxdb_influxql_parser/src/show_tag_values.rs
@@ -0,0 +1,390 @@
+//! Types and parsers for the [`SHOW TAG VALUES`][sql] statement.
+//!
+//! [sql]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-schema/#show-tag-values
+
+use crate::common::{
+    limit_clause, offset_clause, where_clause, ws0, ws1, LimitClause, OffsetClause, OneOrMore,
+    WhereClause,
+};
+use crate::identifier::{identifier, Identifier};
+use crate::internal::{expect, ParseResult};
+use crate::keywords::keyword;
+use crate::show::{on_clause, OnClause};
+use crate::simple_from_clause::{show_from_clause, ShowFromClause};
+use crate::string::{regex, Regex};
+use nom::branch::alt;
+use nom::bytes::complete::tag;
+use nom::character::complete::char;
+use nom::combinator::{map, opt};
+use nom::sequence::{delimited, preceded, tuple};
+use std::fmt;
+use std::fmt::{Display, Formatter};
+
+/// Represents a `SHOW TAG VALUES` InfluxQL statement.
+#[derive(Clone, Debug, PartialEq)]
+pub struct ShowTagValuesStatement {
+    /// The name of the database to query. If `None`, a default
+    /// database will be used.
+    pub database: Option<OnClause>,
+
+    /// The measurement or measurements to restrict which tag keys
+    /// are retrieved.
+    pub from: Option<ShowFromClause>,
+
+    /// Represents the `WITH KEY` clause, to restrict the tag values to
+    /// the matching tag keys.
+    pub with_key: WithKeyClause,
+
+    /// A conditional expression to filter the tag keys.
+    pub condition: Option<WhereClause>,
+
+    /// A value to restrict the number of tag keys returned.
+    pub limit: Option<LimitClause>,
+
+    /// A value to specify an offset to start retrieving tag keys.
+    pub offset: Option<OffsetClause>,
+}
+
+impl Display for ShowTagValuesStatement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "SHOW TAG VALUES")?;
+
+        if let Some(ref on_clause) = self.database {
+            write!(f, " {on_clause}")?;
+        }
+
+        if let Some(ref from_clause) = self.from {
+            write!(f, " {from_clause}")?;
+        }
+
+        write!(f, " {}", self.with_key)?;
+
+        if let Some(ref where_clause) = self.condition {
+            write!(f, " {where_clause}")?;
+        }
+
+        if let Some(ref limit) = self.limit {
+            write!(f, " {limit}")?;
+        }
+
+        if let Some(ref offset) = self.offset {
+            write!(f, " {offset}")?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Parse a `SHOW TAG VALUES` statement, starting from the `VALUES` token.
+pub(crate) fn show_tag_values(i: &str) -> ParseResult<&str, ShowTagValuesStatement> {
+    let (
+        remaining_input,
+        (
+            _, // "VALUES"
+            database,
+            from,
+            with_key,
+            condition,
+            limit,
+            offset,
+        ),
+    ) = tuple((
+        keyword("VALUES"),
+        opt(preceded(ws1, on_clause)),
+        opt(preceded(ws1, show_from_clause)),
+        expect(
+            "invalid SHOW TAG VALUES statement, expected WITH KEY clause",
+            preceded(ws1, with_key_clause),
+        ),
+        opt(preceded(ws1, where_clause)),
+        opt(preceded(ws1, limit_clause)),
+        opt(preceded(ws1, offset_clause)),
+    ))(i)?;
+
+    Ok((
+        remaining_input,
+        ShowTagValuesStatement {
+            database,
+            from,
+            with_key,
+            condition,
+            limit,
+            offset,
+        },
+    ))
+}
+
+/// Represents a list of identifiers when the `WITH KEY` clause
+/// specifies the `IN` operator.
+pub type InList = OneOrMore<Identifier>;
+
+impl Display for InList {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        Display::fmt(self.head(), f)?;
+        for arg in self.tail() {
+            write!(f, ", {arg}")?;
+        }
+        Ok(())
+    }
+}
+
+/// Represents a `WITH KEY` clause.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum WithKeyClause {
+    /// Select a single tag key that equals the identifier.
+    Eq(Identifier),
+    /// Select all tag keys that do not equal the identifier.
+    NotEq(Identifier),
+    /// Select any tag keys that pass the regular expression.
+    EqRegex(Regex),
+    /// Select the tag keys that do not pass the regular expression.
+    NotEqRegex(Regex),
+    /// Select the tag keys matching each of the identifiers in the list.
+    In(InList),
+}
+
+impl Display for WithKeyClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.write_str("WITH KEY ")?;
+
+        match self {
+            Self::Eq(v) => write!(f, "= {v}"),
+            Self::NotEq(v) => write!(f, "!= {v}"),
+            Self::EqRegex(v) => write!(f, "=~ {v}"),
+            Self::NotEqRegex(v) => write!(f, "=! {v}"),
+            Self::In(list) => write!(f, "IN ({list})"),
+        }
+    }
+}
+
+/// Parse an identifier list, as expected by the `WITH KEY IN` clause.
+fn identifier_list(i: &str) -> ParseResult<&str, InList> {
+    delimited(
+        preceded(ws0, char('(')),
+        InList::separated_list1("invalid IN clause, expected identifier"),
+        expect(
+            "invalid identifier list, expected ')'",
+            preceded(ws0, char(')')),
+        ),
+    )(i)
+}
+
+fn with_key_clause(i: &str) -> ParseResult<&str, WithKeyClause> {
+    preceded(
+        tuple((
+            keyword("WITH"),
+            ws1,
+            expect("invalid WITH KEY clause, expected KEY", keyword("KEY")),
+        )),
+        expect(
+            "invalid WITH KEY clause, expected condition",
+            alt((
+                map(
+                    preceded(
+                        delimited(ws0, tag("=~"), ws0),
+                        expect(
+                            "invalid WITH KEY clause, expected regular expression following =~",
+                            regex,
+                        ),
+                    ),
+                    WithKeyClause::EqRegex,
+                ),
+                map(
+                    preceded(
+                        delimited(ws0, tag("!~"), ws0),
+                        expect(
+                            "invalid WITH KEY clause, expected regular expression following =!",
+                            regex,
+                        ),
+                    ),
+                    WithKeyClause::NotEqRegex,
+                ),
+                map(
+                    preceded(
+                        preceded(ws0, char('=')),
+                        expect(
+                            "invalid WITH KEY clause, expected identifier following =",
+                            identifier,
+                        ),
+                    ),
+                    WithKeyClause::Eq,
+                ),
+                map(
+                    preceded(
+                        preceded(ws0, tag("!=")),
+                        expect(
+                            "invalid WITH KEY clause, expected identifier following !=",
+                            identifier,
+                        ),
+                    ),
+                    WithKeyClause::NotEq,
+                ),
+                map(
+                    preceded(
+                        preceded(ws1, tag("IN")),
+                        expect(
+                            "invalid WITH KEY clause, expected identifier list following IN",
+                            identifier_list,
+                        ),
+                    ),
+                    WithKeyClause::In,
+                ),
+            )),
+        ),
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::assert_expect_error;
+
+    #[test]
+    fn test_show_tag_values() {
+        // No optional clauses
+        let (_, got) = show_tag_values("VALUES WITH KEY = some_key").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG VALUES WITH KEY = some_key");
+
+        let (_, got) = show_tag_values("VALUES ON db WITH KEY = some_key").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG VALUES ON db WITH KEY = some_key");
+
+        // measurement selection using name
+        let (_, got) = show_tag_values("VALUES FROM db..foo WITH KEY = some_key").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG VALUES FROM db..foo WITH KEY = some_key"
+        );
+
+        // measurement selection using regex
+        let (_, got) = show_tag_values("VALUES FROM /foo/ WITH KEY = some_key").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG VALUES FROM /foo/ WITH KEY = some_key"
+        );
+
+        // measurement selection using list
+        let (_, got) =
+            show_tag_values("VALUES FROM /foo/ , bar, \"foo bar\" WITH KEY = some_key").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG VALUES FROM /foo/, bar, \"foo bar\" WITH KEY = some_key"
+        );
+
+        let (_, got) = show_tag_values("VALUES WITH KEY = some_key WHERE foo = 'bar'").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG VALUES WITH KEY = some_key WHERE foo = 'bar'"
+        );
+
+        let (_, got) = show_tag_values("VALUES WITH KEY = some_key LIMIT 1").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG VALUES WITH KEY = some_key LIMIT 1"
+        );
+
+        let (_, got) = show_tag_values("VALUES WITH KEY = some_key OFFSET 2").unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG VALUES WITH KEY = some_key OFFSET 2"
+        );
+
+        // all optional clauses
+        let (_, got) = show_tag_values(
+            "VALUES ON db FROM /foo/ WITH KEY = some_key WHERE foo = 'bar' LIMIT 1 OFFSET 2",
+        )
+        .unwrap();
+        assert_eq!(
+            got.to_string(),
+            "SHOW TAG VALUES ON db FROM /foo/ WITH KEY = some_key WHERE foo = 'bar' LIMIT 1 OFFSET 2"
+        );
+
+        let (_, got) = show_tag_values("VALUES WITH KEY IN( foo )").unwrap();
+        assert_eq!(got.to_string(), "SHOW TAG VALUES WITH KEY IN (foo)");
+
+        // Fallible cases are tested by the various combinator functions
+    }
+
+    #[test]
+    fn test_with_key_clause() {
+        let (_, got) = with_key_clause("WITH KEY = foo").unwrap();
+        assert_eq!(got, WithKeyClause::Eq("foo".into()));
+
+        let (_, got) = with_key_clause("WITH KEY != foo").unwrap();
+        assert_eq!(got, WithKeyClause::NotEq("foo".into()));
+
+        let (_, got) = with_key_clause("WITH KEY =~ /foo/").unwrap();
+        assert_eq!(got, WithKeyClause::EqRegex("foo".into()));
+
+        let (_, got) = with_key_clause("WITH KEY !~ /foo/").unwrap();
+        assert_eq!(got, WithKeyClause::NotEqRegex("foo".into()));
+
+        let (_, got) = with_key_clause("WITH KEY IN (foo)").unwrap();
+        assert_eq!(got, WithKeyClause::In(InList::new(vec!["foo".into()])));
+
+        let (_, got) = with_key_clause("WITH KEY IN (foo, bar, \"foo bar\")").unwrap();
+        assert_eq!(
+            got,
+            WithKeyClause::In(InList::new(vec![
+                "foo".into(),
+                "bar".into(),
+                "foo bar".into()
+            ]))
+        );
+
+        // Expressions are still valid when whitespace is omitted
+        let (_, got) = with_key_clause("WITH KEY=foo").unwrap();
+        assert_eq!(got, WithKeyClause::Eq("foo".into()));
+
+        // Fallible cases
+
+        assert_expect_error!(
+            with_key_clause("WITH = foo"),
+            "invalid WITH KEY clause, expected KEY"
+        );
+
+        assert_expect_error!(
+            with_key_clause("WITH KEY"),
+            "invalid WITH KEY clause, expected condition"
+        );
+
+        assert_expect_error!(
+            with_key_clause("WITH KEY foo"),
+            "invalid WITH KEY clause, expected condition"
+        );
+
+        assert_expect_error!(
+            with_key_clause("WITH KEY = /foo/"),
+            "invalid WITH KEY clause, expected identifier following ="
+        );
+
+        assert_expect_error!(
+            with_key_clause("WITH KEY IN = foo"),
+            "invalid WITH KEY clause, expected identifier list following IN"
+        );
+    }
+
+    #[test]
+    fn test_identifier_list() {
+        let (_, got) = identifier_list("(foo)").unwrap();
+        assert_eq!(got, InList::new(vec!["foo".into()]));
+
+        // Test first and rest as well as removing unnecessary whitespace
+        let (_, got) = identifier_list("( foo, bar,\"foo bar\" )").unwrap();
+        assert_eq!(
+            got,
+            InList::new(vec!["foo".into(), "bar".into(), "foo bar".into()])
+        );
+
+        // Fallible cases
+
+        assert_expect_error!(
+            identifier_list("(foo"),
+            "invalid identifier list, expected ')'"
+        );
+
+        assert_expect_error!(
+            identifier_list("(foo bar)"),
+            "invalid identifier list, expected ')'"
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/simple_from_clause.rs b/influxdb_influxql_parser/src/simple_from_clause.rs
new file mode 100644
index 0000000..bd1471a
--- /dev/null
+++ b/influxdb_influxql_parser/src/simple_from_clause.rs
@@ -0,0 +1,179 @@
+//! Types and parsers for the `FROM` clause common to `DELETE` or `SHOW` schema statements.
+
+use crate::common::{
+    qualified_measurement_name, ws1, MeasurementName, OneOrMore, Parser, QualifiedMeasurementName,
+};
+use crate::identifier::{identifier, Identifier};
+use crate::internal::ParseResult;
+use crate::keywords::keyword;
+use nom::sequence::{pair, preceded};
+use std::fmt;
+use std::fmt::{Display, Formatter};
+
+/// Represents a `FROM` clause of a `DELETE` or `SHOW` statement.
+///
+/// A `FROM` clause for a `DELETE` can only accept [`Identifier`] or regular expressions
+/// for measurements names.
+///
+/// A `FROM` clause for a number of `SHOW` statements can accept a [`QualifiedMeasurementName`].
+pub type FromMeasurementClause<U> = OneOrMore<U>;
+
+fn from_clause<T: Parser + fmt::Display>(i: &str) -> ParseResult<&str, FromMeasurementClause<T>> {
+    preceded(
+        pair(keyword("FROM"), ws1),
+        FromMeasurementClause::<T>::separated_list1(
+            "invalid FROM clause, expected identifier or regular expression",
+        ),
+    )(i)
+}
+
+impl Parser for QualifiedMeasurementName {
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        qualified_measurement_name(i)
+    }
+}
+
+/// Represents a `FROM` clause used by various `SHOW` statements.
+///
+/// A `FROM` clause for a `SHOW` statements differs from a `FROM` in a
+/// `SELECT`, as it can only contain measurement name or regular expressions.
+///
+/// It is defined by the following EBNF notation:
+///
+/// ```text
+/// from_clause ::= "FROM" qualified_measurement_name ("," qualified_measurement_name)*
+///
+/// qualified_measurement_name ::= measurement_name |
+///                      ( policy_name "." measurement_name ) |
+///                      ( db_name "." policy_name? "." measurement_name )
+///
+/// db_name          ::= identifier
+/// measurement_name ::= identifier | regex_lit
+/// policy_name      ::= identifier
+/// ```
+///
+/// A minimal `FROM` clause would be a single identifier
+///
+/// ```text
+/// FROM foo
+/// ```
+///
+/// A more complicated example may include a variety of fully-qualified
+/// identifiers and regular expressions
+///
+/// ```text
+/// FROM foo, /bar/, some_database..foo, some_retention_policy.foobar
+/// ```
+pub type ShowFromClause = FromMeasurementClause<QualifiedMeasurementName>;
+
+impl Display for ShowFromClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "FROM {}", self.head())?;
+        for arg in self.tail() {
+            write!(f, ", {arg}")?;
+        }
+        Ok(())
+    }
+}
+
+/// Parse a `FROM` clause for various `SHOW` statements.
+pub(crate) fn show_from_clause(i: &str) -> ParseResult<&str, ShowFromClause> {
+    from_clause(i)
+}
+
+impl Parser for Identifier {
+    fn parse(i: &str) -> ParseResult<&str, Self> {
+        identifier(i)
+    }
+}
+
+/// Represents a `FROM` clause for a `DELETE` statement.
+pub type DeleteFromClause = FromMeasurementClause<MeasurementName>;
+
+impl Display for DeleteFromClause {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "FROM {}", self.head())?;
+        for arg in self.tail() {
+            write!(f, ", {arg}")?;
+        }
+        Ok(())
+    }
+}
+
+/// Parse a `FROM` clause for a `DELETE` statement.
+pub(crate) fn delete_from_clause(i: &str) -> ParseResult<&str, DeleteFromClause> {
+    from_clause(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_show_from_clause() {
+        use crate::common::MeasurementName::*;
+
+        let (_, from) = show_from_clause("FROM c").unwrap();
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Name("c".into()))])
+        );
+
+        let (_, from) = show_from_clause("FROM a..c").unwrap();
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db(
+                Name("c".into()),
+                "a".into()
+            )])
+        );
+
+        let (_, from) = show_from_clause("FROM a.b.c").unwrap();
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![QualifiedMeasurementName::new_db_rp(
+                Name("c".into()),
+                "a".into(),
+                "b".into()
+            )])
+        );
+
+        let (_, from) = show_from_clause("FROM /reg/").unwrap();
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![QualifiedMeasurementName::new(Regex("reg".into()))])
+        );
+
+        let (_, from) = show_from_clause("FROM c, /reg/").unwrap();
+        assert_eq!(
+            from,
+            ShowFromClause::new(vec![
+                QualifiedMeasurementName::new(Name("c".into())),
+                QualifiedMeasurementName::new(Regex("reg".into()))
+            ])
+        );
+    }
+
+    #[test]
+    fn test_delete_from_clause() {
+        use crate::common::MeasurementName::*;
+
+        let (_, from) = delete_from_clause("FROM c").unwrap();
+        assert_eq!(from, DeleteFromClause::new(vec![Name("c".into())]));
+
+        let (_, from) = delete_from_clause("FROM /reg/").unwrap();
+        assert_eq!(from, DeleteFromClause::new(vec![Regex("reg".into())]));
+
+        let (_, from) = delete_from_clause("FROM c, /reg/").unwrap();
+        assert_eq!(
+            from,
+            DeleteFromClause::new(vec![Name("c".into()), Regex("reg".into())])
+        );
+
+        // Demonstrate that the 3-part name is not parsed
+        let (i, from) = delete_from_clause("FROM a.b.c").unwrap();
+        assert_eq!(from, DeleteFromClause::new(vec![Name("a".into())]));
+        // The remaining input will fail in a later parser
+        assert_eq!(i, ".b.c");
+    }
+}
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-2.snap
new file mode 100644
index 0000000..6958fd4
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-2.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"DELETE WHERE 'foo bar' =~ /foo/\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-3.snap
new file mode 100644
index 0000000..2c69319
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-3.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"DELETE FROM cpu\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_delete_from_clause
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_delete_from_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-4.snap
new file mode 100644
index 0000000..ebad0c6
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement-4.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"DELETE FROM /^cpu/\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_delete_from_clause
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_delete_from_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement.snap
new file mode 100644
index 0000000..897986d
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__delete_statement.snap
@@ -0,0 +1,31 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"DELETE FROM a WHERE b = \\\"c\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_delete_from_clause
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_delete_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__drop_measurement_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__drop_measurement_statement.snap
new file mode 100644
index 0000000..4604711
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__drop_measurement_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"DROP MEASUREMENT cpu\")"
+---
+- pre_visit_statement
+- pre_visit_drop_measurement_statement
+- post_visit_drop_measurement_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap
new file mode 100644
index 0000000..442e572
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW MEASUREMENTS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- post_visit_show_measurements_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap
new file mode 100644
index 0000000..145932a
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap
new file mode 100644
index 0000000..6380d32
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG VALUES WITH KEY = \\\"Key\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap
new file mode 100644
index 0000000..51288d7
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW FIELD KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- post_visit_show_field_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap
new file mode 100644
index 0000000..9b92a68
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW RETENTION POLICIES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap
new file mode 100644
index 0000000..c6dca62
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW DATABASES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_databases_statement
+- post_visit_show_databases_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap
new file mode 100644
index 0000000..ed86168
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap
@@ -0,0 +1,31 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN EXPLAIN SELECT * from cpu\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement.snap
new file mode 100644
index 0000000..20b1193
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SELECT * FROM cpu\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-2.snap
new file mode 100644
index 0000000..41c4d46
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-2.snap
@@ -0,0 +1,23 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(r#\"SELECT DISTINCT value FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-3.snap
new file mode 100644
index 0000000..e4f6de7
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-3.snap
@@ -0,0 +1,29 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(r#\"SELECT COUNT(value) FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_call
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_call
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-4.snap
new file mode 100644
index 0000000..47228f1
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-4.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(r#\"SELECT COUNT(DISTINCT value) FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_call
+- pre_visit_expr
+- post_visit_expr
+- post_visit_call
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-5.snap
new file mode 100644
index 0000000..02f0dbc
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-5.snap
@@ -0,0 +1,29 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(r#\"SELECT * FROM /cpu/, memory\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-6.snap
new file mode 100644
index 0000000..0091841
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement-6.snap
@@ -0,0 +1,121 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(r#\"SELECT value FROM (SELECT usage FROM cpu WHERE host = \"node1\")\n            WHERE region =~ /west/ AND value > 5\n            GROUP BY TIME(5m), host\n            FILL(previous)\n            ORDER BY TIME DESC\n            LIMIT 1 OFFSET 2\n            SLIMIT 3 SOFFSET 4\n            TZ('Australia/Hobart')\n        \"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_select_statement
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_group_by_clause
+- pre_visit_select_dimension
+- pre_visit_select_time_dimension
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_select_time_dimension
+- post_visit_select_dimension
+- pre_visit_select_dimension
+- post_visit_select_dimension
+- post_visit_group_by_clause
+- pre_visit_fill_clause
+- post_visit_fill_clause
+- pre_visit_order_by_clause
+- post_visit_order_by_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- pre_visit_slimit_clause
+- post_visit_slimit_clause
+- pre_visit_soffset_clause
+- post_visit_soffset_clause
+- pre_visit_timezone_clause
+- post_visit_timezone_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement.snap
new file mode 100644
index 0000000..65a9162
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__select_statement.snap
@@ -0,0 +1,25 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(r#\"SELECT value FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_databases_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_databases_statement.snap
new file mode 100644
index 0000000..d65265e
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_databases_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW DATABASES\")"
+---
+- pre_visit_statement
+- pre_visit_show_databases_statement
+- post_visit_show_databases_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-2.snap
new file mode 100644
index 0000000..5d9744a
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS ON telegraf\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-3.snap
new file mode 100644
index 0000000..024ad2a
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-3.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS FROM cpu\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-4.snap
new file mode 100644
index 0000000..ef89c76
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement-4.snap
@@ -0,0 +1,17 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS ON telegraf FROM /cpu/\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement.snap
new file mode 100644
index 0000000..f4056d5
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_field_keys_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-2.snap
new file mode 100644
index 0000000..2237adf
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS ON db.rp\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_extended_on_clause
+- post_visit_extended_on_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-3.snap
new file mode 100644
index 0000000..d13c0f8
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-3.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS WITH MEASUREMENT = \\\"cpu\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_with_measurement_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_with_measurement_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-4.snap
new file mode 100644
index 0000000..002605d
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-4.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS WHERE host = 'west'\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-5.snap
new file mode 100644
index 0000000..442678b
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-5.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS LIMIT 5\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-6.snap
new file mode 100644
index 0000000..b3dec30
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-6.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS OFFSET 10\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-7.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-7.snap
new file mode 100644
index 0000000..8f97462
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement-7.snap
@@ -0,0 +1,39 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS ON * WITH MEASUREMENT =~ /foo/ WHERE host = 'west' LIMIT 10 OFFSET 20\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_extended_on_clause
+- post_visit_extended_on_clause
+- pre_visit_with_measurement_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_with_measurement_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement.snap
new file mode 100644
index 0000000..5aba8b8
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_measurements_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_retention_policies_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_retention_policies_statement-2.snap
new file mode 100644
index 0000000..48f57c2
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_retention_policies_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW RETENTION POLICIES ON telegraf\")"
+---
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_retention_policies_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_retention_policies_statement.snap
new file mode 100644
index 0000000..db7d14d
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_retention_policies_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW RETENTION POLICIES\")"
+---
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_keys_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_keys_statement-2.snap
new file mode 100644
index 0000000..e623a40
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_keys_statement-2.snap
@@ -0,0 +1,39 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW TAG KEYS ON telegraf FROM cpu WHERE host = \\\"west\\\" LIMIT 5 OFFSET 10\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_keys_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_keys_statement.snap
new file mode 100644
index 0000000..065082a
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_keys_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW TAG KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-2.snap
new file mode 100644
index 0000000..93786d2
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW TAG VALUES WITH KEY =~ /host|region/\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-3.snap
new file mode 100644
index 0000000..d56fd2a
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-3.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW TAG VALUES WITH KEY IN (host, region)\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-4.snap
new file mode 100644
index 0000000..bbbdb9b
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement-4.snap
@@ -0,0 +1,41 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW TAG VALUES ON telegraf FROM cpu WITH KEY = host WHERE host = \\\"west\\\" LIMIT 5 OFFSET 10\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement.snap
new file mode 100644
index 0000000..475f19b
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__show_tag_values_statement.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"SHOW TAG VALUES WITH KEY = host\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-2.snap
new file mode 100644
index 0000000..cd15174
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-2.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"DELETE WHERE 'foo bar' =~ /foo/\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-3.snap
new file mode 100644
index 0000000..8a8a418
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-3.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"DELETE FROM cpu\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_delete_from_clause
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_delete_from_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-4.snap
new file mode 100644
index 0000000..42cc8bd
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement-4.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"DELETE FROM /^cpu/\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_delete_from_clause
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_delete_from_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement.snap
new file mode 100644
index 0000000..8cdfe7f
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__delete_statement.snap
@@ -0,0 +1,31 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"DELETE FROM a WHERE b = \\\"c\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_delete_statement
+- pre_visit_delete_from_clause
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_delete_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_delete_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__drop_measurement_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__drop_measurement_statement.snap
new file mode 100644
index 0000000..309784e
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__drop_measurement_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"DROP MEASUREMENT cpu\")"
+---
+- pre_visit_statement
+- pre_visit_drop_measurement_statement
+- post_visit_drop_measurement_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap
new file mode 100644
index 0000000..f47cd01
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW MEASUREMENTS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- post_visit_show_measurements_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap
new file mode 100644
index 0000000..52d7f78
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap
new file mode 100644
index 0000000..4c78845
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG VALUES WITH KEY = \\\"Key\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap
new file mode 100644
index 0000000..0a34c15
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW FIELD KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- post_visit_show_field_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap
new file mode 100644
index 0000000..e67d842
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW RETENTION POLICIES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap
new file mode 100644
index 0000000..d3c64c2
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW DATABASES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_databases_statement
+- post_visit_show_databases_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap
new file mode 100644
index 0000000..382aa95
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap
@@ -0,0 +1,31 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN EXPLAIN SELECT * from cpu\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement.snap
new file mode 100644
index 0000000..54e1008
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SELECT * FROM cpu\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-2.snap
new file mode 100644
index 0000000..4d82a71
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-2.snap
@@ -0,0 +1,23 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(r#\"SELECT DISTINCT value FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-3.snap
new file mode 100644
index 0000000..21eb3a4
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-3.snap
@@ -0,0 +1,29 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(r#\"SELECT COUNT(value) FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_call
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_call
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-4.snap
new file mode 100644
index 0000000..2268d6a
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-4.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(r#\"SELECT COUNT(DISTINCT value) FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_call
+- pre_visit_expr
+- post_visit_expr
+- post_visit_call
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-5.snap
new file mode 100644
index 0000000..4e88d8c
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-5.snap
@@ -0,0 +1,29 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(r#\"SELECT * FROM /cpu/, memory\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-6.snap
new file mode 100644
index 0000000..f14370c
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement-6.snap
@@ -0,0 +1,121 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(r#\"SELECT value FROM (SELECT usage FROM cpu WHERE host = \"node1\")\n            WHERE region =~ /west/ AND value > 5\n            GROUP BY TIME(5m), host\n            FILL(previous)\n            ORDER BY TIME DESC\n            LIMIT 1 OFFSET 2\n            SLIMIT 3 SOFFSET 4\n            TZ('Australia/Hobart')\n        \"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_select_statement
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_group_by_clause
+- pre_visit_select_dimension
+- pre_visit_select_time_dimension
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_select_time_dimension
+- post_visit_select_dimension
+- pre_visit_select_dimension
+- post_visit_select_dimension
+- post_visit_group_by_clause
+- pre_visit_fill_clause
+- post_visit_fill_clause
+- pre_visit_order_by_clause
+- post_visit_order_by_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- pre_visit_slimit_clause
+- post_visit_slimit_clause
+- pre_visit_soffset_clause
+- post_visit_soffset_clause
+- pre_visit_timezone_clause
+- post_visit_timezone_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement.snap
new file mode 100644
index 0000000..62af1db
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__select_statement.snap
@@ -0,0 +1,25 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(r#\"SELECT value FROM temp\"#)"
+---
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_databases_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_databases_statement.snap
new file mode 100644
index 0000000..3d5510e
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_databases_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW DATABASES\")"
+---
+- pre_visit_statement
+- pre_visit_show_databases_statement
+- post_visit_show_databases_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-2.snap
new file mode 100644
index 0000000..adcb9e5
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS ON telegraf\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-3.snap
new file mode 100644
index 0000000..f57cc14
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-3.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS FROM cpu\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-4.snap
new file mode 100644
index 0000000..4d3d46e
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement-4.snap
@@ -0,0 +1,17 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS ON telegraf FROM /cpu/\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement.snap
new file mode 100644
index 0000000..ad8598b
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_field_keys_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW FIELD KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- post_visit_show_field_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-2.snap
new file mode 100644
index 0000000..b85b304
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS ON db.rp\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_extended_on_clause
+- post_visit_extended_on_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-3.snap
new file mode 100644
index 0000000..77482b7
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-3.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS WITH MEASUREMENT = \\\"cpu\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_with_measurement_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_with_measurement_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-4.snap
new file mode 100644
index 0000000..e5c2e97
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-4.snap
@@ -0,0 +1,27 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS WHERE host = 'west'\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-5.snap
new file mode 100644
index 0000000..0dd0921
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-5.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS LIMIT 5\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-6.snap
new file mode 100644
index 0000000..fac8cfd
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-6.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS OFFSET 10\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-7.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-7.snap
new file mode 100644
index 0000000..8b80167
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement-7.snap
@@ -0,0 +1,39 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS ON * WITH MEASUREMENT =~ /foo/ WHERE host = 'west' LIMIT 10 OFFSET 20\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- pre_visit_extended_on_clause
+- post_visit_extended_on_clause
+- pre_visit_with_measurement_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_with_measurement_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_literal
+- post_visit_literal
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement.snap
new file mode 100644
index 0000000..c9f8063
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_measurements_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW MEASUREMENTS\")"
+---
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- post_visit_show_measurements_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_retention_policies_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_retention_policies_statement-2.snap
new file mode 100644
index 0000000..306d04e
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_retention_policies_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW RETENTION POLICIES ON telegraf\")"
+---
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_retention_policies_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_retention_policies_statement.snap
new file mode 100644
index 0000000..556cc42
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_retention_policies_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW RETENTION POLICIES\")"
+---
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_keys_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_keys_statement-2.snap
new file mode 100644
index 0000000..a1afc9c
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_keys_statement-2.snap
@@ -0,0 +1,39 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW TAG KEYS ON telegraf FROM cpu WHERE host = \\\"west\\\" LIMIT 5 OFFSET 10\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_keys_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_keys_statement.snap
new file mode 100644
index 0000000..e0b7a8b
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_keys_statement.snap
@@ -0,0 +1,9 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW TAG KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-2.snap
new file mode 100644
index 0000000..1554c50
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-2.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW TAG VALUES WITH KEY =~ /host|region/\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-3.snap
new file mode 100644
index 0000000..e62b8f0
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-3.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW TAG VALUES WITH KEY IN (host, region)\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-4.snap
new file mode 100644
index 0000000..ad3678a
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement-4.snap
@@ -0,0 +1,41 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW TAG VALUES ON telegraf FROM cpu WITH KEY = host WHERE host = \\\"west\\\" LIMIT 5 OFFSET 10\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_on_clause
+- post_visit_on_clause
+- pre_visit_show_from_clause
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_show_from_clause
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- pre_visit_where_clause
+- pre_visit_conditional_expression
+- pre_visit_conditional_binary
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- pre_visit_conditional_expression
+- pre_visit_expr
+- pre_visit_var_ref
+- post_visit_var_ref
+- post_visit_expr
+- post_visit_conditional_expression
+- post_visit_conditional_binary
+- post_visit_conditional_expression
+- post_visit_where_clause
+- pre_visit_limit_clause
+- post_visit_limit_clause
+- pre_visit_offset_clause
+- post_visit_offset_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement.snap
new file mode 100644
index 0000000..d706f46
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__show_tag_values_statement.snap
@@ -0,0 +1,11 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"SHOW TAG VALUES WITH KEY = host\")"
+---
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/statement.rs b/influxdb_influxql_parser/src/statement.rs
new file mode 100644
index 0000000..5788c7a
--- /dev/null
+++ b/influxdb_influxql_parser/src/statement.rs
@@ -0,0 +1,107 @@
+//! Types and parsers for an InfluxQL statement.
+
+use crate::create::{create_statement, CreateDatabaseStatement};
+use crate::delete::{delete_statement, DeleteStatement};
+use crate::drop::{drop_statement, DropMeasurementStatement};
+use crate::explain::{explain_statement, ExplainStatement};
+use crate::internal::ParseResult;
+use crate::select::{select_statement, SelectStatement};
+use crate::show::{show_statement, ShowDatabasesStatement};
+use crate::show_field_keys::ShowFieldKeysStatement;
+use crate::show_measurements::ShowMeasurementsStatement;
+use crate::show_retention_policies::ShowRetentionPoliciesStatement;
+use crate::show_tag_keys::ShowTagKeysStatement;
+use crate::show_tag_values::ShowTagValuesStatement;
+use nom::branch::alt;
+use nom::combinator::map;
+use std::fmt::{Display, Formatter};
+
+/// An InfluxQL statement.
+#[derive(Debug, Clone, PartialEq)]
+pub enum Statement {
+    /// Represents a `CREATE DATABASE` statement.
+    CreateDatabase(Box<CreateDatabaseStatement>),
+    /// Represents a `DELETE` statement.
+    Delete(Box<DeleteStatement>),
+    /// Represents a `DROP MEASUREMENT` statement.
+    DropMeasurement(Box<DropMeasurementStatement>),
+    /// Represents an `EXPLAIN` statement.
+    Explain(Box<ExplainStatement>),
+    /// Represents a `SELECT` statement.
+    Select(Box<SelectStatement>),
+    /// Represents a `SHOW DATABASES` statement.
+    ShowDatabases(Box<ShowDatabasesStatement>),
+    /// Represents a `SHOW MEASUREMENTS` statement.
+    ShowMeasurements(Box<ShowMeasurementsStatement>),
+    /// Represents a `SHOW RETENTION POLICIES` statement.
+    ShowRetentionPolicies(Box<ShowRetentionPoliciesStatement>),
+    /// Represents a `SHOW TAG KEYS` statement.
+    ShowTagKeys(Box<ShowTagKeysStatement>),
+    /// Represents a `SHOW TAG VALUES` statement.
+    ShowTagValues(Box<ShowTagValuesStatement>),
+    /// Represents a `SHOW FIELD KEYS` statement.
+    ShowFieldKeys(Box<ShowFieldKeysStatement>),
+}
+
+impl Display for Statement {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::CreateDatabase(s) => Display::fmt(s, f),
+            Self::Delete(s) => Display::fmt(s, f),
+            Self::DropMeasurement(s) => Display::fmt(s, f),
+            Self::Explain(s) => Display::fmt(s, f),
+            Self::Select(s) => Display::fmt(s, f),
+            Self::ShowDatabases(s) => Display::fmt(s, f),
+            Self::ShowMeasurements(s) => Display::fmt(s, f),
+            Self::ShowRetentionPolicies(s) => Display::fmt(s, f),
+            Self::ShowTagKeys(s) => Display::fmt(s, f),
+            Self::ShowTagValues(s) => Display::fmt(s, f),
+            Self::ShowFieldKeys(s) => Display::fmt(s, f),
+        }
+    }
+}
+
+/// Parse a single InfluxQL statement.
+pub fn statement(i: &str) -> ParseResult<&str, Statement> {
+    alt((
+        map(delete_statement, |s| Statement::Delete(Box::new(s))),
+        map(drop_statement, |s| Statement::DropMeasurement(Box::new(s))),
+        map(explain_statement, |s| Statement::Explain(Box::new(s))),
+        map(select_statement, |s| Statement::Select(Box::new(s))),
+        create_statement,
+        show_statement,
+    ))(i)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::statement;
+
+    #[test]
+    fn test_statement() {
+        // Validate one of each statement parser is accepted and that all input is consumed
+
+        // create_statement combinator
+        let (got, _) = statement("CREATE DATABASE foo").unwrap();
+        assert_eq!(got, "");
+
+        // delete_statement combinator
+        let (got, _) = statement("DELETE FROM foo").unwrap();
+        assert_eq!(got, "");
+
+        // drop_statement combinator
+        let (got, _) = statement("DROP MEASUREMENT foo").unwrap();
+        assert_eq!(got, "");
+
+        // explain_statement combinator
+        let (got, _) = statement("EXPLAIN SELECT * FROM cpu").unwrap();
+        assert_eq!(got, "");
+
+        let (got, _) = statement("SELECT * FROM foo WHERE time > now() - 5m AND host = 'bar' GROUP BY TIME(5m) FILL(previous) ORDER BY time DESC").unwrap();
+        assert_eq!(got, "");
+
+        // show_statement combinator
+        let (got, _) = statement("SHOW TAG KEYS").unwrap();
+        assert_eq!(got, "");
+    }
+}
diff --git a/influxdb_influxql_parser/src/string.rs b/influxdb_influxql_parser/src/string.rs
new file mode 100644
index 0000000..a8a065d
--- /dev/null
+++ b/influxdb_influxql_parser/src/string.rs
@@ -0,0 +1,349 @@
+//! Parse delimited string inputs.
+//!
+
+// Taken liberally from https://github.com/Geal/nom/blob/main/examples/string.rs and
+// amended for InfluxQL.
+
+use crate::impl_tuple_clause;
+use crate::internal::{expect, ParseError, ParseResult};
+use nom::branch::alt;
+use nom::bytes::complete::{is_not, tag, take_till};
+use nom::character::complete::{anychar, char};
+use nom::combinator::{map, value, verify};
+use nom::error::Error;
+use nom::multi::fold_many0;
+use nom::sequence::{delimited, preceded};
+use nom::Parser;
+use std::fmt::{Display, Formatter, Write};
+
+/// Writes `S` to `F`, mapping any characters `FROM` => `TO` their escaped equivalents.
+#[macro_export]
+macro_rules! write_escaped {
+    ($F: expr, $STRING: expr $(, $FROM:expr => $TO:expr)+) => {
+        for c in $STRING.chars() {
+            match c {
+                $(
+                $FROM => $F.write_str($TO)?,
+                )+
+                _ => $F.write_char(c)?,
+            }
+        }
+    };
+}
+/// Writes `S` to `F`, optionally surrounding in `QUOTE`s, if FN(S) fails,
+/// and mapping any characters `FROM` => `TO` their escaped equivalents.
+#[macro_export]
+macro_rules! write_quoted_string {
+    ($F: expr, $QUOTE: literal, $STRING: expr, $FN: expr $(, $FROM:expr => $TO:expr)+) => {
+        if nom::sequence::terminated($FN, nom::combinator::eof)($STRING).is_ok() {
+            $F.write_str($STRING)?;
+        } else {
+            // must be escaped
+            $F.write_char($QUOTE)?;
+            for c in $STRING.chars() {
+                match c {
+                    $(
+                    $FROM => $F.write_str($TO)?,
+                    )+
+                    _ => $F.write_char(c)?,
+                }
+            }
+            $F.write_char($QUOTE)?;
+        }
+    };
+}
+
+/// A string fragment contains a fragment of a string being parsed: either
+/// a non-empty Literal (a series of non-escaped characters) or a single.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum StringFragment<'a> {
+    Literal(&'a str),
+    EscapedChar(char),
+}
+
+/// Parse a single-quoted literal string.
+pub(crate) fn single_quoted_string(i: &str) -> ParseResult<&str, String> {
+    let escaped = preceded(
+        char('\\'),
+        expect(
+            r"invalid escape sequence, expected \\, \' or \n",
+            alt((char('\\'), char('\''), value('\n', char('n')))),
+        ),
+    );
+
+    string(
+        '\'',
+        "unterminated string literal",
+        verify(is_not("'\\\n"), |s: &str| !s.is_empty()),
+        escaped,
+    )(i)
+}
+
+/// Parse a double-quoted identifier string.
+pub(crate) fn double_quoted_string(i: &str) -> ParseResult<&str, String> {
+    let escaped = preceded(
+        char('\\'),
+        expect(
+            r#"invalid escape sequence, expected \\, \" or \n"#,
+            alt((char('\\'), char('"'), value('\n', char('n')))),
+        ),
+    );
+
+    string(
+        '"',
+        "unterminated string literal",
+        verify(is_not("\"\\\n"), |s: &str| !s.is_empty()),
+        escaped,
+    )(i)
+}
+
+fn string<'a, T, U, E>(
+    delimiter: char,
+    unterminated_message: &'static str,
+    literal: T,
+    escaped: U,
+) -> impl FnMut(&'a str) -> ParseResult<&'a str, String, E>
+where
+    T: Parser<&'a str, &'a str, E>,
+    U: Parser<&'a str, char, E>,
+    E: ParseError<'a>,
+{
+    let fragment = alt((
+        map(literal, StringFragment::Literal),
+        map(escaped, StringFragment::EscapedChar),
+    ));
+
+    let build_string = fold_many0(fragment, String::new, |mut string, fragment| {
+        match fragment {
+            StringFragment::Literal(s) => string.push_str(s),
+            StringFragment::EscapedChar(ch) => string.push(ch),
+        }
+        string
+    });
+
+    delimited(
+        char(delimiter),
+        build_string,
+        expect(unterminated_message, char(delimiter)),
+    )
+}
+
+/// Parse regular expression literal characters.
+///
+/// Consumes i until reaching and escaped delimiter ("\/"), newline or eof.
+fn regex_literal(i: &str) -> ParseResult<&str, &str> {
+    let mut remaining = &i[..i.len()];
+    let mut consumed = &i[..0];
+
+    loop {
+        // match everything except `\`, `/` or `\n`
+        let (_, match_i) = take_till(|c| c == '\\' || c == '/' || c == '\n')(remaining)?;
+        consumed = &i[..(consumed.len() + match_i.len())];
+        remaining = &i[consumed.len()..];
+
+        // If we didn't consume anything, check whether it is a newline or regex delimiter,
+        // which signals we should leave this parser for outer processing.
+        if consumed.is_empty() {
+            is_not("/\n")(remaining)?;
+        }
+
+        // Try and consume '\' followed by a '/'
+        if let Ok((remaining_i, _)) = char::<_, Error<&str>>('\\')(remaining) {
+            if char::<_, Error<&str>>('/')(remaining_i).is_ok() {
+                // If we didn't consume anything, but we found "\/" sequence,
+                // we need to return an error so the outer fold_many0 parser does not trigger
+                // an infinite recursion error.
+                anychar(consumed)?;
+
+                // We're escaping a '/' (a regex delimiter), so finish and let
+                // the outer parser match and unescape
+                return Ok((remaining, consumed));
+            }
+            // Skip the '/' and continue consuming
+            consumed = &i[..consumed.len() + 1];
+            remaining = &i[consumed.len()..];
+        } else {
+            return Ok((remaining, consumed));
+        }
+    }
+}
+
+/// An unescaped regular expression.
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct Regex(pub(crate) String);
+
+impl_tuple_clause!(Regex, String);
+
+impl Display for Regex {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_char('/')?;
+        write_escaped!(f, self.0, '/' => "\\/");
+        f.write_char('/')
+    }
+}
+
+impl From<&str> for Regex {
+    fn from(v: &str) -> Self {
+        Self(v.into())
+    }
+}
+
+/// Parse a regular expression, delimited by `/`.
+pub(crate) fn regex(i: &str) -> ParseResult<&str, Regex> {
+    map(
+        string(
+            '/',
+            "unterminated regex literal",
+            regex_literal,
+            map(tag("\\/"), |_| '/'),
+        ),
+        Regex,
+    )(i)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::assert_expect_error;
+
+    #[test]
+    fn test_double_quoted_string() {
+        // ascii
+        let (_, got) = double_quoted_string(r#""quick draw""#).unwrap();
+        assert_eq!(got, "quick draw");
+
+        // ascii
+        let (_, got) = double_quoted_string(r#""n.asks""#).unwrap();
+        assert_eq!(got, "n.asks");
+
+        // unicode
+        let (_, got) = double_quoted_string("\"quick draw\u{1f47d}\"").unwrap();
+        assert_eq!(
+            got,
+            "quick draw\u{1f47d}" // 👽
+        );
+
+        // escaped characters
+        let (_, got) = double_quoted_string(r#""\n\\\"""#).unwrap();
+        assert_eq!(got, "\n\\\"");
+
+        // literal tab
+        let (_, got) = double_quoted_string("\"quick\tdraw\"").unwrap();
+        assert_eq!(got, "quick\tdraw");
+
+        // literal carriage return
+        let (_, got) = double_quoted_string("\"quick\rdraw\"").unwrap();
+        assert_eq!(got, "quick\rdraw");
+
+        // Empty string
+        let (i, got) = double_quoted_string("\"\"").unwrap();
+        assert_eq!(i, "");
+        assert_eq!(got, "");
+
+        // ┌─────────────────────────────┐
+        // │       Fallible tests        │
+        // └─────────────────────────────┘
+
+        // Not terminated
+        assert_expect_error!(
+            double_quoted_string(r#""quick draw"#),
+            "unterminated string literal"
+        );
+
+        // Literal newline
+        assert_expect_error!(
+            double_quoted_string("\"quick\ndraw\""),
+            "unterminated string literal"
+        );
+
+        // Invalid escape
+        assert_expect_error!(
+            double_quoted_string(r#""quick\idraw""#),
+            r#"invalid escape sequence, expected \\, \" or \n"#
+        );
+    }
+
+    #[test]
+    fn test_single_quoted_string() {
+        // ascii
+        let (_, got) = single_quoted_string(r#"'quick draw'"#).unwrap();
+        assert_eq!(got, "quick draw");
+
+        // unicode
+        let (_, got) = single_quoted_string("'quick draw\u{1f47d}'").unwrap();
+        assert_eq!(
+            got,
+            "quick draw\u{1f47d}" // 👽
+        );
+
+        // escaped characters
+        let (_, got) = single_quoted_string(r"'\n\''").unwrap();
+        assert_eq!(got, "\n'");
+
+        let (_, got) = single_quoted_string(r"'\'hello\''").unwrap();
+        assert_eq!(got, "'hello'");
+
+        // literal tab
+        let (_, got) = single_quoted_string("'quick\tdraw'").unwrap();
+        assert_eq!(got, "quick\tdraw");
+
+        // literal carriage return
+        let (_, got) = single_quoted_string("'quick\rdraw'").unwrap();
+        assert_eq!(got, "quick\rdraw");
+
+        // Empty string
+        let (i, got) = single_quoted_string("''").unwrap();
+        assert_eq!(i, "");
+        assert_eq!(got, "");
+
+        // ┌─────────────────────────────┐
+        // │       Fallible tests        │
+        // └─────────────────────────────┘
+        // Not terminated
+        assert_expect_error!(
+            single_quoted_string(r#"'quick draw"#),
+            "unterminated string literal"
+        );
+
+        // Invalid escape
+        assert_expect_error!(
+            single_quoted_string(r"'quick\idraw'"),
+            r"invalid escape sequence, expected \\, \' or \n"
+        );
+    }
+
+    #[test]
+    fn test_regex() {
+        let (_, got) = regex("/hello/").unwrap();
+        assert_eq!(got, "hello".into());
+
+        // handle escaped delimiters "\/"
+        let (_, got) = regex(r"/\/this\/is\/a\/path/").unwrap();
+        assert_eq!(got, "/this/is/a/path".into());
+
+        // ignores any other possible escape sequence
+        let (_, got) = regex(r"/hello\n/").unwrap();
+        assert_eq!(got, "hello\\n".into());
+
+        // can parse possible escape sequence at beginning of regex
+        let (_, got) = regex(r"/\w.*/").unwrap();
+        assert_eq!(got, "\\w.*".into());
+
+        // Empty regex
+        let (i, got) = regex("//").unwrap();
+        assert_eq!(i, "");
+        assert_eq!(got, "".into());
+
+        // Fallible cases
+
+        // Missing trailing delimiter
+        assert_expect_error!(regex(r#"/hello"#), "unterminated regex literal");
+
+        // Embedded newline
+        assert_expect_error!(regex("/hello\nworld/"), "unterminated regex literal");
+
+        // Single backslash fails, which matches Go implementation
+        // See: https://go.dev/play/p/_8J1v5-382G
+        assert_expect_error!(regex(r"/\/"), "unterminated regex literal");
+    }
+}
diff --git a/influxdb_influxql_parser/src/test_util.rs b/influxdb_influxql_parser/src/test_util.rs
new file mode 100644
index 0000000..1d6e766
--- /dev/null
+++ b/influxdb_influxql_parser/src/test_util.rs
@@ -0,0 +1,37 @@
+//! Test utilities
+
+/// Asserts that the result of a nom parser is an error and a [`nom::Err::Failure`].
+#[macro_export]
+macro_rules! assert_failure {
+    ($RESULT:expr) => {
+        assert_matches::assert_matches!($RESULT.unwrap_err(), nom::Err::Failure(_));
+    };
+}
+
+/// Asserts that the result of a nom parser is an error and a [`nom::Err::Error`] of the specified
+/// [`nom::error::ErrorKind`].
+#[macro_export]
+macro_rules! assert_error {
+    ($RESULT:expr, $ERR:ident) => {
+        assert_matches::assert_matches!(
+            $RESULT.unwrap_err(),
+            nom::Err::Error($crate::internal::Error::Nom(_, nom::error::ErrorKind::$ERR))
+        );
+    };
+}
+
+/// Asserts that the result of a nom parser is an [`crate::internal::Error::Syntax`] and a [`nom::Err::Failure`].
+#[macro_export]
+macro_rules! assert_expect_error {
+    ($RESULT:expr, $MSG:expr) => {
+        match $RESULT.unwrap_err() {
+            nom::Err::Failure($crate::internal::Error::Syntax {
+                input: _,
+                message: got,
+            }) => {
+                assert_eq!(got.to_string(), $MSG)
+            }
+            e => panic!("Expected Failure(Syntax(_, msg), got {:?}", e),
+        }
+    };
+}
diff --git a/influxdb_influxql_parser/src/time_range.rs b/influxdb_influxql_parser/src/time_range.rs
new file mode 100644
index 0000000..069df2a
--- /dev/null
+++ b/influxdb_influxql_parser/src/time_range.rs
@@ -0,0 +1,1094 @@
+//! Process InfluxQL time range expressions
+//!
+use crate::expression::walk::{walk_expression, Expression};
+use crate::expression::{
+    lit, Binary, BinaryOperator, ConditionalBinary, ConditionalExpression, Expr, VarRef,
+};
+use crate::functions::is_now_function;
+use crate::literal::{nanos_to_timestamp, Duration, Literal};
+use crate::timestamp::{parse_timestamp, Timestamp};
+use std::ops::ControlFlow;
+
+/// Result type for operations that return an [`Expr`] and could result in an [`ExprError`].
+pub type ExprResult = Result<Expr, ExprError>;
+
+/// Traverse `expr` and separate time range expressions from other predicates.
+///
+/// # NOTE
+///
+/// Combining relational operators like `time > now() - 5s` and equality
+/// operators like `time = <timestamp>` with a disjunction (`OR`)
+/// will evaluate to `false`, like InfluxQL.
+///
+/// # Background
+///
+/// The InfluxQL query engine always promotes the time range expression to filter
+/// all results. It is misleading that time ranges are written in the `WHERE` clause,
+/// as the `WHERE` predicate is not evaluated in its entirety for each row. Rather,
+/// InfluxQL extracts the time range to form a time bound for the entire query and
+/// removes any time range expressions from the filter predicate. The time range
+/// is determined using the `>` and `≥` operators to form the lower bound and
+/// the `<` and `≤` operators to form the upper bound. When multiple instances of
+/// the lower or upper bound operators are found, the time bounds will form the
+/// intersection. For example
+///
+/// ```sql
+/// WHERE time >= 1000 AND time >= 2000 AND time < 10000 and time < 9000
+/// ```
+///
+/// is equivalent to
+///
+/// ```sql
+/// WHERE time >= 2000 AND time < 9000
+/// ```
+///
+/// Further, InfluxQL only allows a single `time = <value>` binary expression. Multiple
+/// occurrences result in an empty result set.
+///
+/// ## Examples
+///
+/// Lets illustrate how InfluxQL applies predicates with a typical example, using the
+/// `metrics.lp` data in the IOx repository:
+///
+/// ```sql
+/// SELECT cpu, usage_idle FROM cpu
+/// WHERE
+///   time > '2020-06-11T16:53:30Z' AND time < '2020-06-11T16:55:00Z' AND cpu = 'cpu0'
+/// ```
+///
+/// InfluxQL first filters rows based on the time range:
+///
+/// ```sql
+/// '2020-06-11T16:53:30Z' < time <  '2020-06-11T16:55:00Z'
+/// ```
+///
+/// and then applies the predicate to the individual rows:
+///
+/// ```sql
+/// cpu = 'cpu0'
+/// ```
+///
+/// Producing the following result:
+///
+/// ```text
+/// name: cpu
+/// time                 cpu  usage_idle
+/// ----                 ---  ----------
+/// 2020-06-11T16:53:40Z cpu0 90.29029029029029
+/// 2020-06-11T16:53:50Z cpu0 89.8
+/// 2020-06-11T16:54:00Z cpu0 90.09009009009009
+/// 2020-06-11T16:54:10Z cpu0 88.82235528942115
+/// ```
+///
+/// The following example is a little more complicated, but shows again how InfluxQL
+/// separates the time ranges from the predicate:
+///
+/// ```sql
+/// SELECT cpu, usage_idle FROM cpu
+/// WHERE
+///   time > '2020-06-11T16:53:30Z' AND time < '2020-06-11T16:55:00Z' AND cpu = 'cpu0' OR cpu = 'cpu1'
+/// ```
+///
+/// InfluxQL first filters rows based on the time range:
+///
+/// ```sql
+/// '2020-06-11T16:53:30Z' < time <  '2020-06-11T16:55:00Z'
+/// ```
+///
+/// and then applies the predicate to the individual rows:
+///
+/// ```sql
+/// cpu = 'cpu0' OR cpu = 'cpu1'
+/// ```
+///
+/// This is certainly quite different to SQL, which would evaluate the predicate as:
+///
+/// ```sql
+/// SELECT cpu, usage_idle FROM cpu
+/// WHERE
+///   (time > '2020-06-11T16:53:30Z' AND time < '2020-06-11T16:55:00Z' AND cpu = 'cpu0') OR cpu = 'cpu1'
+/// ```
+///
+/// ## Time ranges are not normal
+///
+/// Here we demonstrate how the operators combining time ranges do not matter. Using the
+/// original query:
+///
+/// ```sql
+/// SELECT cpu, usage_idle FROM cpu
+/// WHERE
+///   time > '2020-06-11T16:53:30Z' AND time < '2020-06-11T16:55:00Z' AND cpu = 'cpu0'
+/// ```
+///
+/// we replace all `AND` operators with `OR`:
+///
+/// ```sql
+/// SELECT cpu, usage_idle FROM cpu
+/// WHERE
+///   time > '2020-06-11T16:53:30Z' OR time < '2020-06-11T16:55:00Z' OR cpu = 'cpu0'
+/// ```
+///
+/// This should return all rows, but yet it returns the same result 🤯:
+///
+/// ```text
+/// name: cpu
+/// time                 cpu  usage_idle
+/// ----                 ---  ----------
+/// 2020-06-11T16:53:40Z cpu0 90.29029029029029
+/// 2020-06-11T16:53:50Z cpu0 89.8
+/// 2020-06-11T16:54:00Z cpu0 90.09009009009009
+/// 2020-06-11T16:54:10Z cpu0 88.82235528942115
+/// ```
+///
+/// It becomes clearer, if we again review at how InfluxQL OG evaluates the `WHERE`
+/// predicate, InfluxQL first filters rows based on the time range, which uses the
+/// rules previously defined by finding `>` and `≥` to determine the lower bound
+/// and `<` and `≤`:
+///
+/// ```sql
+/// '2020-06-11T16:53:30Z' < time <  '2020-06-11T16:55:00Z'
+/// ```
+///
+/// and then applies the predicate to the individual rows:
+///
+/// ```sql
+/// cpu = 'cpu0'
+/// ```
+///
+/// ## How to think of time ranges intuitively
+///
+/// Imagine a slight variation of InfluxQL has a separate _time bounds clause_.
+/// It could have two forms, first as a `BETWEEN`
+///
+/// ```sql
+/// SELECT cpu, usage_idle FROM cpu
+/// WITH TIME BETWEEN '2020-06-11T16:53:30Z' AND '2020-06-11T16:55:00Z'
+/// WHERE
+///   cpu = 'cpu0'
+/// ```
+///
+/// or as an `IN` to select multiple points:
+///
+/// ```sql
+/// SELECT cpu, usage_idle FROM cpu
+/// WITH TIME IN ('2004-04-09T12:00:00Z', '2004-04-09T12:00:10Z', ...)
+/// WHERE
+///   cpu = 'cpu0'
+/// ```
+pub fn split_cond(
+    ctx: &ReduceContext,
+    cond: &ConditionalExpression,
+) -> Result<(Option<ConditionalExpression>, TimeRange), ExprError> {
+    if !has_time_range(cond) {
+        return Ok((Some(cond.clone()), TimeRange::default()));
+    }
+
+    let mut time_range = TimeRange::default();
+    let mut stack: Vec<Option<ConditionalExpression>> = vec![];
+
+    let res = walk_expression(cond, &mut |expr| {
+        if let Expression::Conditional(cond) = expr {
+            use crate::expression::ConditionalOperator::*;
+            use ConditionalExpression as CE;
+            match cond {
+                CE::Binary(ConditionalBinary {
+                    lhs,
+                    op: op @ (Eq | NotEq | Gt | Lt | GtEq | LtEq),
+                    rhs,
+                }) if is_time_field(lhs) || is_time_field(rhs) => {
+                    if matches!(op, NotEq) {
+                        // Stop recursing, as != is an invalid operator for time expressions
+                        return ControlFlow::Break(error::map::expr(
+                            "invalid time comparison operator: !=",
+                        ));
+                    }
+
+                    stack.push(None);
+
+                    /// Op is the limited set of operators expected from here on,
+                    /// to avoid repeated wildcard match arms with unreachable!().
+                    enum Op {
+                        Eq,
+                        Gt,
+                        GtEq,
+                        Lt,
+                        LtEq,
+                    }
+
+                    // Map the DataFusion Operator to Op
+                    let op = match op {
+                        Eq => Op::Eq,
+                        Gt => Op::Gt,
+                        GtEq => Op::GtEq,
+                        Lt => Op::Lt,
+                        LtEq => Op::LtEq,
+                        _ => unreachable!("expected: Eq | Gt | GtEq | Lt | LtEq"),
+                    };
+
+                    let (expr, op) = if is_time_field(lhs) {
+                        (rhs, op)
+                    } else {
+                        (
+                            lhs,
+                            match op {
+                                Op::Eq => Op::Eq,
+                                // swap the relational operators when the conditional is `expression OP "time"`
+                                Op::Gt => Op::Lt,
+                                Op::GtEq => Op::LtEq,
+                                Op::Lt => Op::Gt,
+                                Op::LtEq => Op::GtEq,
+                            },
+                        )
+                    };
+
+                    let Some(expr) = expr.expr() else {
+                        return ControlFlow::Break(error::map::internal("expected Expr"));
+                    };
+
+                    // simplify binary expressions to a constant, including resolve `now()`
+                    let expr = match reduce_time_expr(ctx, expr) {
+                        Ok(e) => e,
+                        Err(err) => return ControlFlow::Break(err),
+                    };
+
+                    let ts = match expr {
+                        Expr::Literal(Literal::Timestamp(ts)) => match ts.timestamp_nanos_opt() {
+                            Some(ts) => ts,
+                            None => {
+                                return ControlFlow::Break(error::map::internal(
+                                    "timestamp out o range",
+                                ));
+                            }
+                        },
+                        expr => {
+                            return ControlFlow::Break(error::map::internal(format!(
+                                "expected Timestamp, got: {}",
+                                expr
+                            )))
+                        }
+                    };
+
+                    // See InfluxQL OG for details.
+                    //
+                    // https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L5836-L5846
+
+                    let mut other = TimeRange::default();
+                    match op {
+                        Op::Eq => {
+                            other.lower = Some(ts);
+                            other.upper = Some(ts);
+                        }
+                        Op::Gt => {
+                            other.lower = Some(ts + 1);
+                        }
+                        Op::GtEq => {
+                            other.lower = Some(ts);
+                        }
+                        Op::Lt => {
+                            other.upper = Some(ts - 1);
+                        }
+                        Op::LtEq => {
+                            other.upper = Some(ts);
+                        }
+                    }
+                    time_range.intersect(other);
+                }
+                node @ CE::Binary(ConditionalBinary {
+                    op: Eq | NotEq | Gt | GtEq | Lt | LtEq | EqRegex | NotEqRegex,
+                    ..
+                }) => {
+                    stack.push(Some(node.clone()));
+                }
+                node @ CE::Expr(expr)
+                    if matches!(expr.as_ref(), Expr::Literal(Literal::Boolean(_))) =>
+                {
+                    stack.push(Some(node.clone()));
+                }
+                CE::Binary(ConditionalBinary {
+                    op: op @ (And | Or),
+                    ..
+                }) => {
+                    let Some(right) = stack.pop() else {
+                        return ControlFlow::Break(error::map::internal("invalid expr stack"));
+                    };
+                    let Some(left) = stack.pop() else {
+                        return ControlFlow::Break(error::map::internal("invalid expr stack"));
+                    };
+                    stack.push(match (left, right) {
+                        (Some(left), Some(right)) => Some(CE::Binary(ConditionalBinary {
+                            lhs: Box::new(left),
+                            op: *op,
+                            rhs: Box::new(right),
+                        })),
+                        (None, Some(node)) | (Some(node), None) => Some(node),
+                        (None, None) => None,
+                    });
+                }
+                _ => {}
+            }
+        }
+        ControlFlow::Continue(())
+    });
+
+    if let ControlFlow::Break(err) = res {
+        return Err(err);
+    }
+
+    let cond = stack
+        .pop()
+        .ok_or_else(|| error::map::internal("expected an element on stack"))?;
+
+    Ok((cond, time_range))
+}
+
+/// Search `cond` for expressions involving the `time` column.
+pub fn has_time_range(cond: &ConditionalExpression) -> bool {
+    walk_expression(cond, &mut |e| {
+        if let Expression::Conditional(cond) = e {
+            if is_time_field(cond) {
+                return ControlFlow::Break(());
+            }
+        }
+        ControlFlow::Continue(())
+    })
+    .is_break()
+}
+
+/// Represents the time range as [lower, upper].
+///
+/// A value of [`None`] is unbounded.
+#[derive(Clone, Copy, Default, Debug, Eq, PartialEq)]
+pub struct TimeRange {
+    /// The lower bound of the time range.
+    pub lower: Option<i64>,
+    /// The upper bound of the time range.
+    pub upper: Option<i64>,
+}
+
+impl TimeRange {
+    /// Create a new time range with the specified lower and upper bounds.
+    pub fn new(lower: Option<i64>, upper: Option<i64>) -> Self {
+        Self { lower, upper }
+    }
+
+    /// Returns `true` if the `lower` and `upper` bounds are `None`.
+    pub fn is_unbounded(self) -> bool {
+        self.lower.is_none() && self.upper.is_none()
+    }
+
+    /// Update the receiver so it is the intersection with `other`.
+    fn intersect(&mut self, other: Self) {
+        *self = self.intersected(other)
+    }
+
+    /// Return a time range that is the intersection of the receiver and `other`.
+    pub fn intersected(self, other: Self) -> Self {
+        let lower = other.lower.map_or(self.lower, |other| match self.lower {
+            None => Some(other),
+            Some(existing) if other > existing => Some(other),
+            _ => self.lower,
+        });
+
+        let upper = other.upper.map_or(self.upper, |other| match self.upper {
+            None => Some(other),
+            Some(existing) if other < existing => Some(other),
+            _ => self.upper,
+        });
+
+        Self { lower, upper }
+    }
+}
+
+/// Simplifies an InfluxQL duration `expr` to a nanosecond interval represented as an `i64`.
+pub fn duration_expr_to_nanoseconds(ctx: &ReduceContext, expr: &Expr) -> Result<i64, ExprError> {
+    match reduce_time_expr(ctx, expr)? {
+        Expr::Literal(Literal::Timestamp(v)) => v
+            .timestamp_nanos_opt()
+            .ok_or_else(|| error::map::expr("timestamp out of range")),
+        _ => error::expr("invalid duration expression"),
+    }
+}
+
+/// Represents an error that occurred whilst simplifying an InfluxQL expression.
+#[derive(Debug)]
+pub enum ExprError {
+    /// An error in the expression that can be resolved by the client.
+    Expression(String),
+
+    /// An internal error that signals a bug.
+    Internal(String),
+}
+
+/// Helper functions for creating errors.
+mod error {
+    use super::ExprError;
+
+    pub(crate) fn expr<T>(s: impl Into<String>) -> Result<T, ExprError> {
+        Err(map::expr(s))
+    }
+
+    pub(crate) fn internal<T>(s: impl Into<String>) -> Result<T, ExprError> {
+        Err(map::internal(s))
+    }
+
+    pub(crate) mod map {
+        use super::*;
+
+        pub(crate) fn expr(s: impl Into<String>) -> ExprError {
+            ExprError::Expression(s.into())
+        }
+
+        pub(crate) fn internal(s: impl Into<String>) -> ExprError {
+            ExprError::Internal(s.into())
+        }
+    }
+}
+
+/// Context used when simplifying InfluxQL time range expressions.
+#[derive(Default, Debug, Clone, Copy)]
+pub struct ReduceContext {
+    /// The value for the `now()` function.
+    pub now: Option<Timestamp>,
+    /// The timezone to evaluate literal timestamp strings.
+    pub tz: Option<chrono_tz::Tz>,
+}
+
+/// Simplify the time range expression and return a literal [timestamp](Timestamp).
+fn reduce_time_expr(ctx: &ReduceContext, expr: &Expr) -> ExprResult {
+    match reduce_expr(ctx, expr)? {
+        expr @ Expr::Literal(Literal::Timestamp(_)) => Ok(expr),
+        Expr::Literal(Literal::String(ref s)) => {
+            parse_timestamp_expr(s, ctx.tz).map_err(map_expr_err(expr))
+        }
+        Expr::Literal(Literal::Duration(v)) => Ok(lit(nanos_to_timestamp(*v))),
+        Expr::Literal(Literal::Float(v)) => Ok(lit(nanos_to_timestamp(v as i64))),
+        Expr::Literal(Literal::Integer(v)) => Ok(lit(nanos_to_timestamp(v))),
+        _ => error::expr("invalid time range expression"),
+    }
+}
+
+fn reduce_expr(ctx: &ReduceContext, expr: &Expr) -> ExprResult {
+    match expr {
+        Expr::Binary(ref v) => reduce_binary_expr(ctx, v).map_err(map_expr_err(expr)),
+        Expr::Call (call) if is_now_function(call.name.as_str()) => ctx.now.map(lit).ok_or_else(|| error::map::internal("unable to resolve now")),
+        Expr::Call (call) => {
+            error::expr(
+                format!("invalid function call '{}'", call.name),
+            )
+        }
+        Expr::Nested(expr) => reduce_expr(ctx, expr),
+        Expr::Literal(val) => match val {
+            Literal::Integer(_) |
+            Literal::Float(_) |
+            Literal::String(_) |
+            Literal::Timestamp(_) |
+            Literal::Duration(_) => Ok(Expr::Literal(val.clone())),
+            _ => error::expr(format!(
+                "found literal '{val}', expected duration, float, integer, or timestamp string"
+            )),
+        },
+
+        Expr::VarRef { .. } | Expr::BindParameter(_) | Expr::Wildcard(_) | Expr::Distinct(_) => error::expr(format!(
+            "found symbol '{expr}', expected now() or a literal duration, float, integer and timestamp string"
+        )),
+    }
+}
+
+fn reduce_binary_expr(ctx: &ReduceContext, expr: &Binary) -> ExprResult {
+    let lhs = reduce_expr(ctx, &expr.lhs)?;
+    let op = expr.op;
+    let rhs = reduce_expr(ctx, &expr.rhs)?;
+
+    match lhs {
+        Expr::Literal(Literal::Duration(v)) => reduce_binary_lhs_duration(ctx, v, op, rhs),
+        Expr::Literal(Literal::Integer(v)) => reduce_binary_lhs_integer(ctx, v, op, rhs),
+        Expr::Literal(Literal::Float(v)) => reduce_binary_lhs_float(v, op, rhs),
+        Expr::Literal(Literal::Timestamp(v)) => reduce_binary_lhs_timestamp(ctx, v, op, rhs),
+        Expr::Literal(Literal::String(v)) => reduce_binary_lhs_string(ctx, v, op, rhs),
+        _ => Ok(Expr::Binary(Binary {
+            lhs: Box::new(lhs),
+            op,
+            rhs: Box::new(rhs),
+        })),
+    }
+}
+
+/// Reduce `duration OP expr`.
+///
+/// ```text
+/// duration  = duration ( ADD | SUB ) ( duration | NOW() )
+/// duration  = duration ( MUL | DIV ) ( float | integer )
+/// timestamp = duration ADD string
+/// timestamp = duration ADD timestamp
+/// ```
+fn reduce_binary_lhs_duration(
+    ctx: &ReduceContext,
+    lhs: Duration,
+    op: BinaryOperator,
+    rhs: Expr,
+) -> ExprResult {
+    match rhs {
+        Expr::Literal(ref val) => match val {
+            // durations may be added and subtracted from other durations
+            Literal::Duration(Duration(v)) => match op {
+                BinaryOperator::Add => Ok(lit(Duration(
+                    lhs.checked_add(*v)
+                        .ok_or_else(|| error::map::expr("overflow"))?,
+                ))),
+                BinaryOperator::Sub => Ok(lit(Duration(
+                    lhs.checked_sub(*v)
+                        .ok_or_else(|| error::map::expr("overflow"))?,
+                ))),
+                _ => error::expr(format!("found operator '{op}', expected +, -")),
+            },
+            // durations may only be scaled by float literals
+            Literal::Float(v) => {
+                reduce_binary_lhs_duration(ctx, lhs, op, Expr::Literal(Literal::Integer(*v as i64)))
+            }
+            Literal::Integer(v) => match op {
+                BinaryOperator::Mul => Ok(lit(Duration(*lhs * *v))),
+                BinaryOperator::Div => Ok(lit(Duration(*lhs / *v))),
+                _ => error::expr(format!("found operator '{op}', expected *, /")),
+            },
+            // A timestamp may be added to a duration
+            Literal::Timestamp(v) if matches!(op, BinaryOperator::Add) => {
+                Ok(lit(*v + chrono::Duration::nanoseconds(*lhs)))
+            }
+            Literal::String(v) => {
+                reduce_binary_lhs_duration(ctx, lhs, op, parse_timestamp_expr(v, ctx.tz)?)
+            }
+            // This should not occur, as acceptable literals are validated in `reduce_expr`.
+            _ => error::internal(format!(
+                "unexpected literal '{rhs}' for duration expression"
+            )),
+        },
+        _ => error::expr("invalid duration expression"),
+    }
+}
+
+/// Reduce `integer OP expr`.
+///
+/// ```text
+/// integer   = integer ( ADD | SUB | MUL | DIV | MOD | BitwiseAND | BitwiseOR | BitwiseXOR ) integer
+/// float     = integer as float OP float
+/// timestamp = integer as timestamp OP duration
+/// ```
+fn reduce_binary_lhs_integer(
+    ctx: &ReduceContext,
+    lhs: i64,
+    op: BinaryOperator,
+    rhs: Expr,
+) -> ExprResult {
+    match rhs {
+        Expr::Literal(Literal::Float(_)) => reduce_binary_lhs_float(lhs as f64, op, rhs),
+        Expr::Literal(Literal::Integer(v)) => Ok(lit(op.reduce(lhs, v))),
+        Expr::Literal(Literal::Duration(_)) => {
+            reduce_binary_lhs_timestamp(ctx, nanos_to_timestamp(lhs), op, rhs)
+        }
+        Expr::Literal(Literal::String(v)) => {
+            reduce_binary_lhs_duration(ctx, Duration(lhs), op, parse_timestamp_expr(&v, ctx.tz)?)
+        }
+        _ => error::expr("invalid integer expression"),
+    }
+}
+
+/// Reduce `float OP expr`.
+///
+/// ```text
+/// float = float ( ADD | SUB | MUL | DIV | MOD ) ( float | integer)
+/// ```
+fn reduce_binary_lhs_float(lhs: f64, op: BinaryOperator, rhs: Expr) -> ExprResult {
+    Ok(lit(match rhs {
+        Expr::Literal(Literal::Float(v)) => op
+            .try_reduce(lhs, v)
+            .ok_or_else(|| error::map::expr("invalid operator for float expression"))?,
+        Expr::Literal(Literal::Integer(v)) => op
+            .try_reduce(lhs, v)
+            .ok_or_else(|| error::map::expr("invalid operator for float expression"))?,
+        _ => return error::expr("invalid float expression"),
+    }))
+}
+
+/// Reduce `timestamp OP expr`.
+///
+/// The right-hand `expr` must be of a type that can be
+/// coalesced to a duration, which includes a `duration`, `integer` or a
+/// `string`. A `string` is parsed as a timestamp an interpreted as
+/// the number of nanoseconds from the Unix epoch.
+///
+/// ```text
+/// timestamp = timestamp ( ADD | SUB ) ( duration | integer | string | timestamp )
+/// ```
+fn reduce_binary_lhs_timestamp(
+    ctx: &ReduceContext,
+    lhs: Timestamp,
+    op: BinaryOperator,
+    rhs: Expr,
+) -> ExprResult {
+    match rhs {
+        Expr::Literal(Literal::Duration(d)) => match op {
+            BinaryOperator::Add => Ok(lit(lhs + chrono::Duration::nanoseconds(*d))),
+            BinaryOperator::Sub => Ok(lit(lhs - chrono::Duration::nanoseconds(*d))),
+            _ => error::expr(format!(
+                "invalid operator '{op}' for timestamp and duration: expected +, -"
+            )),
+        },
+        Expr::Literal(Literal::Integer(_))
+        // NOTE: This is a slight deviation from InfluxQL, for which the only valid binary
+        // operator for two timestamps is subtraction. By converting the timestamp to a
+        // duration and calling this function recursively, we permit the addition operator.
+        | Expr::Literal(Literal::Timestamp(_))
+        | Expr::Literal(Literal::String(_)) => {
+            reduce_binary_lhs_timestamp(ctx, lhs, op, expr_to_duration(ctx, rhs)?)
+        }
+        _ => error::expr(format!(
+            "invalid expression '{rhs}': expected duration, integer or timestamp string"
+        )),
+    }
+}
+
+fn expr_to_duration(ctx: &ReduceContext, expr: Expr) -> ExprResult {
+    Ok(lit(match expr {
+        Expr::Literal(Literal::Duration(v)) => v,
+        Expr::Literal(Literal::Integer(v)) => Duration(v),
+        Expr::Literal(Literal::Timestamp(v)) => Duration(
+            v.timestamp_nanos_opt()
+                .ok_or_else(|| error::map::expr("timestamp out of range"))?,
+        ),
+        Expr::Literal(Literal::String(v)) => Duration(
+            parse_timestamp_nanos(&v, ctx.tz)?
+                .timestamp_nanos_opt()
+                .ok_or_else(|| error::map::expr("timestamp out of range"))?,
+        ),
+        _ => return error::expr(format!("unable to cast {expr} to duration")),
+    }))
+}
+
+/// Reduce `string OP expr`.
+///
+/// If `expr` is a string, concatenates the two values and returns a new string.
+/// If `expr` is a duration, integer or timestamp, the left-hand
+/// string is parsed as a timestamp and the expression evaluated as
+/// `timestamp OP expr`
+fn reduce_binary_lhs_string(
+    ctx: &ReduceContext,
+    lhs: String,
+    op: BinaryOperator,
+    rhs: Expr,
+) -> ExprResult {
+    match rhs {
+        Expr::Literal(Literal::String(ref s)) => match op {
+            // concatenate the two strings
+            BinaryOperator::Add => Ok(lit(lhs + s)),
+            _ => reduce_binary_lhs_timestamp(ctx, parse_timestamp_nanos(&lhs, ctx.tz)?, op, rhs),
+        },
+        Expr::Literal(Literal::Duration(_))
+        | Expr::Literal(Literal::Timestamp(_))
+        | Expr::Literal(Literal::Integer(_)) => {
+            reduce_binary_lhs_timestamp(ctx, parse_timestamp_nanos(&lhs, ctx.tz)?, op, rhs)
+        }
+        _ => error::expr(format!(
+            "found '{rhs}', expected duration, integer or timestamp string"
+        )),
+    }
+}
+
+/// Returns true if the conditional expression is a single node that
+/// refers to the `time` column.
+///
+/// In a conditional expression, this comparison is case-insensitive per the [Go implementation][go]
+///
+/// [go]: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5751-L5753
+fn is_time_field(cond: &ConditionalExpression) -> bool {
+    if let ConditionalExpression::Expr(expr) = cond {
+        if let Expr::VarRef(VarRef { ref name, .. }) = **expr {
+            name.eq_ignore_ascii_case("time")
+        } else {
+            false
+        }
+    } else {
+        false
+    }
+}
+
+fn parse_timestamp_nanos(s: &str, tz: Option<chrono_tz::Tz>) -> Result<Timestamp, ExprError> {
+    parse_timestamp(s, tz)
+        .ok_or_else(|| error::map::expr(format!("'{s}' is not a valid timestamp")))
+}
+
+/// Parse s as a timestamp in the specified timezone and return the timestamp
+/// as a literal timestamp expression.
+fn parse_timestamp_expr(s: &str, tz: Option<chrono_tz::Tz>) -> ExprResult {
+    Ok(Expr::Literal(Literal::Timestamp(parse_timestamp_nanos(
+        s, tz,
+    )?)))
+}
+
+fn map_expr_err(expr: &Expr) -> impl Fn(ExprError) -> ExprError + '_ {
+    move |err| {
+        error::map::expr(format!(
+            "invalid expression \"{expr}\": {}",
+            match err {
+                ExprError::Expression(str) | ExprError::Internal(str) => str,
+            }
+        ))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::expression::ConditionalExpression;
+    use crate::time_range::{
+        duration_expr_to_nanoseconds, reduce_time_expr, split_cond, ExprError, ExprResult,
+        ReduceContext, TimeRange,
+    };
+    use crate::timestamp::Timestamp;
+    use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Offset, Utc};
+    use test_helpers::assert_error;
+
+    /// Return a `ReduceContext` with a value of
+    /// now set to `2023-01-01T00:00:00Z` / `1672531200000000000`
+    /// and not timezone.
+    fn reduce_context() -> ReduceContext {
+        ReduceContext {
+            now: Some(Timestamp::from_naive_utc_and_offset(
+                NaiveDateTime::new(
+                    NaiveDate::from_ymd_opt(2023, 1, 1).unwrap(),
+                    NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
+                ),
+                Utc.fix(),
+            )),
+            tz: None,
+        }
+    }
+
+    #[test]
+    fn test_split_cond() {
+        fn split_exprs(s: &str) -> Result<(Option<ConditionalExpression>, TimeRange), ExprError> {
+            let ctx = reduce_context();
+            let cond: ConditionalExpression = s.parse().unwrap();
+            split_cond(&ctx, &cond)
+        }
+
+        macro_rules! range {
+            (lower=$LOWER:literal) => {
+                TimeRange {
+                    lower: Some($LOWER),
+                    upper: None,
+                }
+            };
+            (lower=$LOWER:literal, upper ex=$UPPER:literal) => {
+                TimeRange {
+                    lower: Some($LOWER),
+                    upper: Some($UPPER - 1),
+                }
+            };
+            (lower=$LOWER:literal, upper=$UPPER:literal) => {
+                TimeRange {
+                    lower: Some($LOWER),
+                    upper: Some($UPPER),
+                }
+            };
+            (lower ex=$LOWER:literal, upper=$UPPER:literal) => {
+                TimeRange {
+                    lower: Some($LOWER + 1),
+                    upper: Some($UPPER),
+                }
+            };
+            (lower ex=$LOWER:literal) => {
+                TimeRange {
+                    lower: Some($LOWER + 1),
+                    upper: None,
+                }
+            };
+            (upper=$UPPER:literal) => {
+                TimeRange {
+                    lower: None,
+                    upper: Some($UPPER),
+                }
+            };
+            (upper ex=$UPPER:literal) => {
+                TimeRange {
+                    lower: None,
+                    upper: Some($UPPER - 1),
+                }
+            };
+            (eq=$TS:literal) => {
+                TimeRange {
+                    lower: Some($TS),
+                    upper: Some($TS),
+                }
+            };
+        }
+
+        let (cond, tr) = split_exprs("time >= now() - 1s").unwrap();
+        assert!(cond.is_none());
+        assert_eq!(tr, range!(lower = 1672531199000000000));
+
+        // reduces the lower bound to a single expression
+        let (cond, tr) = split_exprs("time >= now() - 1s AND time >= now() - 500ms").unwrap();
+        assert!(cond.is_none());
+        assert_eq!(tr, range!(lower = 1672531199500000000));
+
+        let (cond, tr) = split_exprs("time <= now() - 1s").unwrap();
+        assert!(cond.is_none());
+        assert_eq!(tr, range!(upper = 1672531199000000000));
+
+        // reduces the upper bound to a single expression
+        let (cond, tr) = split_exprs("time <= now() + 1s AND time <= now() + 500ms").unwrap();
+        assert!(cond.is_none());
+        assert_eq!(tr, range!(upper = 1672531200500000000));
+
+        let (cond, tr) = split_exprs("time >= now() - 1s AND time < now()").unwrap();
+        assert!(cond.is_none());
+        assert_eq!(
+            tr,
+            range!(lower=1672531199000000000, upper ex=1672531200000000000)
+        );
+
+        let (cond, tr) = split_exprs("time >= now() - 1s AND cpu = 'cpu0'").unwrap();
+        assert_eq!(cond.unwrap().to_string(), "cpu = 'cpu0'");
+        assert_eq!(tr, range!(lower = 1672531199000000000));
+
+        let (cond, tr) = split_exprs("time = 0").unwrap();
+        assert!(cond.is_none());
+        assert_eq!(tr, range!(eq = 0));
+
+        let (cond, tr) = split_exprs(
+            "instance = 'instance-01' OR instance = 'instance-02' AND time >= now() - 1s",
+        )
+        .unwrap();
+        assert_eq!(
+            cond.unwrap().to_string(),
+            "instance = 'instance-01' OR instance = 'instance-02'"
+        );
+        assert_eq!(tr, range!(lower = 1672531199000000000));
+
+        let (cond, tr) =
+            split_exprs("time >= now() - 1s AND time < now() AND cpu = 'cpu0' OR cpu = 'cpu1'")
+                .unwrap();
+        assert_eq!(cond.unwrap().to_string(), "cpu = 'cpu0' OR cpu = 'cpu1'");
+        assert_eq!(
+            tr,
+            range!(lower=1672531199000000000, upper ex=1672531200000000000)
+        );
+
+        // time >= now - 60s AND time < now() OR cpu = 'cpu0' OR cpu = 'cpu1'
+        //
+        // Split the time range, despite using the disjunction (OR) operator
+        let (cond, tr) =
+            split_exprs("time >= now() - 1s AND time < now() OR cpu = 'cpu0' OR cpu = 'cpu1'")
+                .unwrap();
+        assert_eq!(cond.unwrap().to_string(), "cpu = 'cpu0' OR cpu = 'cpu1'");
+        assert_eq!(
+            tr,
+            range!(lower=1672531199000000000, upper ex=1672531200000000000)
+        );
+
+        let (cond, tr) = split_exprs("time = 0 OR time = 10 AND cpu = 'cpu0'").unwrap();
+        assert_eq!(cond.unwrap().to_string(), "cpu = 'cpu0'");
+        // Models InfluxQL behaviour, which will result in no results being returned because
+        // upper < lower
+        assert_eq!(tr, range!(lower = 10, upper = 0));
+
+        // no time
+        let (cond, tr) = split_exprs("f64 >= 19.5 OR f64 =~ /foo/").unwrap();
+        assert_eq!(cond.unwrap().to_string(), "f64 >= 19.5 OR f64 =~ /foo/");
+        assert!(tr.is_unbounded());
+
+        let (cond, tr) = split_exprs("time > now() OR time = 1000").unwrap();
+        assert!(cond.is_none());
+        assert_eq!(tr, range!(lower ex = 1672531200000000000, upper = 1000));
+
+        // boolean constant
+        // see https://github.com/influxdata/influxdb_iox/issues/9175
+        let (cond, tr) = split_exprs("true OR time > 0").unwrap();
+        assert_eq!(cond.unwrap().to_string(), "true");
+        assert_eq!(tr, range!(lower = 1));
+
+        // fallible
+        assert_error!(split_exprs("time > '2004-04-09T'"), ExprError::Expression(ref s) if s == "invalid expression \"'2004-04-09T'\": '2004-04-09T' is not a valid timestamp");
+    }
+
+    #[test]
+    fn test_rewrite_time_expression_no_timezone() {
+        fn process_expr(s: &str) -> ExprResult {
+            let cond: ConditionalExpression =
+                s.parse().expect("unexpected error parsing expression");
+            let ctx = ReduceContext {
+                now: Some(Timestamp::from_naive_utc_and_offset(
+                    NaiveDateTime::new(
+                        NaiveDate::from_ymd_opt(2004, 4, 9).unwrap(),
+                        NaiveTime::from_hms_opt(12, 13, 14).unwrap(),
+                    ),
+                    Utc.fix(),
+                )),
+                tz: None,
+            };
+            reduce_time_expr(&ctx, cond.expr().unwrap())
+        }
+
+        macro_rules! assert_expr {
+            ($S: expr, $EXPECTED: expr) => {
+                let expr = process_expr($S).unwrap();
+                assert_eq!(expr.to_string(), $EXPECTED);
+            };
+        }
+
+        //
+        // Valid literals
+        //
+
+        // Duration
+        assert_expr!("1d", "1970-01-02T00:00:00+00:00");
+
+        // Single integer interpreted as a Unix nanosecond epoch
+        assert_expr!("1157082310000000000", "2006-09-01T03:45:10+00:00");
+
+        // Single float interpreted as a Unix nanosecond epoch
+        assert_expr!("1157082310000000000.0", "2006-09-01T03:45:10+00:00");
+
+        // Single string interpreted as a timestamp
+        assert_expr!(
+            "'2004-04-09 02:33:45.123456789'",
+            "2004-04-09T02:33:45.123456789+00:00"
+        );
+
+        // now
+        assert_expr!("now()", "2004-04-09T12:13:14+00:00");
+
+        //
+        // Expressions
+        //
+
+        // now() OP expr
+        assert_expr!("now() - 5m", "2004-04-09T12:08:14+00:00");
+        assert_expr!("(now() - 5m)", "2004-04-09T12:08:14+00:00");
+        assert_expr!("now() - 5m - 60m", "2004-04-09T11:08:14+00:00");
+        assert_expr!("now() - 500", "2004-04-09T12:13:13.999999500+00:00");
+        assert_expr!("now() - (5m + 60m)", "2004-04-09T11:08:14+00:00");
+
+        // expr OP now()
+        assert_expr!("5m + now()", "2004-04-09T12:18:14+00:00");
+
+        // duration OP expr
+        assert_expr!("1w3d + 1d", "1970-01-12T00:00:00+00:00");
+        assert_expr!("1w3d - 1d", "1970-01-10T00:00:00+00:00");
+
+        // string OP expr
+        assert_expr!("'2004-04-09' - '2004-04-08'", "1970-01-02T00:00:00+00:00");
+
+        assert_expr!("'2004-04-09' + '02:33:45'", "2004-04-09T02:33:45+00:00");
+
+        // integer OP expr
+        assert_expr!("1157082310000000000 - 1s", "2006-09-01T03:45:09+00:00");
+
+        // nested evaluation order
+        assert_expr!("now() - (6m - (1m * 5))", r#"2004-04-09T12:12:14+00:00"#);
+
+        // Fallible
+
+        use super::ExprError::Expression;
+        assert_error!(process_expr("foo + 1"), Expression(ref s) if s == "invalid expression \"foo + 1\": found symbol 'foo', expected now() or a literal duration, float, integer and timestamp string");
+
+        assert_error!(process_expr("5m - now()"), Expression(ref s) if s == "invalid expression \"5m - now()\": unexpected literal '2004-04-09T12:13:14+00:00' for duration expression");
+
+        assert_error!(process_expr("'2004-04-09' + false"), Expression(ref s) if s == "invalid expression \"'2004-04-09' + false\": found literal 'false', expected duration, float, integer, or timestamp string");
+
+        assert_error!(process_expr("1s * 1s"), Expression(ref s) if s == "invalid expression \"1000ms * 1000ms\": found operator '*', expected +, -");
+        assert_error!(process_expr("1s + 0.5"), Expression(ref s) if s == "invalid expression \"1000ms + 0.5\": found operator '+', expected *, /");
+
+        assert_error!(process_expr("'2004-04-09T'"), Expression(ref s) if s == "invalid expression \"'2004-04-09T'\": '2004-04-09T' is not a valid timestamp");
+        assert_error!(process_expr("now() * 1"), Expression(ref s) if s == "invalid expression \"now() * 1\": invalid operator '*' for timestamp and duration: expected +, -");
+        assert_error!(process_expr("'2' + now()"), Expression(ref s) if s == "invalid expression \"'2' + now()\": '2' is not a valid timestamp");
+        assert_error!(process_expr("'2' + '3'"), Expression(ref s) if s == "invalid expression \"'2' + '3'\": '23' is not a valid timestamp");
+        assert_error!(process_expr("'2' + '3' + 10s"), Expression(ref s) if s == "invalid expression \"'2' + '3' + 10s\": '23' is not a valid timestamp");
+    }
+
+    #[test]
+    fn test_rewrite_time_expression_with_timezone() {
+        fn process_expr(s: &str) -> ExprResult {
+            let cond: ConditionalExpression =
+                s.parse().expect("unexpected error parsing expression");
+            let ctx = ReduceContext {
+                now: None,
+                tz: Some(chrono_tz::Australia::Hobart),
+            };
+            reduce_time_expr(&ctx, cond.expr().unwrap())
+        }
+
+        macro_rules! assert_expr {
+            ($S: expr, $EXPECTED: expr) => {
+                let expr = process_expr($S).unwrap();
+                assert_eq!(expr.to_string(), $EXPECTED);
+            };
+        }
+
+        assert_expr!(
+            "'2004-04-09 10:05:00.123456789'",
+            "2004-04-09T10:05:00.123456789+10:00"
+        );
+        assert_expr!("'2004-04-09'", "2004-04-09T00:00:00+10:00");
+        assert_expr!(
+            "'2004-04-09T10:05:00.123456789Z'",
+            "2004-04-09T20:05:00.123456789+10:00"
+        );
+    }
+
+    #[test]
+    fn test_expr_to_duration() {
+        fn parse(s: &str) -> Result<i64, ExprError> {
+            let ctx = reduce_context();
+            let expr = s
+                .parse::<ConditionalExpression>()
+                .unwrap()
+                .expr()
+                .unwrap()
+                .clone();
+            duration_expr_to_nanoseconds(&ctx, &expr)
+        }
+
+        let cases = vec![
+            ("10s", 10_000_000_000_i64),
+            ("10s + 1d", 86_410_000_000_000),
+            ("5d10ms", 432_000_010_000_000),
+            ("-2d10ms", -172800010000000),
+            ("-2d10ns", -172800000000010),
+            ("now()", 1672531200000000000),
+            ("'2023-01-01T00:00:00Z'", 1672531200000000000),
+        ];
+
+        for (interval_str, exp) in cases {
+            let got = parse(interval_str).unwrap();
+            assert_eq!(got, exp, "Actual: {got:?}");
+        }
+    }
+
+    #[test]
+    fn test_time_range_is_unbounded() {
+        let a = TimeRange::new(Some(1000), Some(5000));
+        assert!(!a.is_unbounded());
+        let a = TimeRange::new(None, Some(5000));
+        assert!(!a.is_unbounded());
+        let a = TimeRange::new(Some(1000), None);
+        assert!(!a.is_unbounded());
+        let a = TimeRange::new(None, None);
+        assert!(a.is_unbounded());
+    }
+
+    #[test]
+    fn test_time_range_intersect() {
+        let a = TimeRange::new(Some(1000), Some(5000));
+        let b = TimeRange::new(Some(2000), Some(6000));
+        assert_eq!(a.intersected(b), TimeRange::new(Some(2000), Some(5000)));
+        assert_eq!(b.intersected(a), TimeRange::new(Some(2000), Some(5000)));
+
+        let a = TimeRange::new(Some(1000), None);
+        let b = TimeRange::new(Some(2000), Some(6000));
+        assert_eq!(a.intersected(b), TimeRange::new(Some(2000), Some(6000)));
+
+        let a = TimeRange::new(None, None);
+        let b = TimeRange::new(Some(2000), Some(6000));
+        assert_eq!(a.intersected(b), TimeRange::new(Some(2000), Some(6000)));
+    }
+}
diff --git a/influxdb_influxql_parser/src/timestamp.rs b/influxdb_influxql_parser/src/timestamp.rs
new file mode 100644
index 0000000..162d029
--- /dev/null
+++ b/influxdb_influxql_parser/src/timestamp.rs
@@ -0,0 +1,199 @@
+//! Parse InfluxQL timestamp strings.
+//!
+use chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, Offset, TimeZone};
+
+/// Represents an InfluxQL timestamp.
+pub type Timestamp = DateTime<FixedOffset>;
+
+/// Parse the timestamp string and return a DateTime in UTC.
+fn parse_timestamp_utc(s: &str) -> Option<Timestamp> {
+    // 1a. Try a date time format string with nanosecond precision and then without
+    //    https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3661
+    NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f")
+        // 1b. Try a date time format string without nanosecond precision
+        .or_else(|_| NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%"))
+        // 2. Try RFC3339 with nano precision
+        //    https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3664
+        .or_else(|_| DateTime::parse_from_str(s, "%+").map(|ts| ts.naive_utc()))
+        // 3. Try a date string
+        //    https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3671
+        .or_else(
+            |_| // Parse as a naive date, add a midnight time and then interpret the result in
+                NaiveDate::parse_from_str(s, "%Y-%m-%d")
+                    .map(|nd| nd.and_time(NaiveTime::default())),
+        )
+        .map(|ts| DateTime::from_naive_utc_and_offset(ts, chrono::Utc.fix()))
+        .ok()
+}
+
+/// Parse the timestamp string and return a DateTime in the specified timezone.
+fn parse_timestamp_tz(s: &str, tz: chrono_tz::Tz) -> Option<Timestamp> {
+    // 1a. Try a date time format string with nanosecond precision
+    //    https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3661
+    NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f")
+        .ok()
+        .and_then(|ts| tz.from_local_datetime(&ts).earliest())
+        // 1a. Try a date time format string without nanosecond precision
+        .or_else(|| {
+            NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S")
+                .ok()
+                .and_then(|ts| tz.from_local_datetime(&ts).earliest())
+        })
+        // 2. Try RFC3339 with nano precision
+        //    https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3664
+        .or_else(|| {
+            DateTime::parse_from_str(s, "%+")
+                .map(|ts| tz.from_utc_datetime(&ts.naive_utc()))
+                .ok()
+        })
+        // 3. Try a date string
+        //    https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3671
+        .or_else(|| {
+            // Parse as a naive date, add a midnight time and then interpret the result in
+            // timezone "tz"
+            NaiveDate::parse_from_str(s, "%Y-%m-%d")
+                .map(|nd| nd.and_time(NaiveTime::default()).and_local_timezone(tz))
+                .ok()?
+                // When converted to the target timezone, tz, it is possible the
+                // date is ambiguous due to time shifts. In this case, rather than
+                // fail, choose the earliest valid date.
+                .earliest()
+        })
+        .map(|ts| ts.with_timezone(&ts.offset().fix()))
+}
+
+/// Parse the string and return a `DateTime` using a fixed offset.
+///
+/// Based on the [`ToTimeLiteral`] function.
+///
+/// [`ToTimeLiteral`]: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3654-L3655
+///
+pub fn parse_timestamp(s: &str, tz: Option<chrono_tz::Tz>) -> Option<Timestamp> {
+    match tz {
+        Some(tz) => parse_timestamp_tz(s, tz),
+        // We could have mapped None => Utc and called parse_timestamp_tz, however,
+        // this implementation is able to use the simplified NaiveDateTime type,
+        // which is more efficient, as it does not have to perform transitions between
+        // arbitrary timezones.
+        None => parse_timestamp_utc(s),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::parse_timestamp;
+
+    #[test]
+    fn test_parse_timestamp() {
+        use chrono_tz::America::New_York; // a timezone in the Western hemisphere
+        use chrono_tz::Australia::Hobart; // a timezone in the Eastern hemisphere
+
+        //
+        // No timezone specified
+        //
+
+        //
+        // Step 1: Date-time format
+        // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/parser.go#L19
+        //
+        // These timestamps should be interpreted in UTC
+        //
+
+        // Nanosecond precision
+        let res = parse_timestamp("2004-04-09 10:30:45.123456789", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456789 +00:00");
+        // Nanosecond precision, 7 digits
+        let res = parse_timestamp("2004-04-09 10:30:45.1234567", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456700 +00:00");
+        // Microsecond precision
+        let res = parse_timestamp("2004-04-09 10:30:45.123456", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456 +00:00");
+        // No fractional seconds
+        let res = parse_timestamp("2004-04-09 10:30:45", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45 +00:00");
+
+        //
+        // Step 2: RFC3339Nano format
+        // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3664
+        //
+        // These timestamps should be interpreted in whatever timezone is specified in the
+        // string literal.
+
+        let res = parse_timestamp("2004-04-09T10:30:45.123456789Z", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456789 +00:00");
+        // With an offset in Eastern hemisphere
+        let res = parse_timestamp("2004-04-09T10:30:45.123456789+10:00", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 00:30:45.123456789 +00:00");
+        // With an offset in Western hemisphere
+        let res = parse_timestamp("2004-04-09T10:30:45.123456789-05:00", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 15:30:45.123456789 +00:00");
+
+        //
+        // Step 3: Date format
+        // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3671
+        //
+        //
+        // These timestamps should be interpreted at midnight in UTC
+        //
+
+        let res = parse_timestamp("2004-04-09", None).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 00:00:00 +00:00");
+
+        //
+        // Timezone specified, therefore unspecified timezone strings should be interpreted
+        // in the provided timezone.
+        //
+
+        //
+        // Step 1: Date-time format
+        // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/parser.go#L19
+        //
+
+        // Nanosecond precision
+        let res = parse_timestamp("2004-04-09 10:30:45.123456789", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456789 +10:00");
+        let res = parse_timestamp("2004-04-09 10:30:45.123456789", Some(New_York)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456789 -04:00");
+        // Nanosecond precision, 7 digits
+        let res = parse_timestamp("2004-04-09 10:30:45.1234567", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456700 +10:00");
+        // Microsecond precision
+        let res = parse_timestamp("2004-04-09 10:30:45.123456", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456 +10:00");
+        // No fractional seconds
+        let res = parse_timestamp("2004-04-09 10:30:45", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45 +10:00");
+        // No fractional seconds
+        let res = parse_timestamp("2004-04-09 00:00:00", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 00:00:00 +10:00");
+
+        //
+        // Step 2: RFC3339Nano format
+        // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3664
+        //
+        // Timestamps are interpreted in the timezone specified in the timestamp string and then converted to the
+        // desired timezone.
+
+        let res = parse_timestamp("2004-04-09T10:30:45.123456789Z", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 20:30:45.123456789 +10:00");
+        // With an offset in the same timezone
+        let res = parse_timestamp("2004-04-09T10:30:45.123456789+10:00", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 10:30:45.123456789 +10:00");
+        // With an offset in another timezone
+        let res = parse_timestamp("2004-04-09T10:30:45.123456789-05:00", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-10 01:30:45.123456789 +10:00");
+
+        //
+        // Step 3: Date format
+        // https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3671
+        //
+
+        let res = parse_timestamp("2004-04-09", Some(Hobart)).unwrap();
+        assert_eq!(res.to_string(), "2004-04-09 00:00:00 +10:00");
+        // 2004-04-09 should be the same as parsing a full date-time at midnight in the specified timezone
+        assert_eq!(
+            res,
+            parse_timestamp("2004-04-09 00:00:00", Some(Hobart)).unwrap()
+        );
+    }
+}
diff --git a/influxdb_influxql_parser/src/visit.rs b/influxdb_influxql_parser/src/visit.rs
new file mode 100644
index 0000000..cedcb9b
--- /dev/null
+++ b/influxdb_influxql_parser/src/visit.rs
@@ -0,0 +1,1566 @@
+//! The visit module provides API for walking the AST.
+//!
+//! # Example
+//!
+//! ```
+//! use influxdb_influxql_parser::visit::{Visitable, Visitor};
+//! use influxdb_influxql_parser::parse_statements;
+//! use influxdb_influxql_parser::common::WhereClause;
+//!
+//! struct MyVisitor;
+//!
+//! impl Visitor for MyVisitor {
+//!     type Error = ();
+//!
+//!     fn post_visit_where_clause(self, n: &WhereClause) -> Result<Self, Self::Error> {
+//!         println!("{}", n);
+//!         Ok(self)
+//!     }
+//! }
+//!
+//! let statements = parse_statements("SELECT value FROM cpu WHERE host = 'west'").unwrap();
+//! let statement  = statements.first().unwrap();
+//! let vis = MyVisitor;
+//! statement.accept(vis);
+//! ```
+use self::Recursion::*;
+use crate::common::{
+    LimitClause, MeasurementName, OffsetClause, OrderByClause, QualifiedMeasurementName,
+    WhereClause,
+};
+use crate::create::CreateDatabaseStatement;
+use crate::delete::DeleteStatement;
+use crate::drop::DropMeasurementStatement;
+use crate::explain::ExplainStatement;
+use crate::expression::arithmetic::Expr;
+use crate::expression::conditional::ConditionalExpression;
+use crate::expression::{Binary, Call, ConditionalBinary, VarRef};
+use crate::literal::Literal;
+use crate::select::{
+    Dimension, Field, FieldList, FillClause, FromMeasurementClause, GroupByClause,
+    MeasurementSelection, SLimitClause, SOffsetClause, SelectStatement, TimeDimension,
+    TimeZoneClause,
+};
+use crate::show::{OnClause, ShowDatabasesStatement};
+use crate::show_field_keys::ShowFieldKeysStatement;
+use crate::show_measurements::{
+    ExtendedOnClause, ShowMeasurementsStatement, WithMeasurementClause,
+};
+use crate::show_retention_policies::ShowRetentionPoliciesStatement;
+use crate::show_tag_keys::ShowTagKeysStatement;
+use crate::show_tag_values::{ShowTagValuesStatement, WithKeyClause};
+use crate::simple_from_clause::{DeleteFromClause, ShowFromClause};
+use crate::statement::Statement;
+
+/// Controls how the visitor recursion should proceed.
+#[derive(Debug)]
+pub enum Recursion<V: Visitor> {
+    /// Attempt to visit all the children, recursively, of this expression.
+    Continue(V),
+    /// Do not visit the children of this expression, though the walk
+    /// of parents of this expression will not be affected
+    Stop(V),
+}
+
+/// Encode the depth-first traversal of an InfluxQL statement. When passed to
+/// any [`Visitable::accept`], `pre_visit` functions are invoked repeatedly
+/// until a leaf node is reached or a `pre_visit` function returns [`Recursion::Stop`].
+pub trait Visitor: Sized {
+    /// The type returned in the event of an error traversing the tree.
+    type Error;
+
+    /// Invoked before any children of the InfluxQL statement are visited.
+    fn pre_visit_statement(self, _n: &Statement) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the InfluxQL statement are visited.
+    fn post_visit_statement(self, _n: &Statement) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of `n` are visited.
+    fn pre_visit_create_database_statement(
+        self,
+        _n: &CreateDatabaseStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of `n` are visited. Default
+    /// implementation does nothing.
+    fn post_visit_create_database_statement(
+        self,
+        _n: &CreateDatabaseStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `DELETE` statement are visited.
+    fn pre_visit_delete_statement(
+        self,
+        _n: &DeleteStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `DELETE` statement are visited.
+    fn post_visit_delete_statement(self, _n: &DeleteStatement) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `FROM` clause of a `DELETE` statement are visited.
+    fn pre_visit_delete_from_clause(
+        self,
+        _n: &DeleteFromClause,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `FROM` clause of a `DELETE` statement are visited.
+    fn post_visit_delete_from_clause(self, _n: &DeleteFromClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the measurement name are visited.
+    fn pre_visit_measurement_name(
+        self,
+        _n: &MeasurementName,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the measurement name are visited.
+    fn post_visit_measurement_name(self, _n: &MeasurementName) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `DROP MEASUREMENT` statement are visited.
+    fn pre_visit_drop_measurement_statement(
+        self,
+        _n: &DropMeasurementStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `DROP MEASUREMENT` statement are visited.
+    fn post_visit_drop_measurement_statement(
+        self,
+        _n: &DropMeasurementStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `EXPLAIN` statement are visited.
+    fn pre_visit_explain_statement(
+        self,
+        _n: &ExplainStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `EXPLAIN` statement are visited.
+    fn post_visit_explain_statement(self, _n: &ExplainStatement) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SELECT` statement are visited.
+    fn pre_visit_select_statement(
+        self,
+        _n: &SelectStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SELECT` statement are visited.
+    fn post_visit_select_statement(self, _n: &SelectStatement) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SHOW DATABASES` statement are visited.
+    fn pre_visit_show_databases_statement(
+        self,
+        _n: &ShowDatabasesStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SHOW DATABASES` statement are visited.
+    fn post_visit_show_databases_statement(
+        self,
+        _n: &ShowDatabasesStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SHOW MEASUREMENTS` statement are visited.
+    fn pre_visit_show_measurements_statement(
+        self,
+        _n: &ShowMeasurementsStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SHOW MEASUREMENTS` statement are visited.
+    fn post_visit_show_measurements_statement(
+        self,
+        _n: &ShowMeasurementsStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SHOW RETENTION POLICIES` statement are visited.
+    fn pre_visit_show_retention_policies_statement(
+        self,
+        _n: &ShowRetentionPoliciesStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SHOW RETENTION POLICIES` statement are visited.
+    fn post_visit_show_retention_policies_statement(
+        self,
+        _n: &ShowRetentionPoliciesStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SHOW TAG KEYS` statement are visited.
+    fn pre_visit_show_tag_keys_statement(
+        self,
+        _n: &ShowTagKeysStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SHOW TAG KEYS` statement are visited.
+    fn post_visit_show_tag_keys_statement(
+        self,
+        _n: &ShowTagKeysStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SHOW TAG VALUES` statement are visited.
+    fn pre_visit_show_tag_values_statement(
+        self,
+        _n: &ShowTagValuesStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SHOW TAG VALUES` statement are visited.
+    fn post_visit_show_tag_values_statement(
+        self,
+        _n: &ShowTagValuesStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SHOW FIELD KEYS` statement are visited.
+    fn pre_visit_show_field_keys_statement(
+        self,
+        _n: &ShowFieldKeysStatement,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SHOW FIELD KEYS` statement are visited.
+    fn post_visit_show_field_keys_statement(
+        self,
+        _n: &ShowFieldKeysStatement,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the conditional expression are visited.
+    fn pre_visit_conditional_expression(
+        self,
+        _n: &ConditionalExpression,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the conditional expression are visited.
+    fn post_visit_conditional_expression(
+        self,
+        _n: &ConditionalExpression,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the arithmetic expression are visited.
+    fn pre_visit_expr(self, _n: &Expr) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the arithmetic expression are visited.
+    fn post_visit_expr(self, _n: &Expr) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any fields of the `SELECT` projection are visited.
+    fn pre_visit_select_field_list(self, _n: &FieldList) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all fields of the `SELECT` projection are visited.
+    fn post_visit_select_field_list(self, _n: &FieldList) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the field of a `SELECT` statement are visited.
+    fn pre_visit_select_field(self, _n: &Field) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the field of a `SELECT` statement are visited.
+    fn post_visit_select_field(self, _n: &Field) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `FROM` clause of a `SELECT` statement are visited.
+    fn pre_visit_select_from_clause(
+        self,
+        _n: &FromMeasurementClause,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `FROM` clause of a `SELECT` statement are visited.
+    fn post_visit_select_from_clause(
+        self,
+        _n: &FromMeasurementClause,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the measurement selection of a `FROM` clause for a `SELECT` statement are visited.
+    fn pre_visit_select_measurement_selection(
+        self,
+        _n: &MeasurementSelection,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the measurement selection of a `FROM` clause for a `SELECT` statement are visited.
+    fn post_visit_select_measurement_selection(
+        self,
+        _n: &MeasurementSelection,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `GROUP BY` clause are visited.
+    fn pre_visit_group_by_clause(self, _n: &GroupByClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `GROUP BY` clause are visited.
+    fn post_visit_group_by_clause(self, _n: &GroupByClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `GROUP BY` dimension expression are visited.
+    fn pre_visit_select_dimension(self, _n: &Dimension) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `GROUP BY` dimension expression are visited.
+    fn post_visit_select_dimension(self, _n: &Dimension) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before `TIME` dimension clause is visited.
+    fn pre_visit_select_time_dimension(
+        self,
+        _n: &TimeDimension,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after `TIME` dimension clause is visited.
+    fn post_visit_select_time_dimension(self, _n: &TimeDimension) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `WHERE` clause are visited.
+    fn pre_visit_where_clause(self, _n: &WhereClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `WHERE` clause are visited.
+    fn post_visit_where_clause(self, _n: &WhereClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `FROM` clause for any `SHOW` statement are visited.
+    fn pre_visit_show_from_clause(
+        self,
+        _n: &ShowFromClause,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `FROM` clause for any `SHOW` statement are visited.
+    fn post_visit_show_from_clause(self, _n: &ShowFromClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the qualified measurement name are visited.
+    fn pre_visit_qualified_measurement_name(
+        self,
+        _n: &QualifiedMeasurementName,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the qualified measurement name are visited.
+    fn post_visit_qualified_measurement_name(
+        self,
+        _n: &QualifiedMeasurementName,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `FILL` clause are visited.
+    fn pre_visit_fill_clause(self, _n: &FillClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `FILL` clause are visited.
+    fn post_visit_fill_clause(self, _n: &FillClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `ORDER BY` clause are visited.
+    fn pre_visit_order_by_clause(self, _n: &OrderByClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `ORDER BY` clause are visited.
+    fn post_visit_order_by_clause(self, _n: &OrderByClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `LIMIT` clause are visited.
+    fn pre_visit_limit_clause(self, _n: &LimitClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `LIMIT` clause are visited.
+    fn post_visit_limit_clause(self, _n: &LimitClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `OFFSET` clause are visited.
+    fn pre_visit_offset_clause(self, _n: &OffsetClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `OFFSET` clause are visited.
+    fn post_visit_offset_clause(self, _n: &OffsetClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SLIMIT` clause are visited.
+    fn pre_visit_slimit_clause(self, _n: &SLimitClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SLIMIT` clause are visited.
+    fn post_visit_slimit_clause(self, _n: &SLimitClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of the `SOFFSET` clause are visited.
+    fn pre_visit_soffset_clause(self, _n: &SOffsetClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of the `SOFFSET` clause are visited.
+    fn post_visit_soffset_clause(self, _n: &SOffsetClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a `TZ` clause are visited.
+    fn pre_visit_timezone_clause(
+        self,
+        _n: &TimeZoneClause,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of a `TZ` clause are visited.
+    fn post_visit_timezone_clause(self, _n: &TimeZoneClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of an extended `ON` clause are visited.
+    fn pre_visit_extended_on_clause(
+        self,
+        _n: &ExtendedOnClause,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of an extended `ON` clause are visited.
+    fn post_visit_extended_on_clause(self, _n: &ExtendedOnClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of an `ON` clause are visited.
+    fn pre_visit_on_clause(self, _n: &OnClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of an `ON` clause are visited.
+    fn post_visit_on_clause(self, _n: &OnClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a `WITH MEASUREMENT` clause  are visited.
+    fn pre_visit_with_measurement_clause(
+        self,
+        _n: &WithMeasurementClause,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of a `WITH MEASUREMENT` clause  are visited.
+    fn post_visit_with_measurement_clause(
+        self,
+        _n: &WithMeasurementClause,
+    ) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a `WITH KEY` clause are visited.
+    fn pre_visit_with_key_clause(self, _n: &WithKeyClause) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of a `WITH KEY` clause  are visited.
+    fn post_visit_with_key_clause(self, _n: &WithKeyClause) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a variable reference are visited.
+    fn pre_visit_var_ref(self, _n: &VarRef) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of a variable reference are visited.
+    fn post_visit_var_ref(self, _n: &VarRef) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a function call are visited.
+    fn pre_visit_call(self, _n: &Call) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of a function call are visited.
+    fn post_visit_call(self, _n: &Call) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a binary expression are visited.
+    fn pre_visit_expr_binary(self, _n: &Binary) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of a binary expression are visited.
+    fn post_visit_expr_binary(self, _n: &Binary) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a conditional binary expression are visited.
+    fn pre_visit_conditional_binary(
+        self,
+        _n: &ConditionalBinary,
+    ) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after all children of a conditional binary expression are visited.
+    fn post_visit_conditional_binary(self, _n: &ConditionalBinary) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+
+    /// Invoked before any children of a literal are visited.
+    fn pre_visit_literal(self, _n: &Literal) -> Result<Recursion<Self>, Self::Error> {
+        Ok(Continue(self))
+    }
+
+    /// Invoked after a literal is visited.
+    fn post_visit_literal(self, _n: &Literal) -> Result<Self, Self::Error> {
+        Ok(self)
+    }
+}
+
+/// Trait for types that can be visited by [`Visitor`]
+pub trait Visitable: Sized {
+    /// accept a visitor, calling `visit` on all children of this
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error>;
+}
+
+impl Visitable for Statement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = match self {
+            Self::CreateDatabase(s) => s.accept(visitor),
+            Self::Delete(s) => s.accept(visitor),
+            Self::DropMeasurement(s) => s.accept(visitor),
+            Self::Explain(s) => s.accept(visitor),
+            Self::Select(s) => s.accept(visitor),
+            Self::ShowDatabases(s) => s.accept(visitor),
+            Self::ShowMeasurements(s) => s.accept(visitor),
+            Self::ShowRetentionPolicies(s) => s.accept(visitor),
+            Self::ShowTagKeys(s) => s.accept(visitor),
+            Self::ShowTagValues(s) => s.accept(visitor),
+            Self::ShowFieldKeys(s) => s.accept(visitor),
+        }?;
+
+        visitor.post_visit_statement(self)
+    }
+}
+
+impl Visitable for CreateDatabaseStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_create_database_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_create_database_statement(self)
+    }
+}
+
+impl Visitable for DeleteStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_delete_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = match self {
+            Self::FromWhere { from, condition } => {
+                let visitor = from.accept(visitor)?;
+
+                if let Some(condition) = condition {
+                    condition.accept(visitor)
+                } else {
+                    Ok(visitor)
+                }
+            }
+            Self::Where(condition) => condition.accept(visitor),
+        }?;
+
+        visitor.post_visit_delete_statement(self)
+    }
+}
+
+impl Visitable for WhereClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_where_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.0.accept(visitor)?;
+
+        visitor.post_visit_where_clause(self)
+    }
+}
+
+impl Visitable for DeleteFromClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_delete_from_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.contents.iter().try_fold(visitor, |v, n| n.accept(v))?;
+
+        visitor.post_visit_delete_from_clause(self)
+    }
+}
+
+impl Visitable for MeasurementName {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_measurement_name(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_measurement_name(self)
+    }
+}
+
+impl Visitable for DropMeasurementStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_drop_measurement_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_drop_measurement_statement(self)
+    }
+}
+
+impl Visitable for ExplainStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_explain_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.statement.accept(visitor)?;
+
+        visitor.post_visit_explain_statement(self)
+    }
+}
+
+impl Visitable for SelectStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_select_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.fields.accept(visitor)?;
+
+        let visitor = self.from.accept(visitor)?;
+
+        let visitor = if let Some(condition) = &self.condition {
+            condition.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(group_by) = &self.group_by {
+            group_by.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(fill_clause) = &self.fill {
+            fill_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(order_by) = &self.order_by {
+            order_by.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(limit) = &self.limit {
+            limit.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(offset) = &self.offset {
+            offset.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(limit) = &self.series_limit {
+            limit.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(offset) = &self.series_offset {
+            offset.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(tz_clause) = &self.timezone {
+            tz_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        visitor.post_visit_select_statement(self)
+    }
+}
+
+impl Visitable for TimeZoneClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_timezone_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_timezone_clause(self)
+    }
+}
+
+impl Visitable for LimitClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_limit_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_limit_clause(self)
+    }
+}
+
+impl Visitable for OffsetClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_offset_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_offset_clause(self)
+    }
+}
+
+impl Visitable for SLimitClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_slimit_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_slimit_clause(self)
+    }
+}
+
+impl Visitable for SOffsetClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_soffset_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_soffset_clause(self)
+    }
+}
+
+impl Visitable for FillClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_fill_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_fill_clause(self)
+    }
+}
+
+impl Visitable for OrderByClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_order_by_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_order_by_clause(self)
+    }
+}
+
+impl Visitable for GroupByClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_group_by_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.contents.iter().try_fold(visitor, |v, d| d.accept(v))?;
+
+        visitor.post_visit_group_by_clause(self)
+    }
+}
+
+impl Visitable for ShowMeasurementsStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_show_measurements_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = if let Some(on_clause) = &self.on {
+            on_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(with_clause) = &self.with_measurement {
+            with_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(condition) = &self.condition {
+            condition.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(limit) = &self.limit {
+            limit.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(offset) = &self.offset {
+            offset.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        visitor.post_visit_show_measurements_statement(self)
+    }
+}
+
+impl Visitable for ExtendedOnClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_extended_on_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_extended_on_clause(self)
+    }
+}
+
+impl Visitable for WithMeasurementClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_with_measurement_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = match self {
+            Self::Equals(n) => n.accept(visitor),
+            Self::Regex(n) => n.accept(visitor),
+        }?;
+
+        visitor.post_visit_with_measurement_clause(self)
+    }
+}
+
+impl Visitable for ShowRetentionPoliciesStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_show_retention_policies_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = if let Some(on_clause) = &self.database {
+            on_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        visitor.post_visit_show_retention_policies_statement(self)
+    }
+}
+
+impl Visitable for ShowFromClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_show_from_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.contents.iter().try_fold(visitor, |v, f| f.accept(v))?;
+
+        visitor.post_visit_show_from_clause(self)
+    }
+}
+
+impl Visitable for QualifiedMeasurementName {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_qualified_measurement_name(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.name.accept(visitor)?;
+
+        visitor.post_visit_qualified_measurement_name(self)
+    }
+}
+
+impl Visitable for ShowTagKeysStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_show_tag_keys_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = if let Some(on_clause) = &self.database {
+            on_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(from) = &self.from {
+            from.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(condition) = &self.condition {
+            condition.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(limit) = &self.limit {
+            limit.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(offset) = &self.offset {
+            offset.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        visitor.post_visit_show_tag_keys_statement(self)
+    }
+}
+
+impl Visitable for ShowTagValuesStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_show_tag_values_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = if let Some(on_clause) = &self.database {
+            on_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(from) = &self.from {
+            from.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = self.with_key.accept(visitor)?;
+
+        let visitor = if let Some(condition) = &self.condition {
+            condition.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(limit) = &self.limit {
+            limit.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(offset) = &self.offset {
+            offset.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        visitor.post_visit_show_tag_values_statement(self)
+    }
+}
+
+impl Visitable for ShowFieldKeysStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_show_field_keys_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = if let Some(on_clause) = &self.database {
+            on_clause.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(from) = &self.from {
+            from.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(limit) = &self.limit {
+            limit.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        let visitor = if let Some(offset) = &self.offset {
+            offset.accept(visitor)
+        } else {
+            Ok(visitor)
+        }?;
+
+        visitor.post_visit_show_field_keys_statement(self)
+    }
+}
+
+impl Visitable for FieldList {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_select_field_list(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.contents.iter().try_fold(visitor, |v, f| f.accept(v))?;
+
+        visitor.post_visit_select_field_list(self)
+    }
+}
+
+impl Visitable for Field {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_select_field(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.expr.accept(visitor)?;
+
+        visitor.post_visit_select_field(self)
+    }
+}
+
+impl Visitable for FromMeasurementClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_select_from_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.contents.iter().try_fold(visitor, |v, f| f.accept(v))?;
+
+        visitor.post_visit_select_from_clause(self)
+    }
+}
+
+impl Visitable for MeasurementSelection {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_select_measurement_selection(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = match self {
+            Self::Name(name) => name.accept(visitor),
+            Self::Subquery(select) => select.accept(visitor),
+        }?;
+
+        visitor.post_visit_select_measurement_selection(self)
+    }
+}
+
+impl Visitable for Dimension {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_select_dimension(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = match self {
+            Self::Time(v) => v.accept(visitor),
+            Self::VarRef(_) | Self::Regex(_) | Self::Wildcard => Ok(visitor),
+        }?;
+
+        visitor.post_visit_select_dimension(self)
+    }
+}
+
+impl Visitable for TimeDimension {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_select_time_dimension(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.interval.accept(visitor)?;
+        let visitor = if let Some(offset) = &self.offset {
+            offset.accept(visitor)?
+        } else {
+            visitor
+        };
+
+        visitor.post_visit_select_time_dimension(self)
+    }
+}
+
+impl Visitable for WithKeyClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_with_key_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_with_key_clause(self)
+    }
+}
+
+impl Visitable for ShowDatabasesStatement {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_show_databases_statement(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+        visitor.post_visit_show_databases_statement(self)
+    }
+}
+
+impl Visitable for ConditionalExpression {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_conditional_expression(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = match self {
+            Self::Expr(expr) => expr.accept(visitor),
+            Self::Binary(expr) => expr.accept(visitor),
+            Self::Grouped(expr) => expr.accept(visitor),
+        }?;
+
+        visitor.post_visit_conditional_expression(self)
+    }
+}
+
+impl Visitable for Expr {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_expr(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = match self {
+            Self::Call(expr) => expr.accept(visitor),
+            Self::Binary(expr) => expr.accept(visitor),
+            Self::Nested(expr) => expr.accept(visitor),
+            Self::VarRef(expr) => expr.accept(visitor),
+            Self::Literal(expr) => expr.accept(visitor),
+
+            // We explicitly list out each enumeration, to ensure
+            // we revisit if new items are added to the Expr enumeration.
+            Self::BindParameter(_) | Self::Wildcard(_) | Self::Distinct(_) => Ok(visitor),
+        }?;
+
+        visitor.post_visit_expr(self)
+    }
+}
+
+impl Visitable for Literal {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_literal(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_literal(self)
+    }
+}
+
+impl Visitable for OnClause {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_on_clause(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_on_clause(self)
+    }
+}
+
+impl Visitable for VarRef {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_var_ref(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        visitor.post_visit_var_ref(self)
+    }
+}
+
+impl Visitable for Call {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_call(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.args.iter().try_fold(visitor, |v, n| n.accept(v))?;
+
+        visitor.post_visit_call(self)
+    }
+}
+
+impl Visitable for Binary {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_expr_binary(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.lhs.accept(visitor)?;
+        let visitor = self.rhs.accept(visitor)?;
+
+        visitor.post_visit_expr_binary(self)
+    }
+}
+
+impl Visitable for ConditionalBinary {
+    fn accept<V: Visitor>(&self, visitor: V) -> Result<V, V::Error> {
+        let visitor = match visitor.pre_visit_conditional_binary(self)? {
+            Continue(visitor) => visitor,
+            Stop(visitor) => return Ok(visitor),
+        };
+
+        let visitor = self.lhs.accept(visitor)?;
+        let visitor = self.rhs.accept(visitor)?;
+
+        visitor.post_visit_conditional_binary(self)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::Recursion::Continue;
+    use super::{Recursion, Visitable, Visitor};
+    use crate::common::{
+        LimitClause, MeasurementName, OffsetClause, OrderByClause, QualifiedMeasurementName,
+        WhereClause,
+    };
+    use crate::delete::DeleteStatement;
+    use crate::drop::DropMeasurementStatement;
+    use crate::explain::ExplainStatement;
+    use crate::expression::arithmetic::Expr;
+    use crate::expression::conditional::ConditionalExpression;
+    use crate::expression::{Binary, Call, ConditionalBinary, VarRef};
+    use crate::literal::Literal;
+    use crate::select::{
+        Dimension, Field, FieldList, FillClause, FromMeasurementClause, GroupByClause,
+        MeasurementSelection, SLimitClause, SOffsetClause, SelectStatement, TimeDimension,
+        TimeZoneClause,
+    };
+    use crate::show::{OnClause, ShowDatabasesStatement};
+    use crate::show_field_keys::ShowFieldKeysStatement;
+    use crate::show_measurements::{
+        ExtendedOnClause, ShowMeasurementsStatement, WithMeasurementClause,
+    };
+    use crate::show_retention_policies::ShowRetentionPoliciesStatement;
+    use crate::show_tag_keys::ShowTagKeysStatement;
+    use crate::show_tag_values::{ShowTagValuesStatement, WithKeyClause};
+    use crate::simple_from_clause::{DeleteFromClause, ShowFromClause};
+    use crate::statement::{statement, Statement};
+
+    struct TestVisitor(Vec<String>);
+
+    impl TestVisitor {
+        fn new() -> Self {
+            Self(Vec::new())
+        }
+
+        fn push_pre(self, name: &str) -> Self {
+            let mut s = self.0;
+            s.push(format!("pre_visit_{name}"));
+            Self(s)
+        }
+
+        fn push_post(self, name: &str) -> Self {
+            let mut s = self.0;
+            s.push(format!("post_visit_{name}"));
+            Self(s)
+        }
+    }
+
+    macro_rules! trace_visit {
+        ($NAME:ident, $TYPE:ty) => {
+            paste::paste! {
+                fn [<pre_visit_ $NAME>](self, _n: &$TYPE) -> Result<Recursion<Self>, Self::Error> {
+                    Ok(Continue(self.push_pre(stringify!($NAME))))
+                }
+
+                fn [<post_visit_ $NAME>](self, _n: &$TYPE) -> Result<Self, Self::Error> {
+                    Ok(self.push_post(stringify!($NAME)))
+                }
+            }
+        };
+    }
+
+    impl Visitor for TestVisitor {
+        type Error = ();
+
+        trace_visit!(statement, Statement);
+        trace_visit!(delete_statement, DeleteStatement);
+        trace_visit!(delete_from_clause, DeleteFromClause);
+        trace_visit!(measurement_name, MeasurementName);
+        trace_visit!(drop_measurement_statement, DropMeasurementStatement);
+        trace_visit!(explain_statement, ExplainStatement);
+        trace_visit!(select_statement, SelectStatement);
+        trace_visit!(show_databases_statement, ShowDatabasesStatement);
+        trace_visit!(show_measurements_statement, ShowMeasurementsStatement);
+        trace_visit!(
+            show_retention_policies_statement,
+            ShowRetentionPoliciesStatement
+        );
+        trace_visit!(show_tag_keys_statement, ShowTagKeysStatement);
+        trace_visit!(show_tag_values_statement, ShowTagValuesStatement);
+        trace_visit!(show_field_keys_statement, ShowFieldKeysStatement);
+        trace_visit!(conditional_expression, ConditionalExpression);
+        trace_visit!(expr, Expr);
+        trace_visit!(select_field_list, FieldList);
+        trace_visit!(select_field, Field);
+        trace_visit!(select_from_clause, FromMeasurementClause);
+        trace_visit!(select_measurement_selection, MeasurementSelection);
+        trace_visit!(group_by_clause, GroupByClause);
+        trace_visit!(select_dimension, Dimension);
+        trace_visit!(select_time_dimension, TimeDimension);
+        trace_visit!(where_clause, WhereClause);
+        trace_visit!(show_from_clause, ShowFromClause);
+        trace_visit!(qualified_measurement_name, QualifiedMeasurementName);
+        trace_visit!(fill_clause, FillClause);
+        trace_visit!(order_by_clause, OrderByClause);
+        trace_visit!(limit_clause, LimitClause);
+        trace_visit!(offset_clause, OffsetClause);
+        trace_visit!(slimit_clause, SLimitClause);
+        trace_visit!(soffset_clause, SOffsetClause);
+        trace_visit!(timezone_clause, TimeZoneClause);
+        trace_visit!(extended_on_clause, ExtendedOnClause);
+        trace_visit!(on_clause, OnClause);
+        trace_visit!(with_measurement_clause, WithMeasurementClause);
+        trace_visit!(with_key_clause, WithKeyClause);
+        trace_visit!(var_ref, VarRef);
+        trace_visit!(call, Call);
+        trace_visit!(expr_binary, Binary);
+        trace_visit!(conditional_binary, ConditionalBinary);
+        trace_visit!(literal, Literal);
+    }
+
+    macro_rules! visit_statement {
+        ($SQL:literal) => {{
+            let (_, s) = statement($SQL).unwrap();
+            s.accept(TestVisitor::new()).unwrap().0
+        }};
+    }
+
+    #[test]
+    fn test_delete_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE FROM a WHERE b = \"c\""));
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE WHERE 'foo bar' =~ /foo/"));
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE FROM /^cpu/"));
+    }
+
+    #[test]
+    fn test_drop_measurement_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("DROP MEASUREMENT cpu"))
+    }
+
+    #[test]
+    fn test_explain_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SELECT * FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW MEASUREMENTS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW TAG KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\""
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW FIELD KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW RETENTION POLICIES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW DATABASES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN EXPLAIN SELECT * from cpu"));
+    }
+
+    #[test]
+    fn test_select_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT value FROM temp"#));
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT DISTINCT value FROM temp"#));
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT COUNT(value) FROM temp"#));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            r#"SELECT COUNT(DISTINCT value) FROM temp"#
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT * FROM /cpu/, memory"#));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            r#"SELECT value FROM (SELECT usage FROM cpu WHERE host = "node1")
+            WHERE region =~ /west/ AND value > 5
+            GROUP BY TIME(5m), host
+            FILL(previous)
+            ORDER BY TIME DESC
+            LIMIT 1 OFFSET 2
+            SLIMIT 3 SOFFSET 4
+            TZ('Australia/Hobart')
+        "#
+        ));
+    }
+
+    #[test]
+    fn test_show_databases_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW DATABASES"));
+    }
+
+    #[test]
+    fn test_show_measurements_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS ON db.rp"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW MEASUREMENTS WITH MEASUREMENT = \"cpu\""
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS WHERE host = 'west'"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS LIMIT 5"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS OFFSET 10"));
+
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW MEASUREMENTS ON * WITH MEASUREMENT =~ /foo/ WHERE host = 'west' LIMIT 10 OFFSET 20"
+        ));
+    }
+
+    #[test]
+    fn test_show_retention_policies_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW RETENTION POLICIES"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW RETENTION POLICIES ON telegraf"));
+    }
+
+    #[test]
+    fn test_show_tag_keys_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW TAG KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW TAG KEYS ON telegraf FROM cpu WHERE host = \"west\" LIMIT 5 OFFSET 10"
+        ));
+    }
+
+    #[test]
+    fn test_show_tag_values_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW TAG VALUES WITH KEY = host"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW TAG VALUES WITH KEY =~ /host|region/"
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW TAG VALUES WITH KEY IN (host, region)"
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW TAG VALUES ON telegraf FROM cpu WITH KEY = host WHERE host = \"west\" LIMIT 5 OFFSET 10"));
+    }
+
+    #[test]
+    fn test_show_field_keys_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS ON telegraf"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS ON telegraf FROM /cpu/"));
+    }
+}
diff --git a/influxdb_influxql_parser/src/visit_mut.rs b/influxdb_influxql_parser/src/visit_mut.rs
new file mode 100644
index 0000000..3205f5f
--- /dev/null
+++ b/influxdb_influxql_parser/src/visit_mut.rs
@@ -0,0 +1,1529 @@
+//! The visit module provides API for walking the AST.
+//!
+//! # Example
+//!
+//! ```
+//! use influxdb_influxql_parser::visit_mut::{VisitableMut, VisitorMut};
+//! use influxdb_influxql_parser::parse_statements;
+//! use influxdb_influxql_parser::common::WhereClause;
+//!
+//! struct MyVisitor;
+//!
+//! impl VisitorMut for MyVisitor {
+//!     type Error = ();
+//!
+//!     fn post_visit_where_clause(&mut self, n: &mut WhereClause) -> Result<(), Self::Error> {
+//!         println!("{}", n);
+//!         Ok(())
+//!     }
+//! }
+//!
+//! let statements = parse_statements("SELECT value FROM cpu WHERE host = 'west'").unwrap();
+//! let mut statement  = statements.first().unwrap().clone();
+//! let mut vis = MyVisitor;
+//! statement.accept(&mut vis).unwrap();
+//! ```
+use self::Recursion::*;
+use crate::common::{
+    LimitClause, MeasurementName, OffsetClause, OrderByClause, QualifiedMeasurementName,
+    WhereClause,
+};
+use crate::create::CreateDatabaseStatement;
+use crate::delete::DeleteStatement;
+use crate::drop::DropMeasurementStatement;
+use crate::explain::ExplainStatement;
+use crate::expression::arithmetic::Expr;
+use crate::expression::conditional::ConditionalExpression;
+use crate::expression::{Binary, Call, ConditionalBinary, VarRef};
+use crate::literal::Literal;
+use crate::select::{
+    Dimension, Field, FieldList, FillClause, FromMeasurementClause, GroupByClause,
+    MeasurementSelection, SLimitClause, SOffsetClause, SelectStatement, TimeDimension,
+    TimeZoneClause,
+};
+use crate::show::{OnClause, ShowDatabasesStatement};
+use crate::show_field_keys::ShowFieldKeysStatement;
+use crate::show_measurements::{
+    ExtendedOnClause, ShowMeasurementsStatement, WithMeasurementClause,
+};
+use crate::show_retention_policies::ShowRetentionPoliciesStatement;
+use crate::show_tag_keys::ShowTagKeysStatement;
+use crate::show_tag_values::{ShowTagValuesStatement, WithKeyClause};
+use crate::simple_from_clause::{DeleteFromClause, ShowFromClause};
+use crate::statement::Statement;
+
+/// Controls how the visitor recursion should proceed.
+#[derive(Clone, Copy, Debug)]
+pub enum Recursion {
+    /// Attempt to visit all the children, recursively, of this expression.
+    Continue,
+    /// Do not visit the children of this expression, though the walk
+    /// of parents of this expression will not be affected
+    Stop,
+}
+
+/// Encode the depth-first traversal of an InfluxQL statement. When passed to
+/// any [`VisitableMut::accept`], `pre_visit` functions are invoked repeatedly
+/// until a leaf node is reached or a `pre_visit` function returns [`Recursion::Stop`].
+pub trait VisitorMut: Sized {
+    /// The type returned in the event of an error traversing the tree.
+    type Error;
+
+    /// Invoked before any children of the InfluxQL statement are visited.
+    fn pre_visit_statement(&mut self, _n: &mut Statement) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the InfluxQL statement are visited.
+    fn post_visit_statement(&mut self, _n: &mut Statement) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of `n` are visited.
+    fn pre_visit_create_database_statement(
+        &mut self,
+        _n: &mut CreateDatabaseStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of `n` are visited. Default
+    /// implementation does nothing.
+    fn post_visit_create_database_statement(
+        &mut self,
+        _n: &mut CreateDatabaseStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `DELETE` statement are visited.
+    fn pre_visit_delete_statement(
+        &mut self,
+        _n: &mut DeleteStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `DELETE` statement are visited.
+    fn post_visit_delete_statement(&mut self, _n: &mut DeleteStatement) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `FROM` clause of a `DELETE` statement are visited.
+    fn pre_visit_delete_from_clause(
+        &mut self,
+        _n: &mut DeleteFromClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `FROM` clause of a `DELETE` statement are visited.
+    fn post_visit_delete_from_clause(
+        &mut self,
+        _n: &mut DeleteFromClause,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the measurement name are visited.
+    fn pre_visit_measurement_name(
+        &mut self,
+        _n: &mut MeasurementName,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the measurement name are visited.
+    fn post_visit_measurement_name(&mut self, _n: &mut MeasurementName) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `DROP MEASUREMENT` statement are visited.
+    fn pre_visit_drop_measurement_statement(
+        &mut self,
+        _n: &mut DropMeasurementStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `DROP MEASUREMENT` statement are visited.
+    fn post_visit_drop_measurement_statement(
+        &mut self,
+        _n: &mut DropMeasurementStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `EXPLAIN` statement are visited.
+    fn pre_visit_explain_statement(
+        &mut self,
+        _n: &mut ExplainStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `EXPLAIN` statement are visited.
+    fn post_visit_explain_statement(
+        &mut self,
+        _n: &mut ExplainStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SELECT` statement are visited.
+    fn pre_visit_select_statement(
+        &mut self,
+        _n: &mut SelectStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SELECT` statement are visited.
+    fn post_visit_select_statement(&mut self, _n: &mut SelectStatement) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SHOW DATABASES` statement are visited.
+    fn pre_visit_show_databases_statement(
+        &mut self,
+        _n: &mut ShowDatabasesStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SHOW DATABASES` statement are visited.
+    fn post_visit_show_databases_statement(
+        &mut self,
+        _n: &mut ShowDatabasesStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SHOW MEASUREMENTS` statement are visited.
+    fn pre_visit_show_measurements_statement(
+        &mut self,
+        _n: &mut ShowMeasurementsStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SHOW MEASUREMENTS` statement are visited.
+    fn post_visit_show_measurements_statement(
+        &mut self,
+        _n: &mut ShowMeasurementsStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SHOW RETENTION POLICIES` statement are visited.
+    fn pre_visit_show_retention_policies_statement(
+        &mut self,
+        _n: &mut ShowRetentionPoliciesStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SHOW RETENTION POLICIES` statement are visited.
+    fn post_visit_show_retention_policies_statement(
+        &mut self,
+        _n: &mut ShowRetentionPoliciesStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SHOW TAG KEYS` statement are visited.
+    fn pre_visit_show_tag_keys_statement(
+        &mut self,
+        _n: &mut ShowTagKeysStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SHOW TAG KEYS` statement are visited.
+    fn post_visit_show_tag_keys_statement(
+        &mut self,
+        _n: &mut ShowTagKeysStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SHOW TAG VALUES` statement are visited.
+    fn pre_visit_show_tag_values_statement(
+        &mut self,
+        _n: &mut ShowTagValuesStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SHOW TAG VALUES` statement are visited.
+    fn post_visit_show_tag_values_statement(
+        &mut self,
+        _n: &mut ShowTagValuesStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SHOW FIELD KEYS` statement are visited.
+    fn pre_visit_show_field_keys_statement(
+        &mut self,
+        _n: &mut ShowFieldKeysStatement,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SHOW FIELD KEYS` statement are visited.
+    fn post_visit_show_field_keys_statement(
+        &mut self,
+        _n: &mut ShowFieldKeysStatement,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the conditional expression are visited.
+    fn pre_visit_conditional_expression(
+        &mut self,
+        _n: &mut ConditionalExpression,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the conditional expression are visited.
+    fn post_visit_conditional_expression(
+        &mut self,
+        _n: &mut ConditionalExpression,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the arithmetic expression are visited.
+    fn pre_visit_expr(&mut self, _n: &mut Expr) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the arithmetic expression are visited.
+    fn post_visit_expr(&mut self, _n: &mut Expr) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any fields of the `SELECT` projection are visited.
+    fn pre_visit_select_field_list(
+        &mut self,
+        _n: &mut FieldList,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all fields of the `SELECT` projection are visited.
+    fn post_visit_select_field_list(&mut self, _n: &mut FieldList) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the field of a `SELECT` statement are visited.
+    fn pre_visit_select_field(&mut self, _n: &mut Field) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the field of a `SELECT` statement are visited.
+    fn post_visit_select_field(&mut self, _n: &mut Field) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `FROM` clause of a `SELECT` statement are visited.
+    fn pre_visit_select_from_clause(
+        &mut self,
+        _n: &mut FromMeasurementClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `FROM` clause of a `SELECT` statement are visited.
+    fn post_visit_select_from_clause(
+        &mut self,
+        _n: &mut FromMeasurementClause,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the measurement selection of a `FROM` clause for a `SELECT` statement are visited.
+    fn pre_visit_select_measurement_selection(
+        &mut self,
+        _n: &mut MeasurementSelection,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the measurement selection of a `FROM` clause for a `SELECT` statement are visited.
+    fn post_visit_select_measurement_selection(
+        &mut self,
+        _n: &mut MeasurementSelection,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `GROUP BY` clause are visited.
+    fn pre_visit_group_by_clause(
+        &mut self,
+        _n: &mut GroupByClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `GROUP BY` clause are visited.
+    fn post_visit_group_by_clause(&mut self, _n: &mut GroupByClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `GROUP BY` dimension expression are visited.
+    fn pre_visit_select_dimension(&mut self, _n: &mut Dimension) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `GROUP BY` dimension expression are visited.
+    fn post_visit_select_dimension(&mut self, _n: &mut Dimension) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before `TIME` dimension clause is visited.
+    fn pre_visit_select_time_dimension(
+        &mut self,
+        _n: &mut TimeDimension,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after `TIME` dimension clause is visited.
+    fn post_visit_select_time_dimension(
+        &mut self,
+        _n: &mut TimeDimension,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `WHERE` clause are visited.
+    fn pre_visit_where_clause(&mut self, _n: &mut WhereClause) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `WHERE` clause are visited.
+    fn post_visit_where_clause(&mut self, _n: &mut WhereClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `FROM` clause for any `SHOW` statement are visited.
+    fn pre_visit_show_from_clause(
+        &mut self,
+        _n: &mut ShowFromClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `FROM` clause for any `SHOW` statement are visited.
+    fn post_visit_show_from_clause(&mut self, _n: &mut ShowFromClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the qualified measurement name are visited.
+    fn pre_visit_qualified_measurement_name(
+        &mut self,
+        _n: &mut QualifiedMeasurementName,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the qualified measurement name are visited.
+    fn post_visit_qualified_measurement_name(
+        &mut self,
+        _n: &mut QualifiedMeasurementName,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `FILL` clause are visited.
+    fn pre_visit_fill_clause(&mut self, _n: &mut FillClause) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `FILL` clause are visited.
+    fn post_visit_fill_clause(&mut self, _n: &mut FillClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `ORDER BY` clause are visited.
+    fn pre_visit_order_by_clause(
+        &mut self,
+        _n: &mut OrderByClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `ORDER BY` clause are visited.
+    fn post_visit_order_by_clause(&mut self, _n: &mut OrderByClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `LIMIT` clause are visited.
+    fn pre_visit_limit_clause(&mut self, _n: &mut LimitClause) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `LIMIT` clause are visited.
+    fn post_visit_limit_clause(&mut self, _n: &mut LimitClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `OFFSET` clause are visited.
+    fn pre_visit_offset_clause(&mut self, _n: &mut OffsetClause) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `OFFSET` clause are visited.
+    fn post_visit_offset_clause(&mut self, _n: &mut OffsetClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SLIMIT` clause are visited.
+    fn pre_visit_slimit_clause(&mut self, _n: &mut SLimitClause) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SLIMIT` clause are visited.
+    fn post_visit_slimit_clause(&mut self, _n: &mut SLimitClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of the `SOFFSET` clause are visited.
+    fn pre_visit_soffset_clause(
+        &mut self,
+        _n: &mut SOffsetClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of the `SOFFSET` clause are visited.
+    fn post_visit_soffset_clause(&mut self, _n: &mut SOffsetClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a `TZ` clause are visited.
+    fn pre_visit_timezone_clause(
+        &mut self,
+        _n: &mut TimeZoneClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of a `TZ` clause are visited.
+    fn post_visit_timezone_clause(&mut self, _n: &mut TimeZoneClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of an extended `ON` clause are visited.
+    fn pre_visit_extended_on_clause(
+        &mut self,
+        _n: &mut ExtendedOnClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of an extended `ON` clause are visited.
+    fn post_visit_extended_on_clause(
+        &mut self,
+        _n: &mut ExtendedOnClause,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of an `ON` clause are visited.
+    fn pre_visit_on_clause(&mut self, _n: &mut OnClause) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of an `ON` clause are visited.
+    fn post_visit_on_clause(&mut self, _n: &mut OnClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a `WITH MEASUREMENT` clause  are visited.
+    fn pre_visit_with_measurement_clause(
+        &mut self,
+        _n: &mut WithMeasurementClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of a `WITH MEASUREMENT` clause  are visited.
+    fn post_visit_with_measurement_clause(
+        &mut self,
+        _n: &mut WithMeasurementClause,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a `WITH KEY` clause are visited.
+    fn pre_visit_with_key_clause(
+        &mut self,
+        _n: &mut WithKeyClause,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of a `WITH KEY` clause  are visited.
+    fn post_visit_with_key_clause(&mut self, _n: &mut WithKeyClause) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a variable reference are visited.
+    fn pre_visit_var_ref(&mut self, _n: &mut VarRef) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of a variable reference are visited.
+    fn post_visit_var_ref(&mut self, _n: &mut VarRef) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a function call are visited.
+    fn pre_visit_call(&mut self, _n: &mut Call) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of a function call are visited.
+    fn post_visit_call(&mut self, _n: &mut Call) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a binary expression are visited.
+    fn pre_visit_expr_binary(&mut self, _n: &mut Binary) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of a binary expression are visited.
+    fn post_visit_expr_binary(&mut self, _n: &mut Binary) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a conditional binary expression are visited.
+    fn pre_visit_conditional_binary(
+        &mut self,
+        _n: &mut ConditionalBinary,
+    ) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after all children of a conditional binary expression are visited.
+    fn post_visit_conditional_binary(
+        &mut self,
+        _n: &mut ConditionalBinary,
+    ) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    /// Invoked before any children of a literal are visited.
+    fn pre_visit_literal(&mut self, _n: &mut Literal) -> Result<Recursion, Self::Error> {
+        Ok(Continue)
+    }
+
+    /// Invoked after a literal is visited.
+    fn post_visit_literal(&mut self, _n: &mut Literal) -> Result<(), Self::Error> {
+        Ok(())
+    }
+}
+
+/// Trait for types that can be visited by [`VisitorMut`]
+pub trait VisitableMut: Sized {
+    /// accept a visitor, calling `visit` on all children of this
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error>;
+}
+
+impl VisitableMut for Statement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_statement(self)? {
+            return Ok(());
+        };
+
+        match self {
+            Self::CreateDatabase(s) => s.accept(visitor),
+            Self::Delete(s) => s.accept(visitor),
+            Self::DropMeasurement(s) => s.accept(visitor),
+            Self::Explain(s) => s.accept(visitor),
+            Self::Select(s) => s.accept(visitor),
+            Self::ShowDatabases(s) => s.accept(visitor),
+            Self::ShowMeasurements(s) => s.accept(visitor),
+            Self::ShowRetentionPolicies(s) => s.accept(visitor),
+            Self::ShowTagKeys(s) => s.accept(visitor),
+            Self::ShowTagValues(s) => s.accept(visitor),
+            Self::ShowFieldKeys(s) => s.accept(visitor),
+        }?;
+
+        visitor.post_visit_statement(self)
+    }
+}
+
+impl VisitableMut for CreateDatabaseStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_create_database_statement(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_create_database_statement(self)
+    }
+}
+
+impl VisitableMut for DeleteStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_delete_statement(self)? {
+            return Ok(());
+        };
+
+        match self {
+            Self::FromWhere { from, condition } => {
+                from.accept(visitor)?;
+
+                if let Some(condition) = condition {
+                    condition.accept(visitor)?;
+                }
+            }
+            Self::Where(condition) => condition.accept(visitor)?,
+        };
+
+        visitor.post_visit_delete_statement(self)
+    }
+}
+
+impl VisitableMut for WhereClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_where_clause(self)? {
+            return Ok(());
+        };
+
+        self.0.accept(visitor)?;
+
+        visitor.post_visit_where_clause(self)
+    }
+}
+
+impl VisitableMut for DeleteFromClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_delete_from_clause(self)? {
+            return Ok(());
+        };
+
+        self.contents
+            .iter_mut()
+            .try_for_each(|n| n.accept(visitor))?;
+
+        visitor.post_visit_delete_from_clause(self)
+    }
+}
+
+impl VisitableMut for MeasurementName {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_measurement_name(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_measurement_name(self)
+    }
+}
+
+impl VisitableMut for DropMeasurementStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_drop_measurement_statement(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_drop_measurement_statement(self)
+    }
+}
+
+impl VisitableMut for ExplainStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_explain_statement(self)? {
+            return Ok(());
+        };
+
+        self.statement.accept(visitor)?;
+
+        visitor.post_visit_explain_statement(self)
+    }
+}
+
+impl VisitableMut for SelectStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_select_statement(self)? {
+            return Ok(());
+        };
+
+        self.fields.accept(visitor)?;
+
+        self.from.accept(visitor)?;
+
+        if let Some(condition) = &mut self.condition {
+            condition.accept(visitor)?;
+        }
+
+        if let Some(group_by) = &mut self.group_by {
+            group_by.accept(visitor)?;
+        }
+
+        if let Some(fill_clause) = &mut self.fill {
+            fill_clause.accept(visitor)?;
+        }
+
+        if let Some(order_by) = &mut self.order_by {
+            order_by.accept(visitor)?;
+        }
+
+        if let Some(limit) = &mut self.limit {
+            limit.accept(visitor)?;
+        }
+
+        if let Some(offset) = &mut self.offset {
+            offset.accept(visitor)?;
+        }
+
+        if let Some(limit) = &mut self.series_limit {
+            limit.accept(visitor)?;
+        }
+
+        if let Some(offset) = &mut self.series_offset {
+            offset.accept(visitor)?;
+        }
+
+        if let Some(tz_clause) = &mut self.timezone {
+            tz_clause.accept(visitor)?;
+        }
+
+        visitor.post_visit_select_statement(self)
+    }
+}
+
+impl VisitableMut for TimeZoneClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_timezone_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_timezone_clause(self)
+    }
+}
+
+impl VisitableMut for LimitClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_limit_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_limit_clause(self)
+    }
+}
+
+impl VisitableMut for OffsetClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_offset_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_offset_clause(self)
+    }
+}
+
+impl VisitableMut for SLimitClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_slimit_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_slimit_clause(self)
+    }
+}
+
+impl VisitableMut for SOffsetClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_soffset_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_soffset_clause(self)
+    }
+}
+
+impl VisitableMut for FillClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_fill_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_fill_clause(self)
+    }
+}
+
+impl VisitableMut for OrderByClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_order_by_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_order_by_clause(self)
+    }
+}
+
+impl VisitableMut for GroupByClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_group_by_clause(self)? {
+            return Ok(());
+        };
+
+        self.contents
+            .iter_mut()
+            .try_for_each(|d| d.accept(visitor))?;
+
+        visitor.post_visit_group_by_clause(self)
+    }
+}
+
+impl VisitableMut for ShowMeasurementsStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_show_measurements_statement(self)? {
+            return Ok(());
+        };
+
+        if let Some(on_clause) = &mut self.on {
+            on_clause.accept(visitor)?;
+        }
+
+        if let Some(with_clause) = &mut self.with_measurement {
+            with_clause.accept(visitor)?;
+        }
+
+        if let Some(condition) = &mut self.condition {
+            condition.accept(visitor)?;
+        }
+
+        if let Some(limit) = &mut self.limit {
+            limit.accept(visitor)?;
+        }
+
+        if let Some(offset) = &mut self.offset {
+            offset.accept(visitor)?;
+        }
+
+        visitor.post_visit_show_measurements_statement(self)
+    }
+}
+
+impl VisitableMut for ExtendedOnClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_extended_on_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_extended_on_clause(self)
+    }
+}
+
+impl VisitableMut for WithMeasurementClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_with_measurement_clause(self)? {
+            return Ok(());
+        };
+
+        match self {
+            Self::Equals(n) => n.accept(visitor),
+            Self::Regex(n) => n.accept(visitor),
+        }?;
+
+        visitor.post_visit_with_measurement_clause(self)
+    }
+}
+
+impl VisitableMut for ShowRetentionPoliciesStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_show_retention_policies_statement(self)? {
+            return Ok(());
+        };
+
+        if let Some(on_clause) = &mut self.database {
+            on_clause.accept(visitor)?;
+        }
+
+        visitor.post_visit_show_retention_policies_statement(self)
+    }
+}
+
+impl VisitableMut for ShowFromClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_show_from_clause(self)? {
+            return Ok(());
+        };
+
+        self.contents
+            .iter_mut()
+            .try_for_each(|f| f.accept(visitor))?;
+
+        visitor.post_visit_show_from_clause(self)
+    }
+}
+
+impl VisitableMut for QualifiedMeasurementName {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_qualified_measurement_name(self)? {
+            return Ok(());
+        };
+
+        self.name.accept(visitor)?;
+
+        visitor.post_visit_qualified_measurement_name(self)
+    }
+}
+
+impl VisitableMut for ShowTagKeysStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_show_tag_keys_statement(self)? {
+            return Ok(());
+        };
+
+        if let Some(on_clause) = &mut self.database {
+            on_clause.accept(visitor)?;
+        }
+
+        if let Some(from) = &mut self.from {
+            from.accept(visitor)?;
+        }
+
+        if let Some(condition) = &mut self.condition {
+            condition.accept(visitor)?;
+        }
+
+        if let Some(limit) = &mut self.limit {
+            limit.accept(visitor)?;
+        }
+
+        if let Some(offset) = &mut self.offset {
+            offset.accept(visitor)?;
+        }
+
+        visitor.post_visit_show_tag_keys_statement(self)
+    }
+}
+
+impl VisitableMut for ShowTagValuesStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_show_tag_values_statement(self)? {
+            return Ok(());
+        };
+
+        if let Some(on_clause) = &mut self.database {
+            on_clause.accept(visitor)?;
+        }
+
+        if let Some(from) = &mut self.from {
+            from.accept(visitor)?;
+        }
+
+        self.with_key.accept(visitor)?;
+
+        if let Some(condition) = &mut self.condition {
+            condition.accept(visitor)?;
+        }
+
+        if let Some(limit) = &mut self.limit {
+            limit.accept(visitor)?;
+        }
+
+        if let Some(offset) = &mut self.offset {
+            offset.accept(visitor)?;
+        }
+
+        visitor.post_visit_show_tag_values_statement(self)
+    }
+}
+
+impl VisitableMut for ShowFieldKeysStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_show_field_keys_statement(self)? {
+            return Ok(());
+        };
+
+        if let Some(on_clause) = &mut self.database {
+            on_clause.accept(visitor)?;
+        }
+
+        if let Some(from) = &mut self.from {
+            from.accept(visitor)?;
+        }
+
+        if let Some(limit) = &mut self.limit {
+            limit.accept(visitor)?;
+        }
+
+        if let Some(offset) = &mut self.offset {
+            offset.accept(visitor)?;
+        }
+
+        visitor.post_visit_show_field_keys_statement(self)
+    }
+}
+
+impl VisitableMut for FieldList {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_select_field_list(self)? {
+            return Ok(());
+        };
+
+        self.contents
+            .iter_mut()
+            .try_for_each(|f| f.accept(visitor))?;
+
+        visitor.post_visit_select_field_list(self)
+    }
+}
+
+impl VisitableMut for Field {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_select_field(self)? {
+            return Ok(());
+        };
+
+        self.expr.accept(visitor)?;
+
+        visitor.post_visit_select_field(self)
+    }
+}
+
+impl VisitableMut for FromMeasurementClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_select_from_clause(self)? {
+            return Ok(());
+        };
+
+        self.contents
+            .iter_mut()
+            .try_for_each(|f| f.accept(visitor))?;
+
+        visitor.post_visit_select_from_clause(self)
+    }
+}
+
+impl VisitableMut for MeasurementSelection {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_select_measurement_selection(self)? {
+            return Ok(());
+        };
+
+        match self {
+            Self::Name(name) => name.accept(visitor),
+            Self::Subquery(select) => select.accept(visitor),
+        }?;
+
+        visitor.post_visit_select_measurement_selection(self)
+    }
+}
+
+impl VisitableMut for Dimension {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_select_dimension(self)? {
+            return Ok(());
+        };
+
+        match self {
+            Self::Time(v) => v.accept(visitor)?,
+            Self::VarRef(_) | Self::Regex(_) | Self::Wildcard => {}
+        };
+
+        visitor.post_visit_select_dimension(self)
+    }
+}
+
+impl VisitableMut for TimeDimension {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_select_time_dimension(self)? {
+            return Ok(());
+        };
+
+        self.interval.accept(visitor)?;
+        if let Some(offset) = &mut self.offset {
+            offset.accept(visitor)?;
+        }
+
+        visitor.post_visit_select_time_dimension(self)
+    }
+}
+
+impl VisitableMut for WithKeyClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_with_key_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_with_key_clause(self)
+    }
+}
+
+impl VisitableMut for ShowDatabasesStatement {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_show_databases_statement(self)? {
+            return Ok(());
+        };
+        visitor.post_visit_show_databases_statement(self)
+    }
+}
+
+impl VisitableMut for ConditionalExpression {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_conditional_expression(self)? {
+            return Ok(());
+        };
+
+        match self {
+            Self::Expr(expr) => expr.accept(visitor),
+            Self::Binary(expr) => expr.accept(visitor),
+            Self::Grouped(expr) => expr.accept(visitor),
+        }?;
+
+        visitor.post_visit_conditional_expression(self)
+    }
+}
+
+impl VisitableMut for Expr {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_expr(self)? {
+            return Ok(());
+        };
+
+        match self {
+            Self::Call(expr) => expr.accept(visitor)?,
+            Self::Binary(expr) => expr.accept(visitor)?,
+            Self::Nested(expr) => expr.accept(visitor)?,
+            Self::VarRef(expr) => expr.accept(visitor)?,
+            Self::Literal(expr) => expr.accept(visitor)?,
+
+            // We explicitly list out each enumeration, to ensure
+            // we revisit if new items are added to the Expr enumeration.
+            Self::BindParameter(_) | Self::Wildcard(_) | Self::Distinct(_) => {}
+        };
+
+        visitor.post_visit_expr(self)
+    }
+}
+
+impl VisitableMut for Literal {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_literal(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_literal(self)
+    }
+}
+
+impl VisitableMut for OnClause {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_on_clause(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_on_clause(self)
+    }
+}
+
+impl VisitableMut for VarRef {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_var_ref(self)? {
+            return Ok(());
+        };
+
+        visitor.post_visit_var_ref(self)
+    }
+}
+
+impl VisitableMut for Call {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_call(self)? {
+            return Ok(());
+        }
+
+        self.args.iter_mut().try_for_each(|e| e.accept(visitor))?;
+
+        visitor.post_visit_call(self)
+    }
+}
+
+impl VisitableMut for Binary {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_expr_binary(self)? {
+            return Ok(());
+        };
+
+        self.lhs.accept(visitor)?;
+        self.rhs.accept(visitor)?;
+
+        visitor.post_visit_expr_binary(self)
+    }
+}
+
+impl VisitableMut for ConditionalBinary {
+    fn accept<V: VisitorMut>(&mut self, visitor: &mut V) -> Result<(), V::Error> {
+        if let Stop = visitor.pre_visit_conditional_binary(self)? {
+            return Ok(());
+        };
+
+        self.lhs.accept(visitor)?;
+        self.rhs.accept(visitor)?;
+
+        visitor.post_visit_conditional_binary(self)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::Recursion::Continue;
+    use super::{Recursion, VisitableMut, VisitorMut};
+    use crate::common::{
+        LimitClause, MeasurementName, OffsetClause, OrderByClause, QualifiedMeasurementName,
+        WhereClause,
+    };
+    use crate::delete::DeleteStatement;
+    use crate::drop::DropMeasurementStatement;
+    use crate::explain::ExplainStatement;
+    use crate::expression::arithmetic::Expr;
+    use crate::expression::conditional::ConditionalExpression;
+    use crate::expression::{Binary, Call, ConditionalBinary, VarRef};
+    use crate::literal::Literal;
+    use crate::parse_statements;
+    use crate::select::{
+        Dimension, Field, FieldList, FillClause, FromMeasurementClause, GroupByClause,
+        MeasurementSelection, SLimitClause, SOffsetClause, SelectStatement, TimeDimension,
+        TimeZoneClause,
+    };
+    use crate::show::{OnClause, ShowDatabasesStatement};
+    use crate::show_field_keys::ShowFieldKeysStatement;
+    use crate::show_measurements::{
+        ExtendedOnClause, ShowMeasurementsStatement, WithMeasurementClause,
+    };
+    use crate::show_retention_policies::ShowRetentionPoliciesStatement;
+    use crate::show_tag_keys::ShowTagKeysStatement;
+    use crate::show_tag_values::{ShowTagValuesStatement, WithKeyClause};
+    use crate::simple_from_clause::{DeleteFromClause, ShowFromClause};
+    use crate::statement::{statement, Statement};
+
+    struct TestVisitor(Vec<String>);
+
+    impl TestVisitor {
+        fn new() -> Self {
+            Self(Vec::new())
+        }
+
+        fn push_pre(&mut self, name: &str) {
+            self.0.push(format!("pre_visit_{name}"));
+        }
+
+        fn push_post(&mut self, name: &str) {
+            self.0.push(format!("post_visit_{name}"));
+        }
+    }
+
+    macro_rules! trace_visit {
+        ($NAME:ident, $TYPE:ty) => {
+            paste::paste! {
+                fn [<pre_visit_ $NAME>](&mut self, _n: &mut $TYPE) -> Result<Recursion, Self::Error> {
+                    self.push_pre(stringify!($NAME));
+                    Ok(Continue)
+                }
+
+                fn [<post_visit_ $NAME>](&mut self, _n: &mut $TYPE) -> Result<(), Self::Error> {
+                    self.push_post(stringify!($NAME));
+                    Ok(())
+                }
+            }
+        };
+    }
+
+    impl VisitorMut for TestVisitor {
+        type Error = ();
+
+        trace_visit!(statement, Statement);
+        trace_visit!(delete_statement, DeleteStatement);
+        trace_visit!(delete_from_clause, DeleteFromClause);
+        trace_visit!(measurement_name, MeasurementName);
+        trace_visit!(drop_measurement_statement, DropMeasurementStatement);
+        trace_visit!(explain_statement, ExplainStatement);
+        trace_visit!(select_statement, SelectStatement);
+        trace_visit!(show_databases_statement, ShowDatabasesStatement);
+        trace_visit!(show_measurements_statement, ShowMeasurementsStatement);
+        trace_visit!(
+            show_retention_policies_statement,
+            ShowRetentionPoliciesStatement
+        );
+        trace_visit!(show_tag_keys_statement, ShowTagKeysStatement);
+        trace_visit!(show_tag_values_statement, ShowTagValuesStatement);
+        trace_visit!(show_field_keys_statement, ShowFieldKeysStatement);
+        trace_visit!(conditional_expression, ConditionalExpression);
+        trace_visit!(expr, Expr);
+        trace_visit!(select_field_list, FieldList);
+        trace_visit!(select_field, Field);
+        trace_visit!(select_from_clause, FromMeasurementClause);
+        trace_visit!(select_measurement_selection, MeasurementSelection);
+        trace_visit!(group_by_clause, GroupByClause);
+        trace_visit!(select_dimension, Dimension);
+        trace_visit!(select_time_dimension, TimeDimension);
+        trace_visit!(where_clause, WhereClause);
+        trace_visit!(show_from_clause, ShowFromClause);
+        trace_visit!(qualified_measurement_name, QualifiedMeasurementName);
+        trace_visit!(fill_clause, FillClause);
+        trace_visit!(order_by_clause, OrderByClause);
+        trace_visit!(limit_clause, LimitClause);
+        trace_visit!(offset_clause, OffsetClause);
+        trace_visit!(slimit_clause, SLimitClause);
+        trace_visit!(soffset_clause, SOffsetClause);
+        trace_visit!(timezone_clause, TimeZoneClause);
+        trace_visit!(extended_on_clause, ExtendedOnClause);
+        trace_visit!(on_clause, OnClause);
+        trace_visit!(with_measurement_clause, WithMeasurementClause);
+        trace_visit!(with_key_clause, WithKeyClause);
+        trace_visit!(var_ref, VarRef);
+        trace_visit!(call, Call);
+        trace_visit!(expr_binary, Binary);
+        trace_visit!(conditional_binary, ConditionalBinary);
+        trace_visit!(literal, Literal);
+    }
+
+    macro_rules! visit_statement {
+        ($SQL:literal) => {{
+            let (_, mut s) = statement($SQL).unwrap();
+            let mut vis = TestVisitor::new();
+            s.accept(&mut vis).unwrap();
+            vis.0
+        }};
+    }
+
+    #[test]
+    fn test_delete_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE FROM a WHERE b = \"c\""));
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE WHERE 'foo bar' =~ /foo/"));
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("DELETE FROM /^cpu/"));
+    }
+
+    #[test]
+    fn test_drop_measurement_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("DROP MEASUREMENT cpu"))
+    }
+
+    #[test]
+    fn test_explain_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SELECT * FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW MEASUREMENTS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW TAG KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\""
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW FIELD KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW RETENTION POLICIES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW DATABASES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN EXPLAIN SELECT * from cpu"));
+    }
+
+    #[test]
+    fn test_select_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT value FROM temp"#));
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT DISTINCT value FROM temp"#));
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT COUNT(value) FROM temp"#));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            r#"SELECT COUNT(DISTINCT value) FROM temp"#
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!(r#"SELECT * FROM /cpu/, memory"#));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            r#"SELECT value FROM (SELECT usage FROM cpu WHERE host = "node1")
+            WHERE region =~ /west/ AND value > 5
+            GROUP BY TIME(5m), host
+            FILL(previous)
+            ORDER BY TIME DESC
+            LIMIT 1 OFFSET 2
+            SLIMIT 3 SOFFSET 4
+            TZ('Australia/Hobart')
+        "#
+        ));
+    }
+
+    #[test]
+    fn test_show_databases_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW DATABASES"));
+    }
+
+    #[test]
+    fn test_show_measurements_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS ON db.rp"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW MEASUREMENTS WITH MEASUREMENT = \"cpu\""
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS WHERE host = 'west'"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS LIMIT 5"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW MEASUREMENTS OFFSET 10"));
+
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW MEASUREMENTS ON * WITH MEASUREMENT =~ /foo/ WHERE host = 'west' LIMIT 10 OFFSET 20"
+        ));
+    }
+
+    #[test]
+    fn test_show_retention_policies_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW RETENTION POLICIES"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW RETENTION POLICIES ON telegraf"));
+    }
+
+    #[test]
+    fn test_show_tag_keys_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW TAG KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW TAG KEYS ON telegraf FROM cpu WHERE host = \"west\" LIMIT 5 OFFSET 10"
+        ));
+    }
+
+    #[test]
+    fn test_show_tag_values_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW TAG VALUES WITH KEY = host"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW TAG VALUES WITH KEY =~ /host|region/"
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "SHOW TAG VALUES WITH KEY IN (host, region)"
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW TAG VALUES ON telegraf FROM cpu WITH KEY = host WHERE host = \"west\" LIMIT 5 OFFSET 10"));
+    }
+
+    #[test]
+    fn test_show_field_keys_statement() {
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS ON telegraf"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("SHOW FIELD KEYS ON telegraf FROM /cpu/"));
+    }
+
+    #[test]
+    fn test_mutability() {
+        struct AddLimit;
+
+        impl VisitorMut for AddLimit {
+            type Error = ();
+
+            fn pre_visit_select_statement(
+                &mut self,
+                n: &mut SelectStatement,
+            ) -> Result<Recursion, Self::Error> {
+                n.limit = Some(LimitClause(10));
+                Ok(Continue)
+            }
+        }
+
+        let mut statement = parse_statements("SELECT usage FROM cpu")
+            .unwrap()
+            .first()
+            .unwrap()
+            .clone();
+        let mut vis = AddLimit;
+        statement.accept(&mut vis).unwrap();
+        assert_eq!(statement.to_string(), "SELECT usage FROM cpu LIMIT 10");
+    }
+}
diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml
new file mode 100644
index 0000000..84a2093
--- /dev/null
+++ b/influxdb_iox_client/Cargo.toml
@@ -0,0 +1,38 @@
+[package]
+name = "influxdb_iox_client"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[features]
+default = ["flight", "format"]
+flight = ["arrow", "arrow-flight", "arrow_util"]
+format = ["arrow", "arrow_util"]
+
+[dependencies]
+arrow = { workspace = true, optional = true }
+arrow-flight = { workspace = true, optional = true }
+arrow_util = { path = "../arrow_util", optional = true }
+bytes = "1.5"
+client_util = { path = "../client_util" }
+comfy-table = { version = "7.1", default-features = false}
+futures-util = { version = "0.3" }
+influxdb-line-protocol = { path = "../influxdb_line_protocol"}
+iox_query_params = { path = "../iox_query_params" }
+generated_types = { path = "../generated_types" }
+prost = { workspace = true }
+rand = "0.8.3"
+reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls-native-roots"] }
+schema = { path = "../schema" }
+serde_json = "1.0.111"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread"] }
+tokio-stream = "0.1.13"
+thiserror = "1.0.56"
+tonic = { workspace = true }
+
+[dev-dependencies]
+insta = { version = "1" }
diff --git a/influxdb_iox_client/README.md b/influxdb_iox_client/README.md
new file mode 100644
index 0000000..52b7934
--- /dev/null
+++ b/influxdb_iox_client/README.md
@@ -0,0 +1,15 @@
+# InfluxDB IOx Client
+
+This is the Rust client library for connecting to InfluxDB IOx.
+
+We're attempting to support all apis as they are added and modified but this client
+is likely not 100% complete at any time.
+
+Some apis are http (for instance the `write`) and some are gRPC. See the individual
+client modules for details.
+
+## Example: Using the Write Client
+
+To write to IOx, create a connection and a write client, and then send line
+protocol. Please see the example on ['Client' struct](./src/client/write.rs) that will work
+when running against all-in-one mode.
diff --git a/influxdb_iox_client/src/client.rs b/influxdb_iox_client/src/client.rs
new file mode 100644
index 0000000..161e5ad
--- /dev/null
+++ b/influxdb_iox_client/src/client.rs
@@ -0,0 +1,46 @@
+/// Client for interacting with a remote catalog
+pub mod catalog;
+
+/// Client for the compactor API
+pub mod compactor;
+
+/// Client for delete API
+pub mod delete;
+
+/// Errors for the client
+pub mod error;
+
+#[cfg(feature = "flight")]
+/// Client for IOx Flight API (native query API)
+pub mod flight;
+
+#[cfg(feature = "flight")]
+/// Client for Flight SQL
+pub mod flightsql;
+
+/// Client for health checking API
+pub mod health;
+
+/// Client for the ingester API
+pub mod ingester;
+
+/// Client for namespace API
+pub mod namespace;
+
+/// Client for query log API.
+pub mod query_log;
+
+/// Client for schema API
+pub mod schema;
+
+/// Client for interacting with a remote object store
+pub mod store;
+
+/// Client for table API
+pub mod table;
+
+/// Client for testing purposes.
+pub mod test;
+
+/// Client for write API
+pub mod write;
diff --git a/influxdb_iox_client/src/client/catalog.rs b/influxdb_iox_client/src/client/catalog.rs
new file mode 100644
index 0000000..f65ea4f
--- /dev/null
+++ b/influxdb_iox_client/src/client/catalog.rs
@@ -0,0 +1,85 @@
+use client_util::connection::GrpcConnection;
+
+use self::generated_types::{catalog_service_client::CatalogServiceClient, *};
+
+use crate::connection::Connection;
+use crate::error::Error;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::catalog::v1::*;
+}
+
+/// A basic client for interacting the a remote catalog.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: CatalogServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: CatalogServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Get the Parquet file records by their partition id
+    pub async fn get_parquet_files_by_partition_id(
+        &mut self,
+        partition_id: i64,
+    ) -> Result<Vec<ParquetFile>, Error> {
+        let response = self
+            .inner
+            .get_parquet_files_by_partition_id(GetParquetFilesByPartitionIdRequest { partition_id })
+            .await?;
+
+        Ok(response.into_inner().parquet_files)
+    }
+
+    /// Get the partitions by table id
+    pub async fn get_partitions_by_table_id(
+        &mut self,
+        table_id: i64,
+    ) -> Result<Vec<Partition>, Error> {
+        let response = self
+            .inner
+            .get_partitions_by_table_id(GetPartitionsByTableIdRequest { table_id })
+            .await?;
+
+        Ok(response.into_inner().partitions)
+    }
+
+    /// Get the Parquet file records by their namespace and table names
+    pub async fn get_parquet_files_by_namespace_table(
+        &mut self,
+        namespace_name: impl Into<String> + Send,
+        table_name: impl Into<String> + Send,
+    ) -> Result<Vec<ParquetFile>, Error> {
+        let namespace_name = namespace_name.into();
+        let table_name = table_name.into();
+        let response = self
+            .inner
+            .get_parquet_files_by_namespace_table(GetParquetFilesByNamespaceTableRequest {
+                namespace_name,
+                table_name,
+            })
+            .await?;
+
+        Ok(response.into_inner().parquet_files)
+    }
+
+    /// Get the Parquet file records by their namespace
+    pub async fn get_parquet_files_by_namespace(
+        &mut self,
+        namespace_name: impl Into<String> + Send,
+    ) -> Result<Vec<ParquetFile>, Error> {
+        let namespace_name = namespace_name.into();
+        let response = self
+            .inner
+            .get_parquet_files_by_namespace(GetParquetFilesByNamespaceRequest { namespace_name })
+            .await?;
+
+        Ok(response.into_inner().parquet_files)
+    }
+}
diff --git a/influxdb_iox_client/src/client/compactor.rs b/influxdb_iox_client/src/client/compactor.rs
new file mode 100644
index 0000000..43fc019
--- /dev/null
+++ b/influxdb_iox_client/src/client/compactor.rs
@@ -0,0 +1,49 @@
+use self::generated_types::{compaction_service_client::CompactionServiceClient, *};
+use crate::{connection::Connection, error::Error};
+use client_util::connection::GrpcConnection;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::compactor::v1::*;
+    pub use generated_types::influxdata::iox::skipped_compaction::v1::*;
+}
+
+/// A basic client for interacting with the compaction service.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: CompactionServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: CompactionServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// List all skipped compactions
+    pub async fn skipped_compactions(
+        &mut self,
+    ) -> Result<Vec<generated_types::SkippedCompaction>, Error> {
+        let response = self
+            .inner
+            .list_skipped_compactions(ListSkippedCompactionsRequest {})
+            .await?;
+
+        Ok(response.into_inner().skipped_compactions)
+    }
+
+    /// Delete the requested skipped compaction
+    pub async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: i64,
+    ) -> Result<Option<generated_types::SkippedCompaction>, Error> {
+        let response = self
+            .inner
+            .delete_skipped_compactions(DeleteSkippedCompactionsRequest { partition_id })
+            .await?;
+
+        Ok(response.into_inner().skipped_compaction)
+    }
+}
diff --git a/influxdb_iox_client/src/client/delete.rs b/influxdb_iox_client/src/client/delete.rs
new file mode 100644
index 0000000..263fde5
--- /dev/null
+++ b/influxdb_iox_client/src/client/delete.rs
@@ -0,0 +1,97 @@
+use client_util::connection::GrpcConnection;
+
+use self::generated_types::{delete_service_client::DeleteServiceClient, *};
+
+use crate::connection::Connection;
+use crate::error::Error;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::delete::v1::*;
+    pub use generated_types::influxdata::iox::predicate::v1::*;
+}
+
+/// An IOx Delete API client.
+///
+/// This client wraps the underlying `tonic` generated client with a
+/// more ergonomic interface.
+///
+/// ```no_run
+/// #[tokio::main]
+/// # async fn main() {
+/// use influxdb_iox_client::{
+///     delete::{
+///         Client,
+///         generated_types::*,
+///     },
+///     connection::Builder,
+/// };
+///
+/// let mut connection = Builder::default()
+///     .build("http://127.0.0.1:8082")
+///     .await
+///     .unwrap();
+///
+/// let mut client = Client::new(connection);
+///
+/// // Delete some data
+/// let pred = Predicate {
+///     range: Some(TimestampRange {
+///         start: 100,
+///         end: 120,
+///     }),
+///     exprs: vec![Expr {
+///         column: String::from("region"),
+///         op: Op::Eq.into(),
+///         scalar: Some(Scalar {
+///             value: Some(scalar::Value::ValueString(
+///                 String::from("west"),
+///             )),
+///         }),
+///     }],
+/// };
+/// client
+///     .delete(
+///         42,
+///         "my_table",
+///         pred,
+///     )
+///     .await
+///     .expect("failed to delete data");
+/// # }
+/// ```
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: DeleteServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: DeleteServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Delete data from a table on a specified predicate
+    pub async fn delete(
+        &mut self,
+        database_id: i64,
+        table_name: impl Into<String> + Send,
+        predicate: Predicate,
+    ) -> Result<(), Error> {
+        let table_name = table_name.into();
+
+        self.inner
+            .delete(DeleteRequest {
+                payload: Some(DeletePayload {
+                    database_id,
+                    table_name,
+                    predicate: Some(predicate),
+                }),
+            })
+            .await?;
+
+        Ok(())
+    }
+}
diff --git a/influxdb_iox_client/src/client/error.rs b/influxdb_iox_client/src/client/error.rs
new file mode 100644
index 0000000..fff676e
--- /dev/null
+++ b/influxdb_iox_client/src/client/error.rs
@@ -0,0 +1,206 @@
+use generated_types::google::{AlreadyExists, FieldViolation, NotFound, PreconditionViolation};
+use std::fmt::Debug;
+use thiserror::Error;
+use tonic::{Code, Status};
+
+/// A generic opaque error
+pub type StdError = Box<dyn std::error::Error + Send + Sync + 'static>;
+
+/// A gRPC error payload with optional [details](https://cloud.google.com/apis/design/errors#error_details)
+#[derive(Error, Debug, Clone)]
+#[error("{message}")]
+pub struct ServerError<D> {
+    /// A human readable error message
+    pub message: String,
+    /// An optional machine-readable error
+    pub details: Option<D>,
+}
+
+fn parse_status<D: ServerErrorDetails>(status: tonic::Status) -> ServerError<D> {
+    ServerError {
+        message: status.message().to_string(),
+        details: D::try_decode(&status),
+    }
+}
+
+trait ServerErrorDetails: Sized {
+    fn try_decode(data: &tonic::Status) -> Option<Self>;
+}
+
+impl ServerErrorDetails for () {
+    fn try_decode(_: &tonic::Status) -> Option<Self> {
+        None
+    }
+}
+
+impl ServerErrorDetails for FieldViolation {
+    fn try_decode(status: &tonic::Status) -> Option<Self> {
+        generated_types::google::decode_field_violation(status).next()
+    }
+}
+
+impl ServerErrorDetails for AlreadyExists {
+    fn try_decode(status: &tonic::Status) -> Option<Self> {
+        generated_types::google::decode_already_exists(status).next()
+    }
+}
+
+impl ServerErrorDetails for NotFound {
+    fn try_decode(status: &tonic::Status) -> Option<Self> {
+        generated_types::google::decode_not_found(status).next()
+    }
+}
+
+impl ServerErrorDetails for PreconditionViolation {
+    fn try_decode(status: &tonic::Status) -> Option<Self> {
+        generated_types::google::decode_precondition_violation(status).next()
+    }
+}
+
+/// The errors returned by this client
+#[derive(Error, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[error("The operation was cancelled: {0}")]
+    Cancelled(ServerError<()>),
+
+    #[error("Unknown server error: {0}")]
+    Unknown(ServerError<()>),
+
+    #[error("Client specified an invalid argument: {0}")]
+    InvalidArgument(Box<ServerError<FieldViolation>>),
+
+    #[error("Deadline expired before operation could complete: {0}")]
+    DeadlineExceeded(ServerError<()>),
+
+    #[error("{0}")]
+    NotFound(Box<ServerError<NotFound>>),
+
+    #[error("Some entity that we attempted to create already exists: {0}")]
+    AlreadyExists(Box<ServerError<AlreadyExists>>),
+
+    #[error("The caller does not have permission to execute the specified operation: {0}")]
+    PermissionDenied(ServerError<()>),
+
+    #[error("Some resource has been exhausted: {0}")]
+    ResourceExhausted(ServerError<()>),
+
+    #[error("The system is not in a state required for the operation's execution: {0}")]
+    FailedPrecondition(Box<ServerError<PreconditionViolation>>),
+
+    #[error("The operation was aborted: {0}")]
+    Aborted(ServerError<()>),
+
+    #[error("Operation was attempted past the valid range: {0}")]
+    OutOfRange(ServerError<()>),
+
+    #[error("Operation is not implemented or supported: {0}")]
+    Unimplemented(ServerError<()>),
+
+    #[error("Internal error: {0}")]
+    Internal(ServerError<()>),
+
+    #[error("The service is currently unavailable: {0}")]
+    Unavailable(ServerError<()>),
+
+    #[error("Unrecoverable data loss or corruption: {0}")]
+    DataLoss(ServerError<()>),
+
+    #[error("The request does not have valid authentication credentials: {0}")]
+    Unauthenticated(ServerError<()>),
+
+    #[error("Received an invalid response from the server: {0}")]
+    InvalidResponse(#[from] FieldViolation),
+
+    #[error("An unexpected error occurred in the client library: {0}")]
+    Client(StdError),
+}
+
+impl From<tonic::Status> for Error {
+    fn from(s: Status) -> Self {
+        match s.code() {
+            Code::Ok => Self::Client("status is not an error".into()),
+            Code::Cancelled => Self::Cancelled(parse_status(s)),
+            Code::Unknown => Self::Unknown(parse_status(s)),
+            Code::InvalidArgument => Self::InvalidArgument(Box::new(parse_status(s))),
+            Code::DeadlineExceeded => Self::DeadlineExceeded(parse_status(s)),
+            Code::NotFound => Self::NotFound(Box::new(parse_status(s))),
+            Code::AlreadyExists => Self::AlreadyExists(Box::new(parse_status(s))),
+            Code::PermissionDenied => Self::PermissionDenied(parse_status(s)),
+            Code::ResourceExhausted => Self::ResourceExhausted(parse_status(s)),
+            Code::FailedPrecondition => Self::FailedPrecondition(Box::new(parse_status(s))),
+            Code::Aborted => Self::Aborted(parse_status(s)),
+            Code::OutOfRange => Self::OutOfRange(parse_status(s)),
+            Code::Unimplemented => Self::Unimplemented(parse_status(s)),
+            Code::Internal => Self::Internal(parse_status(s)),
+            Code::Unavailable => Self::Unavailable(parse_status(s)),
+            Code::DataLoss => Self::DataLoss(parse_status(s)),
+            Code::Unauthenticated => Self::Unauthenticated(parse_status(s)),
+        }
+    }
+}
+
+impl Error {
+    /// Return a `Error::Unknown` variant with the specified message
+    pub(crate) fn unknown(message: impl Into<String>) -> Self {
+        Self::Unknown(ServerError {
+            message: message.into(),
+            details: None,
+        })
+    }
+
+    /// Return a `Error::Internal` variant with the specified message
+    pub(crate) fn internal(message: impl Into<String>) -> Self {
+        Self::Internal(ServerError {
+            message: message.into(),
+            details: None,
+        })
+    }
+
+    /// Return a `Error::Client` variant with the specified message
+    pub(crate) fn client<E: std::error::Error + Send + Sync + 'static>(e: E) -> Self {
+        Self::Client(Box::new(e))
+    }
+
+    /// Return `Error::InvalidArgument` specifing an error in `field_name`
+    pub(crate) fn invalid_argument(
+        field_name: impl Into<String>,
+        description: impl Into<String>,
+    ) -> Self {
+        let field_name = field_name.into();
+        let description = description.into();
+
+        Self::InvalidArgument(Box::new(ServerError {
+            message: format!("Invalid argument for '{field_name}': {description}"),
+            details: Some(FieldViolation {
+                field: field_name,
+                description,
+            }),
+        }))
+    }
+}
+
+/// Translates a reqwest response to an Error
+pub(crate) async fn translate_response(response: reqwest::Response) -> Result<(), Error> {
+    let status = response.status();
+
+    if status.is_success() {
+        Ok(())
+    } else if status.is_server_error() {
+        Err(Error::internal(response_description(response).await))
+    } else {
+        // todo would be nice to check for 404, etc and return more specific errors
+        Err(Error::unknown(response_description(response).await))
+    }
+}
+
+/// Makes as detailed error message as possible
+async fn response_description(response: reqwest::Response) -> String {
+    let status = response.status();
+
+    // see if the response has any text we can include
+    match response.text().await {
+        Ok(text) => format!("(status {status}): {text}"),
+        Err(_) => format!("status: {status}"),
+    }
+}
diff --git a/influxdb_iox_client/src/client/flight/mod.rs b/influxdb_iox_client/src/client/flight/mod.rs
new file mode 100644
index 0000000..7330199
--- /dev/null
+++ b/influxdb_iox_client/src/client/flight/mod.rs
@@ -0,0 +1,368 @@
+//! Client for InfluxDB IOx Flight API
+
+use std::{pin::Pin, task::Poll};
+
+use ::generated_types::influxdata::iox::querier::v1::{read_info::QueryType, ReadInfo};
+use futures_util::{Stream, StreamExt};
+use prost::Message;
+use thiserror::Error;
+use tonic::metadata::{MetadataKey, MetadataMap, MetadataValue};
+
+use arrow::{
+    ipc::{self},
+    record_batch::RecordBatch,
+};
+
+use rand::Rng;
+
+use arrow_flight::{decode::FlightRecordBatchStream, error::FlightError, FlightClient, Ticket};
+
+use crate::connection::Connection;
+
+use self::query::{NoQuery, QueryBuilder};
+
+pub mod query;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::querier::v1::*;
+}
+
+/// Error responses when querying an IOx namespace using the IOx Flight API.
+#[derive(Debug, Error)]
+pub enum Error {
+    /// There were no FlightData messages returned when we expected to get one
+    /// containing a Schema.
+    #[error("no FlightData containing a Schema returned")]
+    NoSchema,
+
+    /// An error involving an Arrow operation occurred.
+    #[error(transparent)]
+    ArrowError(#[from] arrow::error::ArrowError),
+
+    /// An error involving an Arrow Flight operation occurred.
+    #[error(transparent)]
+    ArrowFlightError(#[from] FlightError),
+
+    /// The data contained invalid Flatbuffers.
+    #[error("Invalid Flatbuffer: `{0}`")]
+    InvalidFlatbuffer(String),
+
+    /// The message header said it was a dictionary batch, but interpreting the
+    /// message as a dictionary batch returned `None`. Indicates malformed
+    /// Flight data from the server.
+    #[error("Message with header of type dictionary batch could not return a dictionary batch")]
+    CouldNotGetDictionaryBatch,
+
+    /// Arrow Flight handshake failed.
+    #[error("Handshake failed: {0}")]
+    HandshakeFailed(String),
+
+    /// Serializing the protobuf structs into bytes failed.
+    #[error(transparent)]
+    Serialization(#[from] prost::EncodeError),
+
+    /// Deserializing the protobuf structs from bytes failed.
+    #[error(transparent)]
+    Deserialization(#[from] prost::DecodeError),
+
+    /// Unknown IPC message type.
+    #[error("Unknown IPC message type: {0:?}")]
+    UnknownMessageType(ipc::MessageHeader),
+
+    /// Unexpected schema change.
+    #[error("Unexpected schema change")]
+    UnexpectedSchemaChange,
+}
+
+impl Error {
+    /// Extracts the underlying tonic status, if any
+    pub fn tonic_status(&self) -> Option<&tonic::Status> {
+        if let Self::ArrowFlightError(FlightError::Tonic(status)) = self {
+            Some(status)
+        } else {
+            None
+        }
+    }
+}
+
+impl From<tonic::Status> for Error {
+    fn from(status: tonic::Status) -> Self {
+        Self::ArrowFlightError(status.into())
+    }
+}
+
+/// InfluxDB IOx Flight API client.
+///
+/// This client can send SQL or InfluxQL queries to an IOx server
+/// via IOx's native [Apache Arrow Flight](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/)
+/// API (based on gRPC) and returns query results as streams of [`RecordBatch`].
+///
+/// # Protocol
+///
+/// For SQL queries, this client yields a stream of [`RecordBatch`]es
+/// with the same schema.
+///
+/// # Example
+///
+/// ```rust,no_run
+/// #[tokio::main]
+/// # async fn main() {
+/// use influxdb_iox_client::{
+///     connection::Builder,
+///     flight::{
+///         Client,
+///         generated_types::ReadInfo,
+///         generated_types::read_info,
+///     },
+/// };
+/// use futures_util::TryStreamExt;
+///
+/// let connection = Builder::default()
+///     .build("http://127.0.0.1:8082")
+///     .await
+///     .expect("client should be valid");
+///
+/// let mut client = Client::new(connection);
+///
+/// // results is a stream of RecordBatches
+/// let query_results = client
+///     .sql("my_namespace", "select * from cpu_load")
+///     .await
+///     .expect("query request should work");
+///
+/// // Can use standard TryStreamExt combinators like try_collect
+/// let batches: Vec<_> = query_results
+///     .try_collect()
+///     .await
+///     .expect("valid bathes");
+/// # }
+/// ```
+///
+/// # Parameterized Queries
+///
+/// Use the [`Client::query`] method to create a [`QueryBuilder`] which can be used
+/// to supply parameter values to a query containing `$placeholder` variables
+///
+/// For example:
+///
+/// ```rust, no_run
+/// # let mut client: influxdb_iox_client::flight::Client = todo!();
+/// # async {
+///
+/// // Use QueryBuilder to create a parameterized query
+/// let query_results = client
+///     .query("my_namespace")
+///     .sql("select * from cpu_load where host = $host and value = $value")
+///     .with_param("$host", "my.hostname")
+///     .with_param("$value", 0.523)
+///     .run()
+///     .await?;
+///
+/// # Ok::<(), influxdb_iox_client::flight::Error>(())
+/// # };
+///```
+///
+/// ## Helper macro for working with query parameters
+///
+/// To make building queries easier in some scenarios, you can use the [`iox_query_params::params`] macro to
+/// build a map of name-value pairs.
+///
+/// Use the `params!` macro with [`query::QueryBuilder::with_params`] to
+/// supply parameters to a [`query::QueryBuilder`]
+///
+/// ```rust, no_run
+///     # use influxdb_iox_client::flight::query::{QueryBuilder, Query};
+///     # use iox_query_params::params;
+///     # let mut query: QueryBuilder<Query> = todo!();
+///     query.with_params(
+///         params! {
+///            "param1" => "a string",
+///            "param2" => 1234,
+///            "param3" => 1.234,
+///            "param4" => true,
+///            "param5" => Some(false),
+///            "param6" => None::<Option<()>>
+///         }
+///     )
+///     # ;
+/// ```
+#[derive(Debug)]
+pub struct Client {
+    inner: FlightClient,
+}
+
+impl Client {
+    /// Creates a new client with the provided [`Connection`]. Panics
+    /// if the metadata in connection is invalid for the underlying
+    /// tonic library.
+    pub fn new(connection: Connection) -> Self {
+        // Extract headers to include with each request
+        let (channel, headers) = connection.into_grpc_connection().into_parts();
+
+        let mut inner = FlightClient::new(channel);
+
+        // Copy any headers from IOx Connection
+        for (name, value) in headers.iter() {
+            let name = MetadataKey::<_>::from_bytes(name.as_str().as_bytes())
+                .expect("Invalid metadata name");
+
+            let value: MetadataValue<_> =
+                value.as_bytes().try_into().expect("Invalid metadata value");
+            inner.metadata_mut().insert(name, value);
+        }
+
+        Self { inner }
+    }
+
+    /// Return the inner arrow flight client
+    pub fn into_inner(self) -> FlightClient {
+        self.inner
+    }
+
+    /// Return a reference to gRPC metadata included with each request
+    pub fn metadata(&self) -> &MetadataMap {
+        self.inner.metadata()
+    }
+
+    /// Return a reference to gRPC metadata included with each request
+    ///
+    /// This can be used, for example, to include authorization or
+    /// other headers with each request
+    pub fn metadata_mut(&mut self) -> &mut MetadataMap {
+        self.inner.metadata_mut()
+    }
+
+    /// Add the specified header with value to all subsequent requests
+    pub fn add_header(&mut self, key: &str, value: &str) -> Result<(), Error> {
+        Ok(self.inner.add_header(key, value)?)
+    }
+
+    /// Create a new [`QueryBuilder`] to construct a query, optionally with parameters, on the
+    /// given namespace
+    pub fn query(&mut self, database: impl Into<String> + Send) -> QueryBuilder<'_, NoQuery> {
+        QueryBuilder::new(self, database)
+    }
+
+    /// Query the given database with the given SQL query, returning
+    /// a struct that can stream Arrow [`RecordBatch`] results.
+    pub async fn sql(
+        &mut self,
+        database: impl Into<String> + Send,
+        sql_query: impl Into<String> + Send,
+    ) -> Result<IOxRecordBatchStream, Error> {
+        let request = ReadInfo {
+            database: database.into(),
+            sql_query: sql_query.into(),
+            query_type: QueryType::Sql.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        };
+
+        self.do_get_with_read_info(request).await
+    }
+
+    /// Query the given database with the given InfluxQL query, returning
+    /// a struct that can stream Arrow [`RecordBatch`] results.
+    pub async fn influxql(
+        &mut self,
+        database: impl Into<String> + Send,
+        influxql_query: impl Into<String> + Send,
+    ) -> Result<IOxRecordBatchStream, Error> {
+        let request = ReadInfo {
+            database: database.into(),
+            sql_query: influxql_query.into(),
+            query_type: QueryType::InfluxQl.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        };
+
+        self.do_get_with_read_info(request).await
+    }
+
+    /// Perform a lower level client read with the `ReadInfo`
+    async fn do_get_with_read_info(
+        &mut self,
+        read_info: ReadInfo,
+    ) -> Result<IOxRecordBatchStream, Error> {
+        // encode readinfo as bytes and send it
+        let ticket = Ticket {
+            ticket: read_info.encode_to_vec().into(),
+        };
+        self.inner
+            .do_get(ticket)
+            .await
+            .map(IOxRecordBatchStream::new)
+            .map_err(Error::ArrowFlightError)
+    }
+
+    /// Perform a handshake with the server, returning Ok on success
+    /// and Err if the server fails the handshake.
+    ///
+    /// It is best practice to ensure a successful handshake with IOx
+    /// prior to issuing queries.
+
+    /// Perform a handshake with the server, as defined by the Arrow Flight API.
+    pub async fn handshake(&mut self) -> Result<(), Error> {
+        // handshake is an echo server. Send some random bytes and
+        // expect the same back.
+        let payload = rand::thread_rng().gen::<[u8; 16]>().to_vec();
+
+        let response = self
+            .inner
+            .handshake(payload.clone())
+            .await
+            .map_err(|e| e.to_string())
+            .map_err(Error::HandshakeFailed)?;
+
+        if payload.eq(&response) {
+            Ok(())
+        } else {
+            Err(Error::HandshakeFailed("reponse mismatch".into()))
+        }
+    }
+}
+
+#[derive(Debug)]
+/// Translates errors from FlightErrors to IOx client errors,
+/// providing access to the underyling [`FlightRecordBatchStream`]
+pub struct IOxRecordBatchStream {
+    inner: FlightRecordBatchStream,
+}
+
+impl IOxRecordBatchStream {
+    /// create a new IOxRecordBatchStream
+    pub fn new(inner: FlightRecordBatchStream) -> Self {
+        Self { inner }
+    }
+
+    /// Return a reference to the inner stream
+    pub fn inner(&self) -> &FlightRecordBatchStream {
+        &self.inner
+    }
+
+    /// Return a mutable reference to the inner stream
+    pub fn inner_mut(&mut self) -> &mut FlightRecordBatchStream {
+        &mut self.inner
+    }
+
+    /// Consume self and return the wrapped [`FlightRecordBatchStream`]
+    pub fn into_inner(self) -> FlightRecordBatchStream {
+        self.inner
+    }
+}
+
+impl Stream for IOxRecordBatchStream {
+    type Item = Result<RecordBatch, Error>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch, Error>>> {
+        self.inner
+            .poll_next_unpin(cx)
+            .map_err(Error::ArrowFlightError)
+    }
+}
diff --git a/influxdb_iox_client/src/client/flight/query.rs b/influxdb_iox_client/src/client/flight/query.rs
new file mode 100644
index 0000000..d298626
--- /dev/null
+++ b/influxdb_iox_client/src/client/flight/query.rs
@@ -0,0 +1,154 @@
+//! Query builder for the native InfluxDB Flight API with support for parameters.
+
+use std::{collections::HashMap, marker::PhantomData};
+
+use generated_types::influxdata::iox::querier::v1::{
+    read_info::{QueryParam, QueryType},
+    ReadInfo,
+};
+use iox_query_params::StatementParam;
+
+use super::{Client, IOxRecordBatchStream};
+
+/// Initial type state for [`QueryBuilder`] when no query has been given via
+/// the [`QueryBuilder::sql`] or [`QueryBuilder::influxql`] methods.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct NoQuery;
+
+/// Type state for [`QueryBuilder`] when a query has been given via the
+/// [`QueryBuilder::sql`] or [`QueryBuilder::influxql`] methods.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Query;
+
+/// Query builder for InfluxDB queries executed over the Flight protocol. Supports SQL and InfluxQL languages
+/// optionally with named parameters.
+///
+/// See [`super::Client::query`] for information on how to create a QueryBuilder.
+///
+/// Use [`QueryBuilder::sql`] or [`QueryBuilder::influxql`] methods to supply the text of the query
+/// as a string. Optionally, named parameters can be given with [`QueryBuilder::with_param`].
+///
+/// Finally, call [`QueryBuilder::run`] to execute the query.
+///
+/// The generic parameter `State` is a typestate indicating whether the query builder
+/// has been initialized yet. This ensures that the builder cannot call [`QueryBuilder::run`] on a
+/// query until all mandatory parameters are given.
+#[derive(Debug)]
+pub struct QueryBuilder<'client, State> {
+    client: &'client mut Client,
+    database: String,
+    query_text: Option<String>,
+    query_type: Option<QueryType>,
+    params: HashMap<String, StatementParam>,
+    _phantom: PhantomData<State>,
+}
+
+impl<'c, State> QueryBuilder<'c, State> {
+    /// Supply a new named parameter to use with this query. Parameters referenced using
+    /// `$placeholder` syntax in the query will be substituted with the value provided.
+    ///
+    /// Any type that can be converted to [`StatementParam`] can be used as a value. Here are some
+    /// examples:
+    ///
+    /// ```rust, no_run
+    ///     # use influxdb_iox_client::flight::query::{QueryBuilder, Query};
+    ///     # let mut query: QueryBuilder<Query> = todo!();
+    ///     query
+    ///         .with_param("my_param1", true) // boolean
+    ///         .with_param("my_param2", "Hello, World!") // string
+    ///         .with_param("my_param3", 1234) // integer
+    ///         .with_param("my_param4", 1.23) // floating point
+    ///         .with_param("my_param5", Some("string")) // Option types can be converted
+    ///         .with_param("my_param6", None::<Option<()>>) // Option types convert None to NULL
+    ///     # ;
+    /// ```
+    ///
+    pub fn with_param(mut self, name: impl Into<String>, value: impl Into<StatementParam>) -> Self {
+        self.params.insert(name.into(), value.into());
+        self
+    }
+
+    /// IMPORTANT NOTE: Named parameters currently do not work with this client until
+    /// an upgrade to DataFusion 34.0 is performed. See <https://github.com/apache/arrow-datafusion/issues/8245>
+    ///
+    /// Supply an iterator of (name, value) pairs to use as named parameters for this query. Parameters referenced using
+    /// `$placeholder` syntax in the query will be substituted with the values provided.
+    ///
+    /// This is equivalent to calling [`Self::with_param`) on each (name, value) pair of the
+    /// provided iterator. See docs of that method for more details.
+    pub fn with_params<N, V>(mut self, params: impl IntoIterator<Item = (N, V)>) -> Self
+    where
+        N: Into<String>,
+        V: Into<StatementParam>,
+    {
+        self.params
+            .extend(params.into_iter().map(|(k, v)| (k.into(), v.into())));
+        self
+    }
+}
+
+impl<'c> QueryBuilder<'c, NoQuery> {
+    /// internal constructor. use [`super::Client::query`] as the public constructor
+    pub(crate) fn new(
+        client: &'c mut Client,
+        database: impl Into<String> + Send,
+    ) -> QueryBuilder<'c, NoQuery> {
+        QueryBuilder {
+            client,
+            database: database.into(),
+            query_text: None,
+            query_type: None,
+            params: HashMap::new(),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Builds an SQL query from the given string.
+    pub fn sql(self, query_text: impl Into<String>) -> QueryBuilder<'c, Query> {
+        // can't use record update syntax because the output type changes
+        QueryBuilder {
+            client: self.client,
+            database: self.database,
+            query_text: Some(query_text.into()),
+            query_type: Some(QueryType::Sql),
+            params: self.params,
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Builds an InfluxQL query from the given string.
+    pub fn influxql(self, query_text: impl Into<String>) -> QueryBuilder<'c, Query> {
+        // can't use record update syntax because the output type changes
+        QueryBuilder {
+            client: self.client,
+            database: self.database,
+            query_text: Some(query_text.into()),
+            query_type: Some(QueryType::InfluxQl),
+            params: self.params,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'c> QueryBuilder<'c, Query> {
+    /// Query the given database with the SQL query constructed by this builder,
+    /// returning a struct that can stream Arrow [`arrow::record_batch::RecordBatch`] results.
+    pub async fn run(self) -> Result<IOxRecordBatchStream, super::Error> {
+        let request = ReadInfo {
+            database: self.database,
+            sql_query: self.query_text.unwrap(),
+            query_type: self.query_type.unwrap().into(),
+            flightsql_command: vec![],
+            params: self
+                .params
+                .into_iter()
+                .map(|(name, v)| QueryParam {
+                    name,
+                    value: Some(v.into()),
+                })
+                .collect(),
+            is_debug: false,
+        };
+        self.client.do_get_with_read_info(request).await
+    }
+}
diff --git a/influxdb_iox_client/src/client/flightsql.rs b/influxdb_iox_client/src/client/flightsql.rs
new file mode 100644
index 0000000..f23e8b1
--- /dev/null
+++ b/influxdb_iox_client/src/client/flightsql.rs
@@ -0,0 +1,571 @@
+//! Fork of the Arrow Rust FlightSQL client
+//! <https://github.com/apache/arrow-rs/tree/master/arrow-flight/src/sql/client.rs>
+//!
+//! Plan is to upstream much/all of this to arrow-rs
+//! see <https://github.com/apache/arrow-rs/issues/3301>
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::datatypes::{Schema, SchemaRef};
+use arrow_flight::{
+    decode::FlightRecordBatchStream,
+    error::{FlightError, Result},
+    sql::{
+        ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, Any,
+        CommandGetCatalogs, CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys,
+        CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes,
+        CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery,
+        CommandStatementQuery, ProstMessageExt,
+    },
+    Action, FlightClient, FlightDescriptor, FlightInfo, IpcMessage, Ticket,
+};
+use bytes::Bytes;
+use futures_util::TryStreamExt;
+use prost::Message;
+use tonic::metadata::MetadataMap;
+use tonic::transport::Channel;
+
+/// A FlightSQLServiceClient handles details of interacting with a
+/// remote server using the FlightSQL protocol.
+#[derive(Debug)]
+pub struct FlightSqlClient {
+    inner: FlightClient,
+}
+
+/// An [Arrow Flight SQL](https://arrow.apache.org/docs/format/FlightSql.html) client
+/// that can run queries against FlightSql servers.
+///
+/// If you need more low level control, such as access to response
+/// headers, or redirecting to different endpoints, use the lower
+/// level [`FlightClient`].
+///
+/// This client is in the "experimental" stage. It is not guaranteed
+/// to follow the spec in all instances.  Github issues are welcomed.
+impl FlightSqlClient {
+    /// Creates a new FlightSql client that connects to a server over an arbitrary tonic `Channel`
+    pub fn new(channel: Channel) -> Self {
+        Self::new_from_flight(FlightClient::new(channel))
+    }
+
+    /// Create a new client from an existing [`FlightClient`]
+    pub fn new_from_flight(inner: FlightClient) -> Self {
+        FlightSqlClient { inner }
+    }
+
+    /// Return a reference to the underlying [`FlightClient`]
+    pub fn inner(&self) -> &FlightClient {
+        &self.inner
+    }
+
+    /// Return a mutable reference to the underlying [`FlightClient`]
+    pub fn inner_mut(&mut self) -> &mut FlightClient {
+        &mut self.inner
+    }
+
+    /// Consume self and return the inner [`FlightClient`]
+    pub fn into_inner(self) -> FlightClient {
+        self.inner
+    }
+
+    /// Return a reference to gRPC metadata included with each request
+    pub fn metadata(&self) -> &MetadataMap {
+        self.inner.metadata()
+    }
+
+    /// Return a reference to gRPC metadata included with each request
+    ///
+    /// This can be used, for example, to include authorization or
+    /// other headers with each request
+    pub fn metadata_mut(&mut self) -> &mut MetadataMap {
+        self.inner.metadata_mut()
+    }
+
+    /// Add the specified header with value to all subsequent requests
+    pub fn add_header(&mut self, key: &str, value: &str) -> Result<()> {
+        self.inner.add_header(key, value)
+    }
+
+    /// Send `cmd`, encoded as protobuf, to the FlightSQL server
+    async fn get_flight_info_for_command(
+        &mut self,
+        cmd: arrow_flight::sql::Any,
+    ) -> Result<FlightInfo> {
+        let descriptor = FlightDescriptor::new_cmd(cmd.encode_to_vec());
+        self.inner.get_flight_info(descriptor).await
+    }
+
+    /// Execute a SQL query on the server using [`CommandStatementQuery`]
+    ///
+    /// This involves two round trips
+    ///
+    /// Step 1: send a [`CommandStatementQuery`] message to the
+    /// `GetFlightInfo` endpoint of the FlightSQL server to receive a
+    /// FlightInfo descriptor.
+    ///
+    /// Step 2: Fetch the results described in the [`FlightInfo`]
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn query(
+        &mut self,
+        query: impl Into<String> + Send,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandStatementQuery {
+            query: query.into(),
+            transaction_id: None,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// Get information about sql compatibility from this server using [`CommandGetSqlInfo`]
+    ///
+    /// This implementation does not support alternate endpoints
+    ///
+    /// * If omitted, then all metadata will be retrieved.
+    ///
+    /// [`CommandGetSqlInfo`]: https://github.com/apache/arrow/blob/3a6fc1f9eedd41df2d8ffbcbdfbdab911ff6d82e/format/FlightSql.proto#L45-L68
+    pub async fn get_sql_info(&mut self, info: Vec<u32>) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetSqlInfo { info };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List the catalogs on this server using a [`CommandGetCatalogs`] message.
+    ///
+    /// This implementation does not support alternate endpoints
+    ///
+    /// [`CommandGetCatalogs`]: https://github.com/apache/arrow/blob/3a6fc1f9eedd41df2d8ffbcbdfbdab911ff6d82e/format/FlightSql.proto#L1125-L1140
+    pub async fn get_catalogs(&mut self) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetCatalogs {};
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List a description of the foreign key columns in the given foreign key table that
+    /// reference the primary key or the columns representing a unique constraint of the
+    /// parent table (could be the same or a different table) on this server using a
+    /// [`CommandGetCrossReference`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/f0c8229f5a09fe53186df171d518430243ddf112/format/FlightSql.proto#L1405-L1477>
+    ///
+    /// pk_catalog: The catalog name where the parent table is.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// pk_db_schema: The Schema name where the parent table is.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// pk_table: The parent table name. It cannot be null.
+    ///
+    /// fk_catalog: The catalog name where the foreign table is.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// fk_db_schema: The schema name where the foreign table is.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// fk_table: The foreign table name. It cannot be null.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_cross_reference(
+        &mut self,
+        pk_catalog: Option<impl Into<String> + Send>,
+        pk_db_schema: Option<impl Into<String> + Send>,
+        pk_table: String,
+        fk_catalog: Option<impl Into<String> + Send>,
+        fk_db_schema: Option<impl Into<String> + Send>,
+        fk_table: String,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetCrossReference {
+            pk_catalog: pk_catalog.map(|s| s.into()),
+            pk_db_schema: pk_db_schema.map(|s| s.into()),
+            pk_table,
+            fk_catalog: fk_catalog.map(|s| s.into()),
+            fk_db_schema: fk_db_schema.map(|s| s.into()),
+            fk_table,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List the schemas on this server
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/44edc27e549d82db930421b0d4c76098941afd71/format/FlightSql.proto#L1156-L1173>
+    ///
+    /// catalog: Specifies the Catalog to search for the tables.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// db_schema_filter_pattern: Specifies a filter pattern for schemas to search for.
+    /// When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search.
+    /// In the pattern string, two special characters can be used to denote matching rules:
+    ///    - "%" means to match any substring with 0 or more characters.
+    ///    - "_" means to match any one character.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_db_schemas(
+        &mut self,
+        catalog: Option<impl Into<String> + Send>,
+        db_schema_filter_pattern: Option<impl Into<String> + Send>,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetDbSchemas {
+            catalog: catalog.map(|s| s.into()),
+            db_schema_filter_pattern: db_schema_filter_pattern.map(|s| s.into()),
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List a description of the foreign key columns that reference the given
+    /// table's primary key columns (the foreign keys exported by a table) of a
+    /// table on this server using a [`CommandGetExportedKeys`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/0434ab65075ecd1d2ab9245bcd7ec6038934ed29/format/FlightSql.proto#L1307-L1352>
+    ///
+    /// catalog: Specifies the catalog to search for the foreign key table.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// db_schema: Specifies the schema to search for the foreign key table.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// table: Specifies the foreign key table to get the foreign keys for.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_exported_keys(
+        &mut self,
+        catalog: Option<impl Into<String> + Send>,
+        db_schema: Option<impl Into<String> + Send>,
+        table: String,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetExportedKeys {
+            catalog: catalog.map(|s| s.into()),
+            db_schema: db_schema.map(|s| s.into()),
+            table,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List the foreign keys of a table on this server using a
+    /// [`CommandGetImportedKeys`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/196222dbd543d6931f4a1432845add97be0db802/format/FlightSql.proto#L1354-L1403>
+    ///
+    /// catalog: Specifies the catalog to search for the primary key table.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// db_schema: Specifies the schema to search for the primary key table.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// table: Specifies the primary key table to get the foreign keys for.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_imported_keys(
+        &mut self,
+        catalog: Option<impl Into<String> + Send>,
+        db_schema: Option<impl Into<String> + Send>,
+        table: String,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetImportedKeys {
+            catalog: catalog.map(|s| s.into()),
+            db_schema: db_schema.map(|s| s.into()),
+            table,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List the primary keys on this server using a [`CommandGetPrimaryKeys`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/2fe17338e2d1f85d0c2685d31d2dd51f138b6b80/format/FlightSql.proto#L1261-L1297>
+    ///
+    /// catalog: Specifies the catalog to search for the table.
+    /// An empty string retrieves those without a catalog.
+    /// If omitted the catalog name should not be used to narrow the search.
+    ///
+    /// db_schema: Specifies the schema to search for the table.
+    /// An empty string retrieves those without a schema.
+    /// If omitted the schema name should not be used to narrow the search.
+    ///
+    /// table: Specifies the table to get the primary keys for.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_primary_keys(
+        &mut self,
+        catalog: Option<impl Into<String> + Send>,
+        db_schema: Option<impl Into<String> + Send>,
+        table: String,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetPrimaryKeys {
+            catalog: catalog.map(|s| s.into()),
+            db_schema: db_schema.map(|s| s.into()),
+            table,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List the tables on this server using a [`CommandGetTables`] message.
+    ///
+    /// This implementation does not support alternate endpoints
+    ///
+    /// [`CommandGetTables`]: https://github.com/apache/arrow/blob/44edc27e549d82db930421b0d4c76098941afd71/format/FlightSql.proto#L1176-L1241
+    pub async fn get_tables(
+        &mut self,
+        catalog: Option<impl Into<String> + Send>,
+        db_schema_filter_pattern: Option<impl Into<String> + Send>,
+        table_name_filter_pattern: Option<impl Into<String> + Send>,
+        table_types: Vec<String>,
+        include_schema: bool,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetTables {
+            catalog: catalog.map(|s| s.into()),
+            db_schema_filter_pattern: db_schema_filter_pattern.map(|s| s.into()),
+            table_name_filter_pattern: table_name_filter_pattern.map(|s| s.into()),
+            table_types,
+            include_schema,
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List the table types on this server using a [`CommandGetTableTypes`] message.
+    ///
+    /// This implementation does not support alternate endpoints
+    ///
+    /// [`CommandGetTableTypes`]: https://github.com/apache/arrow/blob/44edc27e549d82db930421b0d4c76098941afd71/format/FlightSql.proto#L1243-L1259
+    pub async fn get_table_types(&mut self) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetTableTypes {};
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// List information about data type supported on this server
+    /// using a [`CommandGetXdbcTypeInfo`] message.
+    ///
+    /// # Parameters
+    ///
+    /// Definition from <https://github.com/apache/arrow/blob/9588da967c756b2923e213ccc067378ba6c90a86/format/FlightSql.proto#L1058-L1123>
+    ///
+    /// data_type: Specifies the data type to search for the info.
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn get_xdbc_type_info(
+        &mut self,
+        data_type: Option<impl Into<i32> + Send>,
+    ) -> Result<FlightRecordBatchStream> {
+        let msg = CommandGetXdbcTypeInfo {
+            data_type: data_type.map(|dt| dt.into()),
+        };
+        self.do_get_with_cmd(msg.as_any()).await
+    }
+
+    /// Implements the canonical interaction for most FlightSQL messages:
+    ///
+    /// 1. Call `GetFlightInfo` with the provided message, and get a
+    /// [`FlightInfo`] and embedded ticket.
+    ///
+    /// 2. Call `DoGet` with the provided ticket.
+    ///
+    /// TODO: example calling with GetDbSchemas
+    pub async fn do_get_with_cmd(
+        &mut self,
+        cmd: arrow_flight::sql::Any,
+    ) -> Result<FlightRecordBatchStream> {
+        let FlightInfo {
+            schema: _,
+            flight_descriptor: _,
+            mut endpoint,
+            total_records: _,
+            total_bytes: _,
+            ordered: _,
+        } = self.get_flight_info_for_command(cmd).await?;
+
+        let flight_endpoint = endpoint.pop().ok_or_else(|| {
+            FlightError::protocol("No endpoint specifed in CommandStatementQuery response")
+        })?;
+
+        // "If the list is empty, the expectation is that the
+        // ticket can only be redeemed on the current service
+        // where the ticket was generated."
+        //
+        // https://github.com/apache/arrow-rs/blob/a0a5880665b1836890f6843b6b8772d81c463351/format/Flight.proto#L292-L294
+        if !flight_endpoint.location.is_empty() {
+            return Err(FlightError::NotYetImplemented(format!(
+                "FlightEndpoint with non empty 'location' not supported ({:?})",
+                flight_endpoint.location,
+            )));
+        }
+
+        if !endpoint.is_empty() {
+            return Err(FlightError::NotYetImplemented(format!(
+                "Multiple endpoints returned in CommandStatementQuery response ({})",
+                endpoint.len() + 1,
+            )));
+        }
+
+        // Get the underlying ticket
+        let ticket = flight_endpoint
+            .ticket
+            .ok_or_else(|| {
+                FlightError::protocol(
+                    "No ticket specifed in CommandStatementQuery's FlightInfo response",
+                )
+            })?
+            .ticket;
+
+        self.inner.do_get(Ticket { ticket }).await
+    }
+
+    /// Create a prepared statement for execution.
+    ///
+    /// Sends a [`ActionCreatePreparedStatementRequest`] message to
+    /// the `DoAction` endpoint of the FlightSQL server, and returns
+    /// the handle from the server.
+    ///
+    /// See [`Self::execute`] to run a previously prepared statement
+    pub async fn prepare(&mut self, query: String) -> Result<PreparedStatement> {
+        let cmd = ActionCreatePreparedStatementRequest {
+            query,
+            transaction_id: None,
+        };
+
+        let request = Action {
+            r#type: "CreatePreparedStatement".into(),
+            body: cmd.as_any().encode_to_vec().into(),
+        };
+
+        let mut results: Vec<Bytes> = self.inner.do_action(request).await?.try_collect().await?;
+
+        if results.len() != 1 {
+            return Err(FlightError::ProtocolError(format!(
+                "Expected 1 response for preparing a statement, got {}",
+                results.len()
+            )));
+        }
+        let result = results.pop().unwrap();
+
+        // decode the response
+        let response: arrow_flight::sql::Any = Message::decode(result.as_ref())
+            .map_err(|e| FlightError::ExternalError(Box::new(e)))?;
+
+        let ActionCreatePreparedStatementResult {
+            prepared_statement_handle,
+            dataset_schema,
+            parameter_schema,
+        } = Any::unpack(&response)?.ok_or_else(|| {
+            FlightError::ProtocolError(format!(
+                "Expected ActionCreatePreparedStatementResult message but got {} instead",
+                response.type_url
+            ))
+        })?;
+
+        Ok(PreparedStatement::new(
+            prepared_statement_handle,
+            schema_bytes_to_schema(dataset_schema)?,
+            schema_bytes_to_schema(parameter_schema)?,
+        ))
+    }
+
+    /// Execute a SQL query on the server using [`CommandStatementQuery`]
+    ///
+    /// This involves two round trips
+    ///
+    /// Step 1: send a [`CommandStatementQuery`] message to the
+    /// `GetFlightInfo` endpoint of the FlightSQL server to receive a
+    /// FlightInfo descriptor.
+    ///
+    /// Step 2: Fetch the results described in the [`FlightInfo`]
+    ///
+    /// This implementation does not support alternate endpoints
+    pub async fn execute(
+        &mut self,
+        statement: PreparedStatement,
+    ) -> Result<FlightRecordBatchStream> {
+        let PreparedStatement {
+            prepared_statement_handle,
+            dataset_schema: _,
+            parameter_schema: _,
+        } = statement;
+        // TODO handle parameters (via DoPut)
+
+        let cmd = CommandPreparedStatementQuery {
+            prepared_statement_handle,
+        };
+
+        self.do_get_with_cmd(cmd.as_any()).await
+    }
+}
+
+fn schema_bytes_to_schema(schema: Bytes) -> Result<SchemaRef> {
+    let schema = if schema.is_empty() {
+        Schema::empty()
+    } else {
+        Schema::try_from(IpcMessage(schema))?
+    };
+
+    Ok(Arc::new(schema))
+}
+
+/// represents a "prepared statement handle" on the server
+#[derive(Debug, Clone)]
+pub struct PreparedStatement {
+    /// The handle returned from the server
+    prepared_statement_handle: Bytes,
+
+    /// Schema for the result of the query
+    dataset_schema: SchemaRef,
+
+    /// Schema of parameters, if any
+    parameter_schema: SchemaRef,
+}
+
+impl PreparedStatement {
+    /// The handle returned from the server
+    /// Schema for the result of the query
+    /// Schema of parameters, if any
+    fn new(
+        prepared_statement_handle: Bytes,
+        dataset_schema: SchemaRef,
+        parameter_schema: SchemaRef,
+    ) -> Self {
+        Self {
+            prepared_statement_handle,
+            dataset_schema,
+            parameter_schema,
+        }
+    }
+
+    /// Return the schema of the query
+    pub fn get_dataset_schema(&self) -> SchemaRef {
+        Arc::clone(&self.dataset_schema)
+    }
+
+    /// Return the schema needed for the parameters
+    pub fn get_parameter_schema(&self) -> SchemaRef {
+        Arc::clone(&self.parameter_schema)
+    }
+}
diff --git a/influxdb_iox_client/src/client/health.rs b/influxdb_iox_client/src/client/health.rs
new file mode 100644
index 0000000..aa602e0
--- /dev/null
+++ b/influxdb_iox_client/src/client/health.rs
@@ -0,0 +1,55 @@
+use generated_types::google::FieldViolation;
+
+use generated_types::grpc::health::v1::*;
+
+use crate::connection::{Connection, GrpcConnection};
+use crate::error::Error;
+
+/// A client for the gRPC health checking API
+///
+/// Allows checking the status of a given service
+#[derive(Debug)]
+pub struct Client {
+    inner: health_client::HealthClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(channel: Connection) -> Self {
+        Self {
+            inner: health_client::HealthClient::new(channel.into_grpc_connection()),
+        }
+    }
+
+    /// Returns `Ok(true)` if the corresponding service is serving
+    pub async fn check(&mut self, service: impl Into<String> + Send) -> Result<bool, Error> {
+        use health_check_response::ServingStatus;
+
+        let response = self
+            .inner
+            .check(HealthCheckRequest {
+                service: service.into(),
+            })
+            .await?
+            .into_inner();
+
+        match response.status() {
+            ServingStatus::Serving => Ok(true),
+            ServingStatus::NotServing => Ok(false),
+            _ => Err(Error::InvalidResponse(FieldViolation {
+                field: "status".to_string(),
+                description: format!("invalid response: {}", response.status),
+            })),
+        }
+    }
+
+    /// Returns `Ok(true)` if the storage service is serving
+    pub async fn check_storage(&mut self) -> Result<bool, Error> {
+        self.check(generated_types::STORAGE_SERVICE).await
+    }
+
+    /// Returns `Ok(true)` if the Arrow Flight service is serving
+    pub async fn check_arrow(&mut self) -> Result<bool, Error> {
+        self.check(generated_types::ARROW_SERVICE).await
+    }
+}
diff --git a/influxdb_iox_client/src/client/ingester.rs b/influxdb_iox_client/src/client/ingester.rs
new file mode 100644
index 0000000..0d33f5b
--- /dev/null
+++ b/influxdb_iox_client/src/client/ingester.rs
@@ -0,0 +1,32 @@
+use self::generated_types::{persist_service_client::PersistServiceClient, *};
+use crate::{connection::Connection, error::Error};
+use client_util::connection::GrpcConnection;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::ingester::v1::*;
+}
+
+/// A basic client for interacting with the ingester persist service.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: PersistServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: PersistServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Instruct the ingester to persist its data for the specified namespace to Parquet. Useful in
+    /// tests asserting on persisted data. May behave in unexpected ways if used concurrently with
+    /// writes and ingester WAL rotations.
+    pub async fn persist(&mut self, namespace: String) -> Result<(), Error> {
+        self.inner.persist(PersistRequest { namespace }).await?;
+
+        Ok(())
+    }
+}
diff --git a/influxdb_iox_client/src/client/namespace.rs b/influxdb_iox_client/src/client/namespace.rs
new file mode 100644
index 0000000..16391d5
--- /dev/null
+++ b/influxdb_iox_client/src/client/namespace.rs
@@ -0,0 +1,121 @@
+use client_util::connection::GrpcConnection;
+
+use self::generated_types::{namespace_service_client::NamespaceServiceClient, *};
+use crate::connection::Connection;
+use crate::error::Error;
+use ::generated_types::google::OptionalField;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::{
+        namespace::v1::{update_namespace_service_protection_limit_request::LimitUpdate, *},
+        partition_template::v1::{template_part::*, *},
+    };
+}
+
+/// A basic client for working with Namespaces.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: NamespaceServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: NamespaceServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Get the available namespaces
+    pub async fn get_namespaces(&mut self) -> Result<Vec<Namespace>, Error> {
+        let response = self.inner.get_namespaces(GetNamespacesRequest {}).await?;
+
+        Ok(response.into_inner().namespaces)
+    }
+
+    /// Create a namespace
+    ///
+    /// `retention_period_ns` is the the retention period in nanoseconds,
+    /// measured from `now()`. `None` represents infinite retention (i.e. never
+    /// drop data), and 0 is also mapped to `None` on the server side.
+    ///
+    /// Negative retention periods are rejected, returning an error.
+    pub async fn create_namespace(
+        &mut self,
+        namespace: &str,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<ServiceProtectionLimits>,
+        partition_template: Option<PartitionTemplate>,
+    ) -> Result<Namespace, Error> {
+        let response = self
+            .inner
+            .create_namespace(CreateNamespaceRequest {
+                name: namespace.to_string(),
+                retention_period_ns,
+                partition_template,
+                service_protection_limits,
+            })
+            .await?;
+
+        Ok(response.into_inner().namespace.unwrap_field("namespace")?)
+    }
+
+    /// Update retention for a namespace
+    ///
+    /// `retention_period_ns` is the the retention period in nanoseconds,
+    /// measured from `now()`. `None` represents infinite retention (i.e. never
+    /// drop data), and 0 is also mapped to `None` on the server side.
+    ///
+    /// Negative retention periods are rejected, returning an error.
+    pub async fn update_namespace_retention(
+        &mut self,
+        namespace: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace, Error> {
+        let response = self
+            .inner
+            .update_namespace_retention(UpdateNamespaceRetentionRequest {
+                name: namespace.to_string(),
+                retention_period_ns,
+            })
+            .await?;
+
+        Ok(response.into_inner().namespace.unwrap_field("namespace")?)
+    }
+
+    /// Update one of the service protection limits for a namespace
+    ///
+    /// `limit_update` is the new service limit protection limit to set
+    /// on the namespace.
+    ///
+    /// Zero-valued limits are rejected, returning an error.
+    pub async fn update_namespace_service_protection_limit(
+        &mut self,
+        namespace: &str,
+        limit_update: LimitUpdate,
+    ) -> Result<Namespace, Error> {
+        let response = self
+            .inner
+            .update_namespace_service_protection_limit(
+                UpdateNamespaceServiceProtectionLimitRequest {
+                    name: namespace.to_string(),
+                    limit_update: Some(limit_update),
+                },
+            )
+            .await?;
+
+        Ok(response.into_inner().namespace.unwrap_field("namespace")?)
+    }
+
+    /// Delete a namespace
+    pub async fn delete_namespace(&mut self, namespace: &str) -> Result<(), Error> {
+        self.inner
+            .delete_namespace(DeleteNamespaceRequest {
+                name: namespace.to_string(),
+            })
+            .await?;
+
+        Ok(())
+    }
+}
diff --git a/influxdb_iox_client/src/client/query_log.rs b/influxdb_iox_client/src/client/query_log.rs
new file mode 100644
index 0000000..f5210b5
--- /dev/null
+++ b/influxdb_iox_client/src/client/query_log.rs
@@ -0,0 +1,30 @@
+use client_util::connection::GrpcConnection;
+
+use self::generated_types::{query_log_service_client::QueryLogServiceClient, *};
+use crate::connection::Connection;
+use crate::error::Error;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::querier::v1::*;
+}
+
+/// A basic client for working with the query log.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: QueryLogServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: QueryLogServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Get log.
+    pub async fn get_log(&mut self) -> Result<GetLogResponse, Error> {
+        Ok(self.inner.get_log(GetLogRequest {}).await?.into_inner())
+    }
+}
diff --git a/influxdb_iox_client/src/client/schema.rs b/influxdb_iox_client/src/client/schema.rs
new file mode 100644
index 0000000..cdfd040
--- /dev/null
+++ b/influxdb_iox_client/src/client/schema.rs
@@ -0,0 +1,43 @@
+use self::generated_types::{schema_service_client::SchemaServiceClient, *};
+use ::generated_types::google::OptionalField;
+use client_util::connection::GrpcConnection;
+
+use crate::connection::Connection;
+use crate::error::Error;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::schema::v1::*;
+}
+
+/// A basic client for fetching the Schema for a Namespace.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: SchemaServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: SchemaServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Get the schema for a namespace and, optionally, one table within that namespace.
+    pub async fn get_schema(
+        &mut self,
+        namespace: &str,
+        table: Option<&str>,
+    ) -> Result<NamespaceSchema, Error> {
+        let response = self
+            .inner
+            .get_schema(GetSchemaRequest {
+                namespace: namespace.to_string(),
+                table: table.map(ToString::to_string),
+            })
+            .await?;
+
+        Ok(response.into_inner().schema.unwrap_field("schema")?)
+    }
+}
diff --git a/influxdb_iox_client/src/client/store.rs b/influxdb_iox_client/src/client/store.rs
new file mode 100644
index 0000000..88fbaac
--- /dev/null
+++ b/influxdb_iox_client/src/client/store.rs
@@ -0,0 +1,42 @@
+use self::generated_types::{object_store_service_client::ObjectStoreServiceClient, *};
+
+use crate::connection::Connection;
+use crate::error::Error;
+
+use client_util::connection::GrpcConnection;
+use futures_util::stream::BoxStream;
+use tonic::Status;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::object_store::v1::*;
+}
+
+/// A basic client for interacting the a remote catalog.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: ObjectStoreServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: ObjectStoreServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Get the parquet file data by its object store uuid
+    pub async fn get_parquet_file_by_object_store_id(
+        &mut self,
+        uuid: String,
+    ) -> Result<BoxStream<'static, Result<GetParquetFileByObjectStoreIdResponse, Status>>, Error>
+    {
+        let response = self
+            .inner
+            .get_parquet_file_by_object_store_id(GetParquetFileByObjectStoreIdRequest { uuid })
+            .await?;
+
+        Ok(Box::pin(response.into_inner()))
+    }
+}
diff --git a/influxdb_iox_client/src/client/table.rs b/influxdb_iox_client/src/client/table.rs
new file mode 100644
index 0000000..e168bda
--- /dev/null
+++ b/influxdb_iox_client/src/client/table.rs
@@ -0,0 +1,78 @@
+use client_util::connection::GrpcConnection;
+
+use self::generated_types::{table_service_client::TableServiceClient, *};
+use crate::connection::Connection;
+use crate::error::Error;
+use ::generated_types::google::OptionalField;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::{
+        partition_template::v1::{template_part::*, *},
+        table::v1::*,
+    };
+}
+
+/// A basic client for working with Tables.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: TableServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: TableServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Fetch the list of tables in the given namespace
+    pub async fn get_tables(&mut self, namespace_name: &str) -> Result<Vec<Table>, Error> {
+        Ok(self
+            .inner
+            .get_tables(GetTablesRequest {
+                namespace_name: namespace_name.to_string(),
+            })
+            .await?
+            .into_inner()
+            .tables)
+    }
+
+    /// Get a  table in the given namespace
+    pub async fn get_table(
+        &mut self,
+        namespace_name: &str,
+        table_name: &str,
+    ) -> Result<Table, Error> {
+        Ok(self
+            .inner
+            .get_table(GetTableRequest {
+                namespace_name: namespace_name.to_string(),
+                table_name: table_name.to_string(),
+            })
+            .await?
+            .into_inner()
+            .table
+            .unwrap_field("table")?)
+    }
+
+    /// Create a table
+    pub async fn create_table(
+        &mut self,
+        namespace: &str,
+        table: &str,
+        partition_template: Option<PartitionTemplate>,
+    ) -> Result<Table, Error> {
+        let response = self
+            .inner
+            .create_table(CreateTableRequest {
+                name: table.to_string(),
+                namespace: namespace.to_string(),
+                partition_template,
+            })
+            .await?;
+
+        Ok(response.into_inner().table.unwrap_field("table")?)
+    }
+}
diff --git a/influxdb_iox_client/src/client/test.rs b/influxdb_iox_client/src/client/test.rs
new file mode 100644
index 0000000..ebc468b
--- /dev/null
+++ b/influxdb_iox_client/src/client/test.rs
@@ -0,0 +1,51 @@
+use client_util::connection::GrpcConnection;
+/// Re-export generated_types
+use generated_types::{i_ox_testing_client::IOxTestingClient, TestErrorRequest};
+
+use crate::connection::Connection;
+use crate::error::Error;
+
+/// A client for testing purposes
+///
+/// ```no_run
+/// #[tokio::main]
+/// # async fn main() {
+/// use influxdb_iox_client::{
+///     test::Client,
+///     connection::Builder,
+/// };
+///
+/// let mut connection = Builder::default()
+///     .build("http://127.0.0.1:8082")
+///     .await
+///     .unwrap();
+///
+/// let mut client = Client::new(connection);
+///
+/// // trigger an error
+/// client
+///     .error()
+///     .await
+///     .expect("failed to trigger an error");
+/// # }
+/// ```
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: IOxTestingClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: IOxTestingClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Trigger an error.
+    pub async fn error(&mut self) -> Result<(), Error> {
+        let request = TestErrorRequest {};
+        self.inner.test_error(request).await?;
+        Ok(())
+    }
+}
diff --git a/influxdb_iox_client/src/client/write.rs b/influxdb_iox_client/src/client/write.rs
new file mode 100644
index 0000000..b9e471c
--- /dev/null
+++ b/influxdb_iox_client/src/client/write.rs
@@ -0,0 +1,529 @@
+use std::{fmt::Debug, num::NonZeroUsize, sync::Arc};
+
+use client_util::{connection::HttpConnection, namespace_translation::split_namespace};
+use futures_util::{future::BoxFuture, FutureExt, Stream, StreamExt, TryStreamExt};
+
+use crate::{
+    connection::Connection,
+    error::{translate_response, Error},
+};
+use reqwest::{Body, Method};
+
+/// The default value for the maximum size of each request, in bytes
+pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option<usize> = Some(1024 * 1024);
+
+/// Name of a database.
+#[derive(Debug, Clone)]
+pub struct DatabaseName {
+    /// The database name.
+    database: String,
+    /// Optionally, an Org ID. This is used only by multi-tenant instances of InfluxDB (such as InfluxDB serverless)
+    /// and must be None if writing to a single-tenant instance (such as InfluxDB OSS or Influxdb Clustered)
+    org: Option<String>,
+}
+
+impl DatabaseName {
+    /// Create a single tenant database name from a string.
+    ///
+    /// You can also just pass a &str or a String (or anything that has a `AsRef<str>` impl) directly
+    /// all places that expect a DatabaseName, so generally you don't need to call this function.
+    pub fn from_db(database: &str) -> Self {
+        database.into()
+    }
+
+    /// Create a MultiTenant DatabaseName.
+    pub fn from_org_db(org: impl AsRef<str>, database: impl AsRef<str>) -> Self {
+        Self {
+            org: Some(org.as_ref().to_owned()),
+            database: database.as_ref().to_owned(),
+        }
+    }
+
+    /// Create a DatabaseName by splitting a single string formatted as `org_bucket`.
+    /// This format is useful when porting legacy code that used IOx internal org_db -> namespace encoding.
+    pub fn split_org_db(namespace: impl AsRef<str>) -> Result<Self, Error> {
+        let (org, database) = split_namespace(namespace.as_ref()).map_err(|e| {
+            Error::invalid_argument(
+                "namespace",
+                format!("Could not find valid org_id and bucket_id: {e}"),
+            )
+        })?;
+        Ok(Self::from_org_db(org, database))
+    }
+
+    /// Internally, we speak the v2 protocol which has an "org" parameter. Single tenant instances of InfluxDB
+    /// will tolerate the presence of an "org" parameter provided it's an empty string.
+    fn get_org_bucket(&self) -> (String, String) {
+        let name = self.clone();
+        (name.org.unwrap_or_default(), name.database)
+    }
+}
+
+impl<T: AsRef<str>> From<T> for DatabaseName {
+    fn from(value: T) -> Self {
+        Self {
+            database: value.as_ref().to_owned(),
+            org: None,
+        }
+    }
+}
+
+/// An IOx Write API client.
+///
+/// ```no_run
+/// #[tokio::main]
+/// # async fn main() {
+/// use influxdb_iox_client::{
+///     write::Client,
+///     connection::Builder,
+/// };
+///
+/// let connection = Builder::default()
+///     .build("http://127.0.0.1:8080")
+///     .await
+///     .unwrap();
+///
+/// let mut client = Client::new(connection);
+///
+/// // write a line of line protocol data
+/// client
+///     .write_lp("fruit_bananas", "cpu,region=west user=23.2 100")
+///     .await
+///     .expect("failed to write to IOx");
+/// # }
+/// ```
+#[derive(Debug, Clone)]
+pub struct Client {
+    /// The inner client used to actually make requests.
+    ///
+    /// Uses a trait for test mocking.
+    ///
+    /// Does not expose the trait in the `Client` type to avoid
+    /// exposing an internal implementation detail (the trait) in the
+    /// public interface.
+    inner: Arc<dyn RequestMaker>,
+
+    /// If `Some`, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict
+    /// the amount sent per request. Defaults to `Some(1MB)`
+    ///
+    /// Splitting the upload size consumes a non trivial amount of CPU
+    /// to find line protocol boundaries. This can be disabled by
+    /// setting `max_request_payload_size_bytes` to `None`.
+    max_request_payload_size_bytes: Option<usize>,
+
+    /// Makes this many concurrent requests at a time. Defaults to 1
+    max_concurrent_uploads: NonZeroUsize,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self::new_with_maker(Arc::new(connection.into_http_connection()))
+    }
+
+    /// Creates a new client with the provided request maker
+    fn new_with_maker(inner: Arc<dyn RequestMaker>) -> Self {
+        Self {
+            inner,
+            max_request_payload_size_bytes: DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES,
+            max_concurrent_uploads: NonZeroUsize::new(1).unwrap(),
+        }
+    }
+
+    /// Override the default of sending 1MB of line protocol per request.
+    /// If `Some` is specified, restricts the maximum amount of line protocol
+    /// sent per request to this many bytes. If `None`, does not restrict the amount of
+    /// line protocol sent per request.
+    pub fn with_max_request_payload_size_bytes(
+        self,
+        max_request_payload_size_bytes: Option<usize>,
+    ) -> Self {
+        Self {
+            max_request_payload_size_bytes,
+            ..self
+        }
+    }
+
+    /// The client makes this many concurrent uploads at a
+    /// time. Defaults to 1.
+    pub fn with_max_concurrent_uploads(self, max_concurrent_uploads: NonZeroUsize) -> Self {
+        Self {
+            max_concurrent_uploads,
+            ..self
+        }
+    }
+
+    /// Write the [LineProtocol] formatted string in `lp_data` to
+    /// namespace `database`.
+    ///
+    /// Returns the number of bytes which were written to the namespace.
+    ///
+    /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+    pub async fn write_lp(
+        &mut self,
+        database: impl Into<DatabaseName> + Send,
+        lp_data: impl Into<String> + Send,
+    ) -> Result<usize, Error> {
+        let sources = futures_util::stream::iter([lp_data.into()]);
+
+        self.write_lp_stream(database, sources).await
+    }
+
+    /// Write the stream of [LineProtocol] formatted strings in
+    /// `sources` to database `database`. It is assumed that
+    /// individual lines (points) do not cross these strings.
+    ///
+    /// Returns the number of bytes, in total, which were written to
+    /// the namespace.
+    ///
+    /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+    pub async fn write_lp_stream(
+        &mut self,
+        database: impl Into<DatabaseName> + Send,
+        sources: impl Stream<Item = String> + Send,
+    ) -> Result<usize, Error> {
+        let max_concurrent_uploads: usize = self.max_concurrent_uploads.into();
+        let max_request_payload_size_bytes = self.max_request_payload_size_bytes;
+
+        let database = database.into();
+        let (org_id, bucket_id) = database.get_org_bucket();
+
+        // make a stream and process in parallel
+        let results = sources
+            // split each input source in parallel, if possible
+            .flat_map(|source| {
+                split_lp(
+                    source,
+                    max_request_payload_size_bytes,
+                    max_concurrent_uploads,
+                )
+            })
+            // do the actual write
+            .map(|source| {
+                let org_id = org_id.to_string();
+                let bucket_id = bucket_id.to_string();
+                let inner = Arc::clone(&self.inner);
+
+                tokio::task::spawn(
+                    async move { inner.write_source(org_id, bucket_id, source).await },
+                )
+            })
+            // Do the uploads in parallel
+            .buffered(max_concurrent_uploads)
+            .try_collect::<Vec<_>>()
+            // handle panics in tasks
+            .await
+            .map_err(Error::client)?
+            // find / return any errors
+            .into_iter()
+            .collect::<Result<Vec<_>, Error>>()?;
+
+        Ok(results.into_iter().sum())
+    }
+}
+
+/// Something that knows how to send http data. Exists so it can be
+/// mocked out for testing
+trait RequestMaker: Debug + Send + Sync {
+    /// Write the body data to the specified org, bucket, and
+    /// returning the number of bytes written
+    ///
+    /// (this is implemented manually to avoid `async_trait`)
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>>;
+}
+
+impl RequestMaker for HttpConnection {
+    fn write_source(
+        &self,
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    ) -> BoxFuture<'_, Result<usize, Error>> {
+        let write_url = format!("{}api/v2/write", self.uri());
+
+        async move {
+            let body: Body = body.into();
+
+            let data_len = body.as_bytes().map(|b| b.len()).unwrap_or(0);
+
+            let response = self
+                .client()
+                .request(Method::POST, &write_url)
+                .query(&[("bucket", bucket_id), ("org", org_id)])
+                .body(body)
+                .send()
+                .await
+                .map_err(Error::client)?;
+
+            translate_response(response).await?;
+
+            Ok(data_len)
+        }
+        .boxed()
+    }
+}
+
+/// splits input line protocol into one or more sizes of at most
+/// `max_chunk` on line breaks in a separte tokio task
+fn split_lp(
+    input: String,
+    max_chunk_size: Option<usize>,
+    max_concurrent_uploads: usize,
+) -> impl Stream<Item = String> {
+    let (tx, rx) = tokio::sync::mpsc::channel(max_concurrent_uploads);
+
+    tokio::task::spawn(async move {
+        match max_chunk_size {
+            None => {
+                // ignore errors (means the receiver hung up but nothing to communicate
+                tx.send(input).await.ok();
+            }
+            Some(max_chunk_size) => {
+                // use the actual line protocol parser to split on valid boundaries
+                let mut acc = LineAccumulator::new(max_chunk_size);
+                for l in influxdb_line_protocol::split_lines(&input) {
+                    if let Some(chunk) = acc.push(l) {
+                        // abort if receiver has hungup
+                        if tx.send(chunk).await.is_err() {
+                            return;
+                        }
+                    }
+                }
+                if let Some(chunk) = acc.flush() {
+                    tx.send(chunk).await.ok();
+                }
+            }
+        }
+    });
+
+    tokio_stream::wrappers::ReceiverStream::new(rx)
+}
+#[derive(Debug)]
+struct LineAccumulator {
+    current_chunk: String,
+    max_chunk_size: usize,
+}
+
+impl LineAccumulator {
+    fn new(max_chunk_size: usize) -> Self {
+        Self {
+            current_chunk: String::with_capacity(max_chunk_size),
+            max_chunk_size,
+        }
+    }
+
+    // Add data `l` to the current chunk being created, returning the
+    // current chunk if complete.
+    fn push(&mut self, l: &str) -> Option<String> {
+        let chunk = if self.current_chunk.len() + l.len() + 1 > self.max_chunk_size {
+            self.flush()
+        } else {
+            None
+        };
+
+        if !self.current_chunk.is_empty() {
+            self.current_chunk += "\n";
+        }
+
+        self.current_chunk += l;
+        chunk
+    }
+
+    /// allocate a new chunk with the right size, returning the currently built chunk if it has non zero length
+    /// `self.current_chunk.len()` is zero
+    fn flush(&mut self) -> Option<String> {
+        if !self.current_chunk.is_empty() {
+            let mut new_chunk = String::with_capacity(self.max_chunk_size);
+            std::mem::swap(&mut new_chunk, &mut self.current_chunk);
+            Some(new_chunk)
+        } else {
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Mutex;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = DatabaseName::from_org_db("orgname", "bucketname");
+        let data = "m,t=foo f=4";
+
+        let expected = vec![MockRequest {
+            org_id: "orgname".into(),
+            bucket_id: "bucketname".into(),
+            body: data.into(),
+        }];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 11);
+    }
+
+    #[tokio::test]
+    async fn test_underscore() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = DatabaseName::from_org_db("orgname", "bucket_name");
+        let data = "m,t=foo f=4";
+
+        let expected = vec![MockRequest {
+            org_id: "orgname".into(),
+            bucket_id: "bucket_name".into(),
+            body: data.into(),
+        }];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 11);
+    }
+
+    #[tokio::test]
+    async fn test_single_tenant() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "bucket_name";
+        let data = "m,t=foo f=4";
+
+        let expected = vec![MockRequest {
+            org_id: "".into(),
+            bucket_id: "bucket_name".into(),
+            body: data.into(),
+        }];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 11);
+    }
+
+    #[tokio::test]
+    async fn test_max_request_payload_size() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = DatabaseName::from_org_db("orgname", "bucketname");
+        let data = "m,t=foo f=4\n\
+                    m,t=bar f=3\n\
+                    m,t=fooddddddd f=4";
+
+        // expect the data to be broken up into two chunks:
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4\nm,t=bar f=3".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=fooddddddd f=4".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            // enough to get first two lines, but not last
+            .with_max_request_payload_size_bytes(Some(30))
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 41);
+    }
+
+    #[tokio::test]
+    async fn test_write_lp_stream() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = DatabaseName::from_org_db("orgname", "bucketname");
+        let data = futures_util::stream::iter(
+            vec!["m,t=foo f=4", "m,t=bar f=3"]
+                .into_iter()
+                .map(|s| s.to_string()),
+        );
+
+        // expect the data to come in two chunks
+        let expected = vec![
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=foo f=4".into(),
+            },
+            MockRequest {
+                org_id: "orgname".into(),
+                bucket_id: "bucketname".into(),
+                body: "m,t=bar f=3".into(),
+            },
+        ];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp_stream(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 22);
+    }
+
+    #[derive(Debug, Clone, PartialEq)]
+    struct MockRequest {
+        org_id: String,
+        bucket_id: String,
+        body: String,
+    }
+
+    #[derive(Debug)]
+    struct MockRequestMaker {
+        requests: Mutex<Vec<MockRequest>>,
+    }
+
+    impl MockRequestMaker {
+        fn new() -> Self {
+            Self {
+                requests: Mutex::new(vec![]),
+            }
+        }
+
+        /// get a copy of the requests that were made using this mock
+        fn requests(&self) -> Vec<MockRequest> {
+            self.requests.lock().unwrap().clone()
+        }
+    }
+
+    impl RequestMaker for MockRequestMaker {
+        fn write_source(
+            &self,
+            org_id: String,
+            bucket_id: String,
+            body: String,
+        ) -> BoxFuture<'_, Result<usize, Error>> {
+            let sz = body.len();
+
+            self.requests.lock().unwrap().push(MockRequest {
+                org_id,
+                bucket_id,
+                body,
+            });
+
+            async move { Ok(sz) }.boxed()
+        }
+    }
+}
diff --git a/influxdb_iox_client/src/format.rs b/influxdb_iox_client/src/format.rs
new file mode 100644
index 0000000..7244ffe
--- /dev/null
+++ b/influxdb_iox_client/src/format.rs
@@ -0,0 +1,222 @@
+//! Output formatting utilities for Arrow record batches
+
+use std::{fmt::Display, str::FromStr};
+
+use thiserror::Error;
+
+use arrow::{
+    self, csv::WriterBuilder, error::ArrowError, json::ArrayWriter, record_batch::RecordBatch,
+};
+
+/// Output formatting for InfluxQL.
+pub mod influxql;
+
+/// Error type for results formatting
+#[derive(Debug, Error)]
+pub enum Error {
+    /// Unknown formatting type
+    #[error("Unknown format type: {}. Expected one of 'pretty', 'csv' or 'json'", .0)]
+    Invalid(String),
+
+    /// Error pretty printing
+    #[error("Arrow pretty printing error: {}", .0)]
+    PrettyArrow(ArrowError),
+
+    /// Error during CSV conversion
+    #[error("Arrow csv printing error: {}", .0)]
+    CsvArrow(ArrowError),
+
+    /// Error during JSON conversion
+    #[error("Arrow json printing error: {}", .0)]
+    JsonArrow(ArrowError),
+
+    /// Error converting CSV output to utf-8
+    #[error("Error converting CSV output to UTF-8: {}", .0)]
+    CsvUtf8(std::string::FromUtf8Error),
+
+    /// Error converting JSON output to utf-8
+    #[error("Error converting JSON output to UTF-8: {}", .0)]
+    JsonUtf8(std::string::FromUtf8Error),
+}
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+/// Requested output format for the query endpoint
+pub enum QueryOutputFormat {
+    /// Arrow pretty printer format (default)
+    Pretty,
+    /// Comma separated values
+    Csv,
+    /// Arrow JSON format
+    Json,
+}
+
+impl Display for QueryOutputFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            QueryOutputFormat::Pretty => write!(f, "pretty"),
+            QueryOutputFormat::Csv => write!(f, "csv"),
+            QueryOutputFormat::Json => write!(f, "json"),
+        }
+    }
+}
+
+impl Default for QueryOutputFormat {
+    fn default() -> Self {
+        Self::Pretty
+    }
+}
+
+impl FromStr for QueryOutputFormat {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_ascii_lowercase().as_str() {
+            "pretty" => Ok(Self::Pretty),
+            "csv" => Ok(Self::Csv),
+            "json" => Ok(Self::Json),
+            _ => Err(Error::Invalid(s.to_string())),
+        }
+    }
+}
+
+impl QueryOutputFormat {
+    /// Return the Mcontent-type of this format
+    pub fn content_type(&self) -> &'static str {
+        match self {
+            Self::Pretty => "text/plain",
+            Self::Csv => "text/csv",
+            Self::Json => "application/json",
+        }
+    }
+}
+
+impl QueryOutputFormat {
+    /// Format the [`RecordBatch`]es into a String in one of the
+    /// following formats:
+    ///
+    /// Pretty:
+    /// ```text
+    /// +----------------+--------------+-------+-----------------+------------+
+    /// | bottom_degrees | location     | state | surface_degrees | time       |
+    /// +----------------+--------------+-------+-----------------+------------+
+    /// | 50.4           | santa_monica | CA    | 65.2            | 1568756160 |
+    /// +----------------+--------------+-------+-----------------+------------+
+    /// ```
+    ///
+    /// CSV:
+    /// ```text
+    /// bottom_degrees,location,state,surface_degrees,time
+    /// 50.4,santa_monica,CA,65.2,1568756160
+    /// ```
+    ///
+    /// JSON:
+    ///
+    /// Example (newline + whitespace added for clarity):
+    /// ```text
+    /// [
+    ///  {"bottom_degrees":50.4,"location":"santa_monica","state":"CA","surface_degrees":65.2,"time":1568756160},
+    ///  {"location":"Boston","state":"MA","surface_degrees":50.2,"time":1568756160}
+    /// ]
+    /// ```
+    pub fn format(&self, batches: &[RecordBatch]) -> Result<String> {
+        match self {
+            Self::Pretty => batches_to_pretty(batches),
+            Self::Csv => batches_to_csv(batches),
+            Self::Json => batches_to_json(batches),
+        }
+    }
+}
+
+fn batches_to_pretty(batches: &[RecordBatch]) -> Result<String> {
+    arrow_util::display::pretty_format_batches(batches).map_err(Error::PrettyArrow)
+}
+
+fn batches_to_csv(batches: &[RecordBatch]) -> Result<String> {
+    let mut bytes = vec![];
+
+    {
+        let mut writer = WriterBuilder::new().with_header(true).build(&mut bytes);
+
+        for batch in batches {
+            writer.write(batch).map_err(Error::CsvArrow)?;
+        }
+    }
+    let csv = String::from_utf8(bytes).map_err(Error::CsvUtf8)?;
+    Ok(csv)
+}
+
+fn batches_to_json(batches: &[RecordBatch]) -> Result<String> {
+    let mut bytes = vec![];
+
+    // json writer wants &[&RecordBatch]
+    let batches: Vec<_> = batches.iter().collect();
+    {
+        let mut writer = ArrayWriter::new(&mut bytes);
+        writer.write_batches(&batches).map_err(Error::CsvArrow)?;
+
+        writer.finish().map_err(Error::CsvArrow)?;
+    }
+
+    let json = String::from_utf8(bytes).map_err(Error::JsonUtf8)?;
+
+    Ok(json)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_from_str() {
+        assert_eq!(
+            QueryOutputFormat::from_str("pretty").unwrap(),
+            QueryOutputFormat::Pretty
+        );
+        assert_eq!(
+            QueryOutputFormat::from_str("pRetty").unwrap(),
+            QueryOutputFormat::Pretty
+        );
+
+        assert_eq!(
+            QueryOutputFormat::from_str("csv").unwrap(),
+            QueryOutputFormat::Csv
+        );
+        assert_eq!(
+            QueryOutputFormat::from_str("CSV").unwrap(),
+            QueryOutputFormat::Csv
+        );
+
+        assert_eq!(
+            QueryOutputFormat::from_str("json").unwrap(),
+            QueryOutputFormat::Json
+        );
+        assert_eq!(
+            QueryOutputFormat::from_str("JSON").unwrap(),
+            QueryOutputFormat::Json
+        );
+
+        assert_eq!(
+            QueryOutputFormat::from_str("un").unwrap_err().to_string(),
+            "Unknown format type: un. Expected one of 'pretty', 'csv' or 'json'"
+        );
+    }
+
+    #[test]
+    fn test_from_roundtrip() {
+        assert_eq!(
+            QueryOutputFormat::from_str(&QueryOutputFormat::Pretty.to_string()).unwrap(),
+            QueryOutputFormat::Pretty
+        );
+
+        assert_eq!(
+            QueryOutputFormat::from_str(&QueryOutputFormat::Csv.to_string()).unwrap(),
+            QueryOutputFormat::Csv
+        );
+
+        assert_eq!(
+            QueryOutputFormat::from_str(&QueryOutputFormat::Json.to_string()).unwrap(),
+            QueryOutputFormat::Json
+        );
+    }
+}
diff --git a/influxdb_iox_client/src/format/influxql.rs b/influxdb_iox_client/src/format/influxql.rs
new file mode 100644
index 0000000..5a5d873
--- /dev/null
+++ b/influxdb_iox_client/src/format/influxql.rs
@@ -0,0 +1,427 @@
+use arrow::array::{Array, ArrayData, StringArray};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use arrow::record_batch::RecordBatch;
+use arrow::util::display::ArrayFormatter;
+use comfy_table::{Cell, Table};
+use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
+use std::io::Write;
+use std::iter;
+use thiserror::Error;
+
+/// Error type for results formatting
+#[derive(Debug, Error)]
+pub enum Error {
+    /// Arrow error.
+    #[error("Arrow error: {}", .0)]
+    Arrow(ArrowError),
+
+    /// [`InfluxQlMetadata`] not found in Arrow schema metadata.
+    #[error("Missing InfluxQL metadata")]
+    MissingMetadata,
+
+    /// Error deserializing [`InfluxQlMetadata`] from Arrow schema metadata.
+    #[error("Invalid InfluxQL metadata: {0}")]
+    InvalidMetadata(#[from] serde_json::Error),
+
+    /// Error writing formatted output.
+    #[error("Error writing output: {0}")]
+    Write(#[from] std::io::Error),
+}
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Options for controlling how table borders are rendered.
+#[derive(Debug, Default, Clone, Copy)]
+pub enum TableBorders {
+    /// Use ASCII characters.
+    #[default]
+    Ascii,
+    /// Use UNICODE box-drawing characters.
+    Unicode,
+    /// Do not render borders.
+    None,
+}
+
+/// Options for the [`write_columnar`] function.
+#[derive(Debug, Default, Copy, Clone)]
+pub struct Options {
+    /// Specify how borders should be rendered.
+    pub borders: TableBorders,
+}
+
+impl Options {
+    fn table_preset(&self) -> &'static str {
+        match self.borders {
+            TableBorders::Ascii => "||--+-++|    ++++++",
+            TableBorders::Unicode => comfy_table::presets::UTF8_FULL,
+            TableBorders::None => comfy_table::presets::NOTHING,
+        }
+    }
+}
+
+/// Write the record batches in a columnar format.
+pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch], options: Options) -> Result<()> {
+    let arrow_opts = arrow::util::display::FormatOptions::default().with_display_error(true);
+
+    let Some(schema) = batches.first().map(|b| b.schema()) else {
+        return Ok(());
+    };
+    let md = schema
+        .metadata()
+        .get(schema::INFLUXQL_METADATA_KEY)
+        .ok_or(Error::MissingMetadata)?;
+
+    let v: InfluxQlMetadata = serde_json::from_str(md)?;
+
+    let measurement_idx = v.measurement_column_index as usize;
+    let (tag_keys, tag_key_indexes): (Vec<_>, Vec<_>) = v
+        .tag_key_columns
+        .iter()
+        .map(|tk| (tk.tag_key.as_str(), tk.column_index as usize))
+        .unzip();
+
+    // Find the column indices that should be displayed for the columnar output,
+    // excluding the measurement name column and any tag key columns that only
+    // appear in the `GROUP BY` clause.
+    let col_indexes = (0..schema.fields().len())
+        .filter(|i| {
+            !v.tag_key_columns
+                .iter()
+                .any(|tk| tk.column_index as usize == *i && !tk.is_projected)
+                && measurement_idx != *i
+        })
+        .collect::<Vec<_>>();
+
+    // Collect the header names for the columnar output
+    let header = col_indexes
+        .iter()
+        .map(|idx| Cell::new(schema.field(*idx).name()))
+        .collect::<Vec<_>>();
+
+    let new_table = || {
+        let mut table = Table::new();
+        table.load_preset(options.table_preset());
+        table.set_header(header.clone());
+        table
+    };
+
+    let mut table = new_table();
+
+    for batch in batches {
+        let cols = col_indexes
+            .iter()
+            .map(|idx| {
+                ArrayFormatter::try_new(batch.column(*idx), &arrow_opts).map_err(Error::Arrow)
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let measurement = batch
+            .column(measurement_idx)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .expect("expected measurement column to be a StringArray");
+
+        // create an empty string array for any tag columns that are NULL
+        let empty: StringArray =
+            StringArray::from(ArrayData::new_null(&DataType::Utf8, measurement.len()));
+
+        let tag_vals = tag_key_indexes
+            .iter()
+            .map(|idx| {
+                batch
+                    .column(*idx)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap_or(&empty)
+            })
+            .collect::<Vec<_>>();
+
+        let mut curr_measurement = "";
+        let mut curr_tag_values = iter::repeat("").take(tag_keys.len()).collect::<Vec<_>>();
+
+        for row in 0..batch.num_rows() {
+            let m = measurement.value(row);
+
+            let meas_changed = if curr_measurement != m {
+                curr_measurement = m;
+                true
+            } else {
+                false
+            };
+
+            let tags_changed = if tag_vals
+                .iter()
+                .map(|col| col.value(row))
+                .zip(&curr_tag_values)
+                .all(|(next, prev)| next == *prev)
+            {
+                false
+            } else {
+                tag_vals
+                    .iter()
+                    .enumerate()
+                    .map(|(i, c)| (i, c.value(row)))
+                    .for_each(|(i, v)| curr_tag_values[i] = v);
+                true
+            };
+
+            if meas_changed || tags_changed {
+                if table.row(0).is_some() {
+                    writeln!(w, "{table}")?;
+                }
+                table = new_table();
+                writeln!(w, "name: {curr_measurement}")?;
+
+                // Only print the `tags:` label if there is a group key
+                if let (Some(key), Some(val)) = (tag_keys.first(), curr_tag_values.first()) {
+                    write!(w, "tags: {key}={val}")?;
+
+                    for (key, val) in tag_keys[1..].iter().zip(&curr_tag_values[1..]) {
+                        write!(w, ", {key}={val}")?;
+                    }
+                    writeln!(w)?;
+                }
+            }
+
+            let mut cells = Vec::new();
+            for col in &cols {
+                cells.push(Cell::new(col.value(row).to_string()));
+            }
+            table.add_row(cells);
+        }
+    }
+
+    writeln!(w, "{table}")?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod test {
+    use crate::format::influxql::{write_columnar, Options};
+    use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray, TimestampNanosecondArray};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use arrow::record_batch::RecordBatch;
+    use generated_types::influxdata::iox::querier::v1::influx_ql_metadata::TagKeyColumn;
+    use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
+    use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE};
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    fn times(vals: &[i64]) -> ArrayRef {
+        Arc::new(
+            TimestampNanosecondArray::from_iter_values(vals.iter().cloned())
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        )
+    }
+
+    fn strs<T: AsRef<str>>(vals: &[Option<T>]) -> ArrayRef {
+        Arc::new(StringArray::from_iter(vals))
+    }
+
+    fn f64s(vals: &[Option<f64>]) -> ArrayRef {
+        Arc::new(Float64Array::from_iter(vals.iter()))
+    }
+
+    fn i64s(vals: &[Option<i64>]) -> ArrayRef {
+        Arc::new(Int64Array::from_iter(vals.iter().cloned()))
+    }
+
+    fn batches(meta: InfluxQlMetadata) -> Vec<RecordBatch> {
+        let schema = Arc::new(Schema::new_with_metadata(
+            vec![
+                Field::new("iox::measurement", DataType::Utf8, false),
+                Field::new("time", TIME_DATA_TYPE(), false),
+                Field::new("cpu", DataType::Utf8, true),
+                Field::new("device", DataType::Utf8, true),
+                Field::new("usage_idle", DataType::Float64, true),
+                Field::new("free", DataType::Int64, true),
+            ],
+            HashMap::from([(
+                "iox::influxql::group_key::metadata".to_owned(),
+                serde_json::to_string(&meta).unwrap(),
+            )]),
+        ));
+
+        vec![RecordBatch::try_new(
+            schema,
+            vec![
+                strs(&[
+                    Some("cpu"),
+                    Some("cpu"),
+                    Some("cpu"),
+                    Some("disk"),
+                    Some("disk"),
+                ]),
+                times(&[
+                    1157082300000000000,
+                    1157082310000000000,
+                    1157082300000000000,
+                    1157082300000000000,
+                    1157082300000000000,
+                ]),
+                strs(&[Some("cpu0"), Some("cpu0"), Some("cpu1"), None, None]),
+                strs(&[None, None, None, Some("disk1s1"), Some("disk1s2")]),
+                f64s(&[Some(99.1), Some(99.8), Some(99.2), None, None]),
+                i64s(&[None, None, None, Some(2133), Some(4110)]),
+            ],
+        )
+        .unwrap()]
+    }
+
+    #[test]
+    fn test_write_columnar() {
+        // No group key defined in the metadata
+        let rb = batches(InfluxQlMetadata {
+            measurement_column_index: 0,
+            tag_key_columns: vec![],
+        });
+        let mut s = Vec::<u8>::new();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
+        let res = String::from_utf8(s).unwrap();
+        insta::assert_snapshot!(res, @r###"
+        name: cpu
+        +---------------------+------+--------+------------+------+
+        | time                | cpu  | device | usage_idle | free |
+        +---------------------+------+--------+------------+------+
+        | 2006-09-01T03:45:00 | cpu0 |        | 99.1       |      |
+        | 2006-09-01T03:45:10 | cpu0 |        | 99.8       |      |
+        | 2006-09-01T03:45:00 | cpu1 |        | 99.2       |      |
+        +---------------------+------+--------+------------+------+
+        name: disk
+        +---------------------+-----+---------+------------+------+
+        | time                | cpu | device  | usage_idle | free |
+        +---------------------+-----+---------+------------+------+
+        | 2006-09-01T03:45:00 |     | disk1s1 |            | 2133 |
+        | 2006-09-01T03:45:00 |     | disk1s2 |            | 4110 |
+        +---------------------+-----+---------+------------+------+
+        "###);
+
+        // Group by cpu tag
+        let rb = batches(InfluxQlMetadata {
+            measurement_column_index: 0,
+            tag_key_columns: vec![TagKeyColumn {
+                tag_key: "cpu".to_owned(),
+                column_index: 2,
+                is_projected: false,
+            }],
+        });
+        let mut s = Vec::<u8>::new();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
+        let res = String::from_utf8(s).unwrap();
+        insta::assert_snapshot!(res, @r###"
+        name: cpu
+        tags: cpu=cpu0
+        +---------------------+--------+------------+------+
+        | time                | device | usage_idle | free |
+        +---------------------+--------+------------+------+
+        | 2006-09-01T03:45:00 |        | 99.1       |      |
+        | 2006-09-01T03:45:10 |        | 99.8       |      |
+        +---------------------+--------+------------+------+
+        name: cpu
+        tags: cpu=cpu1
+        +---------------------+--------+------------+------+
+        | time                | device | usage_idle | free |
+        +---------------------+--------+------------+------+
+        | 2006-09-01T03:45:00 |        | 99.2       |      |
+        +---------------------+--------+------------+------+
+        name: disk
+        tags: cpu=
+        +---------------------+---------+------------+------+
+        | time                | device  | usage_idle | free |
+        +---------------------+---------+------------+------+
+        | 2006-09-01T03:45:00 | disk1s1 |            | 2133 |
+        | 2006-09-01T03:45:00 | disk1s2 |            | 4110 |
+        +---------------------+---------+------------+------+
+        "###);
+
+        // group by cpu tag, and cpu tag is included in projection
+        let rb = batches(InfluxQlMetadata {
+            measurement_column_index: 0,
+            tag_key_columns: vec![TagKeyColumn {
+                tag_key: "cpu".to_owned(),
+                column_index: 2,
+                is_projected: true,
+            }],
+        });
+        let mut s = Vec::<u8>::new();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
+        let res = String::from_utf8(s).unwrap();
+        insta::assert_snapshot!(res, @r###"
+        name: cpu
+        tags: cpu=cpu0
+        +---------------------+------+--------+------------+------+
+        | time                | cpu  | device | usage_idle | free |
+        +---------------------+------+--------+------------+------+
+        | 2006-09-01T03:45:00 | cpu0 |        | 99.1       |      |
+        | 2006-09-01T03:45:10 | cpu0 |        | 99.8       |      |
+        +---------------------+------+--------+------------+------+
+        name: cpu
+        tags: cpu=cpu1
+        +---------------------+------+--------+------------+------+
+        | time                | cpu  | device | usage_idle | free |
+        +---------------------+------+--------+------------+------+
+        | 2006-09-01T03:45:00 | cpu1 |        | 99.2       |      |
+        +---------------------+------+--------+------------+------+
+        name: disk
+        tags: cpu=
+        +---------------------+-----+---------+------------+------+
+        | time                | cpu | device  | usage_idle | free |
+        +---------------------+-----+---------+------------+------+
+        | 2006-09-01T03:45:00 |     | disk1s1 |            | 2133 |
+        | 2006-09-01T03:45:00 |     | disk1s2 |            | 4110 |
+        +---------------------+-----+---------+------------+------+
+        "###);
+
+        // group by cpu, device tags
+        let rb = batches(InfluxQlMetadata {
+            measurement_column_index: 0,
+            tag_key_columns: vec![
+                TagKeyColumn {
+                    tag_key: "cpu".to_owned(),
+                    column_index: 2,
+                    is_projected: false,
+                },
+                TagKeyColumn {
+                    tag_key: "device".to_owned(),
+                    column_index: 3,
+                    is_projected: false,
+                },
+            ],
+        });
+        let mut s = Vec::<u8>::new();
+        write_columnar(&mut s, &rb, Options::default()).unwrap();
+        let res = String::from_utf8(s).unwrap();
+        insta::assert_snapshot!(res, @r###"
+        name: cpu
+        tags: cpu=cpu0, device=
+        +---------------------+------------+------+
+        | time                | usage_idle | free |
+        +---------------------+------------+------+
+        | 2006-09-01T03:45:00 | 99.1       |      |
+        | 2006-09-01T03:45:10 | 99.8       |      |
+        +---------------------+------------+------+
+        name: cpu
+        tags: cpu=cpu1, device=
+        +---------------------+------------+------+
+        | time                | usage_idle | free |
+        +---------------------+------------+------+
+        | 2006-09-01T03:45:00 | 99.2       |      |
+        +---------------------+------------+------+
+        name: disk
+        tags: cpu=, device=disk1s1
+        +---------------------+------------+------+
+        | time                | usage_idle | free |
+        +---------------------+------------+------+
+        | 2006-09-01T03:45:00 |            | 2133 |
+        +---------------------+------------+------+
+        name: disk
+        tags: cpu=, device=disk1s2
+        +---------------------+------------+------+
+        | time                | usage_idle | free |
+        +---------------------+------------+------+
+        | 2006-09-01T03:45:00 |            | 4110 |
+        +---------------------+------------+------+
+        "###);
+    }
+}
diff --git a/influxdb_iox_client/src/lib.rs b/influxdb_iox_client/src/lib.rs
new file mode 100644
index 0000000..40312e5
--- /dev/null
+++ b/influxdb_iox_client/src/lib.rs
@@ -0,0 +1,33 @@
+//! An InfluxDB IOx API client.
+#![deny(
+    rustdoc::broken_intra_doc_links,
+    rustdoc::bare_urls,
+    rust_2018_idioms,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+#![warn(
+    missing_docs,
+    clippy::todo,
+    clippy::dbg_macro,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(clippy::missing_docs_in_private_items)]
+
+pub use generated_types::{google, protobuf_type_url, protobuf_type_url_eq};
+
+pub use client::*;
+
+pub use client_util::connection;
+pub use client_util::namespace_translation;
+
+#[cfg(feature = "format")]
+/// Output formatting utilities
+pub mod format;
+
+mod client;
diff --git a/influxdb_line_protocol/Cargo.toml b/influxdb_line_protocol/Cargo.toml
new file mode 100644
index 0000000..51eb362
--- /dev/null
+++ b/influxdb_line_protocol/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "influxdb-line-protocol"
+version = "1.0.0"
+authors = ["InfluxDB IOx Project Developers"]
+edition = "2021"
+license = "MIT OR Apache-2.0"
+readme = "README.md"
+description = "InfluxDB line protocol parser and builder."
+homepage = "https://github.com/influxdata/influxdb_iox/tree/main/influxdb_line_protocol"
+repository = "https://github.com/influxdata/influxdb_iox/tree/main/influxdb_line_protocol"
+
+# Note this crate is published as its own crate on crates.io but kept in
+# this repository for maintenance convenience.
+#
+# Thus it is important not to have workspace dependencies or workspace configuration.
+#
+# https://github.com/influxdata/influxdb_iox/issues/7051
+
+[dependencies] # In alphabetical order
+bytes = "1.5"
+log = "0.4.20"
+nom = { version = "7", default-features = false, features = ["std"] }
+smallvec = { version = "1.12.0", features = ["union"] }
+snafu = "0.8"
+
+[dev-dependencies] # In alphabetical order
+test_helpers = { path = "../test_helpers" }
diff --git a/influxdb_line_protocol/README.md b/influxdb_line_protocol/README.md
new file mode 100644
index 0000000..aac6e39
--- /dev/null
+++ b/influxdb_line_protocol/README.md
@@ -0,0 +1,63 @@
+# influxdb_line_protocol
+
+<!-- cargo-rdme start -->
+
+This crate contains pure Rust implementations of
+
+1. A [parser](https://docs.rs/influxdb_line_protocol/latest/influxdb_line_protocol/fn.parse_lines.html) for [InfluxDB Line Protocol] developed as part of the
+[InfluxDB IOx] project.  This implementation is intended to be
+compatible with the [Go implementation], however, this
+implementation uses a [nom] combinator-based parser rather than
+attempting to port the imperative Go logic so there are likely
+some small diferences.
+
+2. A [builder](https://docs.rs/influxdb_line_protocol/latest/influxdb_line_protocol/builder/struct.LineProtocolBuilder.html) to contruct valid [InfluxDB Line Protocol]
+
+## Example
+
+Here is an example of how to parse the following line
+protocol data into a `ParsedLine`:
+
+```text
+cpu,host=A,region=west usage_system=64.2 1590488773254420000
+```
+
+```rust
+use influxdb_line_protocol::{ParsedLine, FieldValue};
+
+let mut parsed_lines =
+    influxdb_line_protocol::parse_lines(
+        "cpu,host=A,region=west usage_system=64i 1590488773254420000"
+    );
+let parsed_line = parsed_lines
+    .next()
+    .expect("Should have at least one line")
+    .expect("Should parse successfully");
+
+let ParsedLine {
+    series,
+    field_set,
+    timestamp,
+} = parsed_line;
+
+assert_eq!(series.measurement, "cpu");
+
+let tags = series.tag_set.unwrap();
+assert_eq!(tags[0].0, "host");
+assert_eq!(tags[0].1, "A");
+assert_eq!(tags[1].0, "region");
+assert_eq!(tags[1].1, "west");
+
+let field = &field_set[0];
+assert_eq!(field.0, "usage_system");
+assert_eq!(field.1, FieldValue::I64(64));
+
+assert_eq!(timestamp, Some(1590488773254420000));
+```
+
+[InfluxDB Line Protocol]: https://v2.docs.influxdata.com/v2.0/reference/syntax/line-protocol
+[Go implementation]: https://github.com/influxdata/influxdb/blob/217eddc87e14a79b01d0c22994fc139f530094a2/models/points_parser.go
+[InfluxDB IOx]: https://github.com/influxdata/influxdb_iox
+[nom]: https://crates.io/crates/nom
+
+<!-- cargo-rdme end -->
diff --git a/influxdb_line_protocol/RELEASE.md b/influxdb_line_protocol/RELEASE.md
new file mode 100644
index 0000000..2cbb664
--- /dev/null
+++ b/influxdb_line_protocol/RELEASE.md
@@ -0,0 +1,39 @@
+# Release instructions
+
+## Step 1: Update `README.md` file
+
+Update the `README.md` file (copies the rustdoc comments) using the [`cargo rmde`](https://crates.io/crates/cargo-rdme) tool (install via `cargo install cargo-rdme`):
+
+```shell
+cargo rdme
+```
+
+## Step 2: Update versions
+Update the version in Cargo.toml, like:
+
+```diff
+--- a/influxdb_line_protocol/Cargo.toml
++++ b/influxdb_line_protocol/Cargo.toml
+@@ -1,6 +1,6 @@
+ [package]
+ name = "influxdb_line_protocol"
+-version = "1.0.0"
++version = "2.0.0"
+ authors = ["InfluxDB IOx Project Developers"]
+ edition = "2021"
+ license = "MIT OR Apache-2.0"
+```
+
+## Step 3: Make a PR and merge
+
+## Step 4: Publish to crates.io
+
+Test it out with a dry run first:
+```shell
+cargo publish --dry-run
+```
+
+Publish to crates.io:
+```shell
+cargo publish
+```
diff --git a/influxdb_line_protocol/src/builder.rs b/influxdb_line_protocol/src/builder.rs
new file mode 100644
index 0000000..5be9df8
--- /dev/null
+++ b/influxdb_line_protocol/src/builder.rs
@@ -0,0 +1,537 @@
+//! Typestate [line protocol] builder.
+//!
+//! [line protocol]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol
+//! [special characters]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#special-characters
+use bytes::BufMut;
+use std::{
+    fmt::{self},
+    marker::PhantomData,
+};
+
+// https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#special-characters
+const COMMA_EQ_SPACE: [char; 3] = [',', '=', ' '];
+const COMMA_SPACE: [char; 2] = [',', ' '];
+const DOUBLE_QUOTE: [char; 1] = ['"'];
+
+#[doc(hidden)]
+#[derive(Clone, Copy, Debug, Default)]
+pub struct BeforeMeasurement;
+#[doc(hidden)]
+#[derive(Clone, Copy, Debug)]
+pub struct AfterMeasurement;
+#[doc(hidden)]
+#[derive(Clone, Copy, Debug)]
+pub struct AfterTag;
+#[doc(hidden)]
+#[derive(Clone, Copy, Debug)]
+pub struct AfterField;
+#[doc(hidden)]
+#[derive(Clone, Copy, Debug)]
+pub struct AfterTimestamp;
+
+/// Implements a [line protocol] builder.
+///
+/// A [`LineProtocolBuilder`] is a statically-typed InfluxDB [line protocol] builder.
+/// It writes one or more lines of [line protocol] to a [`bytes::BufMut`].
+///
+/// ```
+/// use influxdb_line_protocol::LineProtocolBuilder;
+/// let lp = LineProtocolBuilder::new()
+///     .measurement("foo")
+///     .tag("bar", "baz")
+///     .field("qux", 42.0)
+///     .close_line();
+///
+/// assert_eq!(lp.build(), b"foo,bar=baz qux=42\n");
+/// ```
+///
+/// [`LineProtocolBuilder`] never returns runtime errors. Instead, it employs a type-level state machine
+/// to guarantee that users can't build a syntactically-malformed line protocol batch.
+///
+/// This builder does not check for semantic errors. In particular, it does not check for duplicate tag and field
+/// names, nor it does enforce [naming restrictions] on keys.
+///
+/// Attempts to consume the line protocol before closing a line yield
+/// compile-time errors:
+///
+/// ```compile_fail
+/// # use influxdb_line_protocol::LineProtocolBuilder;
+/// let lp = LineProtocolBuilder::new()
+///     .measurement("foo")
+///     .tag("bar", "baz")
+///
+/// assert_eq!(lp.build(), b"foo,bar=baz qux=42\n");
+/// ```
+///
+/// and attempts to `close_line` the line without at least one field also yield
+/// compile-time errors:
+///
+/// ```compile_fail
+/// # use influxdb_line_protocol::LineProtocolBuilder;
+/// let lp = LineProtocolBuilder::new()
+///     .measurement("foo")
+///     .tag("bar", "baz")
+///     .close_line();
+/// ```
+///
+/// Tags, if any, must be emitted before fields. This will fail to compile:
+///
+/// ```compile_fail
+/// # use influxdb_line_protocol::LineProtocolBuilder;
+/// let lp = LineProtocolBuilder::new()
+///     .measurement("foo")
+///     .field("qux", 42.0);
+///     .tag("bar", "baz")
+///     .close_line();
+/// ```
+///
+/// and timestamps, if any, must be specified last before closing the line:
+///
+/// ```compile_fail
+/// # use influxdb_line_protocol::LineProtocolBuilder;
+/// let lp = LineProtocolBuilder::new()
+///     .measurement("foo")
+///     .timestamp(1234)
+///     .field("qux", 42.0);
+///     .close_line();
+/// ```
+///
+/// (the negative examples part of the documentation is so verbose because it's the only way to test compilation failures)
+///
+/// [line protocol]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol
+/// [special characters]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#special-characters
+/// [naming restrictions]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#naming-restrictions
+#[derive(Debug, Default)]
+pub struct LineProtocolBuilder<B, S = BeforeMeasurement>
+where
+    B: BufMut,
+{
+    buf: B,
+    _marker: PhantomData<S>,
+}
+
+impl LineProtocolBuilder<Vec<u8>, BeforeMeasurement> {
+    /// Creates a new [`LineProtocolBuilder`] with an empty buffer.
+    pub fn new() -> Self {
+        Self::new_with(vec![])
+    }
+}
+
+impl<B> LineProtocolBuilder<B, BeforeMeasurement>
+where
+    B: BufMut,
+{
+    /// Like `new` but appending to an existing `BufMut`.
+    pub fn new_with(buf: B) -> Self {
+        Self {
+            buf,
+            _marker: PhantomData,
+        }
+    }
+
+    /// Provide the measurement name.
+    ///
+    /// It returns a new builder whose type allows only setting tags and fields.
+    pub fn measurement(self, measurement: &str) -> LineProtocolBuilder<B, AfterMeasurement> {
+        let measurement = escape(measurement, COMMA_SPACE);
+        self.write(format_args!("{measurement}"))
+    }
+
+    /// Finish building the line protocol and return the inner buffer.
+    pub fn build(self) -> B {
+        self.buf
+    }
+}
+
+impl<B> LineProtocolBuilder<B, AfterMeasurement>
+where
+    B: BufMut,
+{
+    /// Add a tag (key + value).
+    ///
+    /// Tag keys and tag values will be escaped according to the rules defined in [the special characters documentation].
+    ///
+    /// [special characters]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#special-characters
+    pub fn tag(self, tag_key: &str, tag_value: &str) -> Self {
+        let tag_key = escape(tag_key, COMMA_EQ_SPACE);
+        let tag_value = escape(tag_value, COMMA_EQ_SPACE);
+        self.write(format_args!(",{tag_key}={tag_value}"))
+    }
+
+    /// Add a field (key + value).
+    ///
+    /// Field keys will be escaped according to the rules defined in [the special characters documentation].
+    ///
+    /// Field values will encoded according to the rules defined in [the data types and formats documentation].
+    ///
+    /// This function is called for the first field only. It returns a new builder whose type no longer allows adding tags.
+    ///
+    /// [special characters]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#special-characters
+    /// [data types and formats]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#data-types-and-format
+    pub fn field<F>(self, field_key: &str, field_value: F) -> LineProtocolBuilder<B, AfterField>
+    where
+        F: FieldValue,
+    {
+        self.write(format_args!(" {}", format_field(field_key, &field_value)))
+    }
+}
+
+impl<B> LineProtocolBuilder<B, AfterField>
+where
+    B: BufMut,
+{
+    /// Add a field (key + value).
+    ///
+    /// This function is called for the second and subsequent fields.
+    pub fn field<F: FieldValue>(self, field_key: &str, field_value: F) -> Self {
+        self.write(format_args!(",{}", format_field(field_key, &field_value)))
+    }
+
+    /// Provide a timestamp.
+    ///
+    /// It returns a builder whose type allows only closing the line.
+    ///
+    /// The precision of the timestamp is by default nanoseconds (ns) but the unit
+    /// can be changed when performing the request that carries the line protocol body.
+    /// Setting the unit is outside of the scope of a line protocol builder.
+    pub fn timestamp(self, ts: i64) -> LineProtocolBuilder<B, AfterTimestamp> {
+        self.write(format_args!(" {ts}"))
+    }
+
+    /// Closing a line is required before starting a new one or finishing building the batch.
+    pub fn close_line(self) -> LineProtocolBuilder<B, BeforeMeasurement> {
+        self.close()
+    }
+}
+
+impl<B> LineProtocolBuilder<B, AfterTimestamp>
+where
+    B: BufMut,
+{
+    /// Closing a line is required before starting a new one or finishing building the batch.
+    pub fn close_line(self) -> LineProtocolBuilder<B, BeforeMeasurement> {
+        self.close()
+    }
+}
+
+impl<B, S> LineProtocolBuilder<B, S>
+where
+    B: BufMut,
+{
+    fn close(self) -> LineProtocolBuilder<B, BeforeMeasurement> {
+        self.write(format_args!("\n"))
+    }
+
+    fn write<S2>(self, args: fmt::Arguments<'_>) -> LineProtocolBuilder<B, S2> {
+        use std::io::Write;
+        // MutBuf's Write adapter is infallible.
+        let mut writer = self.buf.writer();
+        write!(&mut writer, "{args}").unwrap();
+        LineProtocolBuilder {
+            buf: writer.into_inner(),
+            _marker: PhantomData,
+        }
+    }
+}
+
+// Return a [`fmt::Display`] that renders string while escaping any characters in the `special_characters` array
+// with a `\`
+fn escape<const N: usize>(src: &str, special_characters: [char; N]) -> Escaped<'_, N> {
+    Escaped {
+        src,
+        special_characters,
+    }
+}
+
+struct Escaped<'a, const N: usize> {
+    src: &'a str,
+    special_characters: [char; N],
+}
+
+impl<'a, const N: usize> fmt::Display for Escaped<'a, N> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        for ch in self.src.chars() {
+            if self.special_characters.contains(&ch) || ch == '\\' {
+                write!(f, "\\")?;
+            }
+            write!(f, "{ch}")?;
+        }
+        Ok(())
+    }
+}
+
+// This method is used by the two [`LineProtocolBuilder::field`] variants in order to render the
+// `key=value` encoding of a field.
+fn format_field<'a, F>(field_key: &'a str, field_value: &'a F) -> impl fmt::Display + 'a
+where
+    F: FieldValue,
+{
+    FormattedField {
+        field_key,
+        field_value,
+    }
+}
+
+struct FormattedField<'a, F>
+where
+    F: FieldValue,
+{
+    field_key: &'a str,
+    field_value: &'a F,
+}
+
+impl<'a, F> fmt::Display for FormattedField<'a, F>
+where
+    F: FieldValue,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}=", escape(self.field_key, COMMA_EQ_SPACE))?;
+        self.field_value.fmt(f)
+    }
+}
+
+/// The [`FieldValue`] trait is implemented by the legal [line protocol types].
+///
+/// [line protocol types]: https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#data-types-and-format
+pub trait FieldValue {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result;
+}
+
+impl FieldValue for &str {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "\"{}\"", escape(self, DOUBLE_QUOTE))
+    }
+}
+
+impl FieldValue for f64 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{self}")
+    }
+}
+
+impl FieldValue for bool {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{self}")
+    }
+}
+
+impl FieldValue for i64 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{self}i")
+    }
+}
+
+impl FieldValue for u64 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{self}u")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{parse_lines, FieldSet, ParsedLine};
+
+    use super::*;
+
+    #[test]
+    fn test_string_escape() {
+        assert_eq!(
+            format!("\"{}\"", escape(r#"foo"#, DOUBLE_QUOTE)),
+            r#""foo""#
+        );
+        assert_eq!(
+            format!("\"{}\"", escape(r"foo \ bar", DOUBLE_QUOTE)),
+            r#""foo \\ bar""#
+        );
+        assert_eq!(
+            format!("\"{}\"", escape(r#"foo " bar"#, DOUBLE_QUOTE)),
+            r#""foo \" bar""#
+        );
+        assert_eq!(
+            format!("\"{}\"", escape(r#"foo \" bar"#, DOUBLE_QUOTE)),
+            r#""foo \\\" bar""#
+        );
+    }
+
+    #[test]
+    fn test_lp_builder() {
+        const PLAIN: &str = "plain";
+        const WITH_SPACE: &str = "with space";
+        const WITH_COMMA: &str = "with,comma";
+        const WITH_EQ: &str = "with=eq";
+        const WITH_DOUBLE_QUOTE: &str = r#"with"doublequote"#;
+        const WITH_SINGLE_QUOTE: &str = "with'singlequote";
+        const WITH_BACKSLASH: &str = r"with\ backslash";
+
+        let builder = LineProtocolBuilder::new()
+            // line 0
+            .measurement("tag_keys")
+            .tag(PLAIN, "dummy")
+            .tag(WITH_SPACE, "dummy")
+            .tag(WITH_COMMA, "dummy")
+            .tag(WITH_EQ, "dummy")
+            .tag(WITH_DOUBLE_QUOTE, "dummy")
+            .tag(WITH_SINGLE_QUOTE, "dummy")
+            .tag(WITH_BACKSLASH, "dummy")
+            .field("dummy", true)
+            .close_line()
+            // line 1
+            .measurement("tag_values")
+            .tag("plain", PLAIN)
+            .tag("withspace", WITH_SPACE)
+            .tag("withcomma", WITH_COMMA)
+            .tag("witheq", WITH_EQ)
+            .tag("withdoublequote", WITH_DOUBLE_QUOTE)
+            .tag("withsinglaquote", WITH_SINGLE_QUOTE)
+            .tag("withbackslash", WITH_BACKSLASH)
+            .field("dummy", true)
+            .close_line()
+            // line 2
+            .measurement("field keys")
+            .field(PLAIN, true)
+            .field(WITH_SPACE, true)
+            .field(WITH_COMMA, true)
+            .field(WITH_EQ, true)
+            .field(WITH_DOUBLE_QUOTE, true)
+            .field(WITH_SINGLE_QUOTE, true)
+            .field(WITH_BACKSLASH, true)
+            .close_line()
+            // line3
+            .measurement("field values")
+            .field("mybool", false)
+            .field("mysigned", 51_i64)
+            .field("myunsigned", 51_u64)
+            .field("myfloat", 51.0)
+            .field("mystring", "some value")
+            .field("mystringwithquotes", "some \" value")
+            .close_line()
+            // line 4
+            .measurement(PLAIN)
+            .field("dummy", true)
+            .close_line()
+            // line 5
+            .measurement(WITH_SPACE)
+            .field("dummy", true)
+            .close_line()
+            // line 6
+            .measurement(WITH_COMMA)
+            .field("dummy", true)
+            .close_line()
+            // line 7
+            .measurement(WITH_EQ)
+            .field("dummy", true)
+            .close_line()
+            // line 8
+            .measurement(WITH_DOUBLE_QUOTE)
+            .field("dummy", true)
+            .close_line()
+            // line 9
+            .measurement(WITH_SINGLE_QUOTE)
+            .field("dummy", true)
+            .close_line()
+            // line 10
+            .measurement(WITH_BACKSLASH)
+            .field("dummy", true)
+            .close_line()
+            // line 11
+            .measurement("without timestamp")
+            .field("dummy", true)
+            .close_line()
+            // line 12
+            .measurement("with timestamp")
+            .field("dummy", true)
+            .timestamp(1234)
+            .close_line();
+
+        let lp = String::from_utf8(builder.build()).unwrap();
+        println!("-----\n{lp}-----");
+
+        let parsed_lines = parse_lines(&lp)
+            .collect::<Result<Vec<ParsedLine<'_>>, _>>()
+            .unwrap();
+
+        let get_tag_key = |n: usize, f: usize| {
+            format!("{}", parsed_lines[n].series.tag_set.as_ref().unwrap()[f].0)
+        };
+        let row = 0;
+        assert_eq!(get_tag_key(row, 0), PLAIN);
+        assert_eq!(get_tag_key(row, 1), WITH_SPACE);
+        assert_eq!(get_tag_key(row, 2), WITH_COMMA);
+        assert_eq!(get_tag_key(row, 3), WITH_EQ);
+        assert_eq!(get_tag_key(row, 4), WITH_DOUBLE_QUOTE);
+        assert_eq!(get_tag_key(row, 5), WITH_SINGLE_QUOTE);
+        assert_eq!(get_tag_key(row, 6), WITH_BACKSLASH);
+
+        let get_tag_value = |n: usize, f: usize| {
+            format!("{}", parsed_lines[n].series.tag_set.as_ref().unwrap()[f].1)
+        };
+        let row = 1;
+        assert_eq!(get_tag_value(row, 0), PLAIN);
+        assert_eq!(get_tag_value(row, 1), WITH_SPACE);
+        assert_eq!(get_tag_value(row, 2), WITH_COMMA);
+        assert_eq!(get_tag_value(row, 3), WITH_EQ);
+        assert_eq!(get_tag_value(row, 4), WITH_DOUBLE_QUOTE);
+        assert_eq!(get_tag_value(row, 5), WITH_SINGLE_QUOTE);
+        assert_eq!(get_tag_value(row, 6), WITH_BACKSLASH);
+
+        let get_field_key = |n: usize, f: usize| format!("{}", parsed_lines[n].field_set[f].0);
+        let row = 2;
+        assert_eq!(get_field_key(row, 0), PLAIN);
+        assert_eq!(get_field_key(row, 1), WITH_SPACE);
+        assert_eq!(get_field_key(row, 2), WITH_COMMA);
+        assert_eq!(get_field_key(row, 3), WITH_EQ);
+        assert_eq!(get_field_key(row, 4), WITH_DOUBLE_QUOTE);
+        assert_eq!(get_field_key(row, 5), WITH_SINGLE_QUOTE);
+        assert_eq!(get_field_key(row, 6), WITH_BACKSLASH);
+
+        let get_field_value = |n: usize, f: usize| format!("{}", parsed_lines[n].field_set[f].1);
+        let row = 3;
+        assert_eq!(get_field_value(row, 0), "false");
+        assert_eq!(get_field_value(row, 1), "51i");
+        assert_eq!(get_field_value(row, 2), "51u");
+        assert_eq!(get_field_value(row, 3), "51");
+        assert_eq!(get_field_value(row, 4), "some value");
+        // TODO(mkm): file an issue for the parser since it incorrectly decodes an escaped double quote (possibly also the Go version).
+        // assert_eq!(get_field_value(row, 5), "some \" value");
+
+        let get_measurement = |n: usize| format!("{}", parsed_lines[n].series.measurement);
+        assert_eq!(get_measurement(4), PLAIN);
+        assert_eq!(get_measurement(5), WITH_SPACE);
+        assert_eq!(get_measurement(6), WITH_COMMA);
+        assert_eq!(get_measurement(7), WITH_EQ);
+        assert_eq!(get_measurement(8), WITH_DOUBLE_QUOTE);
+        assert_eq!(get_measurement(9), WITH_SINGLE_QUOTE);
+        assert_eq!(get_measurement(10), WITH_BACKSLASH);
+
+        let get_timestamp = |n: usize| parsed_lines[n].timestamp;
+        assert_eq!(get_timestamp(11), None);
+        assert_eq!(get_timestamp(12), Some(1234));
+    }
+
+    #[test]
+    fn test_float_formatting() {
+        // ensure that my_float is printed in a way that it is parsed
+        // as a float (not an int)
+        let builder = LineProtocolBuilder::new()
+            .measurement("tag_keys")
+            .tag("foo", "bar")
+            .field("my_float", 3.0)
+            .close_line();
+
+        let lp = String::from_utf8(builder.build()).unwrap();
+        println!("-----\n{lp}-----");
+
+        let parsed_lines = parse_lines(&lp)
+            .collect::<Result<Vec<ParsedLine<'_>>, _>>()
+            .unwrap();
+
+        assert_eq!(parsed_lines.len(), 1);
+        let parsed_line = &parsed_lines[0];
+
+        let expected_fields = vec![("my_float".into(), crate::FieldValue::F64(3.0))]
+            .into_iter()
+            .collect::<FieldSet<'_>>();
+
+        assert_eq!(parsed_line.field_set, expected_fields)
+    }
+}
diff --git a/influxdb_line_protocol/src/lib.rs b/influxdb_line_protocol/src/lib.rs
new file mode 100644
index 0000000..89a487b
--- /dev/null
+++ b/influxdb_line_protocol/src/lib.rs
@@ -0,0 +1,2641 @@
+//! This crate contains pure Rust implementations of
+//!
+//! 1. A [parser](crate::parse_lines) for [InfluxDB Line Protocol] developed as part of the
+//! [InfluxDB IOx] project.  This implementation is intended to be
+//! compatible with the [Go implementation], however, this
+//! implementation uses a [nom] combinator-based parser rather than
+//! attempting to port the imperative Go logic so there are likely
+//! some small differences.
+//!
+//! 2. A [builder](crate::builder::LineProtocolBuilder) to construct valid [InfluxDB Line Protocol]
+//!
+//! # Example
+//!
+//! Here is an example of how to parse the following line
+//! protocol data into a `ParsedLine`:
+//!
+//! ```text
+//!cpu,host=A,region=west usage_system=64.2 1590488773254420000
+//!```
+//!
+//! ```
+//! use influxdb_line_protocol::{ParsedLine, FieldValue};
+//!
+//! let mut parsed_lines =
+//!     influxdb_line_protocol::parse_lines(
+//!         "cpu,host=A,region=west usage_system=64i 1590488773254420000"
+//!     );
+//! let parsed_line = parsed_lines
+//!     .next()
+//!     .expect("Should have at least one line")
+//!     .expect("Should parse successfully");
+//!
+//! let ParsedLine {
+//!     series,
+//!     field_set,
+//!     timestamp,
+//! } = parsed_line;
+//!
+//! assert_eq!(series.measurement, "cpu");
+//!
+//! let tags = series.tag_set.unwrap();
+//! assert_eq!(tags[0].0, "host");
+//! assert_eq!(tags[0].1, "A");
+//! assert_eq!(tags[1].0, "region");
+//! assert_eq!(tags[1].1, "west");
+//!
+//! let field = &field_set[0];
+//! assert_eq!(field.0, "usage_system");
+//! assert_eq!(field.1, FieldValue::I64(64));
+//!
+//! assert_eq!(timestamp, Some(1590488773254420000));
+//! ```
+//!
+//! [InfluxDB Line Protocol]: https://v2.docs.influxdata.com/v2.0/reference/syntax/line-protocol
+//! [Go implementation]: https://github.com/influxdata/influxdb/blob/217eddc87e14a79b01d0c22994fc139f530094a2/models/points_parser.go
+//! [InfluxDB IOx]: https://github.com/influxdata/influxdb_iox
+//! [nom]: https://crates.io/crates/nom
+
+// Note this crate is published as its own crate on crates.io but kept in this repository for
+// maintenance convenience.
+//
+// Thus this crate can't use workspace lints, so these lint configurations must be here.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+// DO NOT REMOVE these lint configurations; see note above!
+
+pub mod builder;
+pub use builder::LineProtocolBuilder;
+
+use fmt::Display;
+use log::debug;
+use nom::{
+    branch::alt,
+    bytes::complete::{tag, take_while1},
+    character::complete::digit1,
+    combinator::{map, opt, recognize},
+    multi::many0,
+    sequence::{preceded, separated_pair, terminated, tuple},
+};
+use smallvec::SmallVec;
+use snafu::{ResultExt, Snafu};
+use std::{
+    borrow::Cow,
+    char,
+    cmp::Ordering,
+    collections::{btree_map::Entry, BTreeMap},
+    fmt,
+    hash::{Hash, Hasher},
+    ops::Deref,
+};
+
+/// String fields are limited to 64K per
+/// <https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/#string>
+const STRING_LENGTH_LIMIT_IN_BYTES: usize = 65536;
+
+/// Parsing errors that describe how a particular line is invalid line protocol.
+#[derive(Debug, Snafu)]
+#[non_exhaustive]
+pub enum Error {
+    #[snafu(display(r#"Must not contain duplicate tags, but "{}" was repeated"#, tag_key))]
+    DuplicateTag { tag_key: String },
+
+    #[snafu(display(r#"Invalid measurement was provided"#))]
+    MeasurementValueInvalid,
+
+    #[snafu(display(r#"No fields were provided"#))]
+    FieldSetMissing,
+
+    #[snafu(display(r#"Unable to parse integer value '{}'"#, value))]
+    IntegerValueInvalid {
+        source: std::num::ParseIntError,
+        value: String,
+    },
+
+    #[snafu(display(r#"Unable to parse unsigned integer value '{}'"#, value))]
+    UIntegerValueInvalid {
+        source: std::num::ParseIntError,
+        value: String,
+    },
+
+    #[snafu(display(r#"Unable to parse floating-point value '{}'"#, value))]
+    FloatValueInvalid {
+        source: std::num::ParseFloatError,
+        value: String,
+    },
+
+    #[snafu(display(r#"Unable to parse timestamp value '{}'"#, value))]
+    TimestampValueInvalid {
+        source: std::num::ParseIntError,
+        value: String,
+    },
+
+    // This error is for compatibility with the Go parser
+    #[snafu(display(
+        r#"Measurements, tag keys and values, and field keys may not end with a backslash"#
+    ))]
+    EndsWithBackslash,
+
+    #[snafu(display(
+        "Could not parse entire line. Found trailing content: '{}'",
+        trailing_content
+    ))]
+    CannotParseEntireLine { trailing_content: String },
+
+    #[snafu(display(r#"Tag Set Malformed"#))]
+    TagSetMalformed,
+
+    // TODO: Replace this with specific failures.
+    #[snafu(display(r#"A generic parsing error occurred: {:?}"#, kind))]
+    GenericParsingError {
+        kind: nom::error::ErrorKind,
+        trace: Vec<Error>,
+    },
+
+    #[snafu(display(r#"String is greater than 64KB"#))]
+    FieldStringValueTooLarge,
+}
+
+/// A specialized [`Result`] type with a default error type of [`Error`].
+///
+/// [`Result`]: std::result::Result
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+type IResult<I, T, E = Error> = nom::IResult<I, T, E>;
+
+impl nom::error::ParseError<&str> for Error {
+    fn from_error_kind(_input: &str, kind: nom::error::ErrorKind) -> Self {
+        GenericParsingSnafu {
+            kind,
+            trace: vec![],
+        }
+        .build()
+    }
+
+    fn append(_input: &str, kind: nom::error::ErrorKind, other: Self) -> Self {
+        GenericParsingSnafu {
+            kind,
+            trace: vec![other],
+        }
+        .build()
+    }
+}
+
+/// Represents a single parsed line of line protocol data. See the [crate-level documentation](self)
+/// for more information and examples.
+#[derive(Debug)]
+pub struct ParsedLine<'a> {
+    pub series: Series<'a>,
+    pub field_set: FieldSet<'a>,
+    pub timestamp: Option<i64>,
+}
+
+impl<'a> ParsedLine<'a> {
+    /// Total number of columns in this line, including fields, tags, and
+    /// timestamp (timestamp is always present).
+    ///
+    /// ```
+    /// use influxdb_line_protocol::{ParsedLine, FieldValue};
+    ///
+    /// let mut parsed_lines =
+    ///     influxdb_line_protocol::parse_lines(
+    ///         "cpu,host=A,region=west usage_system=64i 1590488773254420000"
+    ///     );
+    /// let parsed_line = parsed_lines
+    ///     .next()
+    ///     .expect("Should have at least one line")
+    ///     .expect("Should parse successfully");
+    ///
+    /// assert_eq!(parsed_line.column_count(), 4);
+    /// ```
+    pub fn column_count(&self) -> usize {
+        1 + self.field_set.len() + self.series.tag_set.as_ref().map_or(0, |t| t.len())
+    }
+
+    /// Returns the value of the passed-in tag, if present.
+    pub fn tag_value(&self, tag_key: &str) -> Option<&EscapedStr<'a>> {
+        match &self.series.tag_set {
+            Some(t) => {
+                let t = t.iter().find(|(k, _)| *k == tag_key);
+                t.map(|(_, val)| val)
+            }
+            None => None,
+        }
+    }
+
+    /// Returns the value of the passed-in field, if present.
+    pub fn field_value(&self, field_key: &str) -> Option<&FieldValue<'a>> {
+        let f = self.field_set.iter().find(|(f, _)| *f == field_key);
+        f.map(|(_, val)| val)
+    }
+}
+
+/// Converts from a `ParsedLine` back to (canonical) line protocol
+///
+/// A note on validity: This code does not error or panic if the
+/// `ParsedLine` represents invalid line protocol (for example, if it
+/// has 0 fields).
+///
+/// Thus, if the `ParsedLine` represents invalid line protocol, then
+/// the result of `Display` / `to_string()` will also be invalid.
+impl<'a> Display for ParsedLine<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.series)?;
+
+        if !self.field_set.is_empty() {
+            write!(f, " ")?;
+
+            let mut first = true;
+            for (field_name, field_value) in &self.field_set {
+                if !first {
+                    write!(f, ",")?;
+                }
+                first = false;
+                escape_and_write_value(f, field_name.as_str(), FIELD_KEY_DELIMITERS)?;
+                write!(f, "={field_value}")?;
+            }
+        }
+
+        if let Some(timestamp) = self.timestamp {
+            write!(f, " {timestamp}")?
+        }
+        Ok(())
+    }
+}
+
+/// Represents the identifier of a series (measurement, tagset) for
+/// line protocol data
+#[derive(Debug)]
+pub struct Series<'a> {
+    raw_input: &'a str,
+    pub measurement: Measurement<'a>,
+    pub tag_set: Option<TagSet<'a>>,
+}
+
+/// Converts `Series` back to line protocol
+impl<'a> Display for Series<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        escape_and_write_value(f, self.measurement.as_str(), MEASUREMENT_DELIMITERS)?;
+        if let Some(tag_set) = &self.tag_set {
+            write!(f, ",")?;
+            let mut first = true;
+            for (tag_name, tag_value) in tag_set {
+                if !first {
+                    write!(f, ",")?;
+                }
+                first = false;
+                escape_and_write_value(f, tag_name.as_str(), TAG_KEY_DELIMITERS)?;
+                write!(f, "=")?;
+                escape_and_write_value(f, tag_value.as_str(), TAG_VALUE_DELIMITERS)?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<'a> Series<'a> {
+    pub fn generate_base(self) -> Result<Cow<'a, str>> {
+        match (!self.is_escaped(), self.is_sorted_and_unique()) {
+            (true, true) => Ok(self.raw_input.into()),
+            (_, true) => Ok(self.generate_base_with_escaping().into()),
+            (_, _) => self
+                .generate_base_with_escaping_sorting_deduplicating()
+                .map(Into::into),
+        }
+    }
+
+    fn generate_base_with_escaping(self) -> String {
+        let mut series_base = self.measurement.to_string();
+        for (tag_key, tag_value) in self.tag_set.unwrap_or_default() {
+            use std::fmt::Write;
+            write!(&mut series_base, ",{tag_key}={tag_value}").expect("Could not append string");
+        }
+        series_base
+    }
+
+    fn generate_base_with_escaping_sorting_deduplicating(self) -> Result<String> {
+        let mut unique_sorted_tag_set = BTreeMap::new();
+        for (tag_key, tag_value) in self.tag_set.unwrap_or_default() {
+            match unique_sorted_tag_set.entry(tag_key) {
+                Entry::Vacant(e) => {
+                    e.insert(tag_value);
+                }
+                Entry::Occupied(e) => {
+                    let (tag_key, _) = e.remove_entry();
+                    return DuplicateTagSnafu {
+                        tag_key: tag_key.to_string(),
+                    }
+                    .fail();
+                }
+            }
+        }
+
+        let mut series_base = self.measurement.to_string();
+        for (tag_key, tag_value) in unique_sorted_tag_set {
+            use std::fmt::Write;
+            write!(&mut series_base, ",{tag_key}={tag_value}").expect("Could not append string");
+        }
+
+        Ok(series_base)
+    }
+
+    fn is_escaped(&self) -> bool {
+        self.measurement.is_escaped() || {
+            match &self.tag_set {
+                None => false,
+                Some(tag_set) => tag_set
+                    .iter()
+                    .any(|(tag_key, tag_value)| tag_key.is_escaped() || tag_value.is_escaped()),
+            }
+        }
+    }
+
+    fn is_sorted_and_unique(&self) -> bool {
+        match &self.tag_set {
+            None => true,
+            Some(tag_set) => {
+                let mut i = tag_set.iter().zip(tag_set.iter().skip(1));
+                i.all(|((last_tag_key, _), (this_tag_key, _))| last_tag_key < this_tag_key)
+            }
+        }
+    }
+}
+
+pub type Measurement<'a> = EscapedStr<'a>;
+
+/// The [field] keys and values that appear in the line of line protocol.
+///
+/// [field]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#field-set
+pub type FieldSet<'a> = SmallVec<[(EscapedStr<'a>, FieldValue<'a>); 4]>;
+
+/// The [tag] keys and values that appear in the line of line protocol.
+///
+/// [tag]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#tag-set
+pub type TagSet<'a> = SmallVec<[(EscapedStr<'a>, EscapedStr<'a>); 8]>;
+
+/// Allowed types of fields in a `ParsedLine`. One of the types described in [the line protocol
+/// reference].
+///
+/// [the line protocol reference]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+#[derive(Debug, Clone, PartialEq)]
+pub enum FieldValue<'a> {
+    I64(i64),
+    U64(u64),
+    F64(f64),
+    String(EscapedStr<'a>),
+    Boolean(bool),
+}
+
+impl<'a> FieldValue<'a> {
+    /// Returns true if `self` and `other` are of the same data type.
+    pub fn is_same_type(&self, other: &Self) -> bool {
+        std::mem::discriminant(self) == std::mem::discriminant(other)
+    }
+}
+
+/// Converts `FieldValue` back to line protocol.
+/// See [the line protocol reference] for more detail.
+///
+/// [the line protocol reference]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+impl<'a> Display for FieldValue<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::I64(v) => write!(f, "{v}i"),
+            Self::U64(v) => write!(f, "{v}u"),
+            Self::F64(v) => write!(f, "{v}"),
+            Self::String(v) => escape_and_write_value(f, v, FIELD_VALUE_STRING_DELIMITERS),
+            Self::Boolean(v) => write!(f, "{v}"),
+        }
+    }
+}
+
+/// Represents a single logical string in the input.
+///
+/// We do not use `&str` directly here because the actual input may be
+/// escaped, in which case the data in the input buffer is not
+/// contiguous. This enum provides an interface to access all such
+/// strings as contiguous string slices for compatibility with other
+/// code, and is optimized for the common case where the
+/// input is all in a contiguous string slice.
+///
+/// For example, the 8-character string `Foo\\Bar` (note the double
+/// `\\`) is parsed into the logical 7-character string `Foo\Bar`
+/// (note the single `\`)
+#[derive(Debug, Clone, Eq)]
+pub enum EscapedStr<'a> {
+    SingleSlice(&'a str),
+    CopiedValue(String),
+}
+
+impl fmt::Display for EscapedStr<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            EscapedStr::SingleSlice(s) => s.fmt(f)?,
+            EscapedStr::CopiedValue(s) => s.fmt(f)?,
+        }
+        Ok(())
+    }
+}
+
+impl<'a> EscapedStr<'a> {
+    fn from_slices(v: &[&'a str]) -> EscapedStr<'a> {
+        match v.len() {
+            0 => EscapedStr::SingleSlice(""),
+            1 => EscapedStr::SingleSlice(v[0]),
+            _ => EscapedStr::CopiedValue(v.join("")),
+        }
+    }
+
+    fn is_escaped(&self) -> bool {
+        match self {
+            EscapedStr::SingleSlice(_) => false,
+            EscapedStr::CopiedValue(_) => true,
+        }
+    }
+
+    /// Return the logical representation for the `EscapedStr` as a
+    /// single slice. The slice might not point into the original
+    /// buffer.
+    pub fn as_str(&self) -> &str {
+        self
+    }
+}
+
+impl<'a> Deref for EscapedStr<'a> {
+    type Target = str;
+
+    fn deref(&self) -> &Self::Target {
+        match &self {
+            EscapedStr::SingleSlice(s) => s,
+            EscapedStr::CopiedValue(s) => s,
+        }
+    }
+}
+
+impl<'a> Hash for EscapedStr<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_str().hash(state);
+    }
+}
+
+impl<'a> PartialEq for EscapedStr<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_str() == other.as_str()
+    }
+}
+
+impl<'a> PartialOrd for EscapedStr<'a> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<'a> Ord for EscapedStr<'a> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.as_str().cmp(other.as_str())
+    }
+}
+
+impl<'a> From<&'a str> for EscapedStr<'a> {
+    fn from(other: &'a str) -> Self {
+        EscapedStr::SingleSlice(other)
+    }
+}
+
+impl From<EscapedStr<'_>> for String {
+    fn from(other: EscapedStr<'_>) -> Self {
+        match other {
+            EscapedStr::SingleSlice(s) => s.into(),
+            EscapedStr::CopiedValue(s) => s,
+        }
+    }
+}
+
+impl From<&EscapedStr<'_>> for String {
+    fn from(other: &EscapedStr<'_>) -> Self {
+        other.to_string()
+    }
+}
+
+impl PartialEq<&str> for EscapedStr<'_> {
+    fn eq(&self, other: &&str) -> bool {
+        self.as_str() == *other
+    }
+}
+
+impl PartialEq<String> for EscapedStr<'_> {
+    fn eq(&self, other: &String) -> bool {
+        self.as_str() == other
+    }
+}
+
+/// Parses a new line-delimited string into an iterator of
+/// [`ParsedLine`]. See the [crate-level documentation](self) for more
+/// information and examples.
+pub fn parse_lines(input: &str) -> impl Iterator<Item = Result<ParsedLine<'_>>> {
+    split_lines(input).filter_map(|line| {
+        let i = trim_leading(line);
+
+        if i.is_empty() {
+            return None;
+        }
+
+        let res = match parse_line(i) {
+            Ok((remaining, line)) => {
+                // should have parsed the whole input line; if any
+                // data remains it is a parse error for this line.
+                // Corresponding Go logic:
+                // https://github.com/influxdata/influxdb/blob/217eddc87e14a79b01d0c22994fc139f530094a2/models/points_parser.go#L259-L266
+                if !remaining.is_empty() {
+                    Some(Err(Error::CannotParseEntireLine {
+                        trailing_content: String::from(remaining),
+                    }))
+                } else {
+                    Some(Ok(line))
+                }
+            }
+            Err(nom::Err::Error(e)) | Err(nom::Err::Failure(e)) => Some(Err(e)),
+            Err(nom::Err::Incomplete(_)) => unreachable!("Cannot have incomplete data"), // Only streaming parsers have this
+        };
+
+        if let Some(Err(r)) = &res {
+            debug!("Error parsing line: '{}'. Error was {:?}", line, r);
+        }
+        res
+    })
+}
+
+/// Split `input` into individual lines to be parsed, based on the
+/// rules of the line protocol format.
+///
+/// This code is more or less a direct port of the [Go implementation of
+/// `scanLine`](https://github.com/influxdata/influxdb/blob/217eddc87e14a79b01d0c22994fc139f530094a2/models/points.go#L1078)
+///
+/// While this choice of implementation definitely means there is
+/// logic duplication for scanning fields, duplicating it also means
+/// we can be more sure of the compatibility of the Rust parser and
+/// the canonical Go parser.
+pub fn split_lines(input: &str) -> impl Iterator<Item = &str> {
+    // NB: This is ported as closely as possible from the original Go code:
+    let mut quoted = false;
+    let mut fields = false;
+
+    // tracks how many '=' and commas we've seen
+    // this duplicates some of the functionality in scanFields
+    let mut equals = 0;
+    let mut commas = 0;
+
+    let mut in_escape = false;
+    input.split(move |c| {
+        // skip past escaped characters
+        if in_escape {
+            in_escape = false;
+            return false;
+        }
+
+        if c == '\\' {
+            in_escape = true;
+            return false;
+        }
+
+        if c == ' ' {
+            fields = true;
+            return false;
+        }
+
+        // If we see a double quote, makes sure it is not escaped
+        if fields {
+            if !quoted && c == '=' {
+                equals += 1;
+                return false;
+            } else if !quoted && c == ',' {
+                commas += 1;
+                return false;
+            } else if c == '"' && equals > commas {
+                quoted = !quoted;
+                return false;
+            }
+        }
+
+        if c == '\n' && !quoted {
+            // reset all the state -- we found a line
+            quoted = false;
+            fields = false;
+            equals = 0;
+            commas = 0;
+            assert!(!in_escape);
+            in_escape = false;
+            return true;
+        }
+
+        false
+    })
+}
+
+fn parse_line(i: &str) -> IResult<&str, ParsedLine<'_>> {
+    let field_set = preceded(whitespace, field_set);
+    let timestamp = preceded(whitespace, terminated(timestamp, opt(whitespace)));
+
+    let line = tuple((series, field_set, opt(timestamp)));
+
+    map(line, |(series, field_set, timestamp)| ParsedLine {
+        series,
+        field_set,
+        timestamp,
+    })(i)
+}
+
+fn series(i: &str) -> IResult<&str, Series<'_>> {
+    let series = tuple((measurement, maybe_tagset));
+    let series_and_raw_input = parse_and_recognize(series);
+
+    map(
+        series_and_raw_input,
+        |(raw_input, (measurement, tag_set))| Series {
+            raw_input,
+            measurement,
+            tag_set,
+        },
+    )(i)
+}
+
+/// Tagsets are optional, but if a comma follows the measurement, then we must have at least one tag=value pair.
+/// anything else is an error
+fn maybe_tagset(i: &str) -> IResult<&str, Option<TagSet<'_>>, Error> {
+    match tag::<&str, &str, Error>(",")(i) {
+        Err(nom::Err::Error(_)) => Ok((i, None)),
+        Ok((remainder, _)) => {
+            match tag_set(remainder) {
+                Ok((i, ts)) => {
+                    // reaching here, we must find a tagset, which is at least one tag=value pair.
+                    if ts.is_empty() {
+                        return Err(nom::Err::Error(Error::TagSetMalformed));
+                    }
+                    Ok((i, Some(ts)))
+                }
+                Err(nom::Err::Error(_)) => TagSetMalformedSnafu.fail().map_err(nom::Err::Error),
+                Err(e) => Err(e),
+            }
+        }
+        Err(e) => Err(e),
+    }
+}
+
+fn measurement(i: &str) -> IResult<&str, Measurement<'_>, Error> {
+    let normal_char = take_while1(|c| {
+        !is_whitespace_boundary_char(c) && !is_null_char(c) && c != ',' && c != '\\'
+    });
+
+    let space = map(tag(" "), |_| " ");
+    let comma = map(tag(","), |_| ",");
+    let backslash = map(tag("\\"), |_| "\\");
+
+    let escaped = alt((comma, space, backslash));
+
+    match escape_or_fallback(normal_char, "\\", escaped)(i) {
+        Err(nom::Err::Error(_)) => MeasurementValueInvalidSnafu.fail().map_err(nom::Err::Error),
+        other => other,
+    }
+}
+
+fn tag_set(i: &str) -> IResult<&str, TagSet<'_>> {
+    let one_tag = separated_pair(tag_key, tag("="), tag_value);
+    parameterized_separated_list(tag(","), one_tag, SmallVec::new, |v, i| v.push(i))(i)
+}
+
+fn tag_key(i: &str) -> IResult<&str, EscapedStr<'_>> {
+    let normal_char = take_while1(|c| !is_whitespace_boundary_char(c) && c != '=' && c != '\\');
+
+    escaped_value(normal_char)(i)
+}
+
+fn tag_value(i: &str) -> IResult<&str, EscapedStr<'_>> {
+    let normal_char = take_while1(|c| !is_whitespace_boundary_char(c) && c != ',' && c != '\\');
+    escaped_value(normal_char)(i)
+}
+
+fn field_set(i: &str) -> IResult<&str, FieldSet<'_>> {
+    let one_field = separated_pair(field_key, tag("="), field_value);
+    let sep = tag(",");
+
+    match parameterized_separated_list1(sep, one_field, SmallVec::new, |v, i| v.push(i))(i) {
+        Err(nom::Err::Error(_)) => FieldSetMissingSnafu.fail().map_err(nom::Err::Error),
+        other => other,
+    }
+}
+
+fn field_key(i: &str) -> IResult<&str, EscapedStr<'_>> {
+    let normal_char = take_while1(|c| !is_whitespace_boundary_char(c) && c != '=' && c != '\\');
+    escaped_value(normal_char)(i)
+}
+
+fn field_value(i: &str) -> IResult<&str, FieldValue<'_>> {
+    let int = map(field_integer_value, FieldValue::I64);
+    let uint = map(field_uinteger_value, FieldValue::U64);
+    let float = map(field_float_value, FieldValue::F64);
+    let string = map(field_string_value, FieldValue::String);
+    let boolv = map(field_bool_value, FieldValue::Boolean);
+
+    alt((int, uint, float, string, boolv))(i)
+}
+
+fn field_integer_value(i: &str) -> IResult<&str, i64> {
+    let tagged_value = terminated(integral_value_signed, tag("i"));
+    map_fail(tagged_value, |value| {
+        value.parse().context(IntegerValueInvalidSnafu { value })
+    })(i)
+}
+
+fn field_uinteger_value(i: &str) -> IResult<&str, u64> {
+    let tagged_value = terminated(digit1, tag("u"));
+    map_fail(tagged_value, |value| {
+        value.parse().context(UIntegerValueInvalidSnafu { value })
+    })(i)
+}
+
+fn field_float_value(i: &str) -> IResult<&str, f64> {
+    let value = alt((
+        field_float_value_with_exponential_and_decimal,
+        field_float_value_with_exponential_no_decimal,
+        field_float_value_with_decimal,
+        field_float_value_no_decimal,
+    ));
+    map_fail(value, |value| {
+        value.parse().context(FloatValueInvalidSnafu { value })
+    })(i)
+}
+
+fn field_float_value_with_decimal(i: &str) -> IResult<&str, &str> {
+    recognize(separated_pair(integral_value_signed, tag("."), digit1))(i)
+}
+
+fn field_float_value_with_exponential_and_decimal(i: &str) -> IResult<&str, &str> {
+    recognize(separated_pair(
+        integral_value_signed,
+        tag("."),
+        exponential_value,
+    ))(i)
+}
+
+fn field_float_value_with_exponential_no_decimal(i: &str) -> IResult<&str, &str> {
+    recognize(preceded(opt(tag("-")), exponential_value))(i)
+}
+
+fn exponential_value(i: &str) -> IResult<&str, &str> {
+    recognize(separated_pair(
+        digit1,
+        tuple((alt((tag("e"), tag("E"))), opt(alt((tag("-"), tag("+")))))),
+        digit1,
+    ))(i)
+}
+
+fn field_float_value_no_decimal(i: &str) -> IResult<&str, &str> {
+    integral_value_signed(i)
+}
+
+fn integral_value_signed(i: &str) -> IResult<&str, &str> {
+    recognize(preceded(opt(tag("-")), digit1))(i)
+}
+
+fn timestamp(i: &str) -> IResult<&str, i64> {
+    map_fail(integral_value_signed, |value| {
+        value.parse().context(TimestampValueInvalidSnafu { value })
+    })(i)
+}
+
+fn field_string_value(i: &str) -> IResult<&str, EscapedStr<'_>> {
+    // https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+    // For string field values, backslash is only used to escape itself (`\`) or double
+    // quotes.
+    let string_data = alt((
+        map(tag(r#"\""#), |_| r#"""#), // escaped double quote -> double quote
+        map(tag(r"\\"), |_| r"\"),     // escaped backslash --> single backslash
+        tag(r"\"),                     // unescaped single backslash
+        take_while1(|c| c != '\\' && c != '"'), // anything else w/ no special handling
+    ));
+
+    // NB: `many0` doesn't allow combinators that match the empty string so
+    // we need to special case a pair of double quotes.
+    let empty_str = map(tag(r#""""#), |_| Vec::new());
+
+    let quoted_str = alt((
+        preceded(tag("\""), terminated(many0(string_data), tag("\""))),
+        empty_str,
+    ));
+
+    map_fail(quoted_str, |value| {
+        let size = value.iter().map(|s| s.len()).sum::<usize>();
+        if STRING_LENGTH_LIMIT_IN_BYTES >= size {
+            Ok(EscapedStr::from_slices(&value))
+        } else {
+            Err(Error::FieldStringValueTooLarge)
+        }
+    })(i)
+}
+
+fn field_bool_value(i: &str) -> IResult<&str, bool> {
+    // https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
+    // "specify TRUE with t, T, true, True, or TRUE. Specify FALSE with f, F, false,
+    // False, or FALSE"
+    alt((
+        map(tag("true"), |_| true),
+        map(tag("True"), |_| true),
+        map(tag("TRUE"), |_| true),
+        map(tag("t"), |_| true),
+        map(tag("T"), |_| true),
+        map(tag("false"), |_| false),
+        map(tag("False"), |_| false),
+        map(tag("FALSE"), |_| false),
+        map(tag("f"), |_| false),
+        map(tag("F"), |_| false),
+    ))(i)
+}
+
+/// Truncates the input slice to remove all whitespace from the
+/// beginning (left), including completely commented-out lines
+fn trim_leading(mut i: &str) -> &str {
+    loop {
+        let offset = i
+            .find(|c| !is_whitespace_boundary_char(c))
+            .unwrap_or(i.len());
+        i = &i[offset..];
+
+        if i.starts_with('#') {
+            let offset = i.find('\n').unwrap_or(i.len());
+            i = &i[offset..];
+        } else {
+            break i;
+        }
+    }
+}
+
+fn whitespace(i: &str) -> IResult<&str, &str> {
+    take_while1(|c| c == ' ')(i)
+}
+
+fn is_whitespace_boundary_char(c: char) -> bool {
+    c == ' ' || c == '\t' || c == '\n'
+}
+
+fn is_null_char(c: char) -> bool {
+    c == '\0'
+}
+
+/// While not all of these escape characters are required to be
+/// escaped, we support the client escaping them proactively to
+/// provide a common experience.
+fn escaped_value<'a>(
+    normal: impl Fn(&'a str) -> IResult<&'a str, &'a str>,
+) -> impl FnOnce(&'a str) -> IResult<&'a str, EscapedStr<'a>> {
+    move |i| {
+        let backslash = map(tag("\\"), |_| "\\");
+        let comma = map(tag(","), |_| ",");
+        let equal = map(tag("="), |_| "=");
+        let space = map(tag(" "), |_| " ");
+
+        let escaped = alt((backslash, comma, equal, space));
+
+        escape_or_fallback(normal, "\\", escaped)(i)
+    }
+}
+
+/// Parse an unescaped piece of text, interspersed with
+/// potentially-escaped characters. If the character *isn't* escaped,
+/// treat it as a literal character.
+fn escape_or_fallback<'a>(
+    normal: impl FnMut(&'a str) -> IResult<&'a str, &'a str>,
+    escape_char: &'static str,
+    escaped: impl FnMut(&'a str) -> IResult<&'a str, &'a str>,
+) -> impl FnOnce(&'a str) -> IResult<&'a str, EscapedStr<'a>> {
+    move |i| {
+        let (remaining, s) = escape_or_fallback_inner(normal, escape_char, escaped)(i)?;
+
+        if s.ends_with('\\') {
+            EndsWithBackslashSnafu.fail().map_err(nom::Err::Failure)
+        } else if s.len() > STRING_LENGTH_LIMIT_IN_BYTES {
+            FieldStringValueTooLargeSnafu
+                .fail()
+                .map_err(nom::Err::Failure)
+        } else {
+            Ok((remaining, s))
+        }
+    }
+}
+
+fn escape_or_fallback_inner<'a, Error>(
+    mut normal: impl FnMut(&'a str) -> IResult<&'a str, &'a str, Error>,
+    escape_char: &'static str,
+    mut escaped: impl FnMut(&'a str) -> IResult<&'a str, &'a str, Error>,
+) -> impl FnMut(&'a str) -> IResult<&'a str, EscapedStr<'a>, Error>
+where
+    Error: nom::error::ParseError<&'a str>,
+{
+    move |i| {
+        let mut result = SmallVec::<[&str; 4]>::new();
+        let mut head = i;
+
+        loop {
+            match normal(head) {
+                Ok((remaining, parsed)) => {
+                    result.push(parsed);
+                    head = remaining;
+                }
+                Err(nom::Err::Error(_)) => {
+                    // FUTURE: https://doc.rust-lang.org/std/primitive.str.html#method.strip_prefix
+                    if head.starts_with(escape_char) {
+                        let after = &head[escape_char.len()..];
+
+                        match escaped(after) {
+                            Ok((remaining, parsed)) => {
+                                result.push(parsed);
+                                head = remaining;
+                            }
+                            Err(nom::Err::Error(_)) => {
+                                result.push(escape_char);
+                                head = after;
+
+                                // The Go parser assumes that *any* unknown escaped character is
+                                // valid.
+                                match head.chars().next() {
+                                    Some(c) => {
+                                        let (escaped, remaining) = head.split_at(c.len_utf8());
+                                        result.push(escaped);
+                                        head = remaining;
+                                    }
+                                    None => return Ok((head, EscapedStr::from_slices(&result))),
+                                }
+                            }
+                            Err(e) => return Err(e),
+                        }
+                    } else {
+                        // have we parsed *anything*?
+                        if head == i {
+                            return Err(nom::Err::Error(Error::from_error_kind(
+                                head,
+                                nom::error::ErrorKind::EscapedTransform,
+                            )));
+                        } else {
+                            return Ok((head, EscapedStr::from_slices(&result)));
+                        }
+                    }
+                }
+                Err(e) => return Err(e),
+            }
+        }
+    }
+}
+
+/// This is a copied version of nom's `separated_list` that allows
+/// parameterizing the created collection via closures.
+fn parameterized_separated_list<I, O, O2, E, F, G, Ret>(
+    mut sep: G,
+    mut f: F,
+    cre: impl FnOnce() -> Ret,
+    mut add: impl FnMut(&mut Ret, O),
+) -> impl FnOnce(I) -> IResult<I, Ret, E>
+where
+    I: Clone + PartialEq,
+    F: FnMut(I) -> IResult<I, O, E>,
+    G: FnMut(I) -> IResult<I, O2, E>,
+    E: nom::error::ParseError<I>,
+{
+    move |mut i: I| {
+        let mut res = cre();
+
+        match f(i.clone()) {
+            Err(nom::Err::Error(_)) => return Ok((i, res)),
+            Err(e) => return Err(e),
+            Ok((i1, o)) => {
+                if i1 == i {
+                    return Err(nom::Err::Error(E::from_error_kind(
+                        i1,
+                        nom::error::ErrorKind::SeparatedList,
+                    )));
+                }
+
+                add(&mut res, o);
+                i = i1;
+            }
+        }
+
+        loop {
+            match sep(i.clone()) {
+                Err(nom::Err::Error(_)) => return Ok((i, res)),
+                Err(e) => return Err(e),
+                Ok((i1, _)) => {
+                    if i1 == i {
+                        return Err(nom::Err::Error(E::from_error_kind(
+                            i1,
+                            nom::error::ErrorKind::SeparatedList,
+                        )));
+                    }
+
+                    match f(i1.clone()) {
+                        Err(nom::Err::Error(_)) => return Ok((i, res)),
+                        Err(e) => return Err(e),
+                        Ok((i2, o)) => {
+                            if i2 == i {
+                                return Err(nom::Err::Error(E::from_error_kind(
+                                    i2,
+                                    nom::error::ErrorKind::SeparatedList,
+                                )));
+                            }
+
+                            add(&mut res, o);
+                            i = i2;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+fn parameterized_separated_list1<I, O, O2, E, F, G, Ret>(
+    mut sep: G,
+    mut f: F,
+    cre: impl FnOnce() -> Ret,
+    mut add: impl FnMut(&mut Ret, O),
+) -> impl FnOnce(I) -> IResult<I, Ret, E>
+where
+    I: Clone + PartialEq,
+    F: FnMut(I) -> IResult<I, O, E>,
+    G: FnMut(I) -> IResult<I, O2, E>,
+    E: nom::error::ParseError<I>,
+{
+    move |i| {
+        let (rem, first) = f(i)?;
+
+        let mut res = cre();
+        add(&mut res, first);
+
+        match sep(rem.clone()) {
+            Ok((rem, _)) => parameterized_separated_list(sep, f, move || res, add)(rem),
+            Err(nom::Err::Error(_)) => Ok((rem, res)),
+            Err(e) => Err(e),
+        }
+    }
+}
+
+/// This is a copied version of nom's `recognize` that runs the parser
+/// **and** returns the entire matched input.
+fn parse_and_recognize<
+    I: Clone + nom::Offset + nom::Slice<std::ops::RangeTo<usize>>,
+    O,
+    E: nom::error::ParseError<I>,
+    F,
+>(
+    mut parser: F,
+) -> impl FnMut(I) -> IResult<I, (I, O), E>
+where
+    F: FnMut(I) -> IResult<I, O, E>,
+{
+    move |input: I| {
+        let i = input.clone();
+        match parser(i) {
+            Ok((i, o)) => {
+                let index = input.offset(&i);
+                Ok((i, (input.slice(..index), o)))
+            }
+            Err(e) => Err(e),
+        }
+    }
+}
+
+/// This is very similar to nom's `map_res`, but creates a
+/// `nom::Err::Failure` instead.
+fn map_fail<'a, R1, R2>(
+    mut first: impl FnMut(&'a str) -> IResult<&'a str, R1>,
+    second: impl FnOnce(R1) -> Result<R2, Error>,
+) -> impl FnOnce(&'a str) -> IResult<&'a str, R2> {
+    move |i| {
+        let (remaining, value) = first(i)?;
+
+        match second(value) {
+            Ok(v) => Ok((remaining, v)),
+            Err(e) => Err(nom::Err::Failure(e)),
+        }
+    }
+}
+
+// copy / pasted from influxdb2_client to avoid a dependency on that crate
+
+/// Characters to escape when writing measurement names
+const MEASUREMENT_DELIMITERS: &[char] = &[',', ' '];
+
+/// Characters to escape when writing tag keys
+const TAG_KEY_DELIMITERS: &[char] = &[',', '=', ' '];
+
+/// Characters to escape when writing tag values
+const TAG_VALUE_DELIMITERS: &[char] = TAG_KEY_DELIMITERS;
+
+/// Characters to escape when writing field keys
+const FIELD_KEY_DELIMITERS: &[char] = TAG_KEY_DELIMITERS;
+
+/// Characters to escape when writing string values in fields
+const FIELD_VALUE_STRING_DELIMITERS: &[char] = &['"']; // " Close quotes for buggy editor
+
+/// Writes a `&str` value to `f`, escaping all characters in
+/// `escaping_specificiation`.
+///
+/// Use the constants defined in this module.
+fn escape_and_write_value(
+    f: &mut fmt::Formatter<'_>,
+    value: &str,
+    escaping_specification: &[char],
+) -> fmt::Result {
+    let mut last = 0;
+
+    for (idx, delim) in value.match_indices(escaping_specification) {
+        let s = &value[last..idx];
+        write!(f, r#"{s}\{delim}"#)?;
+        last = idx + delim.len();
+    }
+
+    f.write_str(&value[last..])
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use smallvec::smallvec;
+    use test_helpers::approximately_equal;
+
+    impl FieldValue<'_> {
+        fn unwrap_i64(&self) -> i64 {
+            match self {
+                Self::I64(v) => *v,
+                _ => panic!("field was not an i64"),
+            }
+        }
+
+        fn unwrap_u64(&self) -> u64 {
+            match self {
+                Self::U64(v) => *v,
+                _ => panic!("field was not an u64"),
+            }
+        }
+
+        fn unwrap_f64(&self) -> f64 {
+            match self {
+                Self::F64(v) => *v,
+                _ => panic!("field was not an f64"),
+            }
+        }
+
+        fn unwrap_string(&self) -> String {
+            match self {
+                Self::String(v) => v.to_string(),
+                _ => panic!("field was not a String"),
+            }
+        }
+
+        fn unwrap_bool(&self) -> bool {
+            match self {
+                Self::Boolean(v) => *v,
+                _ => panic!("field was not a Bool"),
+            }
+        }
+    }
+
+    #[test]
+    fn parse_lines_returns_all_lines_even_when_a_line_errors() {
+        let input = ",tag1=1,tag2=2 value=1 123\nm,tag1=one,tag2=2 value=1 123";
+        let vals = super::parse_lines(input).collect::<Vec<Result<_>>>();
+        assert!(matches!(
+            &vals[..],
+            &[Err(Error::MeasurementValueInvalid), Ok(_)]
+        ));
+    }
+
+    #[test]
+    fn escaped_str_basic() {
+        // Demonstrate how strings without any escapes are handled.
+        let es = EscapedStr::from("Foo");
+        assert_eq!(es, "Foo");
+        assert!(!es.is_escaped(), "There are no escaped values");
+        assert!(!es.ends_with('F'));
+        assert!(!es.ends_with('z'));
+        assert!(!es.ends_with("zz"));
+        assert!(es.ends_with('o'));
+        assert!(es.ends_with("oo"));
+        assert!(es.ends_with("Foo"));
+    }
+
+    #[test]
+    fn escaped_str_multi() {
+        // Get an EscapedStr that has multiple parts by parsing a
+        // measurement name with a non-whitespace escape character
+        let (remaining, es) = measurement("Foo\\aBar").unwrap();
+        assert!(remaining.is_empty());
+        assert_eq!(es, EscapedStr::from_slices(&["Foo", "\\", "a", "Bar"]));
+        assert!(es.is_escaped());
+
+        // Test `ends_with` across boundaries
+        assert!(es.ends_with("Bar"));
+
+        // Test PartialEq implementation for escaped str
+        assert!(es == "Foo\\aBar");
+        assert!(es != "Foo\\aBa");
+        assert!(es != "Foo\\aBaz");
+        assert!(es != "Foo\\a");
+        assert!(es != "Foo\\");
+        assert!(es != "Foo");
+        assert!(es != "Fo");
+        assert!(es != "F");
+        assert!(es != "");
+    }
+
+    #[test]
+    fn optionally_escaped_strs_are_equal_and_hash_the_same() {
+        let (_remaining, field_name_without_escaping) = field_key("foo,bar").unwrap();
+        assert!(field_name_without_escaping == "foo,bar");
+        assert!(!field_name_without_escaping.is_escaped());
+
+        let (_remaining, field_name_with_escaping) = field_key("foo\\,bar").unwrap();
+        assert!(field_name_with_escaping == "foo,bar");
+        assert!(field_name_with_escaping.is_escaped());
+
+        assert_eq!(field_name_without_escaping, field_name_with_escaping);
+        assert_eq!(
+            calculate_hash(&field_name_without_escaping),
+            calculate_hash(&field_name_with_escaping)
+        );
+    }
+
+    fn calculate_hash<T: std::hash::Hash>(t: &T) -> u64 {
+        let mut s = std::collections::hash_map::DefaultHasher::new();
+        t.hash(&mut s);
+        s.finish()
+    }
+
+    #[test]
+    fn test_trim_leading() {
+        assert_eq!(trim_leading(""), "");
+        assert_eq!(trim_leading("  a b c "), "a b c ");
+        assert_eq!(trim_leading("  a "), "a ");
+        assert_eq!(trim_leading("\n  a "), "a ");
+        assert_eq!(trim_leading("\t  a "), "a ");
+
+        // comments
+        assert_eq!(trim_leading("  #comment\n a "), "a ");
+        assert_eq!(trim_leading("#comment\tcomment"), "");
+        assert_eq!(trim_leading("#comment\n #comment2\n#comment\na"), "a");
+    }
+
+    #[test]
+    fn test_split_lines() {
+        assert_eq!(split_lines("").collect::<Vec<_>>(), vec![""]);
+        assert_eq!(split_lines("foo").collect::<Vec<_>>(), vec!["foo"]);
+        assert_eq!(
+            split_lines("foo\nbar").collect::<Vec<_>>(),
+            vec!["foo", "bar"]
+        );
+        assert_eq!(
+            split_lines("foo\nbar\nbaz").collect::<Vec<_>>(),
+            vec!["foo", "bar", "baz"]
+        );
+
+        assert_eq!(
+            split_lines("foo\\nbar\nbaz").collect::<Vec<_>>(),
+            vec!["foo\\nbar", "baz"]
+        );
+        assert_eq!(
+            split_lines("meas tag=val field=1\nnext\n").collect::<Vec<_>>(),
+            vec!["meas tag=val field=1", "next", ""]
+        );
+        assert_eq!(
+            split_lines("meas tag=val field=\"\nval\"\nnext").collect::<Vec<_>>(),
+            vec!["meas tag=val field=\"\nval\"", "next"]
+        );
+        assert_eq!(
+            split_lines("meas tag=val field=\\\"\nval\"\nnext").collect::<Vec<_>>(),
+            vec!["meas tag=val field=\\\"", "val\"", "next"]
+        );
+        assert_eq!(
+            split_lines("meas tag=val field=1,field=\"\nval\"\nnext").collect::<Vec<_>>(),
+            vec!["meas tag=val field=1,field=\"\nval\"", "next"]
+        );
+        assert_eq!(
+            split_lines("meas tag=val field=1,field=\\\"\nval\"\nnext").collect::<Vec<_>>(),
+            vec!["meas tag=val field=1,field=\\\"", "val\"", "next"]
+        );
+    }
+
+    #[test]
+    fn escaped_str_multi_to_string() {
+        let (_, es) = measurement("Foo\\aBar").unwrap();
+        // test the From<> implementation
+        assert_eq!(es, "Foo\\aBar");
+    }
+
+    fn parse(s: &str) -> Result<Vec<ParsedLine<'_>>, super::Error> {
+        super::parse_lines(s).collect()
+    }
+
+    #[test]
+    fn parse_empty() {
+        let input = "";
+        let vals = parse(input);
+        assert_eq!(vals.unwrap().len(), 0);
+    }
+
+    // tests that an incomplete tag=value pair returns an error about a malformed tagset
+    #[test]
+    fn parse_tag_no_value() {
+        let input = "testmeasure,foo= bar=1i";
+        let vals = parse(input);
+        assert!(matches!(vals, Err(Error::TagSetMalformed)));
+    }
+
+    // tests that just a comma after the measurement is an error
+    #[test]
+    fn parse_no_tagset() {
+        let input = "testmeasure, bar=1i";
+        let vals = parse(input);
+        assert!(matches!(vals, Err(Error::TagSetMalformed)));
+    }
+
+    #[test]
+    fn parse_no_measurement() {
+        let input = ",tag1=1,tag2=2 value=1 123";
+        let vals = parse(input);
+        assert!(matches!(vals, Err(Error::MeasurementValueInvalid)));
+
+        // accepts `field=1` as measurement, and errors on missing field
+        let input = "field=1 1234";
+        let vals = parse(input);
+        assert!(matches!(vals, Err(Error::FieldSetMissing)));
+    }
+
+    // matches behavior in influxdb golang parser
+    #[test]
+    fn parse_measurement_with_eq() {
+        let input = "tag1=1 field=1 1234";
+        let vals = parse(input);
+        assert!(vals.is_ok());
+
+        let input = "tag1=1,tag2=2 value=1 123";
+        let vals = parse(input);
+        assert!(vals.is_ok());
+    }
+
+    #[test]
+    fn parse_null_measurement() {
+        let input = "\0 field=1 1234";
+        let vals = parse(input);
+        assert!(matches!(vals, Err(Error::MeasurementValueInvalid)));
+
+        let input = "\0,tag1=1,tag2=2 value=1 123";
+        let vals = parse(input);
+        assert!(matches!(vals, Err(Error::MeasurementValueInvalid)));
+    }
+
+    #[test]
+    fn parse_where_nulls_accepted() {
+        let input = "m,tag\x001=one,tag2=2 value=1 123
+            m,tag1=o\0ne,tag2=2 value=1 123
+            m,tag1=one,tag2=\0 value=1 123
+            m,tag1=one,tag2=2 val\0ue=1 123
+            m,tag1=one,tag2=2 value=\"v\0\" 123";
+        let vals = parse(input);
+        assert!(vals.is_ok());
+        assert_eq!(vals.unwrap().len(), 5);
+    }
+
+    #[test]
+    fn parse_no_fields() {
+        let input = "foo 1234";
+        let vals = parse(input);
+
+        assert!(matches!(vals, Err(super::Error::FieldSetMissing)));
+    }
+
+    #[test]
+    fn parse_null_in_field_value() {
+        let input = "m,tag1=one,tag2=2 value=\0 123";
+        let vals = parse(input);
+        assert!(vals.is_err());
+    }
+
+    #[test]
+    fn parse_single_field_integer() {
+        let input = "foo asdf=23i 1234";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 23);
+    }
+
+    #[test]
+    fn parse_single_field_unteger() {
+        let input = "foo asdf=23u 1234";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert_eq!(vals[0].field_set[0].1.unwrap_u64(), 23);
+    }
+
+    #[test]
+    fn parse_single_field_float_no_decimal() {
+        let input = "foo asdf=44 546";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(546));
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert!(approximately_equal(
+            vals[0].field_set[0].1.unwrap_f64(),
+            44.0
+        ));
+    }
+
+    #[test]
+    fn parse_single_field_float_with_decimal() {
+        let input = "foo asdf=3.74 123";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert!(approximately_equal(
+            vals[0].field_set[0].1.unwrap_f64(),
+            3.74
+        ));
+    }
+
+    #[test]
+    fn parse_single_field_string() {
+        let input = r#"foo asdf="the string value" 1234"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert_eq!(&vals[0].field_set[0].1.unwrap_string(), "the string value");
+    }
+
+    #[test]
+    fn parse_single_field_bool() {
+        let input = r#"foo asdf=true 1234"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert!(vals[0].field_set[0].1.unwrap_bool());
+    }
+
+    #[test]
+    fn parse_string_values() {
+        let test_data = vec![
+            (r#"foo asdf="""#, ""),
+            (r#"foo asdf="str val""#, "str val"),
+            (r#"foo asdf="The \"string\" val""#, r#"The "string" val"#),
+            (
+                r#"foo asdf="The \"string w/ single double quote""#,
+                r#"The "string w/ single double quote"#,
+            ),
+            // Examples from
+            // https://docs.influxdata.com/influxdb/v1.8/write_protocols/line_protocol_tutorial/#special-characters
+            (r#"foo asdf="too hot/cold""#, r#"too hot/cold"#),
+            (r#"foo asdf="too hot\cold""#, r"too hot\cold"),
+            (r#"foo asdf="too hot\\cold""#, r"too hot\cold"),
+            (r#"foo asdf="too hot\\\cold""#, r"too hot\\cold"),
+            (r#"foo asdf="too hot\\\\cold""#, r"too hot\\cold"),
+            (r#"foo asdf="too hot\\\\\cold""#, r"too hot\\\cold"),
+        ];
+
+        for (input, expected_parsed_string_value) in test_data {
+            let vals = parse(input).unwrap();
+            assert_eq!(vals[0].series.tag_set, None);
+            assert_eq!(vals[0].field_set.len(), 1);
+            assert_eq!(vals[0].field_set[0].0, "asdf");
+            assert_eq!(
+                &vals[0].field_set[0].1.unwrap_string(),
+                expected_parsed_string_value
+            );
+        }
+    }
+
+    #[test]
+    fn parse_bool_values() {
+        let test_data = vec![
+            (r#"foo asdf=t"#, true),
+            (r#"foo asdf=T"#, true),
+            (r#"foo asdf=true"#, true),
+            (r#"foo asdf=True"#, true),
+            (r#"foo asdf=TRUE"#, true),
+            (r#"foo asdf=f"#, false),
+            (r#"foo asdf=F"#, false),
+            (r#"foo asdf=false"#, false),
+            (r#"foo asdf=False"#, false),
+            (r#"foo asdf=FALSE"#, false),
+        ];
+
+        for (input, expected_parsed_bool_value) in test_data {
+            let vals = parse(input).unwrap();
+            assert_eq!(vals[0].series.tag_set, None);
+            assert_eq!(vals[0].field_set.len(), 1);
+            assert_eq!(vals[0].field_set[0].0, "asdf");
+            assert_eq!(
+                vals[0].field_set[0].1.unwrap_bool(),
+                expected_parsed_bool_value
+            );
+        }
+    }
+
+    #[test]
+    fn parse_two_fields_integer() {
+        let input = "foo asdf=23i,bar=5i 1234";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 23);
+
+        assert_eq!(vals[0].field_set[1].0, "bar");
+        assert_eq!(vals[0].field_set[1].1.unwrap_i64(), 5);
+    }
+
+    #[test]
+    fn parse_two_fields_unteger() {
+        let input = "foo asdf=23u,bar=5u 1234";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert_eq!(vals[0].field_set[0].1.unwrap_u64(), 23);
+
+        assert_eq!(vals[0].field_set[1].0, "bar");
+        assert_eq!(vals[0].field_set[1].1.unwrap_u64(), 5);
+    }
+
+    #[test]
+    fn parse_two_fields_float() {
+        let input = "foo asdf=23.1,bar=5 1234";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert!(approximately_equal(
+            vals[0].field_set[0].1.unwrap_f64(),
+            23.1
+        ));
+
+        assert_eq!(vals[0].field_set[1].0, "bar");
+        assert!(approximately_equal(
+            vals[0].field_set[1].1.unwrap_f64(),
+            5.0
+        ));
+    }
+
+    #[test]
+    fn parse_mixed_field_types() {
+        let input = r#"foo asdf=23.1,bar=-5i,qux=9u,baz="the string",frab=false 1234"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(1234));
+
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert!(approximately_equal(
+            vals[0].field_set[0].1.unwrap_f64(),
+            23.1
+        ));
+
+        assert_eq!(vals[0].field_set[1].0, "bar");
+        assert_eq!(vals[0].field_set[1].1.unwrap_i64(), -5);
+
+        assert_eq!(vals[0].field_set[2].0, "qux");
+        assert_eq!(vals[0].field_set[2].1.unwrap_u64(), 9);
+
+        assert_eq!(vals[0].field_set[3].0, "baz");
+        assert_eq!(vals[0].field_set[3].1.unwrap_string(), "the string");
+
+        assert_eq!(vals[0].field_set[4].0, "frab");
+        assert!(!vals[0].field_set[4].1.unwrap_bool());
+    }
+
+    #[test]
+    fn parse_negative_integer() {
+        let input = "m0 field=-1i 99";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals.len(), 1);
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), -1);
+    }
+
+    #[test]
+    fn parse_negative_uinteger() {
+        let input = "m0 field=-1u 99";
+        let parsed = parse(input);
+
+        assert!(
+            matches!(parsed, Err(super::Error::CannotParseEntireLine { .. })),
+            "Wrong error: {parsed:?}",
+        );
+    }
+
+    #[test]
+    fn parse_scientific_float() {
+        // Positive tests
+        let input = "m0 field=-1.234456e+06 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        let input = "m0 field=-1.234456E+3 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        let input = "m0 field=1.234456e+02 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        let input = "m0 field=1.234456E+16 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        let input = "m0 field=1.234456E-16";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        let input = "m0 field=1.234456e-03";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        let input = "m0 field=1.234456e-0";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        let input = "m0 field=1e-0";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+        assert_eq!(vals[0].field_value("field"), Some(&FieldValue::F64(1.0)));
+
+        let input = "m0 field=-1e-0";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+        assert_eq!(vals[0].field_value("field"), Some(&FieldValue::F64(-1.0)));
+
+        // NO "+" sign is accepted by IDPE
+        let input = "m0 field=-1.234456e06 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+        assert_float_field(&vals[0], "field", -1.234456e06);
+
+        let input = "m0 field=1.234456e06 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+        assert_float_field(&vals[0], "field", 1.234456e06);
+
+        let input = "m0 field=-1.234456E06 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+        assert_float_field(&vals[0], "field", -1.234456e06);
+
+        let input = "m0 field=1.234456E06 1615869152385000000";
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+        assert_float_field(&vals[0], "field", 1.234456e06);
+
+        /////////////////////
+        // Negative tests
+
+        // No digits after e
+        let input = "m0 field=-1.234456e 1615869152385000000";
+        let parsed = parse(input);
+        assert!(
+            matches!(parsed, Err(super::Error::CannotParseEntireLine { .. })),
+            "Wrong error: {parsed:?}",
+        );
+
+        let input = "m0 field=-1.234456e+ 1615869152385000000";
+        let parsed = parse(input);
+        assert!(
+            matches!(parsed, Err(super::Error::CannotParseEntireLine { .. })),
+            "Wrong error: {parsed:?}",
+        );
+
+        let input = "m0 field=-1.234456E 1615869152385000000";
+        let parsed = parse(input);
+        assert!(
+            matches!(parsed, Err(super::Error::CannotParseEntireLine { .. })),
+            "Wrong error: {parsed:?}",
+        );
+
+        let input = "m0 field=-1.234456E+ 1615869152385000000";
+        let parsed = parse(input);
+        assert!(
+            matches!(parsed, Err(super::Error::CannotParseEntireLine { .. })),
+            "Wrong error: {parsed:?}",
+        );
+
+        let input = "m0 field=-1.234456E-";
+        let parsed = parse(input);
+        assert!(
+            matches!(parsed, Err(super::Error::CannotParseEntireLine { .. })),
+            "Wrong error: {parsed:?}",
+        );
+    }
+
+    #[test]
+    fn parse_negative_float() {
+        let input = "m0 field2=-1 99";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals.len(), 1);
+        assert!(approximately_equal(
+            vals[0].field_set[0].1.unwrap_f64(),
+            -1.0
+        ));
+    }
+
+    #[test]
+    fn parse_out_of_range_integer() {
+        let input = "m0 field=99999999999999999999999999999999i 99";
+        let parsed = parse(input);
+
+        assert!(
+            matches!(parsed, Err(super::Error::IntegerValueInvalid { .. })),
+            "Wrong error: {parsed:?}",
+        );
+    }
+
+    #[test]
+    fn parse_out_of_range_uinteger() {
+        let input = "m0 field=99999999999999999999999999999999u 99";
+        let parsed = parse(input);
+
+        assert!(
+            matches!(parsed, Err(super::Error::UIntegerValueInvalid { .. })),
+            "Wrong error: {parsed:?}",
+        );
+    }
+
+    #[test]
+    fn parse_out_of_range_float() {
+        // this works since rust 1.55
+        let input = format!("m0 field={val}.{val} 99", val = "9".repeat(200));
+        let vals = parse(&input).unwrap();
+
+        assert_eq!(vals.len(), 1);
+        assert!(approximately_equal(
+            vals[0].field_set[0].1.unwrap_f64(),
+            1e200f64
+        ));
+
+        // even very long inputs now work
+        let input = format!("m0 field={val}.{val} 99", val = "9".repeat(1_000));
+        let vals = parse(&input).unwrap();
+
+        assert_eq!(vals.len(), 1);
+        assert!(vals[0].field_set[0].1.unwrap_f64().is_infinite());
+    }
+
+    #[test]
+    fn parse_tag_set_included_in_series() {
+        let input = "foo,tag1=1,tag2=2 value=1 123";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+
+        assert_eq!(vals[0].series.tag_set.as_ref().unwrap()[0].0, "tag1");
+        assert_eq!(vals[0].series.tag_set.as_ref().unwrap()[0].1, "1");
+
+        assert_eq!(vals[0].series.tag_set.as_ref().unwrap()[1].0, "tag2");
+        assert_eq!(vals[0].series.tag_set.as_ref().unwrap()[1].1, "2");
+
+        assert_eq!(vals[0].field_set[0].0, "value");
+    }
+
+    #[test]
+    fn parse_tag_set_unsorted() {
+        let input = "foo,tag2=2,tag1=1";
+        let (remaining, series) = series(input).unwrap();
+
+        assert!(remaining.is_empty());
+        assert_eq!(series.generate_base().unwrap(), "foo,tag1=1,tag2=2");
+    }
+
+    #[test]
+    fn parse_tag_set_duplicate_tags() {
+        let input = "foo,tag=1,tag=2";
+        let (remaining, series) = series(input).unwrap();
+
+        assert!(remaining.is_empty());
+        let err = series
+            .generate_base()
+            .expect_err("Parsing duplicate tags should fail");
+
+        assert_eq!(
+            err.to_string(),
+            r#"Must not contain duplicate tags, but "tag" was repeated"#
+        );
+    }
+
+    #[test]
+    fn parse_multiple_lines_become_multiple_points() {
+        let input = r#"foo value1=1i 123
+foo value2=2i 123"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(vals[0].field_set[0].0, "value1");
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 1);
+
+        assert_eq!(vals[1].series.measurement, "foo");
+        assert_eq!(vals[1].timestamp, Some(123));
+        assert_eq!(vals[1].field_set[0].0, "value2");
+        assert_eq!(vals[1].field_set[0].1.unwrap_i64(), 2);
+    }
+
+    #[test]
+    fn parse_multiple_measurements_become_multiple_points() {
+        let input = r#"foo value1=1i 123
+bar value2=2i 123"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(vals[0].field_set[0].0, "value1");
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 1);
+
+        assert_eq!(vals[1].series.measurement, "bar");
+        assert_eq!(vals[1].timestamp, Some(123));
+        assert_eq!(vals[1].field_set[0].0, "value2");
+        assert_eq!(vals[1].field_set[0].1.unwrap_i64(), 2);
+    }
+
+    #[test]
+    fn parse_trailing_whitespace_is_fine() {
+        let input = r#"foo,tag=val value1=1i 123
+
+"#;
+        let vals = parse(input).unwrap();
+        assert_eq!(vals.len(), 1);
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(vals[0].field_set[0].0, "value1");
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 1);
+    }
+
+    #[test]
+    fn parse_negative_timestamp() {
+        let input = r#"foo value1=1i -123"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(-123));
+        assert_eq!(vals[0].field_set[0].0, "value1");
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 1);
+    }
+
+    #[test]
+    fn parse_out_of_range_timestamp() {
+        let input = "m0 field=1i 99999999999999999999999999999999";
+        let parsed = parse(input);
+
+        assert!(
+            matches!(parsed, Err(super::Error::TimestampValueInvalid { .. })),
+            "Wrong error: {parsed:?}",
+        );
+    }
+
+    #[test]
+    fn parse_blank_lines_are_ignored() {
+        let input = "\n\n\n";
+        let vals = parse(input).unwrap();
+
+        assert!(vals.is_empty());
+    }
+
+    #[test]
+    fn parse_commented_lines_are_ignored() {
+        let input = "# comment";
+        let vals = parse(input).unwrap();
+
+        assert!(vals.is_empty());
+    }
+
+    #[test]
+    fn parse_multiple_whitespace_between_elements_is_allowed() {
+        let input = "  measurement  a=1i  123  ";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "measurement");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(vals[0].field_set[0].0, "a");
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 1);
+    }
+
+    macro_rules! assert_fully_parsed {
+        ($parse_result:expr, $output:expr $(,)?) => {{
+            let (remaining, parsed) = $parse_result.unwrap();
+
+            assert!(
+                remaining.is_empty(),
+                "Some input remained to be parsed: {:?}",
+                remaining,
+            );
+            assert_eq!(parsed, $output, "Did not parse the expected output");
+        }};
+    }
+
+    #[test]
+    fn measurement_allows_escaping_comma() {
+        assert_fully_parsed!(measurement(r"wea\,ther"), r#"wea,ther"#);
+    }
+
+    #[test]
+    fn measurement_allows_escaping_space() {
+        assert_fully_parsed!(measurement(r"wea\ ther"), r#"wea ther"#);
+    }
+
+    #[test]
+    fn measurement_allows_escaping_backslash() {
+        assert_fully_parsed!(measurement(r"\\wea\\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn measurement_allows_backslash_with_unknown_escape() {
+        assert_fully_parsed!(measurement(r"\wea\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn measurement_allows_literal_newline_as_unknown_escape() {
+        assert_fully_parsed!(
+            measurement(
+                r"weat\
+her"
+            ),
+            "weat\\\nher",
+        );
+    }
+
+    #[test]
+    fn measurement_disallows_literal_newline() {
+        let (remaining, parsed) = measurement(
+            r#"weat
+her"#,
+        )
+        .unwrap();
+        assert_eq!(parsed, "weat");
+        assert_eq!(remaining, "\nher");
+    }
+
+    #[test]
+    fn measurement_disallows_ending_in_backslash() {
+        let parsed = measurement(r"weather\");
+        assert!(matches!(
+            parsed,
+            Err(nom::Err::Failure(super::Error::EndsWithBackslash))
+        ));
+    }
+
+    #[test]
+    fn tag_key_allows_escaping_comma() {
+        assert_fully_parsed!(tag_key(r"wea\,ther"), r#"wea,ther"#);
+    }
+
+    #[test]
+    fn tag_key_allows_escaping_equal() {
+        assert_fully_parsed!(tag_key(r"wea\=ther"), r#"wea=ther"#);
+    }
+
+    #[test]
+    fn tag_key_allows_escaping_space() {
+        assert_fully_parsed!(tag_key(r"wea\ ther"), r#"wea ther"#);
+    }
+
+    #[test]
+    fn tag_key_allows_escaping_backslash() {
+        assert_fully_parsed!(tag_key(r"\\wea\\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn tag_key_allows_backslash_with_unknown_escape() {
+        assert_fully_parsed!(tag_key(r"\wea\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn tag_key_allows_literal_newline_as_unknown_escape() {
+        assert_fully_parsed!(
+            tag_key(
+                r"weat\
+her"
+            ),
+            "weat\\\nher",
+        );
+    }
+
+    #[test]
+    fn tag_key_disallows_literal_newline() {
+        let (remaining, parsed) = tag_key(
+            r#"weat
+her"#,
+        )
+        .unwrap();
+        assert_eq!(parsed, "weat");
+        assert_eq!(remaining, "\nher");
+    }
+
+    #[test]
+    fn tag_key_disallows_ending_in_backslash() {
+        let parsed = tag_key(r"weather\");
+        assert!(matches!(
+            parsed,
+            Err(nom::Err::Failure(super::Error::EndsWithBackslash))
+        ));
+    }
+
+    #[test]
+    fn tag_value_allows_escaping_comma() {
+        assert_fully_parsed!(tag_value(r"wea\,ther"), r#"wea,ther"#);
+    }
+
+    #[test]
+    fn tag_value_allows_escaping_equal() {
+        assert_fully_parsed!(tag_value(r"wea\=ther"), r#"wea=ther"#);
+    }
+
+    #[test]
+    fn tag_value_allows_escaping_space() {
+        assert_fully_parsed!(tag_value(r"wea\ ther"), r#"wea ther"#);
+    }
+
+    #[test]
+    fn tag_value_allows_escaping_backslash() {
+        assert_fully_parsed!(tag_value(r"\\wea\\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn tag_value_allows_backslash_with_unknown_escape() {
+        assert_fully_parsed!(tag_value(r"\wea\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn tag_value_allows_literal_newline_as_unknown_escape() {
+        assert_fully_parsed!(
+            tag_value(
+                r"weat\
+her"
+            ),
+            "weat\\\nher",
+        );
+    }
+
+    #[test]
+    fn tag_value_disallows_literal_newline() {
+        let (remaining, parsed) = tag_value(
+            r#"weat
+her"#,
+        )
+        .unwrap();
+        assert_eq!(parsed, "weat");
+        assert_eq!(remaining, "\nher");
+    }
+
+    #[test]
+    fn tag_value_disallows_ending_in_backslash() {
+        let parsed = tag_value(r"weather\");
+        assert!(matches!(
+            parsed,
+            Err(nom::Err::Failure(super::Error::EndsWithBackslash))
+        ));
+    }
+
+    #[test]
+    fn field_key_allows_escaping_comma() {
+        assert_fully_parsed!(field_key(r"wea\,ther"), r#"wea,ther"#);
+    }
+
+    #[test]
+    fn field_key_allows_escaping_equal() {
+        assert_fully_parsed!(field_key(r"wea\=ther"), r#"wea=ther"#);
+    }
+
+    #[test]
+    fn field_key_allows_escaping_space() {
+        assert_fully_parsed!(field_key(r"wea\ ther"), r#"wea ther"#);
+    }
+
+    #[test]
+    fn field_key_allows_escaping_backslash() {
+        assert_fully_parsed!(field_key(r"\\wea\\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn field_key_allows_backslash_with_unknown_escape() {
+        assert_fully_parsed!(field_key(r"\wea\ther"), r"\wea\ther");
+    }
+
+    #[test]
+    fn field_key_allows_literal_newline_as_unknown_escape() {
+        assert_fully_parsed!(
+            field_key(
+                r"weat\
+her"
+            ),
+            "weat\\\nher",
+        );
+    }
+
+    #[test]
+    fn field_key_disallows_literal_newline() {
+        let (remaining, parsed) = field_key(
+            r#"weat
+her"#,
+        )
+        .unwrap();
+        assert_eq!(parsed, "weat");
+        assert_eq!(remaining, "\nher");
+    }
+
+    #[test]
+    fn field_key_disallows_ending_in_backslash() {
+        let parsed = field_key(r"weather\");
+        assert!(matches!(
+            parsed,
+            Err(nom::Err::Failure(super::Error::EndsWithBackslash))
+        ));
+    }
+
+    #[test]
+    fn parse_no_time() {
+        let input = "foo,tag0=value1 asdf=23.1,bar=5i";
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].series.tag_set.as_ref().unwrap()[0].0, "tag0");
+        assert_eq!(vals[0].series.tag_set.as_ref().unwrap()[0].1, "value1");
+
+        assert_eq!(vals[0].timestamp, None);
+
+        assert_eq!(vals[0].field_set[0].0, "asdf");
+        assert!(approximately_equal(
+            vals[0].field_set[0].1.unwrap_f64(),
+            23.1
+        ));
+
+        assert_eq!(vals[0].field_set[1].0, "bar");
+        assert_eq!(vals[0].field_set[1].1.unwrap_i64(), 5);
+    }
+
+    #[test]
+    fn parse_advance_after_error() {
+        // Note that the first line has an error (23.1.22 is not a number),
+        // but there is valid data afterwrds,
+        let input = "foo,tag0=value1 asdf=23.1.22,jkl=4\n\
+                     foo,tag0=value2 asdf=22.1,jkl=5";
+
+        let vals: Vec<_> = super::parse_lines(input).collect();
+
+        assert_eq!(vals.len(), 2);
+        assert!(vals[0].is_err());
+        assert_eq!(
+            format!("{:?}", &vals[0]),
+            "Err(CannotParseEntireLine { trailing_content: \".22,jkl=4\" })"
+        );
+
+        assert!(vals[1].is_ok());
+        let parsed_line = vals[1].as_ref().expect("second line succeeded");
+        assert_eq!(parsed_line.series.measurement, "foo");
+        assert_eq!(parsed_line.series.tag_set.as_ref().unwrap()[0].0, "tag0");
+        assert_eq!(parsed_line.series.tag_set.as_ref().unwrap()[0].1, "value2");
+
+        assert_eq!(parsed_line.timestamp, None);
+
+        assert_eq!(parsed_line.field_set[0].0, "asdf");
+        assert!(approximately_equal(
+            parsed_line.field_set[0].1.unwrap_f64(),
+            22.1
+        ));
+
+        assert_eq!(parsed_line.field_set[1].0, "jkl");
+        assert!(approximately_equal(
+            parsed_line.field_set[1].1.unwrap_f64(),
+            5.
+        ));
+    }
+
+    #[test]
+    fn field_value_display() {
+        assert_eq!(FieldValue::I64(-42).to_string(), "-42i");
+        assert_eq!(FieldValue::U64(42).to_string(), "42u");
+        assert_eq!(FieldValue::F64(42.11).to_string(), "42.11");
+        assert_eq!(
+            FieldValue::String(EscapedStr::from("foo")).to_string(),
+            "foo"
+        );
+        assert_eq!(FieldValue::Boolean(true).to_string(), "true");
+        assert_eq!(FieldValue::Boolean(false).to_string(), "false");
+    }
+
+    #[test]
+    fn series_display_no_tags() {
+        let series = Series {
+            raw_input: "foo",
+            measurement: EscapedStr::from("m"),
+            tag_set: None,
+        };
+        assert_eq!(series.to_string(), "m");
+    }
+
+    #[test]
+    fn series_display_one_tag() {
+        let series = Series {
+            raw_input: "foo",
+            measurement: EscapedStr::from("m"),
+            tag_set: Some(smallvec![(
+                EscapedStr::from("tag1"),
+                EscapedStr::from("val1")
+            )]),
+        };
+        assert_eq!(series.to_string(), "m,tag1=val1");
+    }
+
+    #[test]
+    fn series_display_two_tags() {
+        let series = Series {
+            raw_input: "foo",
+            measurement: EscapedStr::from("m"),
+            tag_set: Some(smallvec![
+                (EscapedStr::from("tag1"), EscapedStr::from("val1")),
+                (EscapedStr::from("tag2"), EscapedStr::from("val2")),
+            ]),
+        };
+        assert_eq!(series.to_string(), "m,tag1=val1,tag2=val2");
+    }
+
+    #[test]
+    fn parsed_line_display_one_field_no_timestamp() {
+        let series = Series {
+            raw_input: "foo",
+            measurement: EscapedStr::from("m"),
+            tag_set: Some(smallvec![(
+                EscapedStr::from("tag1"),
+                EscapedStr::from("val1")
+            ),]),
+        };
+        let field_set = smallvec![(EscapedStr::from("field1"), FieldValue::F64(42.1))];
+
+        let parsed_line = ParsedLine {
+            series,
+            field_set,
+            timestamp: None,
+        };
+
+        assert_eq!(parsed_line.to_string(), "m,tag1=val1 field1=42.1");
+    }
+
+    #[test]
+    fn parsed_line_display_one_field_timestamp() {
+        let series = Series {
+            raw_input: "foo",
+            measurement: EscapedStr::from("m"),
+            tag_set: Some(smallvec![(
+                EscapedStr::from("tag1"),
+                EscapedStr::from("val1")
+            ),]),
+        };
+        let field_set = smallvec![(EscapedStr::from("field1"), FieldValue::F64(42.1))];
+
+        let parsed_line = ParsedLine {
+            series,
+            field_set,
+            timestamp: Some(33),
+        };
+
+        assert_eq!(parsed_line.to_string(), "m,tag1=val1 field1=42.1 33");
+    }
+
+    #[test]
+    fn parsed_line_display_two_fields_timestamp() {
+        let series = Series {
+            raw_input: "foo",
+            measurement: EscapedStr::from("m"),
+            tag_set: Some(smallvec![(
+                EscapedStr::from("tag1"),
+                EscapedStr::from("val1")
+            ),]),
+        };
+        let field_set = smallvec![
+            (EscapedStr::from("field1"), FieldValue::F64(42.1)),
+            (EscapedStr::from("field2"), FieldValue::Boolean(false)),
+        ];
+
+        let parsed_line = ParsedLine {
+            series,
+            field_set,
+            timestamp: Some(33),
+        };
+
+        assert_eq!(
+            parsed_line.to_string(),
+            "m,tag1=val1 field1=42.1,field2=false 33"
+        );
+    }
+
+    #[test]
+    fn parsed_line_display_escaped() {
+        let series = Series {
+            raw_input: "foo",
+            measurement: EscapedStr::from("m,and m"),
+            tag_set: Some(smallvec![(
+                EscapedStr::from("tag ,1"),
+                EscapedStr::from("val ,1")
+            ),]),
+        };
+        let field_set = smallvec![(
+            EscapedStr::from("field ,1"),
+            FieldValue::String(EscapedStr::from("Foo\"Bar"))
+        ),];
+
+        let parsed_line = ParsedLine {
+            series,
+            field_set,
+            timestamp: Some(33),
+        };
+
+        assert_eq!(
+            parsed_line.to_string(),
+            r#"m\,and\ m,tag\ \,1=val\ \,1 field\ \,1=Foo\"Bar 33"#
+        );
+    }
+
+    #[test]
+    fn field_value_returned() {
+        let input = r#"foo asdf=true 1234"#;
+        let vals = parse(input).unwrap();
+
+        assert!(vals[0].field_value("asdf").unwrap().unwrap_bool());
+    }
+
+    #[test]
+    fn field_value_missing() {
+        let input = r#"foo asdf=true 1234"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].field_value("jkl"), None);
+    }
+
+    #[test]
+    fn tag_value_returned() {
+        let input = r#"foo,test=stuff asdf=true 1234"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(*vals[0].tag_value("test").unwrap(), "stuff");
+    }
+
+    #[test]
+    fn tag_value_missing() {
+        let input = r#"foo,test=stuff asdf=true 1234"#;
+        let vals = parse(input).unwrap();
+
+        assert_eq!(vals[0].tag_value("asdf"), None);
+    }
+
+    #[test]
+    fn test_field_value_same_type() {
+        // True cases
+        assert!(FieldValue::I64(0).is_same_type(&FieldValue::I64(42)));
+        assert!(FieldValue::U64(0).is_same_type(&FieldValue::U64(42)));
+        assert!(FieldValue::F64(0.0).is_same_type(&FieldValue::F64(4.2)));
+        // String & String
+        assert!(
+            FieldValue::String(EscapedStr::CopiedValue("bananas".to_string())).is_same_type(
+                &FieldValue::String(EscapedStr::CopiedValue("platanos".to_string()))
+            )
+        );
+        // str & str
+        assert!(FieldValue::String(EscapedStr::SingleSlice("bananas"))
+            .is_same_type(&FieldValue::String(EscapedStr::SingleSlice("platanos"))));
+        // str & String
+        assert!(
+            FieldValue::String(EscapedStr::SingleSlice("bananas")).is_same_type(
+                &FieldValue::String(EscapedStr::CopiedValue("platanos".to_string()))
+            )
+        );
+
+        assert!(FieldValue::Boolean(true).is_same_type(&FieldValue::Boolean(false)));
+
+        // Some false cases
+        assert!(!FieldValue::I64(0).is_same_type(&FieldValue::U64(42)));
+        assert!(!FieldValue::U64(0).is_same_type(&FieldValue::I64(42)));
+        assert!(!FieldValue::F64(0.0).is_same_type(&FieldValue::U64(42)));
+        assert!(
+            !FieldValue::String(EscapedStr::CopiedValue("bananas".to_string()))
+                .is_same_type(&FieldValue::U64(42))
+        );
+        assert!(!FieldValue::Boolean(true).is_same_type(&FieldValue::U64(42)));
+    }
+
+    #[test]
+    fn test_large_tag_value() {
+        let input = format!(
+            r#"foo,tag1=normal,tag={} value=1i 123"#,
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1)
+        );
+        let parsed = parse(&input);
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_tag_value_with_exact_maximum() {
+        let tag_value = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!(r#"foo,tag1=normal,tag={} value=1i 123"#, tag_value);
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(*vals[0].tag_value("tag1").unwrap(), "normal");
+        assert_eq!(*vals[0].tag_value("tag").unwrap(), tag_value);
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 1);
+    }
+
+    #[test]
+    fn test_large_tag_value_in_one_line_of_multiple_line_protocol() {
+        let input = format!(
+            "foo,tag1=very_long_value_is_okay,tag2=bar value=1i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=2i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=3i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=4i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=5i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=6i 123\n\
+                     foo,tag1={}, tag2=bar value=7i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=8i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=9i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=10i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=11i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=12i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=13i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=14i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=15i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=16i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=17i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=18i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=19i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=20i",
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 2)
+        );
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_field_value() {
+        let input = format!(
+            "foo,tag1=bar value=\"{}\" 123",
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1)
+        );
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_field_value_with_exact_maximum() {
+        let value = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("foo,tag1=bar value=\"{}\" 123", value);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(*vals[0].tag_value("tag1").unwrap(), "bar");
+        assert_eq!(vals[0].field_set[0].1.unwrap_string(), value);
+    }
+
+    #[test]
+    fn test_large_field_value_in_one_line_of_multiple_line_protocol() {
+        let input = format!(
+            "foo,tag1=very_long_value_is_okay,tag2=bar value=2i 123\n\
+        foo,tag1=very_long_value_is_okay,tag2=bar value=2i 123\n\
+        foo,tag1=bar value=\"{}\" 123",
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1)
+        );
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_measurement_name() {
+        let measurement = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1);
+        let input = format!("{},tag1=bar value=1i 123", measurement);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_measurement_name_exact_maximum_length() {
+        let measurement = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("{},tag1=bar value=1i 123", measurement);
+
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, measurement);
+    }
+
+    #[test]
+    fn test_large_tag_name() {
+        let tag_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1);
+        let input = format!("foo,{}=bar value=1i 123", tag_name);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_tag_name_exact_maximum_length() {
+        let tag_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("foo,{}=bar value=1i 123", tag_name);
+
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(*vals[0].tag_value(&tag_name).unwrap(), "bar");
+    }
+
+    #[test]
+    fn test_large_value_name() {
+        let value_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1);
+        let input = format!("foo,tag1=bar {}=1i 123", value_name);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_value_name_exact_maximum_length() {
+        let value_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("foo,tag1=bar {}=1i 123", value_name);
+
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].field_value(&value_name).unwrap().unwrap_i64(), 1);
+    }
+
+    /// Assert that the field named `field_name` has a float value
+    /// within 0.0001% of `expected_value`, panic'ing if not
+    fn assert_float_field(parsed_line: &ParsedLine<'_>, field_name: &str, expected_value: f64) {
+        let field_value = parsed_line
+            .field_value(field_name)
+            .expect("did not contain field name");
+
+        let actual_value = if let FieldValue::F64(v) = field_value {
+            *v
+        } else {
+            panic!("field {field_name} had value {field_value:?}, expected F64");
+        };
+
+        assert!(approximately_equal(expected_value, actual_value));
+    }
+}
diff --git a/influxdb_storage_client/Cargo.toml b/influxdb_storage_client/Cargo.toml
new file mode 100644
index 0000000..bf45843
--- /dev/null
+++ b/influxdb_storage_client/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "influxdb_storage_client"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+client_util = { path = "../client_util" }
+generated_types = { path = "../generated_types" }
+prost = { workspace = true }
+tonic = { workspace = true }
+futures-util = { version = "0.3" }
+observability_deps = { path = "../observability_deps"}
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/influxdb_storage_client/src/lib.rs b/influxdb_storage_client/src/lib.rs
new file mode 100644
index 0000000..7529baf
--- /dev/null
+++ b/influxdb_storage_client/src/lib.rs
@@ -0,0 +1,387 @@
+//! An InfluxDB gRPC storage API client
+#![deny(
+    rustdoc::broken_intra_doc_links,
+    rustdoc::bare_urls,
+    rust_2018_idioms,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+#![warn(
+    missing_docs,
+    clippy::todo,
+    clippy::dbg_macro,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(clippy::missing_docs_in_private_items)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use client_util::connection::GrpcConnection;
+use futures_util::TryStreamExt;
+use prost::Message;
+use std::collections::HashMap;
+use std::fmt::Debug;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::platform::storage::*;
+}
+
+pub use client_util::connection;
+
+use self::connection::Connection;
+use self::generated_types::*;
+use ::generated_types::google::protobuf::*;
+use observability_deps::tracing::{debug, trace};
+use std::num::NonZeroU64;
+
+/// InfluxDB IOx deals with namespace names. The gRPC interface deals with org_id and bucket_id
+/// represented as 16 digit hex values. This struct manages creating the org_id, bucket_id, and
+/// namespace names to be consistent with the implementation.
+#[derive(Debug, Clone)]
+pub struct OrgAndBucket {
+    org_id: NonZeroU64,
+    bucket_id: NonZeroU64,
+    db_name: String,
+}
+
+impl OrgAndBucket {
+    /// Create a new `OrgAndBucket` from the provided `org_id` and `bucket_id`
+    pub fn new(org_id: NonZeroU64, bucket_id: NonZeroU64) -> Self {
+        let db_name = format!("{org_id:016x}_{bucket_id:016x}");
+
+        Self {
+            org_id,
+            bucket_id,
+            db_name,
+        }
+    }
+
+    /// Get the `org_id`
+    pub fn org_id(&self) -> NonZeroU64 {
+        self.org_id
+    }
+
+    /// Get the `bucket_id`
+    pub fn bucket_id(&self) -> NonZeroU64 {
+        self.bucket_id
+    }
+
+    /// Get the `db_name` generated from the provided org and bucket ids
+    pub fn db_name(&self) -> &str {
+        &self.db_name
+    }
+}
+
+/// A client for the InfluxDB gRPC storage API
+#[derive(Debug)]
+pub struct Client {
+    inner: storage_client::StorageClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: storage_client::StorageClient::new(
+                connection
+                    .log_debug("creating with connection")
+                    .into_grpc_connection(),
+            ),
+        }
+    }
+
+    /// Create a ReadSource suitable for constructing messages
+    pub fn read_source(bucket: &OrgAndBucket, partition_id: u64) -> Any {
+        let read_source = ReadSource {
+            org_id: bucket.org_id.get(),
+            bucket_id: bucket.bucket_id.get(),
+            partition_id,
+        };
+        let mut d = prost::bytes::BytesMut::new();
+        read_source
+            .encode(&mut d)
+            .expect("encoded read source appropriately");
+        Any {
+            type_url: "type.googleapis.com/influxdata.platform.storage.read.ReadSource".to_string(),
+            value: d.freeze(),
+        }
+    }
+
+    /// return the capabilities of the server as a hash map
+    pub async fn capabilities(&mut self) -> Result<HashMap<String, Vec<String>>, tonic::Status> {
+        let response = self
+            .inner
+            .capabilities(Empty {})
+            .await
+            .log_trace("capabilities response")?
+            .into_inner();
+
+        let CapabilitiesResponse { caps } = response;
+
+        // unwrap the Vec of Strings inside each `Capability`
+        let caps = caps
+            .into_iter()
+            .map(|(name, capability)| (name, capability.features))
+            .collect();
+
+        Ok(caps)
+    }
+
+    /// Make a request to query::measurement_names and do the
+    /// required async dance to flatten the resulting stream to Strings
+    pub async fn measurement_names(
+        &mut self,
+        request: MeasurementNamesRequest,
+    ) -> Result<Vec<String>, tonic::Status> {
+        let request = request.log_trace("measurement_names request");
+        let responses = self
+            .inner
+            .measurement_names(request)
+            .await
+            .log_trace("measurement_names response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_strings(responses))
+    }
+
+    /// Make a request to query::read_window_aggregate and do the
+    /// required async dance to flatten the resulting stream to Strings
+    pub async fn read_window_aggregate(
+        &mut self,
+        request: ReadWindowAggregateRequest,
+    ) -> Result<Vec<read_response::frame::Data>, tonic::Status> {
+        let request = request.log_trace("read_window_aggregate request");
+        let responses: Vec<_> = self
+            .inner
+            .read_window_aggregate(request)
+            .await
+            .log_trace("read_window_aggregate response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_data(responses))
+    }
+
+    /// Make a request to query::tag_keys and do the
+    /// required async dance to flatten the resulting stream to Strings
+    pub async fn tag_keys(
+        &mut self,
+        request: TagKeysRequest,
+    ) -> Result<Vec<String>, tonic::Status> {
+        let request = request.log_trace("tag_keys request");
+        let responses = self
+            .inner
+            .tag_keys(request)
+            .await
+            .log_trace("tag_keys response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_strings(responses))
+    }
+
+    /// Make a request to query::measurement_tag_keys and do the
+    /// required async dance to flatten the resulting stream to Strings
+    pub async fn measurement_tag_keys(
+        &mut self,
+        request: MeasurementTagKeysRequest,
+    ) -> Result<Vec<String>, tonic::Status> {
+        let request = request.log_trace("measurement_tag_keys request");
+        let responses = self
+            .inner
+            .measurement_tag_keys(request)
+            .await
+            .log_trace("measurement_tag_keys response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_strings(responses))
+    }
+
+    /// Make a request to query::tag_values and do the
+    /// required async dance to flatten the resulting stream to Strings
+    pub async fn tag_values(
+        &mut self,
+        request: TagValuesRequest,
+    ) -> Result<Vec<String>, tonic::Status> {
+        let request = request.log_trace("tag_values request");
+        let responses = self
+            .inner
+            .tag_values(request)
+            .await
+            .log_trace("tag_values response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_strings(responses))
+    }
+
+    /// Make a request to query::tag_values_grouped_by_measurement_and_tag_key
+    /// and do the required async dance to flatten the resulting stream
+    pub async fn tag_values_grouped_by_measurement_and_tag_key(
+        &mut self,
+        request: TagValuesGroupedByMeasurementAndTagKeyRequest,
+    ) -> Result<Vec<TagValuesResponse>, tonic::Status> {
+        let request = request.log_trace("tag_values_grouped_by_measurement_and_tag_key request");
+        let responses: Vec<_> = self
+            .inner
+            .tag_values_grouped_by_measurement_and_tag_key(request)
+            .await
+            .log_trace("tag_values_grouped_by_measurement_and_tag_key response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(responses)
+    }
+
+    /// Make a request to query::measurement_tag_values and do the
+    /// required async dance to flatten the resulting stream to Strings
+    pub async fn measurement_tag_values(
+        &mut self,
+        request: MeasurementTagValuesRequest,
+    ) -> Result<Vec<String>, tonic::Status> {
+        let request = request.log_trace("measurement_tag_values request");
+        let responses = self
+            .inner
+            .measurement_tag_values(request)
+            .await
+            .log_trace("measurement_tag_values response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_strings(responses))
+    }
+
+    /// Make a request to query::read_filter and do the
+    /// required async dance to flatten the resulting stream
+    pub async fn read_filter(
+        &mut self,
+        request: ReadFilterRequest,
+    ) -> Result<Vec<read_response::frame::Data>, tonic::Status> {
+        let request = request.log_trace("read_filter request");
+        let responses: Vec<_> = self
+            .inner
+            .read_filter(request)
+            .await
+            .log_trace("read_filter response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_data(responses))
+    }
+
+    /// Make a request to query::query_groups and do the
+    /// required async dance to flatten the resulting stream
+    pub async fn read_group(
+        &mut self,
+        request: ReadGroupRequest,
+    ) -> Result<Vec<read_response::frame::Data>, tonic::Status> {
+        let request = request.log_trace("read_group request");
+        let responses: Vec<_> = self
+            .inner
+            .read_group(request)
+            .await
+            .log_trace("read_group response")?
+            .into_inner()
+            .try_collect()
+            .await?;
+
+        Ok(Self::collect_data(responses))
+    }
+
+    /// Make a request to query::measurement_fields and do the
+    /// required async dance to flatten the resulting stream to Strings
+    pub async fn measurement_fields(
+        &mut self,
+        request: MeasurementFieldsRequest,
+    ) -> Result<Vec<String>, tonic::Status> {
+        let request = request.log_trace("measurement_fields request");
+        let measurement_fields_response = self.inner.measurement_fields(request).await?;
+
+        let responses: Vec<_> = measurement_fields_response
+            .into_inner()
+            .try_collect::<Vec<_>>()
+            .await
+            .log_trace("measurement_fields response")?
+            .into_iter()
+            .flat_map(|r| r.fields)
+            .map(|message_field| {
+                format!(
+                    "key: {}, type: {}, timestamp: {}",
+                    message_field.key, message_field.r#type, message_field.timestamp
+                )
+            })
+            .collect::<Vec<_>>();
+
+        Ok(responses)
+    }
+
+    /// Extract the data frames from the list of ReadResponse
+    fn collect_data(responses: Vec<ReadResponse>) -> Vec<read_response::frame::Data> {
+        responses
+            .into_iter()
+            .flat_map(|r| r.frames)
+            .flat_map(|f| f.data)
+            .collect()
+    }
+
+    /// Convert the StringValueResponses into rust Strings, sorting the
+    /// values to ensure consistency.
+    fn collect_strings(responses: Vec<StringValuesResponse>) -> Vec<String> {
+        let mut strings = responses
+            .into_iter()
+            .flat_map(|r| r.values.into_iter())
+            .map(tag_key_bytes_to_strings)
+            .collect::<Vec<_>>();
+
+        strings.sort();
+
+        strings
+    }
+}
+
+/// Converts bytes representing tag_keys values to Rust strings,
+/// handling the special case `_m(0x00)` and `_f(0xff)` values. Other
+/// than `0xff` panics on any non-utf8 string.
+pub fn tag_key_bytes_to_strings(bytes: Vec<u8>) -> String {
+    match bytes.as_slice() {
+        [0] => "_m(0x00)".into(),
+        // note this isn't valid UTF8 and thus would assert below
+        [255] => "_f(0xff)".into(),
+        _ => String::from_utf8(bytes).expect("string value response was not utf8"),
+    }
+}
+
+/// Logs the specific item
+trait Loggable {
+    fn log_trace(self, msg: &'static str) -> Self;
+    fn log_debug(self, msg: &'static str) -> Self;
+}
+
+impl<T: Debug> Loggable for T {
+    fn log_trace(self, msg: &'static str) -> Self {
+        trace!(data=?self, "{}", msg);
+        self
+    }
+
+    fn log_debug(self, msg: &'static str) -> Self {
+        debug!(data=?self, "{}", msg);
+        self
+    }
+}
diff --git a/influxdb_tsm/Cargo.toml b/influxdb_tsm/Cargo.toml
new file mode 100644
index 0000000..fe9546c
--- /dev/null
+++ b/influxdb_tsm/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "influxdb_tsm"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies] # In alphabetical order
+integer-encoding = "4.0.0"
+snafu = "0.7"
+snap = "1.1.0"
+observability_deps = { path = "../observability_deps" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+flate2 = "1.0"
+hex = "0.4.2"
+rand = "0.8.3"
+test_helpers = { path = "../test_helpers" }
diff --git a/influxdb_tsm/src/encoders.rs b/influxdb_tsm/src/encoders.rs
new file mode 100644
index 0000000..6dab352
--- /dev/null
+++ b/influxdb_tsm/src/encoders.rs
@@ -0,0 +1,13 @@
+pub mod boolean;
+pub mod float;
+pub mod integer;
+mod simple8b;
+pub mod string;
+pub mod timestamp;
+pub mod unsigned;
+
+/// Max number of bytes needed to store a varint-encoded 32-bit integer.
+const MAX_VAR_INT_32: usize = 5;
+
+/// Max number of bytes needed to store a varint-encoded 64-bit integer.
+const MAX_VAR_INT_64: usize = 10;
diff --git a/influxdb_tsm/src/encoders/boolean.rs b/influxdb_tsm/src/encoders/boolean.rs
new file mode 100644
index 0000000..c238886
--- /dev/null
+++ b/influxdb_tsm/src/encoders/boolean.rs
@@ -0,0 +1,179 @@
+use integer_encoding::VarInt;
+use std::{cmp, convert::TryInto, error::Error};
+
+/// The header consists of one byte indicating the compression type.
+const HEADER_LEN: usize = 1;
+/// A bit packed format using 1 bit per boolean. This is the only available
+/// boolean compression format at this time.
+const BOOLEAN_COMPRESSED_BIT_PACKED: u8 = 1;
+
+/// Encodes a slice of booleans into `dst`.
+///
+/// Boolean encoding uses 1 bit per value. Each compressed byte slice contains a
+/// 1 byte header indicating the compression type, followed by a variable byte
+/// encoded length indicating how many booleans are packed in the slice. The
+/// remaining bytes contain 1 byte for every 8 boolean values encoded.
+pub fn encode(src: &[bool], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+    dst.clear();
+    if src.is_empty() {
+        return Ok(());
+    }
+
+    let size = HEADER_LEN + 8 + ((src.len() + 7) / 8); // Header + Num bools + bool data.
+    dst.resize(size, 0);
+
+    // Store the encoding type in the 4 high bits of the first byte
+    dst[0] = BOOLEAN_COMPRESSED_BIT_PACKED << 4;
+
+    let mut n = 8u64; // Current bit in current byte.
+
+    // Encode the number of booleans written.
+    let len_u64: u64 = src.len().try_into()?;
+    let i = len_u64.encode_var(&mut dst[1..]);
+    let step: u64 = (i * 8).try_into()?;
+    n += step;
+
+    for &v in src {
+        let index: usize = (n >> 3).try_into()?;
+        if v {
+            dst[index] |= 128 >> (n & 7); // Set current bit on current byte.
+        } else {
+            dst[index] &= !(128 >> (n & 7)); // Clear current bit on current
+                                             // byte.
+        }
+        n += 1;
+    }
+
+    let mut length = n >> 3;
+    if n & 7 > 0 {
+        length += 1; // Add an extra byte to capture overflowing bits.
+    }
+    let length: usize = length.try_into()?;
+
+    dst.truncate(length);
+
+    Ok(())
+}
+
+/// Decodes a slice of bytes into a destination vector of `bool`s.
+pub fn decode(src: &[u8], dst: &mut Vec<bool>) -> Result<(), Box<dyn Error>> {
+    if src.is_empty() {
+        return Ok(());
+    }
+
+    // First byte stores the encoding type, only have the bit packed format
+    // currently so ignore for now.
+    assert_eq!(src[0], BOOLEAN_COMPRESSED_BIT_PACKED << 4);
+    let src = &src[HEADER_LEN..];
+
+    let (count, num_bytes_read) = u64::decode_var(src).ok_or("boolean decoder: invalid count")?;
+
+    let mut count: usize = count.try_into()?;
+    let src = &src[num_bytes_read..];
+
+    let min = src.len() * 8;
+
+    // Shouldn't happen - TSM file was truncated/corrupted. This is what the Go code
+    // does
+    count = cmp::min(min, count);
+
+    if dst.capacity() < count {
+        dst.reserve_exact(count - dst.capacity());
+    }
+
+    let mut j = 0;
+    for &v in src {
+        let mut i = 128;
+        while i > 0 && j < count {
+            dst.push(v & i != 0);
+            i >>= 1;
+            j += 1;
+        }
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn encode_no_values() {
+        let src: Vec<bool> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        encode(&src, &mut dst).expect("failed to encode src");
+
+        // verify encoded no values.
+        assert_eq!(dst.len(), 0);
+    }
+
+    #[test]
+    fn encode_single_true() {
+        let src = vec![true];
+        let mut dst = vec![];
+
+        encode(&src, &mut dst).expect("failed to encode src");
+        assert_eq!(dst, vec![16, 1, 128]);
+    }
+
+    #[test]
+    fn encode_single_false() {
+        let src = vec![false];
+        let mut dst = vec![];
+
+        encode(&src, &mut dst).expect("failed to encode src");
+        assert_eq!(dst, vec![16, 1, 0]);
+    }
+
+    #[test]
+    fn encode_multi_compressed() {
+        let src: Vec<_> = (0..10).map(|i| i % 2 == 0).collect();
+        let mut dst = vec![];
+
+        encode(&src, &mut dst).expect("failed to encode src");
+        assert_eq!(dst, vec![16, 10, 170, 128]);
+    }
+
+    #[test]
+    fn decode_no_values() {
+        let src: Vec<u8> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        decode(&src, &mut dst).expect("failed to decode src");
+
+        // verify decoded no values.
+        assert_eq!(dst.len(), 0);
+    }
+
+    #[test]
+    fn decode_single_true() {
+        let src = vec![16, 1, 128];
+        let mut dst = vec![];
+
+        decode(&src, &mut dst).expect("failed to decode src");
+        assert_eq!(dst, vec![true]);
+    }
+
+    #[test]
+    fn decode_single_false() {
+        let src = vec![16, 1, 0];
+        let mut dst = vec![];
+
+        decode(&src, &mut dst).expect("failed to decode src");
+        assert_eq!(dst, vec![false]);
+    }
+
+    #[test]
+    fn decode_multi_compressed() {
+        let src = vec![16, 10, 170, 128];
+        let mut dst = vec![];
+
+        decode(&src, &mut dst).expect("failed to decode src");
+
+        let expected: Vec<_> = (0..10).map(|i| i % 2 == 0).collect();
+        assert_eq!(dst, expected);
+    }
+}
diff --git a/influxdb_tsm/src/encoders/float.rs b/influxdb_tsm/src/encoders/float.rs
new file mode 100644
index 0000000..43956e8
--- /dev/null
+++ b/influxdb_tsm/src/encoders/float.rs
@@ -0,0 +1,1696 @@
+use std::error::Error;
+
+// SENTINEL is used to terminate a float-encoded block. A sentinel marker value
+// is useful because blocks do not always end aligned to bytes, and spare empty
+// bits can otherwise have undesirable semantic meaning.
+const SENTINEL: u64 = 0x7ff8_0000_0000_00ff; // in the quiet NaN range.
+const SENTINEL_INFLUXDB: u64 = 0x7ff8_0000_0000_0001; // legacy NaN value used by InfluxDB
+
+fn is_sentinel_f64(v: f64, sentinel: u64) -> bool {
+    v.to_bits() == sentinel
+}
+fn is_sentinel_u64(v: u64, sentinel: u64) -> bool {
+    v == sentinel
+}
+
+/// encode encodes a vector of floats into dst.
+///
+/// The encoding used is equivalent to the encoding of floats in the Gorilla
+/// paper. Each subsequent value is compared to the previous and the XOR of the
+/// two is determined. Leading and trailing zero bits are then analysed and
+/// representations based on those are stored.
+#[allow(clippy::many_single_char_names)]
+pub fn encode(src: &[f64], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+    dst.clear(); // reset buffer.
+    if src.is_empty() {
+        return Ok(());
+    }
+    if dst.capacity() < 9 {
+        dst.reserve_exact(9 - dst.capacity()); // room for encoding type, block
+                                               // size and a value
+    }
+
+    // write encoding type
+    let mut n = 8; // N.B, this is the number of bits written
+    dst.push((1 << 4) as u8); // write compression type
+
+    // write the first value into the block
+    let first = src[0];
+    let mut prev = first.to_bits();
+    dst.extend_from_slice(&prev.to_be_bytes());
+    n += 64;
+
+    let (mut prev_leading, mut prev_trailing) = (!0u64, 0u64);
+    // encode remaining values
+    for i in 1..=src.len() {
+        let x;
+        if i < src.len() {
+            x = src[i];
+            if is_sentinel_f64(x, SENTINEL) {
+                return Err(From::from("unsupported value"));
+            }
+        } else {
+            x = f64::from_bits(SENTINEL);
+        }
+
+        let cur = x.to_bits();
+        let v_delta = cur ^ prev;
+        if v_delta == 0 {
+            n += 1; // write a single zero bit, nothing else to do
+            prev = cur;
+            continue;
+        }
+
+        while n >> 3 >= dst.len() {
+            dst.push(0); // make room
+        }
+
+        // set the current bit of the current byte to indicate we are writing a
+        // delta value to the output
+        // n&7 - current bit in current byte
+        // n>>3 - current byte
+        dst[n >> 3] |= 128 >> (n & 7); // set the current bit of the current byte
+        n += 1;
+
+        // next, write the delta to the output
+        let mut leading = v_delta.leading_zeros() as u64;
+        let trailing = v_delta.trailing_zeros() as u64;
+
+        // prevent overflow by restricting number of leading zeros to 31
+        leading &= 0b0001_1111;
+
+        // a minimum of two further bits will be required
+        if (n + 2) >> 3 >= dst.len() {
+            dst.push(0);
+        }
+
+        if prev_leading != !0u64 && leading >= prev_leading && trailing >= prev_trailing {
+            n += 1; // write leading bit
+
+            let l = 64 - prev_leading - prev_trailing; // none-zero bit count
+            while (n + 1) >> 3 >= dst.len() {
+                dst.push(0); // grow to accommodate bits.
+            }
+
+            // the full value
+            let v = (v_delta >> prev_trailing) << (64 - l); // l least significant bits of v
+            let m = (n & 7) as u64; // current bit in current byte
+            let mut written = 0u64;
+            if m > 0 {
+                // the current byte has not been completely filled
+                written = if l < 8 - m { l } else { 8 - m };
+                let mask = v >> 56; // move 8 MSB to 8 LSB
+                dst[n >> 3] |= (mask >> m) as u8;
+                n += written as usize;
+
+                if l - written == 0 {
+                    prev = cur;
+                    continue;
+                }
+            }
+
+            let vv = v << written; // move written bits out of the way
+            while (n >> 3) + 8 >= dst.len() {
+                dst.push(0);
+            }
+            // TODO(edd): maybe this can be optimised?
+            let k = n >> 3;
+            let vv_bytes = &vv.to_be_bytes();
+            dst[k..k + 8].clone_from_slice(&vv_bytes[0..(k + 8 - k)]);
+
+            n += (l - written) as usize;
+        } else {
+            prev_leading = leading;
+            prev_trailing = trailing;
+
+            // set a single bit to indicate a value will follow
+            dst[n >> 3] |= 128 >> (n & 7); // set the current bit on the current byte
+            n += 1;
+
+            // write 5 bits of leading
+            if (n + 5) >> 3 >= dst.len() {
+                dst.push(0);
+            }
+
+            // see if there is enough room left in current byte for the 5 bits.
+            let mut m = n & 7;
+            let mut l = 5usize;
+            let mut v = leading << 59; // 5 LSB of leading
+            let mut mask = v >> 56; // move 5 MSB to 8 LSB
+
+            if m <= 3 {
+                // 5 bits fit in current byte
+                dst[n >> 3] |= (mask >> m) as u8;
+                n += l;
+            } else {
+                // not enough bits available in current byte
+                let written = 8 - m;
+                dst[n >> 3] |= (mask >> m) as u8; // some of mask will get lost
+                n += written;
+
+                // next the lost part of mask needs to be written into the next byte
+                mask = v << written; // move already written bits out the way
+                mask >>= 56;
+
+                m = n & 7; // new current bit
+                dst[n >> 3] |= (mask >> m) as u8;
+                n += l - written;
+            }
+
+            // Note that if leading == trailing == 0, then sig_bits == 64. But
+            // that value doesn't actually fit into the 6 bits we have. However,
+            // we never need to encode 0 significant bits, since that would put
+            // us in the other case (v_delta == 0). So instead we write out a 0
+            // and adjust it back to 64 on unpacking.
+            let sig_bits = 64 - leading - trailing;
+            if (n + 6) >> 3 >= dst.len() {
+                dst.push(0);
+            }
+
+            m = n & 7;
+            l = 6;
+            v = sig_bits << 58; // move 6 LSB of sig_bits to MSB
+            let mut mask = v >> 56; // move 6 MSB to 8 LSB
+            if m <= 2 {
+                dst[n >> 3] |= (mask >> m) as u8; // the 6 bits fit in the current byte
+                n += l;
+            } else {
+                let written = 8 - m;
+                dst[n >> 3] |= (mask >> m) as u8; // fill rest of current byte
+                n += written;
+
+                // next, write the lost part of mask into the next byte
+                mask = v << written;
+                mask >>= 56;
+
+                m = n & 7; // recompute current bit to write
+                dst[n >> 3] |= (mask >> m) as u8;
+                n += l - written;
+            }
+
+            // write final value
+            m = n & 7;
+            l = sig_bits as usize;
+            v = (v_delta >> trailing) << (64 - l); // move l LSB into MSB
+            while (n + l) >> 3 >= dst.len() {
+                dst.push(0);
+            }
+
+            let mut written = 0usize;
+            if m > 0 {
+                // current byte not full
+                written = if l < 8 - m { l } else { 8 - m };
+                mask = v >> 56; // move 8 MSB to 8 LSB
+                dst[n >> 3] |= (mask >> m) as u8;
+                n += written;
+
+                if l - written == 0 {
+                    prev = cur;
+                    continue;
+                }
+            }
+
+            // shift remaining bits and write out
+            let vv = v << written; // remove bits written in previous byte
+            while (n >> 3) + 8 >= dst.len() {
+                dst.push(0);
+            }
+
+            // TODO(edd): maybe this can be optimised?
+            let k = n >> 3;
+            let vv_bytes = &vv.to_be_bytes();
+            dst[k..k + 8].clone_from_slice(&vv_bytes[0..(k + 8 - k)]);
+            n += l - written;
+        }
+        prev = cur;
+    }
+
+    let mut length = n >> 3;
+    if n & 7 > 0 {
+        length += 1;
+    }
+    dst.truncate(length);
+    Ok(())
+}
+
+// BIT_MASK contains a lookup table where the index is the number of bits
+// and the value is a mask. The table is always read by ANDing the index
+// with 0x3f, such that if the index is 64, position 0 will be read, which
+// is a 0xffffffffffffffff, thus returning all bits.
+//
+// 00 = 0xffffffffffffffff
+// 01 = 0x0000000000000001
+// 02 = 0x0000000000000003
+// 03 = 0x0000000000000007
+// ...
+// 62 = 0x3fffffffffffffff
+// 63 = 0x7fffffffffffffff
+//
+// TODO(edd): figure out how to generate this.
+const BIT_MASK: [u64; 64] = [
+    0xffff_ffff_ffff_ffff,
+    0x0001,
+    0x0003,
+    0x0007,
+    0x000f,
+    0x001f,
+    0x003f,
+    0x007f,
+    0x00ff,
+    0x01ff,
+    0x03ff,
+    0x07ff,
+    0x0fff,
+    0x1fff,
+    0x3fff,
+    0x7fff,
+    0xffff,
+    0x0001_ffff,
+    0x0003_ffff,
+    0x0007_ffff,
+    0x000f_ffff,
+    0x001f_ffff,
+    0x003f_ffff,
+    0x007f_ffff,
+    0x00ff_ffff,
+    0x01ff_ffff,
+    0x03ff_ffff,
+    0x07ff_ffff,
+    0x0fff_ffff,
+    0x1fff_ffff,
+    0x3fff_ffff,
+    0x7fff_ffff,
+    0xffff_ffff,
+    0x0001_ffff_ffff,
+    0x0003_ffff_ffff,
+    0x0007_ffff_ffff,
+    0x000f_ffff_ffff,
+    0x001f_ffff_ffff,
+    0x003f_ffff_ffff,
+    0x007f_ffff_ffff,
+    0x00ff_ffff_ffff,
+    0x01ff_ffff_ffff,
+    0x03ff_ffff_ffff,
+    0x07ff_ffff_ffff,
+    0x0fff_ffff_ffff,
+    0x1fff_ffff_ffff,
+    0x3fff_ffff_ffff,
+    0x7fff_ffff_ffff,
+    0xffff_ffff_ffff,
+    0x0001_ffff_ffff_ffff,
+    0x0003_ffff_ffff_ffff,
+    0x0007_ffff_ffff_ffff,
+    0x000f_ffff_ffff_ffff,
+    0x001f_ffff_ffff_ffff,
+    0x003f_ffff_ffff_ffff,
+    0x007f_ffff_ffff_ffff,
+    0x00ff_ffff_ffff_ffff,
+    0x01ff_ffff_ffff_ffff,
+    0x03ff_ffff_ffff_ffff,
+    0x07ff_ffff_ffff_ffff,
+    0x0fff_ffff_ffff_ffff,
+    0x1fff_ffff_ffff_ffff,
+    0x3fff_ffff_ffff_ffff,
+    0x7fff_ffff_ffff_ffff,
+];
+
+/// decode decodes the provided slice of bytes into a vector of f64 values.
+pub fn decode(src: &[u8], dst: &mut Vec<f64>) -> Result<(), Box<dyn Error>> {
+    decode_with_sentinel(src, dst, SENTINEL)
+}
+
+/// decode_influxdb decodes the provided slice of bytes, which must have been
+/// encoded into a TSM file via InfluxDB's encoder.
+///
+/// TODO(edd): InfluxDB uses a different  sentinel value to terminate a block
+/// than we chose to use for the float decoder. As we settle on a story around
+/// compression of f64 blocks we may be able to clean this API and not have
+/// multiple methods.
+pub fn decode_influxdb(src: &[u8], dst: &mut Vec<f64>) -> Result<(), Box<dyn Error>> {
+    decode_with_sentinel(src, dst, SENTINEL_INFLUXDB)
+}
+
+/// decode decodes a slice of bytes into a vector of floats.
+#[allow(clippy::many_single_char_names)]
+#[allow(clippy::useless_let_if_seq)]
+fn decode_with_sentinel(
+    src: &[u8],
+    dst: &mut Vec<f64>,
+    sentinel: u64,
+) -> Result<(), Box<dyn Error>> {
+    if src.len() < 9 {
+        return Ok(());
+    }
+
+    let mut i = 1; // skip first byte as it's the encoding, which is always gorilla
+    let mut buf: [u8; 8] = [0; 8];
+
+    // the first decoded value
+    buf.copy_from_slice(&src[i..i + 8]);
+    let mut val = u64::from_be_bytes(buf);
+    i += 8;
+    dst.push(f64::from_bits(val));
+
+    // decode the rest of the values
+    let mut br_cached_val;
+    let mut br_valid_bits;
+
+    // Refill br_cached_value, reading up to 8 bytes from b, returning the new
+    // values for the cached value, the valid bits and the number of bytes read.
+    let mut refill_cache = |i: usize| -> Result<(u64, u8, usize), Box<dyn Error>> {
+        let remaining_bytes = src.len() - i;
+        if remaining_bytes >= 8 {
+            // read 8 bytes directly
+            buf.copy_from_slice(&src[i..i + 8]);
+            return Ok((u64::from_be_bytes(buf), 64, 8));
+        } else if remaining_bytes > 0 {
+            let mut br_cached_val = 0u64;
+            let br_valid_bits = (remaining_bytes * 8) as u8;
+            let mut n = 0;
+            for v in src.iter().skip(i) {
+                br_cached_val = (br_cached_val << 8) | *v as u64;
+                n += 1;
+            }
+            br_cached_val = br_cached_val.rotate_right(br_valid_bits as u32);
+            return Ok((br_cached_val, br_valid_bits, n));
+        }
+        Err(From::from("unexpected end of block"))
+    };
+
+    // TODO(edd): I found it got complicated quickly when trying to use Ref to
+    // mutate br_cached_val, br_valid_bits and I directly in the closure, so for
+    // now we will just mutate copies and re-assign...
+    match refill_cache(i) {
+        Ok(res) => {
+            br_cached_val = res.0;
+            br_valid_bits = res.1;
+            i += res.2;
+        }
+        Err(e) => return Err(e),
+    }
+
+    let mut trailing_n = 0u8;
+    let mut meaningful_n = 64u8;
+
+    loop {
+        if br_valid_bits == 0 {
+            match refill_cache(i) {
+                Ok(res) => {
+                    br_cached_val = res.0;
+                    br_valid_bits = res.1;
+                    i += res.2;
+                }
+                Err(e) => return Err(e),
+            }
+        }
+
+        // read control bit 0.
+        br_valid_bits -= 1;
+        br_cached_val = br_cached_val.rotate_left(1);
+        if br_cached_val & 1 == 0 {
+            dst.push(f64::from_bits(val));
+            continue;
+        }
+
+        if br_valid_bits == 0 {
+            match refill_cache(i) {
+                Ok(res) => {
+                    br_cached_val = res.0;
+                    br_valid_bits = res.1;
+                    i += res.2;
+                }
+                Err(e) => return Err(e),
+            }
+        }
+
+        // read control bit 1.
+        br_valid_bits -= 1;
+        br_cached_val = br_cached_val.rotate_left(1);
+        if br_cached_val & 1 > 0 {
+            // read 5 bits for leading zero count and 6 bits for the meaningful data count
+            let leading_trailing_bit_count = 11;
+            let mut lm_bits = 0u64; // leading + meaningful data counts
+            if br_valid_bits >= leading_trailing_bit_count {
+                // decode 5 bits leading + 6 bits meaningful for a total of 11 bits
+                br_valid_bits -= leading_trailing_bit_count;
+                br_cached_val = br_cached_val.rotate_left(leading_trailing_bit_count as u32);
+                lm_bits = br_cached_val;
+            } else {
+                let mut bits_01 = 11u8;
+                if br_valid_bits > 0 {
+                    bits_01 -= br_valid_bits;
+                    lm_bits = br_cached_val.rotate_left(11);
+                }
+
+                match refill_cache(i) {
+                    Ok(res) => {
+                        br_cached_val = res.0;
+                        br_valid_bits = res.1;
+                        i += res.2;
+                    }
+                    Err(e) => return Err(e),
+                }
+
+                br_cached_val = br_cached_val.rotate_left(bits_01 as u32);
+                br_valid_bits -= bits_01;
+                lm_bits &= !BIT_MASK[(bits_01 & 0x3f) as usize];
+                lm_bits |= br_cached_val & BIT_MASK[(bits_01 & 0x3f) as usize];
+            }
+
+            lm_bits &= 0x7ff;
+            let leading_n = (lm_bits >> 6) as u8 & 0x1f; // 5 bits leading
+            meaningful_n = (lm_bits & 0x3f) as u8; // 6 bits meaningful
+            if meaningful_n > 0 {
+                trailing_n = 64 - leading_n - meaningful_n;
+            } else {
+                // meaningful_n == 0 is a special case, such that all bits are meaningful
+                trailing_n = 0;
+                meaningful_n = 64;
+            }
+        }
+
+        let mut s_bits = 0u64; // significant bits
+        if br_valid_bits >= meaningful_n {
+            br_valid_bits -= meaningful_n;
+            br_cached_val = br_cached_val.rotate_left(meaningful_n as u32);
+            s_bits = br_cached_val;
+        } else {
+            let mut m_bits = meaningful_n;
+            if br_valid_bits > 0 {
+                m_bits -= br_valid_bits;
+                s_bits = br_cached_val.rotate_left(meaningful_n as u32);
+            }
+
+            match refill_cache(i) {
+                Ok(res) => {
+                    br_cached_val = res.0;
+                    br_valid_bits = res.1;
+                    i += res.2;
+                }
+                Err(e) => return Err(e),
+            }
+
+            br_cached_val = br_cached_val.rotate_left(m_bits as u32);
+            br_valid_bits = br_valid_bits.wrapping_sub(m_bits);
+            s_bits &= !BIT_MASK[(m_bits & 0x3f) as usize];
+            s_bits |= br_cached_val & BIT_MASK[(m_bits & 0x3f) as usize];
+        }
+        s_bits &= BIT_MASK[(meaningful_n & 0x3f) as usize];
+        val ^= s_bits << (trailing_n & 0x3f);
+
+        // check for sentinel value
+        if is_sentinel_u64(val, sentinel) {
+            break;
+        }
+        dst.push(f64::from_bits(val));
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+#[allow(clippy::unreadable_literal)]
+#[allow(clippy::excessive_precision)] // TODO: Audit test values for truncation
+mod tests {
+    use test_helpers::approximately_equal;
+
+    #[test]
+    fn encode_no_values() {
+        let src: Vec<f64> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        super::encode(&src, &mut dst).expect("failed to encode src");
+
+        // verify encoded no values.
+        let exp: Vec<u8> = Vec::new();
+        assert_eq!(dst.to_vec(), exp);
+    }
+
+    #[test]
+    fn encode_special_values() {
+        let src: Vec<f64> = vec![
+            100.0,
+            222.12,
+            f64::from_bits(0x7ff8000000000001), // Go representation of signalling NaN
+            45.324,
+            std::f64::NAN,
+            2453.023,
+            -1234.235312132,
+            std::f64::INFINITY,
+            std::f64::NEG_INFINITY,
+            9123419329123.1234,
+            f64::from_bits(0x7ff0000000000002), // Prometheus stale NaN
+            -19292929929292929292.22,
+            -0.0000000000000000000000000092,
+        ];
+        let mut dst = vec![];
+
+        // check for error
+        super::encode(&src, &mut dst).expect("failed to encode src");
+
+        let mut got = vec![];
+        super::decode(&dst, &mut got).expect("failed to decode");
+
+        // Verify decoded values.
+        assert_eq!(got.len(), src.len());
+
+        for (i, v) in got.iter().enumerate() {
+            if v.is_nan() || v.is_infinite() {
+                assert_eq!(src[i].to_bits(), v.to_bits());
+            } else {
+                assert!(approximately_equal(src[i], *v));
+            }
+        }
+    }
+
+    #[test]
+    fn encode() {
+        struct Test {
+            name: String,
+            input: Vec<f64>,
+        }
+
+        let tests = vec![
+            Test {
+                name: String::from("from reference paper"),
+                input: vec![12.0, 12.0, 24.0, 13.0, 24.0, 24.0, 24.0, 23.0],
+            },
+            Test {
+                name: String::from("failed in previous implementation"),
+                input: vec![
+                    -3.8970913068231994e+307,
+                    -9.036931257783943e+307,
+                    1.7173073833490201e+308,
+                    -9.312369166661538e+307,
+                    -2.2435523083555231e+307,
+                    1.4779121287289644e+307,
+                    1.771273431601434e+308,
+                    8.140360378221364e+307,
+                    4.783405048208089e+307,
+                    -2.8044680049605344e+307,
+                    4.412915337205696e+307,
+                    -1.2779380602005046e+308,
+                    1.6235802318921885e+308,
+                    -1.3402901846299688e+307,
+                    1.6961015582104055e+308,
+                    -1.067980796435633e+308,
+                    -3.02868987458268e+307,
+                    1.7641793640790284e+308,
+                    1.6587191845856813e+307,
+                    -1.786073304985983e+308,
+                    1.0694549382051123e+308,
+                    3.5635180996210295e+307,
+                ],
+            },
+            Test {
+                name: String::from("previous example as natural numbers"),
+                input: vec![
+                    -38970913068231994.0,
+                    -9036931257783943.0,
+                    171730738334902010.0,
+                    -9312369166661538.0,
+                    -22435523083555231.0,
+                    14779121287289644.0,
+                    17712734316014340.0,
+                    8140360378221364.0,
+                    4783405048208089.0,
+                    -28044680049605344.0,
+                    4412915337205696.0,
+                    -127793806020050460.0,
+                    162358023189218850.0,
+                    -13402901846299688.0,
+                    169610155821040550.0,
+                    -10679807964356330.0,
+                    -302868987458268.0,
+                    176417936407902840.0,
+                    16587191845856813.0,
+                    -17860733049859830.0,
+                    106945493820511230.0,
+                    35635180996210295.0,
+                ],
+            },
+            Test {
+                name: String::from("similar values"),
+                input: vec![
+                    6.00065e+06,
+                    6.000656e+06,
+                    6.000657e+06,
+                    6.000659e+06,
+                    6.000661e+06,
+                ],
+            },
+            Test {
+                name: String::from("two hours data"),
+                input: vec![
+                    761.0, 727.0, 763.0, 706.0, 700.0, 679.0, 757.0, 708.0, 739.0, 707.0, 699.0,
+                    740.0, 729.0, 766.0, 730.0, 715.0, 705.0, 693.0, 765.0, 724.0, 799.0, 761.0,
+                    737.0, 766.0, 756.0, 719.0, 722.0, 801.0, 747.0, 731.0, 742.0, 744.0, 791.0,
+                    750.0, 759.0, 809.0, 751.0, 705.0, 770.0, 792.0, 727.0, 762.0, 772.0, 721.0,
+                    748.0, 753.0, 744.0, 716.0, 776.0, 659.0, 789.0, 766.0, 758.0, 690.0, 795.0,
+                    770.0, 758.0, 723.0, 767.0, 765.0, 693.0, 706.0, 681.0, 727.0, 724.0, 780.0,
+                    678.0, 696.0, 758.0, 740.0, 735.0, 700.0, 742.0, 747.0, 752.0, 734.0, 743.0,
+                    732.0, 746.0, 770.0, 780.0, 710.0, 731.0, 712.0, 712.0, 741.0, 770.0, 770.0,
+                    754.0, 718.0, 670.0, 775.0, 749.0, 795.0, 756.0, 741.0, 787.0, 721.0, 745.0,
+                    782.0, 765.0, 780.0, 811.0, 790.0, 836.0, 743.0, 858.0, 739.0, 762.0, 770.0,
+                    752.0, 763.0, 795.0, 792.0, 746.0, 786.0, 785.0, 774.0, 786.0, 718.0,
+                ],
+            },
+            Test {
+                name: String::from("identical values"),
+                input: vec![12123.1234; 1000],
+            },
+            Test {
+                name: String::from("1000 real CPU values"),
+                input: vec![
+                    11.286653185035389,
+                    3.7310629773381745,
+                    1.6102858569466982,
+                    1.691305437233776,
+                    1.8957345971563981,
+                    3.625453181647706,
+                    10.073740782402199,
+                    7.99398571607568,
+                    4.598130841121495,
+                    5.293527345709985,
+                    6.247661803217359,
+                    4.296777416937297,
+                    1.3373328333958254,
+                    1.5998000249968753,
+                    3.0139394700489763,
+                    2.0558185895838523,
+                    3.18630513557416,
+                    4.133882852504059,
+                    3.3033033033033035,
+                    3.6824366496067906,
+                    2.9378672334041753,
+                    2.112764095511939,
+                    1.5896858179997497,
+                    5.252626313156578,
+                    2.908137793310035,
+                    3.1738098213170063,
+                    3.3120859892513437,
+                    2.189415738771425,
+                    2.9650944576504443,
+                    2.826195219123506,
+                    2.618391380606364,
+                    2.739897410233955,
+                    2.886056971514243,
+                    2.25140712945591,
+                    3.5843636817784437,
+                    2.7236381809095453,
+                    3.5138176816306115,
+                    3.5348488633524857,
+                    1.5920772220132882,
+                    2.5954579485899676,
+                    2.4918607563235664,
+                    1.9355644355644355,
+                    2.240580798598072,
+                    1.8293446936474127,
+                    1.1938813580400447,
+                    3.0753844230528817,
+                    3.2399299474605954,
+                    5.420326223337516,
+                    6.535540893813021,
+                    5.47158026233604,
+                    2.55,
+                    2.0336429826763744,
+                    2.638489433537577,
+                    2.8251400124455506,
+                    2.3505876469117277,
+                    2.9632408102025507,
+                    2.7243189202699325,
+                    1.712285964254468,
+                    1.6506189821182944,
+                    1.1495689116581282,
+                    1.4140908522087348,
+                    1.8733608092918697,
+                    2.13696575856036,
+                    3.0644152595372107,
+                    3.649087728067983,
+                    3.649087728067983,
+                    2.4799599198396796,
+                    3.4676312835225147,
+                    2.9992501874531365,
+                    2.8371453568303964,
+                    3.6660389202762085,
+                    3.5487485991781846,
+                    3.8509627406851714,
+                    4.373981701967665,
+                    3.8153615211408556,
+                    2.3548467480687765,
+                    2.575321915239405,
+                    1.4267834793491865,
+                    1.2625,
+                    1.2353381582231096,
+                    3.055243795984537,
+                    3.6958155850663994,
+                    2.0645645645645647,
+                    1.7491254372813594,
+                    1.8865567216391803,
+                    4.563640910227557,
+                    3.6805207811717575,
+                    3.3983008495752123,
+                    3.6117381489841986,
+                    2.832650018635855,
+                    1.0390585878818228,
+                    3.930131004366812,
+                    5.608531994981179,
+                    6.253900886281363,
+                    5.43640897755611,
+                    2.4558326024307733,
+                    2.272727272727273,
+                    4.708829054477144,
+                    3.233458177278402,
+                    2.9246344206974126,
+                    2.651325662831416,
+                    2.6993251687078232,
+                    3.994990607388854,
+                    2.220558882235529,
+                    1.275797373358349,
+                    2.0367362239160314,
+                    2.8375,
+                    1.676257192894671,
+                    2.237779722465308,
+                    2.135098014733425,
+                    1.2259194395796849,
+                    1.2489072061945798,
+                    1.2503125781445361,
+                    1.3375,
+                    1.3413563996489908,
+                    1.3102071375093587,
+                    1.1620642259152818,
+                    1.6577340147077153,
+                    2.6081504702194356,
+                    1.9233170975396527,
+                    3.6754594324290535,
+                    1.936773709858803,
+                    2.766649974962444,
+                    2.3833416959357754,
+                    2.1663346613545817,
+                    2.722957781663752,
+                    1.5119330251155816,
+                    2.075,
+                    2.340132649230384,
+                    1.3647176662075873,
+                    2.5884706765036887,
+                    3.3445231878652244,
+                    4.001505268439538,
+                    1.9852665751030092,
+                    2.9267679939706066,
+                    2.8447204968944098,
+                    1.9831806200577382,
+                    2.695619618120554,
+                    3.9003115264797508,
+                    2.9944640161046805,
+                    2.6381284221005474,
+                    2.651657285803627,
+                    1.8745313671582104,
+                    1.5650431951921873,
+                    1.6974538192710933,
+                    1.7769991240145164,
+                    3.06809678223996,
+                    3.730128927275003,
+                    4.101221640488657,
+                    3.045112781954887,
+                    4.24037134612972,
+                    3.2709113607990012,
+                    1.546713234376949,
+                    3.7937019588395735,
+                    1.639344262295082,
+                    1.1609037573336662,
+                    0.7003501750875438,
+                    3.4275706780085065,
+                    4.846587351283657,
+                    8.203907815631263,
+                    18.056599048334586,
+                    10.340050377833753,
+                    4.5375,
+                    9.898889027587067,
+                    3.705564627559352,
+                    4.7613112302131375,
+                    7.604467310829464,
+                    5.090230242688239,
+                    3.286915067118304,
+                    4.54602223054827,
+                    0.9040247678018576,
+                    2.383955600403633,
+                    2.8744889109156238,
+                    6.1522945032778615,
+                    5.16641828117238,
+                    5.089218396582056,
+                    5.886028492876781,
+                    7.0792017070415465,
+                    5.167894145549869,
+                    2.3025501361723197,
+                    0.8625,
+                    1.0116148370176097,
+                    0.5875734466808351,
+                    0.703694395576778,
+                    0.7711442786069652,
+                    1.587896974243561,
+                    3.3358509566968784,
+                    1.9875,
+                    2.5736203909923288,
+                    0.7769423558897243,
+                    2.013506753376688,
+                    0.7248187953011747,
+                    0.7130347760820616,
+                    0.8994378513429107,
+                    0.7498125468632841,
+                    0.7125890736342043,
+                    0.9572994079858924,
+                    0.6337765626941717,
+                    0.8108782435129741,
+                    0.7625953244155519,
+                    6.323510813203491,
+                    4.336989032901296,
+                    3.547782635852592,
+                    3.8667830859423726,
+                    3.95742016280526,
+                    1.7510944340212633,
+                    0.9375,
+                    2.3729236917697016,
+                    4.491017964071856,
+                    4.435836561289516,
+                    5.663939584644431,
+                    1.8381893209953732,
+                    1.1118051217988758,
+                    2.2338699613128665,
+                    2.1189081391000872,
+                    1.4509068167604753,
+                    1.2650300601202404,
+                    1.347305389221557,
+                    2.108169155477475,
+                    3.7398373983739837,
+                    3.3872976338729766,
+                    1.1,
+                    2.059925093632959,
+                    1.4130298862073278,
+                    1.375171896487061,
+                    1.5011258443832876,
+                    1.7312758750470456,
+                    1.0823587957203284,
+                    1.1912225705329154,
+                    1.6826623457559517,
+                    2.3415977961432506,
+                    1.2849301397205588,
+                    1.7127140892611576,
+                    2.01325497061398,
+                    2.049487628092977,
+                    1.7157169693174703,
+                    1.410736579275905,
+                    1.3989507869098177,
+                    1.4273193940152749,
+                    1.4474669328674818,
+                    1.4509068167604753,
+                    1.5257628814407203,
+                    1.6370907273181705,
+                    1.9113054341036853,
+                    1.3625,
+                    1.2266866942045311,
+                    1.6485575121768452,
+                    1.311680199875078,
+                    1.0628985869701137,
+                    1.11236095488064,
+                    1.2996750812296927,
+                    1.4398397395768123,
+                    1.6344354335620712,
+                    2.364568997873139,
+                    2.7979015738196353,
+                    2.4262131065532766,
+                    2.11065317846884,
+                    2.8639319659829914,
+                    1.5248093988251468,
+                    2.0497437820272464,
+                    1.4380392647242717,
+                    1.4507253626813408,
+                    1.50018752344043,
+                    1.3238416385662546,
+                    1.6741629185407296,
+                    2.3517638228671505,
+                    1.6870782304423895,
+                    1.85,
+                    1.226379677136779,
+                    1.536155863619333,
+                    2.5541504945536495,
+                    1.963727329580988,
+                    2.543640897755611,
+                    2.276138069034517,
+                    1.4133833646028768,
+                    2.4362818590704647,
+                    1.7875,
+                    1.400525196948856,
+                    1.0990383414512301,
+                    1.625,
+                    2.3744063984004,
+                    1.1126390798849857,
+                    1.7883941970985493,
+                    1.5121219695076231,
+                    1.4126765845730715,
+                    1.136789506558401,
+                    1.2384288216162123,
+                    1.1376422052756594,
+                    1.236418134132634,
+                    1.6754188547136784,
+                    1.6416040100250626,
+                    2.1614192903548224,
+                    2.2227772227772227,
+                    1.0744627686156922,
+                    1.3126640830103764,
+                    2.5378172271533943,
+                    3.3125,
+                    3.511622094476381,
+                    1.5632816408204102,
+                    3.620474406991261,
+                    2.9863801074597025,
+                    1.2648716343143394,
+                    1.1611936571357222,
+                    1.6129032258064515,
+                    3.588845817181443,
+                    1.3243378310844578,
+                    2.5072082236429734,
+                    2.395209580838323,
+                    6.315657828914457,
+                    3.0976767424431677,
+                    2.8646157678415745,
+                    3.5776832624468353,
+                    2.1532298447671505,
+                    2.465890599574415,
+                    2.3985009369144286,
+                    4.2473454091193,
+                    3.889931207004378,
+                    2.4518388791593697,
+                    3.0016191306513886,
+                    2.814610958218664,
+                    3.019247704113725,
+                    2.7127924340467895,
+                    2.9375,
+                    2.4390243902439024,
+                    3.1791547188629847,
+                    2.01325497061398,
+                    3.1568356181612374,
+                    2.774667164364813,
+                    3.9639864949356007,
+                    1.8900988859682062,
+                    3.0257751214045574,
+                    1.4541807697129248,
+                    4.20042378162782,
+                    6.213981458281133,
+                    5.3235257449195865,
+                    2.3436520867276602,
+                    3.890759446315002,
+                    3.973426924041113,
+                    1.670015067805123,
+                    4.529053129277093,
+                    2.0648229257915154,
+                    4.3825696091896615,
+                    3.890759446315002,
+                    8.32288794184006,
+                    16.313295086056375,
+                    3.9313885063227745,
+                    3.1746031746031744,
+                    4.294863744819792,
+                    4.854730568661535,
+                    4.84901641398321,
+                    3.4904013961605584,
+                    4.512409125094009,
+                    3.8639489808678253,
+                    3.8600874453466583,
+                    3.6819770344483276,
+                    3.9308963445167753,
+                    2.601951463597698,
+                    4.79820067474697,
+                    4.549431321084865,
+                    4.902083073468878,
+                    5.147891755821271,
+                    3.2209924138788706,
+                    3.2060878243512976,
+                    4.336957880264967,
+                    3.152758663830852,
+                    2.704056084126189,
+                    3.0117470632341914,
+                    4.2705072010018785,
+                    4.996252810392206,
+                    3.784661503872096,
+                    2.6743314171457135,
+                    4.125,
+                    3.463365841460365,
+                    4.259837601499063,
+                    2.6200873362445414,
+                    2.7948364456698833,
+                    3.1179845347967072,
+                    3.411086029596188,
+                    3.2104934415990005,
+                    3.59685275384039,
+                    4.465510789572159,
+                    2.5952858575727182,
+                    4.499437570303712,
+                    3.248375812093953,
+                    2.4878109763720464,
+                    3.965365792445727,
+                    2.7535509593820087,
+                    3.1855090568394755,
+                    1.4865708931917552,
+                    2.5209035317608888,
+                    1.8934169278996866,
+                    1.949025487256372,
+                    2.875359419927491,
+                    1.6360684401148995,
+                    2.5387693846923463,
+                    2.20247778751095,
+                    2.0867174809446456,
+                    1.440561192534135,
+                    1.1214953271028036,
+                    1.002004008016032,
+                    1.1479910157224857,
+                    0.9778112072207596,
+                    1.0719182350741618,
+                    1.7133566783391696,
+                    0.924191332583989,
+                    2.3267450587940957,
+                    1.7747781527309086,
+                    1.5994002249156567,
+                    1.0393188079138493,
+                    1.621351958094288,
+                    1.518955561134823,
+                    0.9715994020926756,
+                    1.1489946296990134,
+                    0.901352028042063,
+                    2.127659574468085,
+                    1.2106839740389417,
+                    1.0360753963300462,
+                    1.8509254627313656,
+                    1.4882441220610305,
+                    1.0870923403723605,
+                    1.4262479669710997,
+                    5.1077613055936215,
+                    4.774931609052475,
+                    5.52555569508979,
+                    5.446161515453639,
+                    6.3678361842926705,
+                    4.3364158960259935,
+                    1.1625,
+                    1.825912956478239,
+                    1.2125,
+                    1.1605903872839662,
+                    0.9479855307471623,
+                    0.9656383245548031,
+                    1.1587341141290806,
+                    2.0210896309314585,
+                    2.302140368342459,
+                    1.0884523958463657,
+                    0.9534562790114164,
+                    0.9715994020926756,
+                    1.098901098901099,
+                    1.9784623090408215,
+                    1.073389915127309,
+                    1.56289072268067,
+                    1.4886164623467601,
+                    1.3994751968011996,
+                    1.7609591607343575,
+                    0.9498812648418947,
+                    1.7649267743146826,
+                    1.136221750530653,
+                    0.9379689844922461,
+                    2.1111805121798874,
+                    2.2252781597699713,
+                    2.1630407601900474,
+                    1.115987460815047,
+                    2.0125,
+                    1.5451713395638629,
+                    1.5625,
+                    1.2642383277005884,
+                    0.9364464976900987,
+                    1.3744845682868925,
+                    1.09104589917231,
+                    0.8164275111331024,
+                    1.0117411941044216,
+                    1.4132066033016508,
+                    0.8380237648530331,
+                    0.7878939469734867,
+                    1.075,
+                    1.0613060307154452,
+                    1.2125,
+                    1.50018752344043,
+                    1.4501812726590824,
+                    1.0494752623688155,
+                    1.1255627813906954,
+                    0.7126781695423856,
+                    1.1118051217988758,
+                    1.0768845479589282,
+                    0.9616585487698264,
+                    0.8116883116883117,
+                    1.5501937742217777,
+                    1.1011011011011012,
+                    0.9752438109527382,
+                    0.9737827715355806,
+                    1.0623672040994876,
+                    1.7517517517517518,
+                    1.5988008993255058,
+                    2.4091826437941473,
+                    1.1982026959560659,
+                    1.2507817385866167,
+                    0.974512743628186,
+                    1.0889973713856553,
+                    1.23688155922039,
+                    1.735330836454432,
+                    1.1008256192144108,
+                    1.1502875718929733,
+                    1.049082053203447,
+                    1.040230605339015,
+                    1.8573921715282973,
+                    1.2879829936226084,
+                    1.9262038774233896,
+                    2.7843675864652266,
+                    0.9283653243005896,
+                    1.7957351290684624,
+                    1.2492192379762648,
+                    0.9248843894513186,
+                    1.075268817204301,
+                    1.3897583573306622,
+                    0.9732967307212378,
+                    1.1255627813906954,
+                    1.2274549098196392,
+                    0.9489324509926332,
+                    1.2402906539714358,
+                    1.2084215771770275,
+                    0.9496438835436711,
+                    1.0755377688844423,
+                    1.236572570572071,
+                    0.8443009684628756,
+                    1.002004008016032,
+                    1.047250966213689,
+                    1.1,
+                    0.9746345120579782,
+                    1.5157209069272204,
+                    1.0480349344978166,
+                    5.388173521690211,
+                    3.325774754346183,
+                    2.424090965887792,
+                    1.9879969992498125,
+                    4.252400548696845,
+                    5.122745490981964,
+                    8.461827754795035,
+                    9.561852452877293,
+                    5.677564262540554,
+                    6.4795087103647075,
+                    11.7683763883689,
+                    11.491809428535701,
+                    11.861655637407916,
+                    8.579289644822412,
+                    3.53661584603849,
+                    2.1997250343707035,
+                    2.7125455230440787,
+                    2.0734449163127655,
+                    2.398201348988259,
+                    2.8830620106872127,
+                    2.2080040145527535,
+                    2.3607294529103173,
+                    4.101025256314078,
+                    2.214160620465349,
+                    2.205057929487978,
+                    2.719298245614035,
+                    4.217396761641773,
+                    1.9920318725099602,
+                    1.8165982331715815,
+                    1.5831134564643798,
+                    2.156739811912226,
+                    2.106706556968337,
+                    2.346480279580629,
+                    4.129645851583031,
+                    2.575,
+                    2.0994751312171958,
+                    2.6368407898025494,
+                    2.261651880544796,
+                    2.0940438871473352,
+                    5.609573672400898,
+                    3.127354935945742,
+                    2.3625963690624223,
+                    2.954431647471207,
+                    2.8963795255930087,
+                    2.0277882087870824,
+                    1.6936488169364883,
+                    2.3749685850716262,
+                    2.330508474576271,
+                    1.9772243774246028,
+                    2.507522567703109,
+                    2.4533001245330013,
+                    3.6293339985033675,
+                    4.437202306342441,
+                    3.948519305260527,
+                    2.93823455863966,
+                    2.6739972510308636,
+                    3.373734849431463,
+                    3.779724655819775,
+                    2.9798422436459244,
+                    4.555097965805566,
+                    3.9132070738743256,
+                    2.130841121495327,
+                    1.424287856071964,
+                    1.78682993877296,
+                    2.439634680345302,
+                    3.381763527054108,
+                    2.4922118380062304,
+                    2.8814833375093962,
+                    1.2358007739358383,
+                    1.1126390798849857,
+                    1.2484394506866416,
+                    2.5378172271533943,
+                    4.085967762089217,
+                    10.723659542557181,
+                    6.651662915728933,
+                    4.898163188804198,
+                    4.215134459036898,
+                    4.111472131967008,
+                    5.3895210704014005,
+                    5.248031988004498,
+                    6.167896909796071,
+                    7.418508804795803,
+                    7.763819095477387,
+                    3.333747978604304,
+                    7.185703574106474,
+                    4.7011752938234554,
+                    5.547919530176184,
+                    5.1016589746788075,
+                    6.253916530893596,
+                    3.986503374156461,
+                    4.254522769806613,
+                    4.059133049361062,
+                    4.02650994122796,
+                    3.139795664091702,
+                    1.455092824887105,
+                    1.2118940529735132,
+                    1.6364772017489069,
+                    2.31278909863733,
+                    2.627956451007383,
+                    3.7217434744598474,
+                    4.866483653606189,
+                    4.683195592286501,
+                    13.934016495876032,
+                    4.038509627406852,
+                    9.072706795144537,
+                    8.952618453865338,
+                    16.64580725907384,
+                    4.65,
+                    3.150393799224903,
+                    3.7643821910955477,
+                    3.409090909090909,
+                    3.9929903617474025,
+                    2.1972534332084894,
+                    2.4621922259717537,
+                    1.8506940102538452,
+                    2.1284587454613746,
+                    1.2983770287141074,
+                    1.3537227375282026,
+                    2.2429906542056073,
+                    2.6503312914114265,
+                    1.3141426783479349,
+                    2.1483887084686484,
+                    1.4536340852130325,
+                    1.545941902505922,
+                    1.3640345388562132,
+                    1.6235793680529538,
+                    2.0119970007498127,
+                    3.310430980637102,
+                    1.1252813203300824,
+                    2.612080874042446,
+                    1.7170586039567002,
+                    2.8778778778778777,
+                    2.7753469183647956,
+                    2.3363318340829586,
+                    2.0606968902210565,
+                    1.0505252626313157,
+                    1.261553834624032,
+                    2.238339377266475,
+                    3.426284856821308,
+                    2.3505876469117277,
+                    1.5746063484128967,
+                    1.791755419120411,
+                    2.4940765681506423,
+                    1.3616489693941287,
+                    1.988991743807856,
+                    1.1641006383777694,
+                    0.8989886377824947,
+                    1.5130674002751032,
+                    1.4498187726534184,
+                    1.5720524017467248,
+                    1.1009633429250594,
+                    1.9129782445611403,
+                    3.3629203650456305,
+                    3.2383095773943484,
+                    2.1125,
+                    1.0744627686156922,
+                    1.5873015873015872,
+                    1.3391739674593242,
+                    0.8868348738446166,
+                    3.15,
+                    3.1996000499937507,
+                    5.376344086021505,
+                    3.092269326683292,
+                    2.291510142749812,
+                    2.230576441102757,
+                    3.7752305008721656,
+                    6.034051076614922,
+                    5.11988011988012,
+                    4.2390896586219835,
+                    4.489183443791422,
+                    4.744616925388082,
+                    1.921876949956321,
+                    1.9879969992498125,
+                    1.749562609347663,
+                    2.1540388227927365,
+                    3.834624031976018,
+                    5.069297040829067,
+                    4.37172120909318,
+                    2.2439513601604615,
+                    3.181137724550898,
+                    3.399150212446888,
+                    4.019975031210986,
+                    2.077337004129646,
+                    1.5992003998000999,
+                    4.775,
+                    3.3516758379189593,
+                    3.313328332083021,
+                    4.101732951003616,
+                    4.184414933600602,
+                    9.849812265331664,
+                    4.359775140537164,
+                    2.3238380809595203,
+                    3.4767383691845923,
+                    4.55743879472693,
+                    4.677780542423489,
+                    4.172932330827067,
+                    2.8550056102730332,
+                    2.5253156644580574,
+                    3.8061850507073993,
+                    3.7129641205150645,
+                    1.550581468050519,
+                    2.28236467947119,
+                    3.4,
+                    3.651825912956478,
+                    2.224443889027743,
+                    3.479236812570146,
+                    3.221358736525445,
+                    1.0622344413896525,
+                    2.52784382430234,
+                    1.798426376920195,
+                    0.825,
+                    1.0381488430268917,
+                    2.3755938984746185,
+                    3.655189620758483,
+                    1.863431715857929,
+                    1.3386713374202428,
+                    3.2866783304173954,
+                    2.8317253477007895,
+                    2.143035135808622,
+                    2.1627703462932866,
+                    3.3654447641686476,
+                    0.8876109513689211,
+                    1.4128532133033258,
+                    2.6993251687078232,
+                    3.1222680154864495,
+                    2.8489316506310134,
+                    2.1638524077548467,
+                    2.325,
+                    2.937132858392701,
+                    3.717611716109651,
+                    2.919525888958203,
+                    1.7129282320580146,
+                    3.9453907815631264,
+                    3.1694534564512105,
+                    2.7965889139704037,
+                    2.0622422197225347,
+                    2.460347196203322,
+                    3.121878121878122,
+                    3.1464602322387316,
+                    1.8667000751691305,
+                    2.2072577628133185,
+                    2.774653168353956,
+                    2.638489433537577,
+                    1.5248093988251468,
+                    3.313328332083021,
+                    3.123828564288392,
+                    3.224193951512122,
+                    3.425,
+                    2.4875,
+                    3.3012379642365888,
+                    2.9790962573538615,
+                    2.6351942050705635,
+                    2.9742564358910273,
+                    3.2495938007749032,
+                    2.9279279279279278,
+                    2.6838097615778307,
+                    2.488122030507627,
+                    2.074222166687492,
+                    2.8399849868635054,
+                    5.220432121893343,
+                    4.981226533166458,
+                    3.946546771574872,
+                    2.8646484863647736,
+                    2.9839518555667,
+                    2.4523839163450765,
+                    3.2626427406199023,
+                    3.9775561097256857,
+                    5.620082427875609,
+                    2.0119970007498127,
+                    4.761904761904762,
+                    4.061992250968629,
+                    3.675918979744936,
+                    2.6490066225165565,
+                    3.500437554694337,
+                    3.7911122269645996,
+                    3.5242839352428392,
+                    2.784048156508653,
+                    4.30937850292689,
+                    2.401500938086304,
+                    1.875,
+                    2.008284172210368,
+                    1.7037681880363138,
+                    2.9146860145108833,
+                    1.8625,
+                    1.461769115442279,
+                    1.3758599124452784,
+                    1.3920240782543265,
+                    0.7597459210362436,
+                    1.3628407101775444,
+                    1.5246188452886777,
+                    2.274431392151962,
+                    2.331380127166189,
+                    1.5809284818067755,
+                    2.1234074444166873,
+                    2.614133833646029,
+                    1.650825412706353,
+                    1.536155863619333,
+                    1.5740162398500936,
+                    1.5138245965219568,
+                    1.2112887112887112,
+                    2.567635270541082,
+                    1.5730337078651686,
+                    2.0635317658829413,
+                    1.2510947078693857,
+                    1.3729405891163255,
+                    0.7745159275452842,
+                    1.3050570962479608,
+                    1.6879219804951238,
+                    2.105132037867464,
+                    2.65,
+                    2.770812437311936,
+                    3.339147769748318,
+                    1.9257221458046767,
+                    1.3269904857285928,
+                    1.3850761167956076,
+                    1.2121969507623094,
+                    1.026026026026026,
+                    2.495944090852365,
+                    2.9761160435163188,
+                    1.125703564727955,
+                    1.3120079970011245,
+                    1.2152342771235278,
+                    1.4722395508421708,
+                    1.5242378810594703,
+                    1.693002257336343,
+                    1.058926124330385,
+                    0.900225056264066,
+                    0.9623797025371829,
+                    1.026539809714572,
+                    1.3358302122347065,
+                    1.9460138104205902,
+                    1.5978030208463363,
+                    1.0223164193990775,
+                    1.0126265783222903,
+                    1.0501312664083011,
+                    4.408091908091908,
+                    5.026885081905714,
+                    2.551594746716698,
+                    2.443280977312391,
+                    2.7433295753476137,
+                    1.525,
+                    2.325,
+                    2.9287138584247256,
+                    2.7342280195660353,
+                    1.4996250937265683,
+                    1.6122984626921635,
+                    3.3291770573566084,
+                    7.73286467486819,
+                    1.2983770287141074,
+                    1.0755377688844423,
+                    1.4244658253155067,
+                    0.7869098176367724,
+                    1.0257693269952464,
+                    0.6994753934549088,
+                    0.8875,
+                    1.025384519194698,
+                    1.7745563609097725,
+                    2.456448176463216,
+                    2.655860349127182,
+                    0.7375921990248782,
+                    0.8386531480786081,
+                    0.998377636340946,
+                    0.7999000124984377,
+                    2.186406796601699,
+                    3.5146966854283925,
+                    2.0372453443319585,
+                    1.9259629814907453,
+                    2.4359775140537163,
+                    0.7634543178973717,
+                    0.7987021090727567,
+                    0.9007881896659578,
+                    0.7125,
+                    1.3123359580052494,
+                    1.262342207224097,
+                    2.40180135101326,
+                    0.877742946708464,
+                    0.9596211365902293,
+                    0.9365634365634365,
+                    1.2248468941382327,
+                    0.9508319779807332,
+                    0.7767476822851416,
+                    0.7220216606498195,
+                    0.9540547326136078,
+                    0.9349289454001496,
+                    0.7508447002878238,
+                    1.11236095488064,
+                    0.6118117118241978,
+                    0.8258258258258259,
+                    1.614922383575363,
+                    1.9815553339980059,
+                    1.9136960600375235,
+                    1.6112915313514864,
+                    0.6762680025046963,
+                    0.7628814407203601,
+                    0.9367974019485386,
+                    1.3368315842078962,
+                    1.8978649019852665,
+                    1.2274549098196392,
+                    1.1858694295343901,
+                    3.311672081979505,
+                    3.7783060177655448,
+                    3.3462354850792857,
+                    2.6378297287160897,
+                    2.412198475190601,
+                    1.927891837756635,
+                    2.1510755377688846,
+                    1.0998625171853518,
+                    1.2613962782565256,
+                    1.1864618458848508,
+                    1.3013013013013013,
+                    1.2862137862137861,
+                    5.7117860267466565,
+                    3.140640640640641,
+                    4.909431605246721,
+                    5.711072231942015,
+                    4.078568747654198,
+                    1.8245438640339915,
+                    1.387326584176978,
+                    1.1998500187476566,
+                    0.9640666082383874,
+                    1.0728542914171657,
+                    1.2625,
+                    0.914099674430253,
+                    0.8983156581409857,
+                    1.0635635635635636,
+                    1.2740444666500126,
+                    1.5531062124248498,
+                    2.3678276121272863,
+                    2.7514940239043826,
+                    0.8877219304826206,
+                    1.0391886816076124,
+                    0.8237643534697953,
+                    1.4376797099637455,
+                    1.799550112471882,
+                    0.9272021049993735,
+                    1.0340102155226112,
+                    1.0256410256410255,
+                    0.9248843894513186,
+                    1.425891181988743,
+                    1.1120829688866676,
+                    1.0384086075315901,
+                    1.1134742900037533,
+                    1.4223331253898939,
+                    1.1009633429250594,
+                    1.2612387612387612,
+                    1.537307836520435,
+                    1.809272521673577,
+                    15.410406059853472,
+                    12.409668577124346,
+                    5.008117896840265,
+                    5.541624874623872,
+                    6.655056599079487,
+                    2.993861956657898,
+                    1.4595808383233533,
+                    1.1397795591182365,
+                    0.8129064532266133,
+                    0.9484587545238987,
+                    1.261553834624032,
+                    1.137215696075981,
+                    2.5827482447342027,
+                    1.244031163608947,
+                    1.3625,
+                    1.4621344663834042,
+                    1.8309505894156006,
+                    1.0860067407314942,
+                    5.251340900586254,
+                    2.262217222847144,
+                    2.0127515939492437,
+                    2.899637545306837,
+                    2.126063031515758,
+                    1.4869423966012745,
+                    1.5121219695076231,
+                    1.6497937757780277,
+                    0.9754877438719359,
+                    1.6993627389728851,
+                    1.687289088863892,
+                    1.50018752344043,
+                    1.537307836520435,
+                ],
+            },
+        ];
+        for test in tests {
+            let mut dst = vec![];
+            let src = test.input;
+
+            super::encode(&src, &mut dst).expect("failed to encode");
+
+            let mut got = vec![];
+            super::decode(&dst, &mut got).expect("failed to decode");
+            // verify got same values back
+            assert_eq!(got, src, "{}", test.name);
+        }
+    }
+
+    #[test]
+    fn decode_influxdb() {
+        // A block compressed with InfluxDB's gorilla encoder containing 507 0
+        // values.
+        let enc_influxdb = [
+            16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 255, 255, 224, 0, 0, 0, 0, 0, 4,
+        ];
+
+        let mut got = vec![];
+        let exp = vec![0.0; 507];
+        super::decode_influxdb(&enc_influxdb, &mut got).expect("failed to decode");
+        assert_eq!(got, exp);
+    }
+}
diff --git a/influxdb_tsm/src/encoders/integer.rs b/influxdb_tsm/src/encoders/integer.rs
new file mode 100644
index 0000000..ebf5065
--- /dev/null
+++ b/influxdb_tsm/src/encoders/integer.rs
@@ -0,0 +1,385 @@
+use super::simple8b;
+use integer_encoding::*;
+use std::error::Error;
+
+/// Encoding describes the type of encoding used by an encoded integer block.
+#[derive(Debug, Clone, Copy)]
+pub enum Encoding {
+    Uncompressed = 0,
+    Simple8b = 1,
+    Rle = 2,
+}
+
+/// encode encodes a vector of signed integers into dst.
+///
+/// Deltas between the integers in the vector are first calculated, and these
+/// deltas are then zig-zag encoded. The resulting zig-zag encoded deltas are
+/// further compressed if possible, either via bit-packing using simple8b or by
+/// run-length encoding the deltas if they're all the same.
+pub fn encode(src: &[i64], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+    dst.clear(); // reset buffer.
+    if src.is_empty() {
+        return Ok(());
+    }
+
+    let mut max: u64 = 0;
+    let mut deltas = i64_to_u64_vector(src);
+    for i in (1..deltas.len()).rev() {
+        deltas[i] = zig_zag_encode(deltas[i].wrapping_sub(deltas[i - 1]) as i64);
+        if deltas[i] > max {
+            max = deltas[i];
+        }
+    }
+
+    // deltas[0] is the first value in the sequence.
+    deltas[0] = zig_zag_encode(src[0]);
+
+    if deltas.len() > 2 {
+        let mut use_rle = true;
+        for i in 2..deltas.len() {
+            if deltas[1] != deltas[i] {
+                use_rle = false;
+                break;
+            }
+        }
+
+        // Encode with RLE if possible.
+        if use_rle {
+            // count is the number of deltas repeating excluding first value.
+            encode_rle(deltas[0], deltas[1], deltas.len() as u64 - 1, dst);
+            // 4 high bits of first byte used for the encoding type
+            dst[0] |= (Encoding::Rle as u8) << 4;
+            return Ok(());
+        }
+    }
+
+    // write block uncompressed
+    if max > simple8b::MAX_VALUE {
+        let cap = 1 + (deltas.len() * 8); // 8 bytes per value plus header byte
+        if dst.capacity() < cap {
+            dst.reserve_exact(cap - dst.capacity());
+        }
+        dst.push((Encoding::Uncompressed as u8) << 4);
+        for delta in &deltas {
+            dst.extend_from_slice(&delta.to_be_bytes());
+        }
+        return Ok(());
+    }
+
+    // Compress with simple8b
+    // first 4 high bits used for encoding type
+    dst.push((Encoding::Simple8b as u8) << 4);
+    dst.extend_from_slice(&deltas[0].to_be_bytes()); // encode first value
+    simple8b::encode(&deltas[1..], dst)
+}
+
+// zig_zag_encode converts a signed integer into an unsigned one by zig zagging
+// negative and positive values across even and odd numbers.
+//
+// Eg. [0,-1,1,-2] becomes [0, 1, 2, 3].
+fn zig_zag_encode(v: i64) -> u64 {
+    ((v << 1) ^ (v >> 63)) as u64
+}
+
+// zig_zag_decode converts a zig zag encoded unsigned integer into an signed
+// integer.
+fn zig_zag_decode(v: u64) -> i64 {
+    ((v >> 1) ^ ((((v & 1) as i64) << 63) >> 63) as u64) as i64
+}
+
+// Converts a slice of `i64` values to a `Vec<u64>`.
+// TODO(edd): this is expensive as it copies. There are cheap
+// but unsafe alternatives to look into such as std::mem::transmute
+fn i64_to_u64_vector(src: &[i64]) -> Vec<u64> {
+    src.iter().map(|&x| x as u64).collect()
+}
+
+// encode_rle encodes the value v, delta and count into dst.
+//
+// v should be the first element of a sequence, delta the difference that each
+// value in the sequence differs by, and count the number of times that the
+// delta is repeated.
+fn encode_rle(v: u64, delta: u64, count: u64, dst: &mut Vec<u8>) {
+    use super::MAX_VAR_INT_64;
+    dst.push(0); // save a byte for encoding type
+    dst.extend_from_slice(&v.to_be_bytes()); // write the first value in as a byte array.
+    let mut n = 9;
+
+    if dst.len() - n <= MAX_VAR_INT_64 {
+        dst.resize(n + MAX_VAR_INT_64, 0);
+    }
+    n += delta.encode_var(&mut dst[n..]); // encode delta between values
+
+    if dst.len() - n <= MAX_VAR_INT_64 {
+        dst.resize(n + MAX_VAR_INT_64, 0);
+    }
+    n += count.encode_var(&mut dst[n..]); // encode count of values
+    dst.truncate(n);
+}
+
+/// decode decodes a slice of bytes into a vector of signed integers.
+pub fn decode(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.is_empty() {
+        return Ok(());
+    }
+    let encoding = &src[0] >> 4;
+    match encoding {
+        encoding if encoding == Encoding::Uncompressed as u8 => {
+            decode_uncompressed(&src[1..], dst) // first byte not used
+        }
+        encoding if encoding == Encoding::Rle as u8 => decode_rle(&src[1..], dst),
+        encoding if encoding == Encoding::Simple8b as u8 => decode_simple8b(&src[1..], dst),
+        _ => Err(From::from("invalid block encoding")),
+    }
+}
+
+fn decode_uncompressed(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.is_empty() || src.len() & 0x7 != 0 {
+        return Err(From::from("invalid uncompressed block length"));
+    }
+
+    let count = src.len() / 8;
+    if dst.capacity() < count {
+        dst.reserve_exact(count - dst.capacity());
+    }
+    let mut i = 0;
+    let mut prev: i64 = 0;
+    let mut buf: [u8; 8] = [0; 8];
+    while i < src.len() {
+        buf.copy_from_slice(&src[i..i + 8]);
+        prev = prev.wrapping_add(zig_zag_decode(u64::from_be_bytes(buf)));
+        dst.push(prev); // N.B - signed integer...
+        i += 8;
+    }
+    Ok(())
+}
+
+// decode_rle decodes an RLE encoded slice containing only unsigned into the
+// destination vector.
+fn decode_rle(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.len() < 8 {
+        return Err(From::from("not enough data to decode using RLE"));
+    }
+
+    let mut i = 8; // Skip first value
+    let (delta, n) = u64::decode_var(&src[i..]).ok_or("unable to decode delta")?;
+
+    i += n;
+
+    let (count, _n) = usize::decode_var(&src[i..]).ok_or("unable to decode count")?;
+
+    if dst.capacity() < count {
+        dst.reserve_exact(count - dst.capacity());
+    }
+
+    // TODO(edd): this should be possible to do in-place without copy.
+    let mut a: [u8; 8] = [0; 8];
+    a.copy_from_slice(&src[0..8]);
+    let mut first = zig_zag_decode(u64::from_be_bytes(a));
+    let delta_z = zig_zag_decode(delta);
+
+    // first values stored raw
+    dst.push(first);
+
+    for _ in 0..count {
+        first = first.wrapping_add(delta_z);
+        dst.push(first);
+    }
+    Ok(())
+}
+
+fn decode_simple8b(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.len() < 8 {
+        return Err(From::from("not enough data to decode packed integer."));
+    }
+
+    // TODO(edd): pre-allocate res by counting bytes in encoded slice?
+    let mut res = vec![];
+    let mut buf: [u8; 8] = [0; 8];
+    buf.copy_from_slice(&src[0..8]);
+    dst.push(zig_zag_decode(u64::from_be_bytes(buf)));
+
+    simple8b::decode(&src[8..], &mut res);
+    // TODO(edd): fix this. It's copying, which is slowwwwwwwww.
+    let mut next = dst[0];
+    for v in &res {
+        next += zig_zag_decode(*v);
+        dst.push(next);
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+#[allow(clippy::unreadable_literal)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn zig_zag_encoding() {
+        let input = [-2147483648, -2, -1, 0, 1, 2147483647];
+        let exp = [4294967295, 3, 1, 0, 2, 4294967294];
+        for (i, v) in input.iter().enumerate() {
+            let encoded = zig_zag_encode(*v);
+            assert_eq!(encoded, exp[i]);
+
+            let decoded = zig_zag_decode(encoded);
+            assert_eq!(decoded, input[i]);
+        }
+    }
+
+    #[test]
+    fn encode_no_values() {
+        let src: Vec<i64> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        encode(&src, &mut dst).expect("failed to encode src");
+
+        // verify encoded no values.
+        assert_eq!(dst.to_vec().len(), 0);
+    }
+
+    #[test]
+    fn encode_uncompressed() {
+        let src: Vec<i64> = vec![-1000, 0, simple8b::MAX_VALUE as i64, 213123421];
+        let mut dst = vec![];
+
+        let exp = src.clone();
+        encode(&src, &mut dst).expect("failed to encode");
+
+        // verify uncompressed encoding used
+        assert_eq!(&dst[0] >> 4, Encoding::Uncompressed as u8);
+        let mut got = vec![];
+        decode(&dst, &mut got).expect("failed to decode");
+
+        // verify got same values back
+        assert_eq!(got, exp);
+    }
+
+    #[test]
+    fn encode_rle() {
+        struct Test {
+            name: String,
+            input: Vec<i64>,
+        }
+
+        let tests = vec![
+            Test {
+                name: String::from("no delta positive"),
+                input: vec![123; 8],
+            },
+            Test {
+                name: String::from("no delta negative"),
+                input: vec![-345632452354; 1000],
+            },
+            Test {
+                name: String::from("delta positive"),
+                input: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            },
+            Test {
+                name: String::from("delta negative"),
+                input: vec![-350, -200, -50],
+            },
+            Test {
+                name: String::from("delta mixed"),
+                input: vec![-35000, -5000, 25000, 55000],
+            },
+            Test {
+                name: String::from("delta descending"),
+                input: vec![100, 50, 0, -50, -100, -150],
+            },
+        ];
+
+        for test in tests {
+            let mut dst = vec![];
+            let src = test.input.clone();
+            let exp = test.input;
+            encode(&src, &mut dst).expect("failed to encode");
+
+            // verify RLE encoding used
+            assert_eq!(&dst[0] >> 4, Encoding::Rle as u8);
+            let mut got = vec![];
+            decode(&dst, &mut got).expect("failed to decode");
+            // verify got same values back
+            assert_eq!(got, exp, "{}", test.name);
+        }
+    }
+
+    #[test]
+    fn encode_simple8b() {
+        struct Test {
+            name: String,
+            input: Vec<i64>,
+        }
+
+        let tests = vec![
+            Test {
+                name: String::from("positive"),
+                input: vec![1, 11, 3124, 123543256, 2398567984273478],
+            },
+            Test {
+                name: String::from("negative"),
+                input: vec![-109290, -1234, -123, -12],
+            },
+            Test {
+                name: String::from("mixed"),
+                input: vec![-109290, -1234, -123, -12, 0, 0, 0, 1234, 44444, 4444444],
+            },
+        ];
+
+        for test in tests {
+            let mut dst = vec![];
+            let src = test.input.clone();
+            let exp = test.input;
+            encode(&src, &mut dst).expect("failed to encode");
+            // verify Simple8b encoding used
+            assert_eq!(&dst[0] >> 4, Encoding::Simple8b as u8);
+
+            let mut got = vec![];
+            decode(&dst, &mut got).expect("failed to decode");
+            // verify got same values back
+            assert_eq!(got, exp, "{}", test.name);
+        }
+    }
+
+    #[test]
+    // This tests against a defect found when decoding a TSM block from InfluxDB.
+    fn rle_regression() {
+        let values = vec![809201799168i64; 509];
+        let mut enc = vec![];
+        encode(&values, &mut enc).expect("encoding failed");
+
+        // this is a compressed rle integer block representing 509 identical
+        // 809201799168 values.
+        let enc_influx = [32, 0, 0, 1, 120, 208, 95, 32, 0, 0, 252, 3];
+
+        // ensure that encoder produces same bytes as InfluxDB encoder.
+        assert_eq!(enc, enc_influx);
+
+        let mut dec = vec![];
+        decode(&enc, &mut dec).expect("failed to decode");
+
+        assert_eq!(dec.len(), values.len());
+        assert_eq!(dec, values);
+    }
+
+    #[test]
+    // This tests against a defect found when decoding a TSM block from InfluxDB.
+    fn simple8b_short_regression() {
+        let values = vec![346];
+        let mut enc = vec![];
+        encode(&values, &mut enc).expect("encoding failed");
+
+        // this is a compressed simple8b integer block representing the value 346.
+        let enc_influx = [16, 0, 0, 0, 0, 0, 0, 2, 180];
+
+        // ensure that encoder produces same bytes as InfluxDB encoder.
+        assert_eq!(enc, enc_influx);
+
+        let mut dec = vec![];
+        decode(&enc, &mut dec).expect("failed to decode");
+
+        assert_eq!(dec.len(), values.len());
+        assert_eq!(dec, values);
+    }
+}
diff --git a/influxdb_tsm/src/encoders/simple8b.rs b/influxdb_tsm/src/encoders/simple8b.rs
new file mode 100644
index 0000000..003caa7
--- /dev/null
+++ b/influxdb_tsm/src/encoders/simple8b.rs
@@ -0,0 +1,395 @@
+use std::error::Error;
+
+//
+// Adapted from https://github.com/stuartcarnie/rust-encoding
+//
+const S8B_BIT_SIZE: usize = 60;
+
+// maximum value that can be encoded.
+pub const MAX_VALUE: u64 = (1 << 60) - 1;
+
+const NUM_BITS: [[u8; 2]; 14] = [
+    [60, 1],
+    [30, 2],
+    [20, 3],
+    [15, 4],
+    [12, 5],
+    [10, 6],
+    [8, 7],
+    [7, 8],
+    [6, 10],
+    [5, 12],
+    [4, 15],
+    [3, 20],
+    [2, 30],
+    [1, 60],
+];
+
+/// encode packs and binary encodes the provides slice of u64 values using
+/// simple8b into the provided vector.
+pub fn encode(src: &[u64], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+    let mut i = 0;
+    'next_value: while i < src.len() {
+        // try to pack a run of 240 or 120 1s
+        let remain = src.len() - i;
+        if remain >= 120 {
+            let a = if remain >= 240 {
+                &src[i..i + 240]
+            } else {
+                &src[i..i + 120]
+            };
+
+            // search for the longest sequence of 1s in a
+            let k = a.iter().take_while(|x| **x == 1).count();
+            if k == 240 {
+                i += 240;
+                dst.resize(dst.len() + 8, 0);
+                continue;
+            } else if k >= 120 {
+                i += 120;
+                dst.extend_from_slice(&(1u64 << 60).to_be_bytes());
+                continue;
+            }
+        }
+
+        'codes: for (idx, code) in NUM_BITS.iter().enumerate() {
+            let (int_n, bit_n) = (code[0] as usize, code[1] as usize);
+            if int_n > remain {
+                continue;
+            }
+
+            let max_val = 1u64 << (bit_n & 0x3f) as u64;
+            let mut val = ((idx as u64) + 2) << S8B_BIT_SIZE;
+            for (k, in_v) in src[i..].iter().enumerate() {
+                if k < int_n {
+                    if *in_v >= max_val {
+                        continue 'codes;
+                    }
+                    val |= in_v << ((k * bit_n) as u8 & 0x3f)
+                } else {
+                    break;
+                }
+            }
+            dst.extend_from_slice(&val.to_be_bytes());
+            i += int_n;
+            continue 'next_value;
+        }
+        return Err(From::from("value out of bounds"));
+    }
+    Ok(())
+}
+
+/// decode decodes and unpacks the binary-encoded values stored in src into
+/// dst.
+pub fn decode(src: &[u8], dst: &mut Vec<u64>) {
+    let mut i = 0;
+    let mut j = 0;
+    let mut buf: [u8; 8] = [0; 8];
+    while i < src.len() {
+        if dst.len() < j + 240 {
+            dst.resize(j + 240, 0); // may need 240 capacity
+        }
+        buf.copy_from_slice(&src[i..i + 8]);
+        j += decode_value(u64::from_be_bytes(buf), &mut dst[j..]);
+        i += 8;
+    }
+    dst.truncate(j);
+}
+
+fn decode_value(v: u64, dst: &mut [u64]) -> usize {
+    let sel = v >> S8B_BIT_SIZE as u64;
+    let mut v = v;
+    match sel {
+        0 => {
+            for i in &mut dst[0..240] {
+                *i = 1;
+            }
+            240
+        }
+        1 => {
+            for i in &mut dst[0..120] {
+                *i = 1
+            }
+            120
+        }
+        2 => {
+            for i in &mut dst[0..60] {
+                *i = v & 0x01;
+                v >>= 1
+            }
+            60
+        }
+        3 => {
+            for i in &mut dst[0..30] {
+                *i = v & 0x03;
+                v >>= 2
+            }
+            30
+        }
+        4 => {
+            for i in &mut dst[0..20] {
+                *i = v & 0x07;
+                v >>= 3
+            }
+            20
+        }
+        5 => {
+            for i in &mut dst[0..15] {
+                *i = v & 0x0f;
+                v >>= 4
+            }
+            15
+        }
+        6 => {
+            for i in &mut dst[0..12] {
+                *i = v & 0x1f;
+                v >>= 5
+            }
+            12
+        }
+        7 => {
+            for i in &mut dst[0..10] {
+                *i = v & 0x3f;
+                v >>= 6
+            }
+            10
+        }
+        8 => {
+            for i in &mut dst[0..8] {
+                *i = v & 0x7f;
+                v >>= 7
+            }
+            8
+        }
+        9 => {
+            for i in &mut dst[0..7] {
+                *i = v & 0xff;
+                v >>= 8
+            }
+            7
+        }
+        10 => {
+            for i in &mut dst[0..6] {
+                *i = v & 0x03ff;
+                v >>= 10
+            }
+            6
+        }
+        11 => {
+            for i in &mut dst[0..5] {
+                *i = v & 0x0fff;
+                v >>= 12
+            }
+            5
+        }
+        12 => {
+            dst[0] = v & 0x7fff;
+            dst[1] = (v >> 15) & 0x7fff;
+            dst[2] = (v >> 30) & 0x7fff;
+            dst[3] = (v >> 45) & 0x7fff;
+            4
+        }
+        13 => {
+            for i in &mut dst[0..3] {
+                *i = v & 0x000f_ffff;
+                v >>= 20
+            }
+            3
+        }
+        14 => {
+            for i in &mut dst[0..2] {
+                *i = v & 0x3fff_ffff;
+                v >>= 30
+            }
+            2
+        }
+        15 => {
+            dst[0] = v & 0x0fff_ffff_ffff_ffff;
+            1
+        }
+        _ => 0,
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::unreadable_literal)]
+mod tests {
+    use super::*;
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
+
+    #[test]
+    fn test_encode_no_values() {
+        let src = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        encode(&src, &mut dst).expect("failed to encode src");
+
+        // verify encoded no values.
+        assert_eq!(dst.len(), src.len())
+    }
+
+    #[test]
+    fn test_encode_mixed_sizes() {
+        let src = vec![7, 6, 256, 4, 3, 2, 1];
+
+        let mut encoded = vec![];
+        let mut decoded = vec![];
+        encode(&src, &mut encoded).expect("failed to encode");
+        assert_eq!(encoded.len(), 16); // verify vector is truncated.
+        decode(&encoded, &mut decoded);
+        assert_eq!(decoded.to_vec(), src, "{}", "mixed sizes");
+    }
+
+    #[test]
+    fn test_encode_mixed_sizes_alt() {
+        let src = vec![1, 11, 3124, 123543256, 2398567984273478];
+
+        let mut encoded = vec![];
+        let mut decoded = vec![];
+        encode(&src, &mut encoded).expect("failed to encode");
+        assert_eq!(encoded.len(), 24); // verify vector is truncated.
+        decode(&encoded, &mut decoded);
+        assert_eq!(decoded.to_vec(), src, "{}", "mixed sizes");
+    }
+
+    #[test]
+    fn test_encode_too_big() {
+        let src = vec![7, 6, 2 << (61 - 1), 4, 3, 2, 1];
+
+        let mut encoded = vec![];
+        let result = encode(&src, &mut encoded);
+        assert_eq!(result.unwrap_err().to_string(), "value out of bounds");
+    }
+
+    #[test]
+    fn test_encode() {
+        struct Test {
+            name: String,
+            // TODO(edd): no idea how to store the closure in the struct rather than the
+            // result.
+            input: Vec<u64>,
+        }
+
+        let tests = vec![
+            Test {
+                name: String::from("1 bit"),
+                input: bits(100, 1)(),
+            },
+            Test {
+                name: String::from("2 bit"),
+                input: bits(100, 2)(),
+            },
+            Test {
+                name: String::from("3 bit"),
+                input: bits(100, 3)(),
+            },
+            Test {
+                name: String::from("4 bit"),
+                input: bits(100, 4)(),
+            },
+            Test {
+                name: String::from("5 bit"),
+                input: bits(100, 5)(),
+            },
+            Test {
+                name: String::from("6 bit"),
+                input: bits(100, 6)(),
+            },
+            Test {
+                name: String::from("7 bit"),
+                input: bits(100, 7)(),
+            },
+            Test {
+                name: String::from("8 bit"),
+                input: bits(100, 8)(),
+            },
+            Test {
+                name: String::from("10 bit"),
+                input: bits(100, 10)(),
+            },
+            Test {
+                name: String::from("12 bit"),
+                input: bits(100, 12)(),
+            },
+            Test {
+                name: String::from("15 bit"),
+                input: bits(100, 15)(),
+            },
+            Test {
+                name: String::from("20 bit"),
+                input: bits(100, 20)(),
+            },
+            Test {
+                name: String::from("30 bit"),
+                input: bits(100, 30)(),
+            },
+            Test {
+                name: String::from("60 bit"),
+                input: bits(100, 60)(),
+            },
+            Test {
+                name: String::from("240 ones"),
+                input: ones(240)(),
+            },
+        ];
+
+        for test in tests {
+            let mut encoded = vec![];
+            encode(&test.input, &mut encoded).expect("failed to encode");
+            let mut decoded = vec![];
+            decode(&encoded, &mut decoded);
+            assert_eq!(decoded.to_vec(), test.input, "{}", test.name);
+        }
+
+        // Some special cases that are tricky to get into the structs until I figure
+        // out how to make `input` a function.
+
+        let mut input = ones(240)();
+        input[120] = 5;
+        let mut encoded = vec![];
+        encode(&input, &mut encoded).expect("failed to encode");
+        let mut decoded = vec![];
+        decode(&encoded, &mut decoded);
+        assert_eq!(decoded.to_vec(), input, "{}", "120 ones");
+
+        input = ones(240)();
+        input[119] = 5;
+
+        let mut encoded = vec![];
+        encode(&input, &mut encoded).expect("failed to encode");
+        let mut decoded = vec![];
+        decode(&encoded, &mut decoded);
+        assert_eq!(decoded.to_vec(), input, "{}", "119 ones");
+
+        input = ones(241)();
+        input[239] = 5;
+
+        let mut encoded = vec![];
+        encode(&input, &mut encoded).expect("failed to encode");
+        let mut decoded = vec![];
+        decode(&encoded, &mut decoded);
+        assert_eq!(decoded.to_vec(), input, "{}", "239 ones");
+    }
+
+    fn bits(n: u64, bits: u8) -> impl Fn() -> Vec<u64> {
+        // move takes ownership of captured variable n
+        move || {
+            let max = 1 << bits;
+            let mut rng: StdRng = SeedableRng::seed_from_u64(231);
+            let mut a = Vec::with_capacity(n as usize);
+            for i in 0..n {
+                let top_bit = (i & 1) << (bits - 1);
+                let v = rng.gen_range(0..max) | top_bit;
+                assert!(v < max);
+                a.push(v);
+            }
+            a
+        }
+    }
+
+    fn ones(n: u64) -> impl Fn() -> Vec<u64> {
+        move || vec![1; n as usize]
+    }
+}
diff --git a/influxdb_tsm/src/encoders/string.rs b/influxdb_tsm/src/encoders/string.rs
new file mode 100644
index 0000000..65db363
--- /dev/null
+++ b/influxdb_tsm/src/encoders/string.rs
@@ -0,0 +1,244 @@
+use integer_encoding::VarInt;
+use std::{convert::TryInto, error::Error};
+
+/// A compressed encoding using Snappy compression. Snappy is the only available
+/// string compression format at this time.
+const STRING_COMPRESSED_SNAPPY: u8 = 1;
+/// The header consists of one byte indicating the compression type.
+const HEADER_LEN: usize = 1;
+/// Store `i32::MAX` as a `usize` for comparing with lengths in assertions
+const MAX_I32: usize = i32::MAX as usize;
+
+/// Encodes a slice of byte slices representing string data into a vector of
+/// bytes. Currently uses Snappy compression.
+pub fn encode(src: &[&[u8]], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+    dst.clear(); // reset buffer
+    if src.is_empty() {
+        return Ok(());
+    }
+
+    // strings shouldn't be longer than 64kb
+    let length_of_lengths = src.len() * super::MAX_VAR_INT_32;
+    let sum_of_lengths: usize = src
+        .iter()
+        .map(|s| {
+            let len = s.len();
+            assert!(len < MAX_I32);
+            len
+        })
+        .sum();
+    let source_size = 2 + length_of_lengths + sum_of_lengths;
+
+    // determine the maximum possible length needed for the buffer, which
+    // includes the compressed size
+    let max_encoded_len = snap::raw::max_compress_len(source_size);
+    if max_encoded_len == 0 {
+        return Err("source length too large".into());
+    }
+    let compressed_size = max_encoded_len + HEADER_LEN;
+    let total_size = source_size + compressed_size;
+
+    if dst.len() < total_size {
+        dst.resize(total_size, 0);
+    }
+
+    // write the data to be compressed *after* the space needed for snappy
+    // compression. The compressed data is at the start of the allocated buffer,
+    // ensuring the entire capacity is returned and available for subsequent use.
+    let (compressed_data, data) = dst.split_at_mut(compressed_size);
+    let mut n = 0;
+    for s in src {
+        let len = s.len();
+        let len_u64: u64 = len.try_into()?;
+        n += len_u64.encode_var(&mut data[n..]);
+        data[n..][..len].copy_from_slice(s);
+        n += len;
+    }
+    let data = &data[..n];
+
+    let (header, compressed_data) = compressed_data.split_at_mut(HEADER_LEN);
+
+    header[0] = STRING_COMPRESSED_SNAPPY << 4; // write compression type
+
+    // TODO: snap docs say it is beneficial to reuse an `Encoder` when possible
+    let mut encoder = snap::raw::Encoder::new();
+    let actual_compressed_size = encoder.compress(data, compressed_data)?;
+
+    dst.truncate(HEADER_LEN + actual_compressed_size);
+
+    Ok(())
+}
+
+/// Decodes a slice of bytes representing Snappy-compressed data into a vector
+/// of vectors of bytes representing string data, which may or may not be valid
+/// UTF-8.
+pub fn decode(src: &[u8], dst: &mut Vec<Vec<u8>>) -> Result<(), Box<dyn Error>> {
+    if src.is_empty() {
+        return Ok(());
+    }
+
+    let mut decoder = snap::raw::Decoder::new();
+    // First byte stores the encoding type, only have snappy format
+    // currently so ignore for now.
+    let decoded_bytes = decoder.decompress_vec(&src[HEADER_LEN..])?;
+
+    if dst.capacity() == 0 {
+        dst.reserve_exact(64);
+    }
+
+    let num_decoded_bytes = decoded_bytes.len();
+    let mut i = 0;
+
+    while i < num_decoded_bytes {
+        let (length, num_bytes_read) =
+            u64::decode_var(&decoded_bytes[i..]).ok_or("invalid encoded string length")?;
+        let length: usize = length.try_into()?;
+
+        let lower = i + num_bytes_read;
+        let upper = lower + length;
+
+        if upper < lower {
+            return Err("length overflow".into());
+        }
+        if upper > num_decoded_bytes {
+            return Err("short buffer".into());
+        }
+
+        dst.push(decoded_bytes[lower..upper].to_vec());
+
+        // The length of this string plus the length of the variable byte encoded length
+        i += length + num_bytes_read;
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn encode_no_values() {
+        let src: Vec<&[u8]> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        encode(&src, &mut dst).expect("failed to encode src");
+
+        // verify encoded no values.
+        assert_eq!(dst.to_vec().len(), 0);
+    }
+
+    #[test]
+    fn encode_single() {
+        let v1_bytes = b"v1";
+        let src = vec![&v1_bytes[..]];
+        let mut dst = vec![];
+
+        encode(&src, &mut dst).expect("failed to encode src");
+        assert_eq!(dst, vec![16, 3, 8, 2, 118, 49]);
+    }
+
+    #[test]
+    fn encode_multi_compressed() {
+        let src_strings: Vec<_> = (0..10).map(|i| format!("value {i}")).collect();
+        let src: Vec<_> = src_strings.iter().map(|s| s.as_bytes()).collect();
+        let mut dst = vec![];
+
+        encode(&src, &mut dst).expect("failed to encode src");
+        assert_eq!(
+            dst,
+            vec![
+                16, 80, 28, 7, 118, 97, 108, 117, 101, 32, 48, 13, 8, 0, 49, 13, 8, 0, 50, 13, 8,
+                0, 51, 13, 8, 0, 52, 13, 8, 0, 53, 13, 8, 0, 54, 13, 8, 0, 55, 13, 8, 32, 56, 7,
+                118, 97, 108, 117, 101, 32, 57
+            ]
+        );
+    }
+
+    #[test]
+    fn encode_unicode() {
+        let src = vec!["☃".as_bytes()];
+        let mut dst = vec![];
+
+        encode(&src, &mut dst).expect("failed to encode src");
+        assert_eq!(dst, vec![16, 4, 12, 3, 226, 152, 131]);
+    }
+
+    #[test]
+    fn encode_invalid_utf8() {
+        let src = vec![&[b'\xC0'][..]];
+        let mut dst = vec![];
+
+        encode(&src, &mut dst).expect("failed to encode src");
+        assert_eq!(dst, vec![16, 2, 4, 1, 192]);
+    }
+
+    #[test]
+    fn decode_no_values() {
+        let src: Vec<u8> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        decode(&src, &mut dst).expect("failed to decode src");
+
+        // verify decoded no values.
+        assert_eq!(dst.to_vec().len(), 0);
+    }
+
+    #[test]
+    fn decode_single() {
+        let src = vec![16, 3, 8, 2, 118, 49];
+        let mut dst = vec![];
+
+        decode(&src, &mut dst).expect("failed to decode src");
+
+        let dst_as_strings: Vec<_> = dst
+            .iter()
+            .map(|s| std::str::from_utf8(s).unwrap())
+            .collect();
+        assert_eq!(dst_as_strings, vec!["v1"]);
+    }
+
+    #[test]
+    fn decode_multi_compressed() {
+        let src = vec![
+            16, 80, 28, 7, 118, 97, 108, 117, 101, 32, 48, 13, 8, 0, 49, 13, 8, 0, 50, 13, 8, 0,
+            51, 13, 8, 0, 52, 13, 8, 0, 53, 13, 8, 0, 54, 13, 8, 0, 55, 13, 8, 32, 56, 7, 118, 97,
+            108, 117, 101, 32, 57,
+        ];
+        let mut dst = vec![];
+
+        decode(&src, &mut dst).expect("failed to decode src");
+
+        let dst_as_strings: Vec<_> = dst
+            .iter()
+            .map(|s| std::str::from_utf8(s).unwrap())
+            .collect();
+        let expected: Vec<_> = (0..10).map(|i| format!("value {i}")).collect();
+        assert_eq!(dst_as_strings, expected);
+    }
+
+    #[test]
+    fn decode_unicode() {
+        let src = vec![16, 4, 12, 3, 226, 152, 131];
+        let mut dst = vec![];
+
+        decode(&src, &mut dst).expect("failed to decode src");
+
+        let dst_as_strings: Vec<_> = dst
+            .iter()
+            .map(|s| std::str::from_utf8(s).unwrap())
+            .collect();
+        assert_eq!(dst_as_strings, vec!["☃"]);
+    }
+
+    #[test]
+    fn decode_invalid_utf8() {
+        let src = vec![16, 2, 4, 1, 192];
+        let mut dst = vec![];
+
+        decode(&src, &mut dst).expect("failed to decode src");
+        assert_eq!(dst, vec![&[b'\xC0'][..]]);
+    }
+}
diff --git a/influxdb_tsm/src/encoders/timestamp.rs b/influxdb_tsm/src/encoders/timestamp.rs
new file mode 100644
index 0000000..f2ae07d
--- /dev/null
+++ b/influxdb_tsm/src/encoders/timestamp.rs
@@ -0,0 +1,377 @@
+use super::simple8b;
+use integer_encoding::*;
+use std::error::Error;
+
+// Encoding describes the type of encoding used by an encoded timestamp block.
+enum Encoding {
+    Uncompressed = 0,
+    Simple8b = 1,
+    Rle = 2,
+}
+
+/// encode encodes a vector of signed integers into a slice of bytes.
+///
+/// To maximise compression, the provided vector should be sorted in ascending
+/// order. First deltas between the integers are determined, then further
+/// encoding is potentially carried out. If all the deltas are the same the
+/// block can be encoded using RLE. If not, as long as the deltas are not bigger
+/// than simple8b::MAX_VALUE they can be encoded using simple8b.
+pub fn encode(src: &[i64], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+    dst.clear(); // reset buffer.
+    if src.is_empty() {
+        return Ok(());
+    }
+
+    let mut max: u64 = 0;
+    let mut deltas = i64_to_u64_vector(src);
+    if deltas.len() > 1 {
+        for i in (1..deltas.len()).rev() {
+            deltas[i] = deltas[i].wrapping_sub(deltas[i - 1]);
+            if deltas[i] > max {
+                max = deltas[i];
+            }
+        }
+        let mut use_rle = true;
+        for i in 2..deltas.len() {
+            if deltas[1] != deltas[i] {
+                use_rle = false;
+                break;
+            }
+        }
+
+        // Encode with RLE if possible.
+        if use_rle {
+            encode_rle(deltas[0], deltas[1], deltas.len() as u64, dst);
+            // 4 high bits of first byte used for the encoding type
+            dst[0] |= (Encoding::Rle as u8) << 4;
+            return Ok(());
+        }
+    }
+
+    // write block uncompressed
+    if max > simple8b::MAX_VALUE {
+        let cap = 1 + (deltas.len() * 8); // 8 bytes per value plus header byte
+        if dst.capacity() < cap {
+            dst.reserve_exact(cap - dst.capacity());
+        }
+
+        dst.push((Encoding::Uncompressed as u8) << 4);
+        for delta in &deltas {
+            dst.extend_from_slice(&delta.to_be_bytes());
+        }
+        return Ok(());
+    }
+
+    // Compress with simple8b
+    // First find the divisor for the deltas.
+    let mut div: u64 = 1_000_000_000_000;
+    for delta in deltas.iter().skip(1) {
+        if div <= 1 {
+            break;
+        }
+        while div > 1 && delta % div != 0 {
+            div /= 10;
+        }
+    }
+
+    if div > 1 {
+        // apply only if expense of division is warranted
+        for delta in deltas.iter_mut().skip(1) {
+            *delta /= div
+        }
+    }
+
+    // first 4 high bits used for encoding type
+    dst.push((Encoding::Simple8b as u8) << 4);
+    dst[0] |= ((div as f64).log10()) as u8; // 4 low bits used for log10 divisor
+    dst.extend_from_slice(&deltas[0].to_be_bytes()); // encode first value
+    simple8b::encode(&deltas[1..], dst)
+}
+
+// i64_to_u64_vector converts a Vec<i64> to Vec<u64>.
+// TODO(edd): this is expensive as it copies. There are cheap
+// but unsafe alternatives to look into such as std::mem::transmute
+fn i64_to_u64_vector(src: &[i64]) -> Vec<u64> {
+    src.iter().map(|x| *x as u64).collect::<Vec<u64>>()
+}
+
+// encode_rle encodes the value v, delta and count into dst.
+//
+// v should be the first element of a sequence, delta the difference that each
+// value in the sequence differs by, and count the total number of values in the
+// sequence.
+fn encode_rle(v: u64, delta: u64, count: u64, dst: &mut Vec<u8>) {
+    use super::MAX_VAR_INT_64;
+
+    // Keep a byte back for the scaler.
+    dst.push(0);
+    let mut n = 1;
+    // write the first value in as a byte array.
+    dst.extend_from_slice(&v.to_be_bytes());
+    n += 8;
+
+    // check delta's divisor
+    let mut div: u64 = 1_000_000_000_000;
+    while div > 1 && delta % div != 0 {
+        div /= 10;
+    }
+
+    if dst.len() <= n + MAX_VAR_INT_64 {
+        dst.resize(n + MAX_VAR_INT_64, 0);
+    }
+
+    // 4 low bits are the log10 divisor.
+    if div > 1 {
+        // calculate and store the number of trailing 0s in the divisor.
+        // e.g., 100_000 would be stored as 5.
+        let scaler = ((div as f64).log10()) as u8;
+        assert!(scaler <= 15);
+
+        dst[0] |= scaler; // Set the scaler on low 4 bits of first byte.
+        n += (delta / div).encode_var(&mut dst[n..]);
+    } else {
+        n += delta.encode_var(&mut dst[n..]);
+    }
+
+    if dst.len() - n <= MAX_VAR_INT_64 {
+        dst.resize(n + MAX_VAR_INT_64, 0);
+    }
+    // finally, encode the number of times the delta is repeated.
+    n += count.encode_var(&mut dst[n..]);
+    dst.truncate(n);
+}
+
+/// decode decodes a slice of bytes encoded using encode back into a
+/// vector of signed integers.
+pub fn decode(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.is_empty() {
+        return Ok(());
+    }
+    let encoding = &src[0] >> 4;
+    match encoding {
+        encoding if encoding == Encoding::Uncompressed as u8 => {
+            decode_uncompressed(&src[1..], dst) // first byte not used
+        }
+        encoding if encoding == Encoding::Rle as u8 => decode_rle(src, dst),
+        encoding if encoding == Encoding::Simple8b as u8 => decode_simple8b(src, dst),
+        _ => Err(From::from("invalid block encoding")),
+    }
+}
+
+// decode_uncompressed writes the binary encoded values in src into dst.
+fn decode_uncompressed(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.is_empty() || src.len() & 0x7 != 0 {
+        return Err(From::from("invalid uncompressed block length"));
+    }
+
+    let count = src.len() / 8;
+    if dst.capacity() < count {
+        dst.reserve_exact(count - dst.capacity());
+    }
+    let mut i = 0;
+    let mut prev = 0;
+    let mut buf: [u8; 8] = [0; 8];
+    while i < src.len() {
+        buf.copy_from_slice(&src[i..i + 8]);
+        prev += i64::from_be_bytes(buf);
+        dst.push(prev); // N.B - signed integer...
+        i += 8;
+    }
+    Ok(())
+}
+
+// decode_rle decodes an RLE encoded slice containing only unsigned into the
+// destination vector.
+fn decode_rle(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.len() < 9 {
+        return Err(From::from("not enough data to decode using RLE"));
+    }
+
+    // calculate the scaler from the lower 4 bits of the first byte.
+    let scaler = 10_u64.pow((src[0] & 0b0000_1111) as u32);
+    let mut i = 1;
+
+    // TODO(edd): this should be possible to do in-place without copy.
+    let mut a: [u8; 8] = [0; 8];
+    a.copy_from_slice(&src[i..i + 8]);
+    i += 8;
+    let (mut delta, n) = u64::decode_var(&src[i..]).ok_or("unable to decode delta")?;
+    i += n;
+    delta *= scaler;
+
+    let (count, _n) = usize::decode_var(&src[i..]).ok_or("unable to decode count")?;
+
+    if dst.capacity() < count {
+        dst.reserve_exact(count - dst.capacity());
+    }
+
+    let mut first = i64::from_be_bytes(a);
+    for _ in 0..count {
+        dst.push(first);
+        first = first.wrapping_add(delta as i64);
+    }
+    Ok(())
+}
+
+fn decode_simple8b(src: &[u8], dst: &mut Vec<i64>) -> Result<(), Box<dyn Error>> {
+    if src.len() < 9 {
+        return Err(From::from("not enough data to decode packed timestamp"));
+    }
+
+    let scaler = 10_u64.pow((src[0] & 0b0000_1111) as u32);
+
+    // TODO(edd): pre-allocate res by counting bytes in encoded slice?
+    let mut res = vec![];
+    let mut buf: [u8; 8] = [0; 8];
+    buf.copy_from_slice(&src[1..9]);
+    dst.push(i64::from_be_bytes(buf));
+
+    simple8b::decode(&src[9..], &mut res);
+    let mut next = dst[dst.len() - 1];
+    if scaler > 1 {
+        // TODO(edd): fix this. It's copying, which is slowwwwwwwww.
+        for v in &res {
+            next += (v * scaler) as i64;
+            dst.push(next);
+        }
+        return Ok(());
+    }
+
+    // TODO(edd): fix this. It's copying, which is slowwwwwwwww.
+    for v in &res {
+        next += *v as i64;
+        dst.push(next);
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+#[allow(clippy::unreadable_literal)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn encode_no_values() {
+        let src: Vec<i64> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        encode(&src, &mut dst).expect("failed to encode src");
+
+        // verify encoded no values.
+        assert_eq!(dst.len(), 0);
+    }
+
+    #[test]
+    fn encode_uncompressed() {
+        let src: Vec<i64> = vec![-1000, 0, simple8b::MAX_VALUE as i64, 213123421];
+        let mut dst = vec![];
+
+        let exp = src.clone();
+        encode(&src, &mut dst).expect("failed to encode");
+
+        // verify uncompressed encoding used
+        assert_eq!(&dst[0] >> 4, Encoding::Uncompressed as u8);
+
+        let mut got = vec![];
+        decode(&dst, &mut got).expect("failed to decode");
+
+        // verify got same values back
+        assert_eq!(got, exp);
+    }
+
+    #[test]
+    fn encode_rle() {
+        struct Test {
+            name: String,
+            input: Vec<i64>,
+        }
+
+        let tests = vec![
+            Test {
+                name: String::from("no delta positive"),
+                input: vec![123; 8],
+            },
+            Test {
+                name: String::from("no delta negative"),
+                input: vec![-2398749823764923; 10000],
+            },
+            Test {
+                name: String::from("no delta negative"),
+                input: vec![-345632452354; 1000],
+            },
+            Test {
+                name: String::from("delta positive 1"),
+                input: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            },
+            Test {
+                name: String::from("delta positive 2000"),
+                input: vec![100, 2100, 4100, 6100, 8100, 10100, 12100, 14100],
+            },
+            Test {
+                name: String::from("delta negative"),
+                input: vec![-350, -200, -50],
+            },
+            Test {
+                name: String::from("delta mixed"),
+                input: vec![-35000, -5000, 25000, 55000],
+            },
+            Test {
+                name: String::from("delta descending"),
+                input: vec![100, 50, 0, -50, -100, -150],
+            },
+        ];
+
+        for test in tests {
+            let mut dst = vec![];
+            let src = test.input.clone();
+            let exp = test.input;
+            encode(&src, &mut dst).expect("failed to encode");
+
+            // verify RLE encoding used
+            assert_eq!(&dst[0] >> 4, Encoding::Rle as u8);
+
+            let mut got = vec![];
+            decode(&dst, &mut got).expect("failed to decode");
+            // verify got same values back
+            assert_eq!(got, exp, "{}", test.name);
+        }
+    }
+
+    #[test]
+    fn encode_simple8b() {
+        struct Test {
+            name: String,
+            input: Vec<i64>,
+        }
+
+        let tests = vec![
+            Test {
+                name: String::from("positive"),
+                input: vec![1, 11, 3124, 123543256, 2398567984273478],
+            },
+            Test {
+                name: String::from("negative"),
+                input: vec![-109290, -1234, -123, -12],
+            },
+            Test {
+                name: String::from("mixed"),
+                input: vec![-109290, -1234, -123, -12, 0, 0, 0, 1234, 44444, 4444444],
+            },
+        ];
+
+        for test in tests {
+            let mut dst = vec![];
+            let src = test.input.clone();
+            let exp = test.input;
+            encode(&src, &mut dst).expect("failed to encode");
+            // verify Simple8b encoding used
+            assert_eq!(&dst[0] >> 4, Encoding::Simple8b as u8);
+
+            let mut got = vec![];
+            decode(&dst, &mut got).expect("failed to decode");
+            // verify got same values back
+            assert_eq!(got, exp, "{}", test.name);
+        }
+    }
+}
diff --git a/influxdb_tsm/src/encoders/unsigned.rs b/influxdb_tsm/src/encoders/unsigned.rs
new file mode 100644
index 0000000..731bf34
--- /dev/null
+++ b/influxdb_tsm/src/encoders/unsigned.rs
@@ -0,0 +1,169 @@
+use std::error::Error;
+
+/// Encodes a slice of unsigned 64-bit integers into `dst`.
+///
+/// Deltas between the integers in the input are first calculated, then the
+/// deltas are further compressed if possible, either via bit-packing using
+/// simple8b or by run-length encoding the deltas if they're all the same.
+pub fn encode(src: &[u64], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+    let signed = u64_to_i64_vector(src);
+    super::integer::encode(&signed, dst)
+}
+
+/// Decodes a slice of bytes into a destination vector of unsigned integers.
+pub fn decode(src: &[u8], dst: &mut Vec<u64>) -> Result<(), Box<dyn Error>> {
+    if src.is_empty() {
+        return Ok(());
+    }
+    let mut signed_results = vec![];
+    super::integer::decode(src, &mut signed_results)?;
+    dst.clear();
+    dst.reserve_exact(signed_results.len() - dst.capacity());
+    for s in signed_results {
+        dst.push(s as u64);
+    }
+    Ok(())
+}
+
+// Converts a slice of `u64` values to a `Vec<i64>`.
+// TODO(edd): this is expensive as it copies. There are cheap
+// but unsafe alternatives to look into such as std::mem::transmute
+fn u64_to_i64_vector(src: &[u64]) -> Vec<i64> {
+    src.iter().map(|&x| x as i64).collect()
+}
+
+#[cfg(test)]
+#[allow(clippy::unreadable_literal)]
+mod tests {
+    use super::super::integer::Encoding;
+    use super::super::simple8b;
+    use super::*;
+
+    #[test]
+    fn encode_no_values() {
+        let src: Vec<u64> = vec![];
+        let mut dst = vec![];
+
+        // check for error
+        encode(&src, &mut dst).expect("failed to encode src");
+
+        // verify encoded no values.
+        assert_eq!(dst.len(), 0);
+    }
+
+    #[test]
+    fn encode_uncompressed() {
+        let src: Vec<u64> = vec![1000, 0, simple8b::MAX_VALUE, 213123421];
+        let mut dst = vec![];
+
+        let exp = src.clone();
+        encode(&src, &mut dst).expect("failed to encode");
+
+        // verify uncompressed encoding used
+        assert_eq!(&dst[0] >> 4, Encoding::Uncompressed as u8);
+        let mut got = vec![];
+        decode(&dst, &mut got).expect("failed to decode");
+
+        // verify got same values back
+        assert_eq!(got, exp);
+    }
+
+    struct Test {
+        name: String,
+        input: Vec<u64>,
+    }
+
+    #[test]
+    fn encode_rle() {
+        let tests = vec![
+            Test {
+                name: String::from("no delta"),
+                input: vec![123; 8],
+            },
+            Test {
+                name: String::from("delta increasing"),
+                input: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            },
+            Test {
+                name: String::from("delta decreasing"),
+                input: vec![350, 200, 50],
+            },
+        ];
+
+        for test in tests {
+            let mut dst = vec![];
+            let src = test.input.clone();
+            let exp = test.input;
+            encode(&src, &mut dst).expect("failed to encode");
+
+            // verify RLE encoding used
+            assert_eq!(
+                &dst[0] >> 4,
+                Encoding::Rle as u8,
+                "didn't use rle on {src:?}"
+            );
+            let mut got = vec![];
+            decode(&dst, &mut got).expect("failed to decode");
+            // verify got same values back
+            assert_eq!(got, exp, "{}", test.name);
+        }
+    }
+
+    #[test]
+    fn encode_rle_byte_for_byte_with_go() {
+        let mut dst = vec![];
+        let src = vec![1232342341234u64; 1000];
+        encode(&src, &mut dst).expect("failed to encode");
+
+        let expected_encoded = vec![32, 0, 0, 2, 61, 218, 167, 172, 228, 0, 231, 7];
+        assert_eq!(dst, expected_encoded);
+
+        assert_eq!(&dst[0] >> 4, Encoding::Rle as u8);
+        let mut got = vec![];
+        decode(&dst, &mut got).expect("failed to decode");
+        assert_eq!(got, src);
+    }
+
+    #[test]
+    fn encode_simple8b() {
+        let tests = vec![Test {
+            name: String::from("positive"),
+            input: vec![1, 11, 3124, 123543256, 2398567984273478],
+        }];
+
+        for test in tests {
+            let mut dst = vec![];
+            let src = test.input.clone();
+            let exp = test.input;
+            encode(&src, &mut dst).expect("failed to encode");
+            // verify Simple8b encoding used
+            assert_eq!(&dst[0] >> 4, Encoding::Simple8b as u8);
+
+            let mut got = vec![];
+            decode(&dst, &mut got).expect("failed to decode");
+            // verify got same values back
+            assert_eq!(got, exp, "{}", test.name);
+        }
+    }
+
+    #[test]
+    // This tests against a defect found when decoding a TSM block from InfluxDB.
+    fn rle_regression() {
+        let values = vec![809201799168u64; 509];
+        let mut enc = vec![];
+        encode(&values, &mut enc).expect("encoding failed");
+
+        // this is a compressed rle integer block representing 509 identical
+        // 809201799168 values.
+        let enc_influx = [32, 0, 0, 1, 120, 208, 95, 32, 0, 0, 252, 3];
+
+        // ensure that encoder produces same bytes as InfluxDB encoder.
+        assert_eq!(enc, enc_influx);
+
+        let mut dec = vec![];
+        decode(&enc, &mut dec).expect("failed to decode");
+
+        assert_eq!(dec.len(), values.len());
+        assert_eq!(dec, values);
+    }
+}
diff --git a/influxdb_tsm/src/key.rs b/influxdb_tsm/src/key.rs
new file mode 100644
index 0000000..5decc90
--- /dev/null
+++ b/influxdb_tsm/src/key.rs
@@ -0,0 +1,1045 @@
+use super::*;
+use snafu::{OptionExt, ResultExt, Snafu};
+
+#[derive(Clone, Debug)]
+pub struct ParsedTsmKey {
+    pub org_id: InfluxId,
+    pub bucket_id: InfluxId,
+    pub measurement: String,
+    pub tagset: Vec<(String, String)>,
+    pub field_key: String,
+}
+
+/// Public error type that wraps the underlying data parsing error
+/// with the actual key value being parsed.
+#[derive(Debug, Snafu, PartialEq, Eq)]
+pub enum Error {
+    #[snafu(display(r#"Error while parsing tsm tag key '{}': {}"#, key, source))]
+    ParsingTsmKey { key: String, source: DataError },
+}
+
+#[derive(Debug, Snafu, PartialEq, Eq)]
+pub enum DataError {
+    #[snafu(display(r#"Key length too short"#))]
+    KeyTooShort {},
+
+    #[snafu(display(r#"No measurement found (expected to find in tag field \x00)"#))]
+    NoMeasurement {},
+
+    #[snafu(display(r#"No field key (expected to find in tag field \xff)"#))]
+    NoFieldKey {},
+
+    #[snafu(display(
+        r#"Found new measurement '{}' after the first '{}'"#,
+        new_measurement,
+        old_measurement
+    ))]
+    MultipleMeasurements {
+        new_measurement: String,
+        old_measurement: String,
+    },
+
+    #[snafu(display(
+        r#"Found new field key '{}' after the first '{}'"#,
+        new_field,
+        old_field
+    ))]
+    MultipleFields {
+        new_field: String,
+        old_field: String,
+    },
+
+    #[snafu(display(r#"Error parsing field key: {}"#, details))]
+    ParsingFieldKey { details: String },
+
+    #[snafu(display(r#"Error parsing tsm tag key: {}"#, description))]
+    ParsingTsmTagKey { description: String },
+
+    #[snafu(display(
+        r#"Error parsing tsm tag value for key '{}': {}"#,
+        tag_key,
+        description
+    ))]
+    ParsingTsmTagValue {
+        tag_key: String,
+        description: String,
+    },
+
+    #[snafu(display(r#"Error parsing tsm field key: {}"#, description))]
+    ParsingTsmFieldKey { description: String },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Parses the the measurement, field key and tag set from a TSM index key
+///
+/// It does not provide access to the org and bucket IDs on the key; these can be accessed via
+/// `org_id()` and `bucket_id()` respectively.
+///
+/// Loosely based on [points.go](https://github.com/influxdata/influxdb/blob/751d70a213e5fdae837eda13d7ecb37763e69abb/models/points.go#L462)
+///
+/// The format looks roughly like:
+///
+/// ```text
+/// <org_id bucket_id>,\x00=<measurement>,<tag_keys_str>,\xff=<field_key_str>#!
+/// ~#<field_key_str>
+/// ```
+///
+/// For example:
+///
+/// ```text
+/// <org_id bucket_id>,\x00=http_api_request_duration_seconds,status=2XX,\
+/// xff=sum#!~#sum
+///
+///    measurement = "http_api_request"
+///    tags = [("status", "2XX")]
+///    field = "sum"
+/// ```
+pub fn parse_tsm_key(key: &[u8]) -> Result<ParsedTsmKey, Error> {
+    // Wrap in an internal function to translate error types and add key context
+    parse_tsm_key_internal(key).context(ParsingTsmKeySnafu {
+        key: String::from_utf8_lossy(key),
+    })
+}
+
+fn parse_tsm_key_internal(key: &[u8]) -> Result<ParsedTsmKey, DataError> {
+    // Get the org and bucket id from the first section of the key.
+    let mut rem_key = key.iter().copied();
+    let org_id = parse_id(&mut rem_key)?;
+    let bucket_id = parse_id(&mut rem_key)?;
+
+    // Now fetch the measurement and tags, starting after the org, bucket and a comma.
+    rem_key.next(); // Skip the comma
+
+    let mut tagset = Vec::with_capacity(10);
+    let mut measurement = None;
+    let mut field_key = None;
+
+    loop {
+        let tag_key = parse_tsm_tag_key(&mut rem_key)?;
+
+        let has_more_tags = match tag_key {
+            KeyType::Tag(tag_key) => {
+                let (has_more_tags, tag_value) = parse_tsm_tag_value(&tag_key, &mut rem_key)?;
+                tagset.push((tag_key, tag_value));
+                has_more_tags
+            }
+
+            KeyType::Measurement => {
+                let (has_more_tags, tag_value) = parse_tsm_tag_value("Measurement", &mut rem_key)?;
+                match measurement {
+                    Some(measurement) => {
+                        return MultipleMeasurementsSnafu {
+                            new_measurement: tag_value,
+                            old_measurement: measurement,
+                        }
+                        .fail()
+                    }
+                    None => {
+                        measurement = Some(tag_value);
+                        has_more_tags
+                    }
+                }
+            }
+            KeyType::Field => {
+                // since `parse_tsm_field_key_value` consumes the rest of the iterator, it
+                // is some kind of logic error if we already have a field key
+                assert!(field_key.is_none(), "second field key found while parsing");
+
+                let parsed_value = parse_tsm_field_key_value(&mut rem_key)?;
+                assert!(
+                    rem_key.next().is_none(),
+                    "parsing field key value did not consume remaining index entry"
+                );
+
+                field_key = Some(parsed_value);
+                false
+            }
+        };
+
+        if !has_more_tags {
+            break;
+        }
+    }
+
+    Ok(ParsedTsmKey {
+        org_id,
+        bucket_id,
+        measurement: measurement.context(NoMeasurementSnafu)?,
+        tagset,
+        field_key: field_key.context(NoFieldKeySnafu)?,
+    })
+}
+
+// Parses an influx id from the byte sequence. IDs are generally just 8 bytes, but we escape
+// certain characters ('\', ' ' and '='), so we unescape them as part of this process.
+// The iterator will consume all bytes that are part of the id.
+fn parse_id(key: impl Iterator<Item = u8>) -> Result<InfluxId, DataError> {
+    let mut id: [u8; 8] = [0; 8];
+
+    let mut i = 0;
+    let mut escaped = false;
+    for x in key {
+        if x == b'\\' && !escaped {
+            escaped = true;
+            continue;
+        }
+
+        id[i] = x;
+        if i >= 7 {
+            return Ok(InfluxId::from_be_bytes(id));
+        }
+        i += 1;
+        escaped = false;
+    }
+
+    Err(DataError::KeyTooShort {})
+}
+
+/// Parses the field value stored in a TSM field key into a field name.
+/// fields are stored on the series keys in TSM indexes as follows:
+///
+/// <field_key><4-byte delimiter><field_key>
+///
+/// Example: sum#!~#sum means 'sum' field key
+///
+/// It also turns out that the data after the delimiter does not necessarily
+/// escape the data.
+///
+/// So for example, the following is a valid field key value (for the
+/// field named "Code Cache"):
+///
+/// {\xff>=Code\ Cache#!~#Code Cache
+fn parse_tsm_field_key_value(rem_key: impl Iterator<Item = u8>) -> Result<String, DataError> {
+    #[derive(Debug)]
+    enum State {
+        Data,
+        Escape, //
+        Key1,   // saw #
+        Key2,   // saw #!
+        Key3,   // saw #!~
+        Done,
+    }
+
+    let mut field_name = String::with_capacity(100);
+    let mut state = State::Data;
+
+    // return the next byte if its value is not a valid unless escaped
+    fn check_next_byte(byte: u8) -> Result<u8, DataError> {
+        match byte {
+            b'=' => ParsingTsmFieldKeySnafu {
+                description: "invalid unescaped '='",
+            }
+            .fail(),
+            // An unescaped space is an invalid tag value.
+            b' ' => ParsingTsmFieldKeySnafu {
+                description: "invalid unescaped ' '",
+            }
+            .fail(),
+            b',' => ParsingTsmFieldKeySnafu {
+                description: "invalid unescaped ','",
+            }
+            .fail(),
+            _ => Ok(byte),
+        }
+    }
+
+    // Determines what hte next state is when in the middle of parsing a
+    // delimiter.
+    fn next_key_state(
+        byte: u8,
+        next_delim_byte: u8,
+        next_delim_state: State,
+        delim_so_far: &str,
+        field_name: &mut String,
+    ) -> Result<State, DataError> {
+        // If the next_delim_byte is the next part of the delimiter
+        if byte == next_delim_byte {
+            Ok(next_delim_state)
+        }
+        // otherwise it was data that happened to be the same first
+        // few bytes as delimiter. Add the part of the delimiter seen
+        // so far and go back to data
+        else {
+            field_name.push_str(delim_so_far);
+            // start of delimiter again
+            match byte {
+                b'#' => Ok(State::Key1),
+                b'\\' => Ok(State::Escape),
+                _ => {
+                    field_name.push(check_next_byte(byte)? as char);
+                    Ok(State::Data)
+                }
+            }
+        }
+    }
+
+    // loop over input byte by byte and once we are at the end of the field key,
+    // consume the rest of the key stream (ignoring all remaining characters)
+    for byte in rem_key {
+        match state {
+            State::Data => match byte {
+                b'#' => state = State::Key1,
+                b'\\' => state = State::Escape,
+                _ => field_name.push(check_next_byte(byte)? as char),
+            },
+            State::Escape => {
+                field_name.push(byte as char);
+                state = State::Data
+            }
+            State::Key1 => state = next_key_state(byte, b'!', State::Key2, "#", &mut field_name)?,
+            State::Key2 => state = next_key_state(byte, b'~', State::Key3, "#!", &mut field_name)?,
+            State::Key3 => state = next_key_state(byte, b'#', State::Done, "#!~", &mut field_name)?,
+            State::Done => {} // ignore all data after delimiter
+        };
+    }
+
+    match state {
+        State::Done if !field_name.is_empty() => Ok(field_name),
+        State::Done => ParsingFieldKeySnafu {
+            details: "field key too short",
+        }
+        .fail(),
+        _ => ParsingFieldKeySnafu {
+            details: format!(
+                "Delimiter not found before end of stream reached. \
+                                  Still in state {state:?}"
+            ),
+        }
+        .fail(),
+    }
+}
+
+#[derive(Debug, PartialEq)]
+
+/// Represents the 'type' of the tag.
+///
+/// This is used to represent the
+/// the way the 'measurement name' and the `field name` are stored in
+/// TSM OSS 2.0 files, which is different than where line protocol has the
+/// measurement and field names.
+///
+/// Specifically, the measurement name and field names are stored as
+/// 'tag's with the special keys \x00 and \xff, respectively.
+enum KeyType {
+    Tag(String),
+    /// the measurement name is encoded in the tsm key as the value of a
+    /// special tag key '\x00'.
+    ///
+    /// For example,the tsm key
+    /// "\x00=foo" has the measurement name "foo"
+    Measurement,
+    /// the field name is encoded in the tsm key as the value of a
+    /// special tag key '\xff'.
+    ///
+    /// For example,the tsm key
+    /// "user_agent=Firefox,\xff=sum#!~#sum" has a 'user_agent` tag
+    /// key with value Firefix and a field named 'sum')
+    Field,
+}
+
+impl From<&KeyType> for String {
+    fn from(item: &KeyType) -> Self {
+        match item {
+            KeyType::Tag(s) => s.clone(),
+            KeyType::Measurement => "<measurement>".to_string(),
+            KeyType::Field => "<field>".to_string(),
+        }
+    }
+}
+
+/// Parses bytes from the `rem_key` input stream until the end of the
+/// next key value (=). Consumes the '='
+fn parse_tsm_tag_key(rem_key: impl Iterator<Item = u8>) -> Result<KeyType, DataError> {
+    enum State {
+        Data,
+        Measurement,
+        Field,
+        Escape,
+    }
+
+    let mut state = State::Data;
+    let mut key = String::with_capacity(250);
+
+    // Examine each character in the tag key until we hit an unescaped
+    // equals (the tag value), or we hit an error (i.e., unescaped
+    // space or comma).
+    for byte in rem_key {
+        match state {
+            State::Data => match byte {
+                b'\x00' => {
+                    state = State::Measurement;
+                }
+                b'\xff' => {
+                    state = State::Field;
+                }
+                b'=' => return Ok(KeyType::Tag(key)),
+                b',' => {
+                    return ParsingTsmTagKeySnafu {
+                        description: "unescaped comma",
+                    }
+                    .fail();
+                }
+                b' ' => {
+                    return ParsingTsmTagKeySnafu {
+                        description: "unescaped space",
+                    }
+                    .fail();
+                }
+                b'\\' => state = State::Escape,
+                _ => key.push(byte as char),
+            },
+            State::Measurement => match byte {
+                b'=' => {
+                    return Ok(KeyType::Measurement);
+                }
+                _ => {
+                    return ParsingTsmTagKeySnafu {
+                        description: "extra data after special 0x00",
+                    }
+                    .fail();
+                }
+            },
+            State::Field => match byte {
+                b'=' => {
+                    return Ok(KeyType::Field);
+                }
+                _ => {
+                    return ParsingTsmTagKeySnafu {
+                        description: "extra data after special 0xff",
+                    }
+                    .fail();
+                }
+            },
+            State::Escape => {
+                state = State::Data;
+                key.push(byte as char);
+            }
+        }
+    }
+
+    ParsingTsmTagKeySnafu {
+        description: "unexpected end of data",
+    }
+    .fail()
+}
+
+/// Parses bytes from the `rem_key` input stream until the end of a
+/// tag value
+///
+/// Returns a tuple `(has_more_tags, tag_value)`
+///
+/// Examples:
+///
+/// "val1,tag2=val --> Ok((true, "val1")));
+/// "val1" --> Ok((False, "val1")));
+fn parse_tsm_tag_value(
+    tag_key: &str,
+    rem_key: impl Iterator<Item = u8>,
+) -> Result<(bool, String), DataError> {
+    #[derive(Debug)]
+    enum State {
+        Start,
+        Data,
+        Escape,
+    }
+
+    let mut state = State::Start;
+    let mut tag_value = String::with_capacity(100);
+
+    // Examine each character in the tag value until we hit an unescaped
+    // comma (move onto next tag key), or we error out.
+    for byte in rem_key {
+        match state {
+            State::Start => {
+                match byte {
+                    // An unescaped equals sign is an invalid tag value.
+                    // cpu,tag={'=', 'fo=o'}
+                    b'=' => {
+                        return ParsingTsmTagValueSnafu {
+                            tag_key,
+                            description: "invalid unescaped '='",
+                        }
+                        .fail()
+                    }
+                    // An unescaped space is an invalid tag value.
+                    b' ' => {
+                        return ParsingTsmTagValueSnafu {
+                            tag_key,
+                            description: "invalid unescaped ' '",
+                        }
+                        .fail()
+                    }
+                    b',' => {
+                        return ParsingTsmTagValueSnafu {
+                            tag_key,
+                            description: "missing tag value",
+                        }
+                        .fail()
+                    }
+                    b'\\' => state = State::Escape,
+                    _ => {
+                        state = State::Data;
+                        tag_value.push(byte as char);
+                    }
+                }
+            }
+            State::Data => {
+                match byte {
+                    // An unescaped equals sign is an invalid tag value.
+                    // cpu,tag={'=', 'fo=o'}
+                    b'=' => {
+                        return ParsingTsmTagValueSnafu {
+                            tag_key,
+                            description: "invalid unescaped '='",
+                        }
+                        .fail()
+                    }
+                    // An unescaped space is an invalid tag value.
+                    b' ' => {
+                        return ParsingTsmTagValueSnafu {
+                            tag_key,
+                            description: "invalid unescaped ' '",
+                        }
+                        .fail()
+                    }
+                    // cpu,tag=foo,
+                    b',' => return Ok((true, tag_value)),
+                    // start of escape value
+                    b'\\' => state = State::Escape,
+                    _ => {
+                        tag_value.push(byte as char);
+                    }
+                }
+            }
+            State::Escape => {
+                tag_value.push(byte as char);
+                state = State::Data;
+            }
+        }
+    }
+
+    // Tag value cannot be empty.
+    match state {
+        State::Start => ParsingTsmTagValueSnafu {
+            tag_key,
+            description: "missing tag value",
+        }
+        .fail(),
+        State::Escape => ParsingTsmTagValueSnafu {
+            tag_key,
+            description: "tag value ends in escape",
+        }
+        .fail(),
+        _ => Ok((false, tag_value)),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_id_good() {
+        // Simple with no escaping
+        let mut key = b"\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02,\x00=cpu"
+            .iter()
+            .copied();
+        let org_id = parse_id(&mut key).expect("unable to parse id");
+        assert_eq!(org_id, InfluxId(1));
+
+        let bucket_id = parse_id(&mut key).expect("unable to parse id");
+        assert_eq!(bucket_id, InfluxId(2));
+
+        // Check that the iterator has been left at the comma
+        let rem: Vec<u8> = key.collect();
+        assert_eq!(rem, b",\x00=cpu");
+    }
+
+    #[test]
+    fn test_parse_id_escaped() {
+        // ID with escaped characters: we escape space (\x20), comma (\x2c) and backslash (\x5c)
+        let mut key = b"\x00\x5c\x20\x5c\x5c\x5c\x2c\x01\x5c\x2c\x03\x04,\x00=cpu"
+            .iter()
+            .copied();
+        let unescaped: [u8; 8] = hex::decode("00205c2c012c0304").unwrap().try_into().unwrap();
+
+        let id = parse_id(&mut key).expect("unable to parse id");
+        assert_eq!(id, InfluxId::from_be_bytes(unescaped));
+
+        // Check that the iterator has been left at the next byte
+        let rem: Vec<u8> = key.collect();
+        assert_eq!(rem, b",\x00=cpu");
+    }
+
+    #[test]
+    fn test_parse_tsm_field_key_value() {
+        // test the operation of parse_tsm_field_key_value
+        do_test_parse_tsm_field_key_value_good("sum#!~#sum", "sum");
+        do_test_parse_tsm_field_key_value_bad("#!~#", "field key too short");
+
+        do_test_parse_tsm_field_key_value_good("foo#!~#fpp", "foo");
+        do_test_parse_tsm_field_key_value_good("foo#!~#", "foo");
+
+        // escaped values
+        do_test_parse_tsm_field_key_value_good(r"foo\ bar#!~#foo bar", "foo bar");
+        do_test_parse_tsm_field_key_value_good(r"foo\,bar#!~#foo,bar", "foo,bar");
+
+        // unescaped values
+        do_test_parse_tsm_field_key_value_bad("foo bar#!~#foo bar", "invalid unescaped ' '");
+        do_test_parse_tsm_field_key_value_bad("foo,bar#!~#foo,bar", "invalid unescaped ','");
+
+        do_test_parse_tsm_field_key_value_good("foo##!~#foo", "foo#");
+        do_test_parse_tsm_field_key_value_good("fo#o#!~#foo", "fo#o");
+
+        // partial delimiters
+        do_test_parse_tsm_field_key_value_good("foo#!#!~#foo", "foo#!");
+        do_test_parse_tsm_field_key_value_good("fo#!o#!~#foo", "fo#!o");
+        do_test_parse_tsm_field_key_value_good(r"fo#!\ o#!~#foo", "fo#! o");
+        do_test_parse_tsm_field_key_value_good(r"fo#!\,o#!~#foo", "fo#!,o");
+        do_test_parse_tsm_field_key_value_good(r"fo#!\=o#!~#foo", "fo#!=o");
+
+        do_test_parse_tsm_field_key_value_good("foo#!~o#!~#foo", "foo#!~o");
+        do_test_parse_tsm_field_key_value_good("fo#!~o#!~#foo", "fo#!~o");
+        do_test_parse_tsm_field_key_value_good(r"fo#!~\ #!~#foo", "fo#!~ ");
+
+        do_test_parse_tsm_field_key_value_good("foo#!~#!~#foo", "foo"); // matches!
+        do_test_parse_tsm_field_key_value_good("fo#!~o#!~#foo", "fo#!~o");
+        do_test_parse_tsm_field_key_value_good(r"fo#!~\ #!~#foo", "fo#!~ ");
+
+        // test partial delimiters
+        do_test_parse_tsm_field_key_value_bad(
+            "foo",
+            "Delimiter not found before end of stream reached",
+        );
+        do_test_parse_tsm_field_key_value_bad(
+            "foo#",
+            "Delimiter not found before end of stream reached",
+        );
+        do_test_parse_tsm_field_key_value_bad(
+            "foo#!",
+            "Delimiter not found before end of stream reached",
+        );
+        do_test_parse_tsm_field_key_value_bad(
+            "foo#!~",
+            "Delimiter not found before end of stream reached",
+        );
+
+        // test unescaped ' ', '=' and ',' before and after the delimiters
+        do_test_parse_tsm_field_key_value_bad("foo bar#!~#foo bar", "invalid unescaped ' '");
+        do_test_parse_tsm_field_key_value_bad("foo,bar#!~#foo,bar", "invalid unescaped ','");
+        do_test_parse_tsm_field_key_value_bad("foo=bar#!~#foo=bar", "invalid unescaped '='");
+        // but escaped before the delimiter is fine
+        do_test_parse_tsm_field_key_value_good(r"foo\ bar#!~#foo bar", "foo bar");
+        do_test_parse_tsm_field_key_value_good(r"foo\,bar#!~#foo,bar", "foo,bar");
+        do_test_parse_tsm_field_key_value_good(r"foo\=bar#!~#foo=bar", "foo=bar");
+    }
+
+    #[test]
+    fn test_parse_tsm_tag_key() {
+        do_test_parse_tsm_tag_key_error("", "", "unexpected end of data");
+        do_test_parse_tsm_tag_key_good("foo=bar", "bar", KeyType::Tag("foo".into()));
+        do_test_parse_tsm_tag_key_good("foo=", "", KeyType::Tag("foo".into()));
+        do_test_parse_tsm_tag_key_error("foo", "", "unexpected end of data");
+        do_test_parse_tsm_tag_key_error("foo,=bar", "=bar", "unescaped comma");
+        do_test_parse_tsm_tag_key_error("foo =bar", "=bar", "unescaped space");
+
+        do_test_parse_tsm_tag_key_good(r"\ foo=bar", "bar", KeyType::Tag(" foo".into()));
+        do_test_parse_tsm_tag_key_good(r"\=foo=bar", "bar", KeyType::Tag("=foo".into()));
+        do_test_parse_tsm_tag_key_good(r"\,foo=bar", "bar", KeyType::Tag(",foo".into()));
+        do_test_parse_tsm_tag_key_good(r"\foo=bar", "bar", KeyType::Tag("foo".into()));
+        do_test_parse_tsm_tag_key_good(r"\\foo=bar", "bar", KeyType::Tag(r"\foo".into()));
+
+        do_test_parse_tsm_tag_key_good(r"f\ oo=bar", "bar", KeyType::Tag("f oo".into()));
+        do_test_parse_tsm_tag_key_good(r"f\=oo=bar", "bar", KeyType::Tag("f=oo".into()));
+        do_test_parse_tsm_tag_key_good(r"f\,oo=bar", "bar", KeyType::Tag("f,oo".into()));
+        do_test_parse_tsm_tag_key_good(r"f\oo=bar", "bar", KeyType::Tag("foo".into()));
+    }
+
+    #[test]
+    fn test_parse_tsm_tag_value() {
+        do_test_parse_tsm_tag_value_error("", "", "missing tag value");
+        do_test_parse_tsm_tag_value_good(
+            "val1,tag2=val2 value=1",
+            "tag2=val2 value=1",
+            (true, "val1".into()),
+        );
+        do_test_parse_tsm_tag_value_good("val1", "", (false, "val1".into()));
+        do_test_parse_tsm_tag_value_good(r"\ val1", "", (false, " val1".into()));
+        do_test_parse_tsm_tag_value_good(r"val\ 1", "", (false, "val 1".into()));
+        do_test_parse_tsm_tag_value_good(r"val1\ ", "", (false, "val1 ".into()));
+        do_test_parse_tsm_tag_value_error(r"val1\", "", "tag value ends in escape");
+        do_test_parse_tsm_tag_value_error(r"=b", "b", "invalid unescaped '='");
+        do_test_parse_tsm_tag_value_error(r"f=b", "b", "invalid unescaped '='");
+        do_test_parse_tsm_tag_value_error(r" v", "v", "invalid unescaped ' '");
+        do_test_parse_tsm_tag_value_error(r"v ", "", "invalid unescaped ' '");
+    }
+
+    // create key in this form:
+    //<org_id bucket_id>,\x00=<measurement>,<tag_keys_str>
+    fn make_tsm_key_prefix(measurement: &str, tag_keys_str: &str) -> Vec<u8> {
+        let mut key = Vec::new();
+
+        let org = b"12345678";
+        let bucket = b"87654321";
+
+        // 8 bytes of ORG
+        key.extend_from_slice(org);
+
+        // 8 bytes of BUCKET
+        key.extend_from_slice(bucket);
+
+        key.push(b',');
+
+        // 2 bytes: special measurement tag key \x00=
+        key.push(b'\x00');
+        key.push(b'=');
+        key.extend_from_slice(measurement.as_bytes());
+
+        key.push(b',');
+        key.extend_from_slice(tag_keys_str.as_bytes());
+
+        key
+    }
+
+    // add this to the key: ,\xff=<field_key_str>#!~#<field_key_str>
+    fn add_field_key(mut key: Vec<u8>, field_key_str: &str) -> Vec<u8> {
+        key.push(b',');
+        key.push(b'\xff');
+        key.push(b'=');
+        key.extend_from_slice(field_key_str.as_bytes());
+        key.extend_from_slice(b"#!~#");
+        key.extend_from_slice(field_key_str.as_bytes());
+        key
+    }
+
+    #[test]
+    fn parse_tsm_key_good() {
+        //<org_id bucket_id>,\x00=<measurement>,<tag_keys_str>,\xff=<field_key_str>#!~#
+        //<org_id <field_key_str>
+        let mut key = make_tsm_key_prefix("m", "tag1=val1,tag2=val2");
+        key = add_field_key(key, "f");
+
+        let org_id = InfluxId::from_be_bytes(*b"12345678");
+        let bucket_id = InfluxId::from_be_bytes(*b"87654321");
+
+        let parsed_key = super::parse_tsm_key(&key).unwrap();
+        assert_eq!(parsed_key.org_id, org_id);
+        assert_eq!(parsed_key.bucket_id, bucket_id);
+        assert_eq!(parsed_key.measurement, String::from("m"));
+        let exp_tagset = vec![
+            (String::from("tag1"), String::from("val1")),
+            (String::from("tag2"), String::from("val2")),
+        ];
+        assert_eq!(parsed_key.tagset, exp_tagset);
+        assert_eq!(parsed_key.field_key, String::from("f"));
+    }
+
+    #[test]
+    fn parse_tsm_key_too_short() {
+        let key = b"1234567887654";
+        let err_str = parse_tsm_key(&key[..])
+            .expect_err("expect parsing error")
+            .to_string();
+
+        assert!(
+            err_str
+                .contains("Error while parsing tsm tag key '1234567887654': Key length too short"),
+            "{}",
+            err_str
+        );
+    }
+
+    #[test]
+    fn parse_tsm_error_has_key() {
+        //<org_id bucket_id>,\x00=<measurement>,<tag_keys_str>
+        let key = make_tsm_key_prefix("m", "tag1=val1,tag2=val2");
+
+        let err_str = parse_tsm_key(&key)
+            .expect_err("expect parsing error")
+            .to_string();
+        // expect that a representation of the actual TSM key is in the error message
+        assert!(
+            err_str.contains(
+                "Error while parsing tsm tag key '1234567887654321,\x00=m,tag1=val1,tag2=val2':"
+            ),
+            "{}",
+            err_str
+        );
+    }
+
+    #[test]
+    fn parse_tsm_key_no_field() {
+        //<org_id bucket_id>,\x00=<measurement>,<tag_keys_str>
+        let key = make_tsm_key_prefix("m", "tag1=val1,tag2=val2");
+
+        let err_str = parse_tsm_key(&key)
+            .expect_err("expect parsing error")
+            .to_string();
+        assert!(
+            err_str.contains("No field key (expected to find in tag field \\xff)"),
+            "{}",
+            err_str
+        );
+    }
+
+    #[test]
+    fn parse_tsm_key_two_fields() {
+        //<org_id bucket_id>,\x00=<measurement>,<tag_keys_str>\xff=<field-key_str>#!~#
+        //<org_id <field_key_str>\xff=<field-key_str>#!~#<field_key_str>
+        let mut key = make_tsm_key_prefix("m", "tag1=val1,tag2=val2");
+        key = add_field_key(key, "f");
+        key = add_field_key(key, "f2");
+
+        // Now we just ignore all content after the field key
+        let parsed_key = parse_tsm_key(&key).expect("parsed");
+        assert_eq!(
+            parsed_key.field_key,
+            "f",
+            "while parsing {}",
+            String::from_utf8_lossy(&key)
+        );
+    }
+
+    #[test]
+    fn test_parse_tsm_key() {
+        //<org_id bucket_id>,\x00=http_api_request_duration_seconds,handler=platform,
+        //<org_id method=POST,path=/api/v2/setup,status=2XX,user_agent=Firefox,\xff=sum#
+        //<org_id !~#sum
+        let buf = "05C19117091A100005C19117091A10012C003D68747470\
+             5F6170695F726571756573745F6475726174696F6E5F73\
+             65636F6E64732C68616E646C65723D706C6174666F726D\
+             2C6D6574686F643D504F53542C706174683D2F6170692F\
+             76322F73657475702C7374617475733D3258582C757365\
+             725F6167656E743D46697265666F782CFF3D73756D2321\
+             7E2373756D";
+        let tsm_key = hex::decode(buf).unwrap();
+
+        let parsed_key = super::parse_tsm_key(&tsm_key).unwrap();
+        assert_eq!(
+            parsed_key.measurement,
+            String::from("http_api_request_duration_seconds")
+        );
+
+        let exp_tagset = vec![
+            (String::from("handler"), String::from("platform")),
+            (String::from("method"), String::from("POST")),
+            (String::from("path"), String::from("/api/v2/setup")),
+            (String::from("status"), String::from("2XX")),
+            (String::from("user_agent"), String::from("Firefox")),
+        ];
+        assert_eq!(parsed_key.tagset, exp_tagset);
+        assert_eq!(parsed_key.field_key, String::from("sum"));
+    }
+
+    #[test]
+    fn parse_tsm_key_escaped() {
+        //<org_id bucket_id>,\x00=query_log,env=prod01-eu-central-1,error=memory\
+        //<org_id allocation\ limit\ reached:\ limit\ 740000000\ bytes\,\ allocated:\
+        //<org_id 739849088\,\ wanted:\ 6946816;\ memory\ allocation\ limit\ reached:\
+        //<org_id limit\ 740000000\ bytes\,\ allocated:\ 739849088\,\ wanted:\
+        //<org_id 6946816,errorCode=invalid,errorType=user,
+        //<org_id host=queryd-algow-rw-76d68d5968-fzgwr,
+        //<org_id hostname=queryd-algow-rw-76d68d5968-fzgwr,nodename=ip-10-153-10-221.
+        //<org_id eu-central-1.compute.internal,orgID=0b6e852e272ffdd9,
+        //<org_id ot_trace_sampled=false,role=queryd-algow-rw,source=hackney,\
+        //<org_id xff=responseSize#!~#responseSize
+        let buf = "844910ECE80BE8BC3C0BD4C89186CA892C\
+             003D71756572795F6C6F672C656E763D70726F6430312D65752D63656E747261\
+             6C2D312C6572726F723D6D656D6F72795C20616C6C6F636174696F6E5C206C69\
+             6D69745C20726561636865643A5C206C696D69745C203734303030303030305C\
+             2062797465735C2C5C20616C6C6F63617465643A5C203733393834393038385C2\
+             C5C2077616E7465643A5C20363934363831363B5C206D656D6F72795C20616C6C\
+             6F636174696F6E5C206C696D69745C20726561636865643A5C206C696D69745C2\
+             03734303030303030305C2062797465735C2C5C20616C6C6F63617465643A5C20\
+             3733393834393038385C2C5C2077616E7465643A5C20363934363831362C65727\
+             26F72436F64653D696E76616C69642C6572726F72547970653D757365722C686F\
+             73743D7175657279642D616C676F772D72772D373664363864353936382D667A6\
+             777722C686F73746E616D653D7175657279642D616C676F772D72772D37366436\
+             3864353936382D667A6777722C6E6F64656E616D653D69702D31302D3135332D3\
+             1302D3232312E65752D63656E7472616C2D312E636F6D707574652E696E746572\
+             6E616C2C6F726749443D306236653835326532373266666464392C6F745F74726\
+             163655F73616D706C65643D66616C73652C726F6C653D7175657279642D616C67\
+             6F772D72772C736F757263653D6861636B6E65792CFF3D726573706F6E7365536\
+             97A6523217E23726573706F6E736553697A65";
+        let tsm_key = hex::decode(buf).unwrap();
+
+        let parsed_key = super::parse_tsm_key(&tsm_key).unwrap();
+        assert_eq!(parsed_key.measurement, String::from("query_log"));
+
+        let exp_tagset = vec![
+            (String::from("env"), String::from("prod01-eu-central-1")),
+            (String::from("error"), String::from("memory allocation limit reached: limit 740000000 bytes, allocated: 739849088, wanted: 6946816; memory allocation limit reached: limit 740000000 bytes, allocated: 739849088, wanted: 6946816")),
+            (String::from("errorCode"), String::from("invalid")),
+            (String::from("errorType"), String::from("user")),
+            (String::from("host"), String::from("queryd-algow-rw-76d68d5968-fzgwr")),
+            (String::from("hostname"), String::from("queryd-algow-rw-76d68d5968-fzgwr")),
+            (String::from("nodename"), String::from("ip-10-153-10-221.eu-central-1.compute.internal")),
+            (String::from("orgID"), String::from("0b6e852e272ffdd9")),
+            (String::from("ot_trace_sampled"), String::from("false")),
+            (String::from("role"), String::from("queryd-algow-rw")),
+            (String::from("source"), String::from("hackney")),
+
+        ];
+        assert_eq!(parsed_key.tagset, exp_tagset);
+        assert_eq!(parsed_key.field_key, String::from("responseSize"));
+    }
+
+    fn do_test_parse_tsm_field_key_value_good(input: &str, expected_field_key: &str) {
+        let mut iter = input.bytes();
+        let result = parse_tsm_field_key_value(&mut iter);
+        match result {
+            Ok(field_key) => {
+                assert_eq!(
+                    field_key, expected_field_key,
+                    "Unexpected field key parsing '{input}'"
+                );
+            }
+            Err(e) => panic!(
+                "Unexpected error while parsing field key '{input}', got '{e}', expected '{expected_field_key}'"
+            ),
+        }
+    }
+
+    fn do_test_parse_tsm_field_key_value_bad(input: &str, expected_error: &str) {
+        let mut iter = input.bytes();
+        let result = parse_tsm_field_key_value(&mut iter);
+        match result {
+            Ok(field_key) => {
+                panic!(
+                    "Unexpected success parsing field key '{input}'. \
+                        Expected error '{expected_error}', got  '{field_key}'"
+                );
+            }
+            Err(err) => {
+                let err_str = err.to_string();
+                assert!(
+                    err_str.contains(expected_error),
+                    "Did not find expected error while parsing '{input}'. \
+                     Expected '{expected_error}' but actual error was '{err_str}'"
+                );
+            }
+        }
+    }
+
+    fn do_test_parse_tsm_tag_key_good(
+        input: &str,
+        expected_remaining_input: &str,
+        expected_tag_key: KeyType,
+    ) {
+        let mut iter = input.bytes();
+
+        let result = parse_tsm_tag_key(&mut iter);
+        let remaining_input =
+            String::from_utf8(iter.collect()).expect("can not find remaining input");
+
+        match result {
+            Ok(tag_key) => {
+                assert_eq!(tag_key, expected_tag_key, "while parsing input '{input}'");
+            }
+            Err(err) => {
+                panic!(
+                    "Got error '{err}', expected parsed tag key: '{expected_tag_key:?}' while parsing '{input}'"
+                );
+            }
+        }
+        assert_eq!(
+            remaining_input, expected_remaining_input,
+            "remaining input was not correct while parsing input '{input}'"
+        );
+    }
+
+    fn do_test_parse_tsm_tag_key_error(
+        input: &str,
+        expected_remaining_input: &str,
+        expected_error: &str,
+    ) {
+        let mut iter = input.bytes();
+
+        let result = parse_tsm_tag_key(&mut iter);
+        let remaining_input =
+            String::from_utf8(iter.collect()).expect("can not find remaining input");
+
+        match result {
+            Ok(tag_key) => {
+                panic!(
+                    "Got parsed key {tag_key:?}, expected failure {expected_error} while parsing input '{input}'"
+                );
+            }
+            Err(err) => {
+                let err_str = err.to_string();
+                assert!(
+                    err_str.contains(expected_error),
+                    "Did not find expected error '{expected_error}' in actual error '{err_str}'"
+                );
+            }
+        }
+        assert_eq!(
+            remaining_input, expected_remaining_input,
+            "remaining input was not correct while parsing input '{input}'"
+        );
+    }
+
+    fn do_test_parse_tsm_tag_value_good(
+        input: &str,
+        expected_remaining_input: &str,
+        expected_tag_value: (bool, String),
+    ) {
+        let mut iter = input.bytes();
+
+        let result = parse_tsm_tag_value("Unknown", &mut iter);
+        let remaining_input =
+            String::from_utf8(iter.collect()).expect("can not find remaining input");
+
+        match result {
+            Ok(tag_value) => {
+                assert_eq!(
+                    tag_value, expected_tag_value,
+                    "while parsing input '{input}'"
+                );
+            }
+            Err(err) => {
+                panic!(
+                    "Got error '{err}', expected parsed tag_value: '{expected_tag_value:?}' while parsing input '{input}"
+                );
+            }
+        }
+
+        assert_eq!(
+            remaining_input, expected_remaining_input,
+            "remaining input was not correct while parsing input '{input}'"
+        );
+    }
+
+    fn do_test_parse_tsm_tag_value_error(
+        input: &str,
+        expected_remaining_input: &str,
+        expected_error: &str,
+    ) {
+        let mut iter = input.bytes();
+
+        let result = parse_tsm_tag_value("Unknown", &mut iter);
+        let remaining_input =
+            String::from_utf8(iter.collect()).expect("can not find remaining input");
+
+        match result {
+            Ok(tag_value) => {
+                panic!(
+                    "Got parsed tag_value {tag_value:?}, expected failure {expected_error} while parsing input '{input}'"
+                );
+            }
+            Err(err) => {
+                let err_str = err.to_string();
+                assert!(
+                    err_str.contains(expected_error),
+                    "Did not find expected error '{expected_error}' in actual error '{err_str}'"
+                );
+            }
+        }
+
+        assert_eq!(
+            remaining_input, expected_remaining_input,
+            "remaining input was not correct while parsing input '{input}'"
+        );
+    }
+}
diff --git a/influxdb_tsm/src/lib.rs b/influxdb_tsm/src/lib.rs
new file mode 100644
index 0000000..45b6229
--- /dev/null
+++ b/influxdb_tsm/src/lib.rs
@@ -0,0 +1,186 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod encoders;
+pub mod key;
+pub mod mapper;
+pub mod reader;
+
+use std::convert::TryFrom;
+use std::error;
+use std::fmt;
+use std::io;
+
+pub use key::ParsedTsmKey;
+
+#[derive(Clone, Debug, Copy, PartialEq, Eq)]
+pub enum BlockType {
+    Float,
+    Integer,
+    Bool,
+    Str,
+    Unsigned,
+}
+
+impl TryFrom<u8> for BlockType {
+    type Error = TsmError;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        match value {
+            0 => Ok(Self::Float),
+            1 => Ok(Self::Integer),
+            2 => Ok(Self::Bool),
+            3 => Ok(Self::Str),
+            4 => Ok(Self::Unsigned),
+            _ => Err(TsmError {
+                description: format!("{value:?} is invalid block type"),
+            }),
+        }
+    }
+}
+
+/// `Block` holds information about location and time range of a block of data.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct Block {
+    pub min_time: i64,
+    pub max_time: i64,
+    pub offset: u64,
+    pub size: u32,
+    pub typ: BlockType,
+
+    // This index is used to track an associated reader needed to decode the
+    // data this block holds.
+    pub reader_idx: usize,
+}
+
+impl Block {
+    /// Determines if this block overlaps the provided block.
+    ///
+    /// Blocks overlap when the time-range of the data within the block can
+    /// overlap.
+    pub fn overlaps(&self, other: &Self) -> bool {
+        self.min_time <= other.max_time && other.min_time <= self.max_time
+    }
+}
+
+// MAX_BLOCK_VALUES is the maximum number of values a TSM block can store.
+const MAX_BLOCK_VALUES: usize = 1000;
+
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+/// `InfluxId` represents an InfluxDB ID used in InfluxDB 2.x to represent
+/// organization and bucket identifiers.
+pub struct InfluxId(u64);
+
+impl InfluxId {
+    #[allow(dead_code)]
+    fn new_str(s: &str) -> Result<Self, TsmError> {
+        let v = u64::from_str_radix(s, 16).map_err(|e| TsmError {
+            description: e.to_string(),
+        })?;
+        Ok(Self(v))
+    }
+
+    fn from_be_bytes(bytes: [u8; 8]) -> Self {
+        Self(u64::from_be_bytes(bytes))
+    }
+}
+
+impl std::fmt::Display for InfluxId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        write!(f, "{:016x}", self.0)
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TsmError {
+    pub description: String,
+}
+
+impl fmt::Display for TsmError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.description)
+    }
+}
+
+impl error::Error for TsmError {
+    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
+        // Generic error, underlying cause isn't tracked.
+        None
+    }
+}
+
+impl From<io::Error> for TsmError {
+    fn from(e: io::Error) -> Self {
+        Self {
+            description: format!("TODO - io error: {e} ({e:?})"),
+        }
+    }
+}
+
+impl From<std::str::Utf8Error> for TsmError {
+    fn from(e: std::str::Utf8Error) -> Self {
+        Self {
+            description: format!("TODO - utf8 error: {e} ({e:?})"),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn influx_id() {
+        let id = InfluxId::new_str("20aa9b0").unwrap();
+        assert_eq!(id, InfluxId(34_253_232));
+        assert_eq!(format!("{id}"), "00000000020aa9b0");
+    }
+
+    #[test]
+    fn block_overlaps() {
+        // ((0, 0), (0, 0), false)
+        let inputs = vec![
+            ((0, 10), (11, 12), false), // *---* +---+
+            ((10, 20), (3, 5), false),  // +---+ *---*
+            ((0, 0), (0, 0), true),     //
+            ((0, 1), (1, 2), true),     // +----*----*
+            ((0, 2), (1, 5), true),     // *--+-*----+
+            ((0, 5), (3, 10), true),    // *--+-*----+
+            ((3, 7), (0, 10), true),    // +--*----*-+
+            ((0, 10), (2, 2), true),    // *--++-----*
+        ];
+
+        for (a, b, expected) in inputs {
+            let block_a = Block {
+                min_time: a.0,
+                max_time: a.1,
+                offset: 0,
+                reader_idx: 0,
+                typ: BlockType::Float,
+                size: 0,
+            };
+            let block_b = Block {
+                min_time: b.0,
+                max_time: b.1,
+                offset: 0,
+                reader_idx: 0,
+                typ: BlockType::Float,
+                size: 0,
+            };
+            assert_eq!(block_a.overlaps(&block_b), expected);
+            assert_eq!(block_b.overlaps(&block_a), expected);
+        }
+    }
+}
diff --git a/influxdb_tsm/src/mapper.rs b/influxdb_tsm/src/mapper.rs
new file mode 100644
index 0000000..82fc21c
--- /dev/null
+++ b/influxdb_tsm/src/mapper.rs
@@ -0,0 +1,966 @@
+//! Types for mapping and converting series data from TSM indexes produced by
+//! InfluxDB >= 2.x
+use crate::reader::{BlockData, BlockDecoder, TsmIndexReader, ValuePair};
+use crate::{Block, BlockType, TsmError};
+
+use observability_deps::tracing::warn;
+
+use std::collections::{BTreeMap, BTreeSet};
+use std::fmt::{Display, Formatter};
+use std::i64;
+use std::io::{Read, Seek};
+use std::iter::Peekable;
+
+/// `TSMMeasurementMapper` takes a TSM reader and produces an iterator that
+/// collects all series data for a given measurement.
+///
+/// The main purpose of the `TSMMeasurementMapper` is to provide a
+/// transformation step that allows one to convert per-series/per-field data
+/// into measurement-oriented table data.
+#[derive(Debug)]
+pub struct TsmMeasurementMapper<R>
+where
+    R: Read + Seek,
+{
+    iter: Peekable<TsmIndexReader<R>>,
+    reader_idx: usize,
+}
+
+impl<R> TsmMeasurementMapper<R>
+where
+    R: Read + Seek,
+{
+    pub fn new(iter: Peekable<TsmIndexReader<R>>, reader_idx: usize) -> Self {
+        Self { iter, reader_idx }
+    }
+}
+
+/// either assign a value from a `Result` or return an error wrapped in an
+/// Option.
+macro_rules! try_or_some {
+    ($e:expr) => {
+        match $e {
+            Ok(val) => val,
+            Err(err) => return Some(Err(err)),
+        }
+    };
+}
+
+impl<R: Read + Seek> Iterator for TsmMeasurementMapper<R> {
+    type Item = Result<MeasurementTable, TsmError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // `None` indicates the end of index iteration.
+        let entry = try_or_some!(self.iter.next()?);
+
+        let parsed_key = try_or_some!(entry.parse_key());
+        let mut measurement: MeasurementTable =
+            MeasurementTable::new(parsed_key.measurement, self.reader_idx);
+        try_or_some!(measurement.add_series_data(
+            parsed_key.tagset,
+            parsed_key.field_key,
+            entry.block
+        ));
+
+        // The first index entry for the item has been processed, next keep
+        // peeking at subsequent entries in the index until a yielded value is
+        // for a different measurement. At that point we will return the
+        // measurement.
+        while let Some(res) = self.iter.peek() {
+            match res {
+                Ok(entry) => {
+                    let parsed_key = try_or_some!(entry.parse_key());
+                    if measurement.name != parsed_key.measurement {
+                        // Next entry is for a different measurement.
+                        return Some(Ok(measurement));
+                    }
+                    try_or_some!(measurement.add_series_data(
+                        parsed_key.tagset,
+                        parsed_key.field_key,
+                        entry.block
+                    ));
+                }
+                Err(e) => return Some(Err(e.clone())),
+            }
+            self.iter.next(); // advance iterator - we got what we needed from
+                              // the peek
+        }
+        Some(Ok(measurement)) // final measurement in index.
+    }
+}
+
+/// FieldKeyBlocks is a mapping between a set of field keys and all of the
+/// blocks for those keys.
+pub type FieldKeyBlocks = BTreeMap<String, Vec<Block>>;
+
+/// A collection of related blocks, fields and tag-sets for a single
+/// measurement.
+///
+/// A `MeasurementTable` should be derived from a single TSM index (file).
+/// Given a single series key, an invariant is that none of the blocks for that
+/// key have overlapping timestamps.
+///
+/// A MeasurementTable can be combined with another `MeasurementTable` as long
+/// as `other` refers to the same measurement name.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct MeasurementTable {
+    pub name: String,
+    // Tagset for key --> map of fields with that tagset to their blocks.
+    //
+    // Here we are mapping each set of field keys (and their blocks) to a unique
+    // tag set.
+    //
+    // One entry in `tag_set_fields_blocks` might be:
+    //
+    // key: vec![("region", "west"), ("server", "a")]
+    // value: {
+    //          {key: "temp": vec![*block1*, *block2*},
+    //          {key: "current": value: vec![*block1*, *block1*, *block3*]}
+    //          {key: "voltage": value: vec![*block1*]}
+    // }
+    //
+    // All of the blocks and fields for `"server"="b"` would be kept under a
+    // separate key on `tag_set_fields_blocks`.
+    tag_set_fields_blocks: BTreeMap<Vec<(String, String)>, FieldKeyBlocks>,
+
+    tag_columns: BTreeSet<String>,
+    field_columns: BTreeMap<String, BlockType>,
+
+    // reader_idx can be set when mapping multiple TSM files; it is used to
+    // specify which block reader should be used when decoding blocks for this
+    // measurement table.
+    reader_idx: usize,
+}
+
+impl MeasurementTable {
+    pub fn new(name: String, reader_idx: usize) -> Self {
+        Self {
+            name,
+            tag_set_fields_blocks: BTreeMap::new(),
+            tag_columns: BTreeSet::new(),
+            field_columns: BTreeMap::new(),
+            reader_idx,
+        }
+    }
+
+    fn tag_set_fields_blocks(&mut self) -> &mut BTreeMap<Vec<(String, String)>, FieldKeyBlocks> {
+        &mut self.tag_set_fields_blocks
+    }
+
+    pub fn tag_columns(&self) -> Vec<&String> {
+        self.tag_columns.iter().collect()
+    }
+
+    pub fn field_columns(&self) -> &BTreeMap<String, BlockType> {
+        &self.field_columns
+    }
+
+    // updates the table with data from a single TSM index entry's block.
+    pub fn add_series_data(
+        &mut self,
+        tagset: Vec<(String, String)>,
+        field_key: String,
+        mut block: Block,
+    ) -> Result<(), TsmError> {
+        // Invariant: our data model does not support a field column for a
+        // measurement table having multiple data types, even though this is
+        // supported in InfluxDB.
+        if let Some(fk) = self.field_columns.get(&field_key) {
+            if *fk != block.typ {
+                warn!(
+                    "Rejected block for field {:?} with type {:?}. \
+                    Tagset: {:?}, measurement {:?}. \
+                    Field exists with type: {:?}",
+                    &field_key, block.typ, tagset, self.name, *fk
+                );
+                return Ok(());
+            }
+        }
+
+        // tags will be used as the key to a map, where the value will be a
+        // collection of all the field keys for that tagset and the associated
+        // blocks.
+        self.field_columns.insert(field_key.clone(), block.typ);
+        for (k, _) in &tagset {
+            self.tag_columns.insert(k.clone());
+        }
+
+        let field_key_blocks = self.tag_set_fields_blocks.entry(tagset).or_default();
+        let blocks = field_key_blocks.entry(field_key).or_default();
+
+        block.reader_idx = self.reader_idx;
+        blocks.push(block);
+
+        Ok(())
+    }
+
+    // Process the MeasurementTable in sections.
+    //
+    // Each call to `process` emits a `TableSection`, which is a partial section
+    // of the final table. Each section contains the data for all columns
+    // in the table, though not all of that data will necessarily be
+    // materialised.
+    //
+    // `process` expects a closure to process each section.
+    pub fn process<F>(
+        &mut self,
+        mut block_reader: impl BlockDecoder,
+        mut apply_fn: F,
+    ) -> Result<(), TsmError>
+    where
+        F: FnMut(TableSection) -> Result<(), TsmError>,
+    {
+        for (i, (tag_set_pair, blocks)) in self.tag_set_fields_blocks().iter_mut().enumerate() {
+            let (ts, field_cols) = map_field_columns(&mut block_reader, blocks)?;
+
+            let col_set = TableSection {
+                i,
+                ts,
+                field_cols,
+                tag_cols: tag_set_pair.clone(),
+            };
+            apply_fn(col_set)?;
+        }
+        Ok(())
+    }
+
+    /// Merge another `MeasurementTable` into this one.
+    ///
+    /// `other` must be associated with the same measurement, otherwise an error
+    /// will be returned.
+    ///
+    /// Because measurement table data can originate from multiple sources (TSM
+    /// files) it is possible that blocks for the same tagset and field will
+    /// overlap with each other. It is the callers responsibility to handle
+    /// merging this data when decoding those blocks.
+    pub fn merge(&mut self, other: &mut Self) -> Result<(), TsmError> {
+        if self.name != other.name {
+            return Err(TsmError {
+                description: format!(
+                    "cannot merge measurement {:?} into {:?}",
+                    self.name, other.name
+                ),
+            });
+        }
+        self.tag_columns.append(&mut other.tag_columns);
+        self.field_columns.append(&mut other.field_columns);
+
+        for (other_tagset, other_field_key_blocks) in &mut other.tag_set_fields_blocks {
+            let field_key_blocks = self
+                .tag_set_fields_blocks
+                .entry(other_tagset.clone())
+                .or_default();
+
+            for (other_field_key, other_blocks) in &mut *other_field_key_blocks {
+                match field_key_blocks.get_mut(other_field_key) {
+                    Some(blocks) => {
+                        assert!(
+                            !other_blocks.is_empty(),
+                            "tried to merge field with no blocks"
+                        );
+
+                        assert!(
+                            !blocks.is_empty(),
+                            "MeasurementTable has field with no blocks"
+                        );
+
+                        // Invariant: blocks are already sorted by time range.
+                        // Self's blocks do not overlap each other.
+                        // other's blocks do not overlap each other.
+                        //
+                        // It is possible that blocks for the same tagset and
+                        // field in self overlap corresponding blocks in `other`
+
+                        // happy path - all of other's blocks are after ours
+                        if other_blocks[0].min_time > blocks[blocks.len() - 1].max_time {
+                            blocks.extend_from_slice(other_blocks);
+                            break;
+                        }
+
+                        // less happy path
+                        blocks.extend_from_slice(other_blocks);
+                        blocks.sort_by(|a, b| a.min_time.cmp(&b.min_time))
+                    }
+                    None => {
+                        field_key_blocks.insert(other_field_key.clone(), other_blocks.clone());
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Display for MeasurementTable {
+    // This trait requires `fmt` with this exact signature.
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Measurement: {}", self.name)?;
+        writeln!(f, "\nTag Sets:")?;
+        for (tagset, field_key_blocks) in &self.tag_set_fields_blocks {
+            write!(f, "\t")?;
+            for (key, value) in tagset {
+                write!(f, "{key}={value} ")?;
+            }
+
+            writeln!(f, "\n\tField Keys:")?;
+            for (field_key, blocks) in field_key_blocks {
+                writeln!(f, "\t{field_key}")?;
+                for block in blocks {
+                    writeln!(
+                        f,
+                        "\t\tBlock time-range: ({}, {}), Offset: {}, Size: {}",
+                        block.min_time, block.max_time, block.offset, block.size
+                    )?;
+                }
+            }
+            writeln!(f)?;
+        }
+        Ok(())
+    }
+}
+
+/// A partial collection of columns belonging to the same table.
+///
+/// A TableSection always contains a column of timestamps, which indicates how
+/// many rows each column has. Each field column is the same length as the
+/// timestamp column, but may contain values or NULL for each entry.
+///
+/// Tag columns all have the same value in their column within this column set.
+/// It is up to the caller to materialise these column vectors when required.
+#[derive(Debug)]
+pub struct TableSection {
+    i: usize, // indicates previous number of column sets for measurement table.
+    pub ts: Vec<i64>,
+    pub tag_cols: Vec<(String, String)>,
+    pub field_cols: BTreeMap<String, ColumnData>,
+}
+
+impl TableSection {
+    pub fn len(&self) -> usize {
+        self.ts.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    // Determines if this is the first column set for a table.
+    pub fn is_first(&self) -> bool {
+        self.i == 0
+    }
+}
+
+/// `ColumnData` describes various types of nullable block data.
+#[derive(Debug, PartialEq, Clone)]
+pub enum ColumnData {
+    // TODO(edd): perf - I expect it to be much better to track nulls in a
+    // separate bitmap.
+    Float(Vec<Option<f64>>),
+    Integer(Vec<Option<i64>>),
+    Bool(Vec<Option<bool>>),
+    Str(Vec<Option<Vec<u8>>>),
+    Unsigned(Vec<Option<u64>>),
+}
+
+#[rustfmt::skip]
+// Maps multiple columnar field blocks to a single tablular representation.
+//
+// Given a set of field keys and a set of blocks for each key,
+// `map_field_columns` aligns each columnar block by the timestamp component to
+// produce a single tablular output with one timestamp column, and each value
+// column joined by the timestamp values.
+//
+// For example, here we have three blocks (one block for a different field):
+//
+// ┌───────────┬───────────┐     ┌───────────┬───────────┐    ┌───────────┬───────────┐
+// │    TS     │   Temp    │     │    TS     │  Voltage  │    │    TS     │  Current  │
+// ├───────────┼───────────┤     ├───────────┼───────────┤    ├───────────┼───────────┤
+// │     1     │   10.2    │     │     1     │   1.23    │    │     2     │   0.332   │
+// ├───────────┼───────────┤     ├───────────┼───────────┤    ├───────────┼───────────┤
+// │     2     │   11.4    │     │     2     │   1.24    │    │     3     │    0.5    │
+// ├───────────┼───────────┤     ├───────────┼───────────┤    ├───────────┼───────────┤
+// │     3     │   10.2    │     │     3     │   1.26    │    │     5     │    0.6    │
+// └───────────┼───────────┘     └───────────┼───────────┘    └───────────┼───────────┘
+//             │                             │                            │
+//             │                             │                            │
+//             └─────────────────────────────┼────────────────────────────┘
+//                                           │
+//                                           │
+//                                           │
+//                                           ▼
+//                     ┌──────────┐  ┌──────────┬─────────┬─────────┐
+//                     │   Time   │  │ Current  │  Temp   │ Voltage │
+//                     ├──────────┤  ├──────────┼─────────┼─────────┤
+//                     │    1     │  │   NULL   │  10.2   │  1.23   │
+//                     ├──────────┤  ├──────────┼─────────┼─────────┤
+//                     │    2     │  │  0.332   │  11.4   │  1.24   │
+//                     ├──────────┤  ├──────────┼─────────┼─────────┤
+//                     │    3     │  │   0.5    │  10.2   │  1.26   │
+//                     ├──────────┤  ├──────────┼─────────┼─────────┤
+//                     │    5     │  │   0.6    │  NULL   │  NULL   │
+//                     └──────────┘  └──────────┴─────────┴─────────┘
+//
+// We produce a single time column and a column for each field block. Notice
+// that if there is no value for a timestamp that the column entry becomes NULL
+// Currently we use an Option(None) variant to represent NULL values but in the
+// the future this may be changed to a separate bitmap to track NULL values.
+//
+// An invariant of the TSM block format is that multiple blocks for the same
+// input field will never overlap by time. Once we have mapped a single block
+// for a field we can decode and pull the next block for the field and continue
+// to build the output.
+//
+fn map_field_columns(
+    mut decoder: impl BlockDecoder,
+    field_blocks: &mut FieldKeyBlocks,
+) -> Result<(Vec<i64>, BTreeMap<String, ColumnData>), TsmError> {
+    // This function maintains two main buffers. The first holds the next
+    // decoded block for each field in the input fields. `refill_block_buffer`
+    // is responsible for determining if each value in the buffer (a decoded
+    // block) needs refilling. Refilling involves physically decoding a TSM block
+    // using the reader.
+    //
+    // The second buffer holds the "head" of each of the blocks in the first
+    // buffer; these values are tuples of time-stamp and value. Using these
+    // values we can essentially do a k-way "join" on the timestamp parts of the
+    // tuples, and construct an output row where each field (plus time) are
+    // columns.
+
+    // This buffer holds the next decoded block for each input field.
+    let mut input_block_buffer = BTreeMap::new();
+    refill_block_buffer(&mut decoder, field_blocks, &mut input_block_buffer)?;
+
+    // This buffer holds the head (ts, value) pair in each decoded input block
+    // of the input block buffer.
+    let mut block_value_buffer: Vec<Option<ValuePair>> = Vec::new();
+    block_value_buffer.resize_with(input_block_buffer.len(), || None);
+    refill_value_pair_buffer(&mut input_block_buffer, &mut block_value_buffer);
+
+    // Create output columns for each field.
+    let mut result = BTreeMap::new();
+    for (field_key, block) in &input_block_buffer {
+        match block {
+            BlockData::Float { .. } => {
+                result.insert(field_key.clone(), ColumnData::Float(vec![]));
+            }
+            BlockData::Integer { .. } => {
+                result.insert(field_key.clone(), ColumnData::Integer(vec![]));
+            }
+            BlockData::Bool { .. } => {
+                result.insert(field_key.clone(), ColumnData::Bool(vec![]));
+            }
+            BlockData::Str { .. } => {
+                result.insert(field_key.clone(), ColumnData::Str(vec![]));
+            }
+            BlockData::Unsigned { .. } => {
+                result.insert(field_key.clone(), ColumnData::Unsigned(vec![]));
+            }
+        }
+    }
+
+    // Each iteration of this loop will result in the creation of one output
+    // row. Every input block maps to a single column (field) in the output, but
+    // a block does not have to have a value for every row. Buffers are only
+    // refilled if values have been used during the loop iteration.
+    //
+    // When all inputs have been drained there is no timestamp available to
+    // create a row with and iteration stops.
+    let mut timestamps = Vec::new(); // TODO(edd): get hint for pre-allocate
+    while let Some(min_ts) = map_blocks_to_columns(&mut block_value_buffer, &mut result) {
+        timestamps.push(min_ts);
+        refill_block_buffer(&mut decoder, field_blocks, &mut input_block_buffer)?;
+        refill_value_pair_buffer(&mut input_block_buffer, &mut block_value_buffer);
+    }
+
+    Ok((timestamps, result))
+}
+
+// Given a set of input blocks, where each block comprises two equally sized
+// arrays of timestamps and values, join the head of each input block's value
+// array by the head of the corresponding timestamp column.
+//
+fn map_blocks_to_columns(
+    blocks: &mut [Option<ValuePair>],
+    dst: &mut BTreeMap<String, ColumnData>,
+) -> Option<i64> {
+    // First determine the minimum timestamp in any of the input blocks or return
+    // None if all of the blocks have been drained.
+    let min_ts = blocks.iter().flatten().map(ValuePair::timestamp).min()?;
+
+    for (i, column) in dst.values_mut().enumerate() {
+        match &mut blocks[i] {
+            Some(pair) => {
+                // If this candidate has the `min_ts` time-stamp then emit its
+                // value to the output column, otherwise emit a None value.
+                match pair {
+                    ValuePair::F64((ts, value)) => {
+                        if let ColumnData::Float(vs) = column {
+                            if *ts == min_ts {
+                                vs.push(Some(*value));
+                                blocks[i] = None;
+                            } else {
+                                vs.push(None); // block has a value available
+                                               // but timestamp doesn't join
+                            }
+                        };
+                    }
+                    ValuePair::I64((ts, value)) => {
+                        if let ColumnData::Integer(vs) = column {
+                            if *ts == min_ts {
+                                vs.push(Some(*value));
+                                blocks[i] = None;
+                            } else {
+                                vs.push(None); // block has a value available
+                                               // but timestamp doesn't join
+                            }
+                        };
+                    }
+                    ValuePair::Bool((ts, value)) => {
+                        if let ColumnData::Bool(vs) = column {
+                            if *ts == min_ts {
+                                vs.push(Some(*value));
+                                blocks[i] = None;
+                            } else {
+                                vs.push(None); // block has a value available
+                                               // but timestamp doesn't join
+                            }
+                        };
+                    }
+                    ValuePair::Str((ts, value)) => {
+                        if let ColumnData::Str(vs) = column {
+                            // TODO(edd): perf - Remove this cloning....
+                            if *ts == min_ts {
+                                vs.push(Some(value.clone()));
+                                blocks[i] = None;
+                            } else {
+                                vs.push(None); // block has a value available
+                                               // but timestamp doesn't join
+                            }
+                        };
+                    }
+                    ValuePair::U64((ts, value)) => {
+                        if let ColumnData::Unsigned(vs) = column {
+                            if *ts == min_ts {
+                                vs.push(Some(*value));
+                                blocks[i] = None;
+                            } else {
+                                vs.push(None); // block has a value available
+                                               // but timestamp doesn't join
+                            }
+                        };
+                    }
+                }
+            }
+            // This field value pair doesn't have a value for the min time-stamp
+            None => match column {
+                ColumnData::Float(vs) => {
+                    vs.push(None);
+                }
+                ColumnData::Integer(vs) => {
+                    vs.push(None);
+                }
+                ColumnData::Bool(vs) => {
+                    vs.push(None);
+                }
+                ColumnData::Str(vs) => {
+                    vs.push(None);
+                }
+                ColumnData::Unsigned(vs) => {
+                    vs.push(None);
+                }
+            },
+        }
+    }
+    Some(min_ts)
+}
+
+// Ensures that the next available block for a field is materialised in the
+// destination container.
+fn refill_block_buffer(
+    decoder: &mut impl BlockDecoder,
+    field_blocks: &mut FieldKeyBlocks,
+    dst: &mut BTreeMap<String, BlockData>,
+) -> Result<(), TsmError> {
+    // Determine for each input block if the destination container needs
+    // refilling.
+    for (field, blocks) in &mut *field_blocks {
+        if blocks.is_empty() {
+            continue; // drained all blocks for this field
+        }
+
+        // in this case the destination buffer does not need refilling yet
+        if let Some(dst_block) = dst.get(field) {
+            if !dst_block.is_empty() {
+                continue; // not ready to be replaced with next block yet
+            }
+        };
+
+        // Either there is no block data in the destination buffer for field,
+        // or the block data that is there has been completely consumed. Refill
+        // the buffer by getting the next block(s), decoding them and making
+        // the block data available for consumption.
+
+        // It is possible for fields to have multiple overlapping blocks, e.g.,
+        // if the data has been built up from multiple data sources (TSM files).
+        //
+        // Determine how many overlapping blocks need to be decoded and merged
+        // together
+        let mut i = 0; // track which blocks are overlapping in the vector
+        while i < blocks.len() - 1 {
+            if !blocks[i].overlaps(&blocks[i + 1]) {
+                break;
+            }
+            i += 1;
+        }
+
+        // materialise all the blocks to be merged. Note, a single block is valid
+        // here - the merge will simply return the block data.
+        let decoded_blocks = blocks
+            .drain(..i + 1)
+            .map(|b| decoder.decode(&b))
+            .collect::<Result<Vec<_>, _>>()?;
+
+        dst.insert(field.clone(), BlockData::merge(decoded_blocks));
+    }
+    Ok(())
+}
+
+// Fills any empty (consumed) values from the destination vector with the next
+// value from the input set of blocks.
+fn refill_value_pair_buffer(
+    blocks: &mut BTreeMap<String, BlockData>,
+    dst: &mut Vec<Option<ValuePair>>,
+) {
+    for (block, dst) in blocks.values_mut().zip(dst) {
+        if dst.is_none() {
+            // (ts, value) pair has been used - fetch next pair (if any).
+            *dst = block.next_pair();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::reader::*;
+
+    use flate2::read::GzDecoder;
+
+    use std::fs::File;
+    use std::io::BufReader;
+    use std::io::Cursor;
+    use std::io::Read;
+
+    const TSM_FIXTURE_SIZE: usize = 4_222_248;
+
+    #[test]
+    fn map_tsm_index() {
+        let file = File::open("../test_fixtures/000000000000005-000000002.tsm.gz");
+        let mut decoder = GzDecoder::new(file.unwrap());
+        let mut buf = Vec::new();
+        decoder.read_to_end(&mut buf).unwrap();
+
+        let reader =
+            TsmIndexReader::try_new(BufReader::new(Cursor::new(buf)), TSM_FIXTURE_SIZE).unwrap();
+        let mapper = TsmMeasurementMapper::new(reader.peekable(), 0);
+
+        // Although there  are over 2,000 series keys in the TSM file, there are
+        // only 121 unique measurements.
+        assert_eq!(mapper.count(), 121);
+    }
+
+    #[test]
+    fn map_field_columns_file() {
+        let file = File::open("../test_fixtures/000000000000005-000000002.tsm.gz");
+        let mut decoder = GzDecoder::new(file.unwrap());
+        let mut buf = Vec::new();
+        decoder.read_to_end(&mut buf).unwrap();
+
+        let index_reader =
+            TsmIndexReader::try_new(BufReader::new(Cursor::new(&buf)), TSM_FIXTURE_SIZE).unwrap();
+        let mut mapper = TsmMeasurementMapper::new(index_reader.peekable(), 0);
+
+        let mut block_reader = TsmBlockReader::new(BufReader::new(Cursor::new(&buf)));
+
+        let mut cpu = mapper
+            .find(|m| m.as_ref().unwrap().name == "cpu")
+            .unwrap()
+            .unwrap();
+
+        // cpu measurement has these 10 field keys on each tagset combination
+        let exp_field_keys = vec![
+            "usage_guest",
+            "usage_guest_nice",
+            "usage_idle",
+            "usage_iowait",
+            "usage_irq",
+            "usage_nice",
+            "usage_softirq",
+            "usage_steal",
+            "usage_system",
+            "usage_user",
+        ];
+
+        for field_blocks in cpu.tag_set_fields_blocks.values_mut() {
+            let (_, field_cols) =
+                super::map_field_columns(&mut block_reader, field_blocks).unwrap();
+            let keys: Vec<_> = field_cols.keys().collect();
+
+            // Every mapping between field blocks should result in columns
+            // for every field.
+            assert_eq!(keys, exp_field_keys);
+        }
+    }
+
+    #[test]
+    fn measurement_table_columns() {
+        let file = File::open("../test_fixtures/000000000000005-000000002.tsm.gz");
+        let mut decoder = GzDecoder::new(file.unwrap());
+        let mut buf = Vec::new();
+        decoder.read_to_end(&mut buf).unwrap();
+
+        let reader =
+            TsmIndexReader::try_new(BufReader::new(Cursor::new(buf)), TSM_FIXTURE_SIZE).unwrap();
+        let mut mapper = TsmMeasurementMapper::new(reader.peekable(), 0);
+
+        let cpu = mapper
+            .find(|table| table.as_ref().unwrap().name == "cpu")
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(cpu.tag_columns(), vec!["cpu", "host"]);
+        assert_eq!(
+            cpu.field_columns().keys().collect::<Vec<_>>(),
+            vec![
+                "usage_guest",
+                "usage_guest_nice",
+                "usage_idle",
+                "usage_iowait",
+                "usage_irq",
+                "usage_nice",
+                "usage_softirq",
+                "usage_steal",
+                "usage_system",
+                "usage_user"
+            ]
+        );
+    }
+
+    #[test]
+    fn conflicting_field_types() {
+        let mut table = MeasurementTable::new("cpu".to_string(), 0);
+        table
+            .add_series_data(
+                vec![("region".to_string(), "west".to_string())],
+                "value".to_string(),
+                Block {
+                    max_time: 0,
+                    min_time: 0,
+                    offset: 0,
+                    size: 0,
+                    typ: BlockType::Float,
+                    reader_idx: 0,
+                },
+            )
+            .unwrap();
+
+        table
+            .add_series_data(
+                vec![],
+                "value".to_string(),
+                Block {
+                    max_time: 0,
+                    min_time: 0,
+                    offset: 0,
+                    size: 0,
+                    typ: BlockType::Integer,
+                    reader_idx: 0,
+                },
+            )
+            .unwrap();
+
+        // The block type for the value field should be Float because the
+        // conflicting integer field should be ignored.
+        assert_eq!(
+            *table.field_columns().get("value").unwrap(),
+            BlockType::Float,
+        );
+    }
+
+    #[test]
+    fn merge_measurement_table() {
+        let mut table1 = MeasurementTable::new("cpu".to_string(), 0);
+        table1
+            .add_series_data(
+                vec![("region".to_string(), "west".to_string())],
+                "value".to_string(),
+                Block {
+                    min_time: 101,
+                    max_time: 150,
+                    offset: 0,
+                    size: 0,
+                    typ: BlockType::Float,
+                    reader_idx: 0,
+                },
+            )
+            .unwrap();
+
+        let mut table2 = MeasurementTable::new("cpu".to_string(), 1);
+        table2
+            .add_series_data(
+                vec![("region".to_string(), "west".to_string())],
+                "value".to_string(),
+                Block {
+                    min_time: 0,
+                    max_time: 100,
+                    offset: 0,
+                    size: 0,
+                    typ: BlockType::Float,
+                    reader_idx: 0,
+                },
+            )
+            .unwrap();
+        table2
+            .add_series_data(
+                vec![("server".to_string(), "a".to_string())],
+                "temp".to_string(),
+                Block {
+                    min_time: 0,
+                    max_time: 50,
+                    offset: 0,
+                    size: 0,
+                    typ: BlockType::Str,
+                    reader_idx: 0,
+                },
+            )
+            .unwrap();
+
+        table1.merge(&mut table2).unwrap();
+        assert_eq!(table1.name, "cpu");
+        assert_eq!(table1.tag_columns(), vec!["region", "server"]);
+
+        let mut exp_fields: BTreeMap<String, BlockType> = BTreeMap::new();
+        exp_fields.insert("temp".to_string(), BlockType::Str);
+        exp_fields.insert("value".to_string(), BlockType::Float);
+        assert_eq!(table1.field_columns(), &exp_fields);
+
+        let mut field_blocks_value: BTreeMap<String, Vec<Block>> = BTreeMap::new();
+        field_blocks_value.insert(
+            "value".to_string(),
+            vec![
+                Block {
+                    min_time: 0,
+                    max_time: 100,
+                    offset: 0,
+                    size: 0,
+                    typ: BlockType::Float,
+                    reader_idx: 1, // index updated to reflect using other block reader
+                },
+                Block {
+                    min_time: 101,
+                    max_time: 150,
+                    offset: 0,
+                    size: 0,
+                    typ: BlockType::Float,
+                    reader_idx: 0,
+                },
+            ],
+        );
+
+        let mut field_blocks_temp: BTreeMap<String, Vec<Block>> = BTreeMap::new();
+        field_blocks_temp.insert(
+            "temp".to_string(),
+            vec![Block {
+                min_time: 0,
+                max_time: 50,
+                offset: 0,
+                size: 0,
+                typ: BlockType::Str,
+                reader_idx: 1, // index updated to reflect using other block reader
+            }],
+        );
+
+        let mut exp_tag_set_field_blocks: BTreeMap<Vec<(String, String)>, FieldKeyBlocks> =
+            BTreeMap::new();
+        exp_tag_set_field_blocks.insert(
+            vec![("region".to_string(), "west".to_string())],
+            field_blocks_value,
+        );
+        exp_tag_set_field_blocks.insert(
+            vec![("server".to_string(), "a".to_string())],
+            field_blocks_temp,
+        );
+        assert_eq!(table1.tag_set_fields_blocks, exp_tag_set_field_blocks);
+    }
+
+    #[test]
+    fn fill_value_buffer() {
+        // pairs is a helper to generate expected values.
+        let pairs = |values: &[(i64, i64)]| -> Vec<Option<ValuePair>> {
+            values
+                .iter()
+                .map(|(t, v)| Some(ValuePair::I64((*t, *v))))
+                .collect::<Vec<_>>()
+        };
+
+        let mut input = BTreeMap::new();
+        input.insert(
+            "a".to_string(),
+            BlockData::Integer {
+                i: 0,
+                ts: vec![1, 2],
+                values: vec![1, 2],
+            },
+        );
+
+        input.insert(
+            "b".to_string(),
+            BlockData::Integer {
+                i: 0,
+                ts: vec![1, 2, 3],
+                values: vec![10, 20, 30],
+            },
+        );
+
+        input.insert(
+            "c".to_string(),
+            BlockData::Integer {
+                i: 0,
+                ts: vec![1, 2, 3],
+                values: vec![100, 200, 300],
+            },
+        );
+
+        let mut dst: Vec<Option<ValuePair>> = vec![None, None, None];
+
+        super::refill_value_pair_buffer(&mut input, &mut dst);
+        assert_eq!(dst, pairs(&[(1, 1), (1, 10), (1, 100)]));
+
+        // If the buffer wasn't drained then no new values will be added.
+        super::refill_value_pair_buffer(&mut input, &mut dst);
+        assert_eq!(dst, pairs(&[(1, 1), (1, 10), (1, 100)]));
+
+        // use up a value
+        dst[2] = None;
+        super::refill_value_pair_buffer(&mut input, &mut dst);
+        assert_eq!(dst, pairs(&[(1, 1), (1, 10), (2, 200)]));
+
+        // consume multiple values
+        dst = vec![None, None, None];
+        super::refill_value_pair_buffer(&mut input, &mut dst);
+        assert_eq!(dst, pairs(&[(2, 2), (2, 20), (3, 300)]));
+
+        // consume values to drain the first and last input
+        dst = vec![None, None, None];
+        super::refill_value_pair_buffer(&mut input, &mut dst);
+        let mut exp = pairs(&[(2, 2), (3, 30), (3, 300)]);
+        exp[0] = None;
+        exp[2] = None;
+        assert_eq!(dst, exp);
+
+        // drain remaining input
+        dst = vec![None, None, None];
+        super::refill_value_pair_buffer(&mut input, &mut dst);
+        assert_eq!(dst, vec![None, None, None]);
+    }
+}
diff --git a/influxdb_tsm/src/reader.rs b/influxdb_tsm/src/reader.rs
new file mode 100644
index 0000000..4e4265e
--- /dev/null
+++ b/influxdb_tsm/src/reader.rs
@@ -0,0 +1,896 @@
+//! Types for reading and writing TSM files produced by InfluxDB >= 2.x
+
+use super::*;
+use integer_encoding::VarInt;
+use std::collections::BTreeMap;
+use std::io::{Read, Seek, SeekFrom};
+use std::u64;
+
+/// `TSMIndexReader` allows you to read index data within a TSM file.
+///
+/// # Example
+///
+/// Iterating over the TSM index.
+///
+/// ```
+/// # use influxdb_tsm::reader::*;
+/// # use flate2::read::GzDecoder;
+/// # use std::fs::File;
+/// # use std::io::BufReader;
+/// # use std::io::Cursor;
+/// # use std::io::Read;
+/// # let file = File::open("../test_fixtures/000000000000005-000000002.tsm.gz");
+/// # let mut decoder = GzDecoder::new(file.unwrap());
+/// # let mut buf = Vec::new();
+/// # decoder.read_to_end(&mut buf).unwrap();
+/// # let data_len = buf.len();
+/// # let r = Cursor::new(buf);
+///
+/// let reader = TsmIndexReader::try_new(BufReader::new(r), 4_222_248).unwrap();
+///
+/// // reader allows you to access each index entry, and each block for each
+/// // entry in order.
+/// for index_entry in reader {
+///     match index_entry {
+///         Ok(entry) => {
+///             let key = entry.parse_key().unwrap();
+///             println!(
+///                 "bucket id is {:?}, measurement name is {:?}",
+///                 entry.bucket_id(),
+///                 key.measurement,
+///             )
+///         }
+///         Err(e) => println!("got an error {:?}", e),
+///     }
+/// }
+/// ```
+#[derive(Debug)]
+pub struct TsmIndexReader<R>
+where
+    R: Read + Seek,
+{
+    r: R,
+
+    curr_offset: u64,
+    end_offset: u64,
+
+    curr: Option<IndexEntry>,
+    next: Option<IndexEntry>,
+}
+
+impl<R> TsmIndexReader<R>
+where
+    R: Read + Seek,
+{
+    pub fn try_new(mut r: R, len: usize) -> Result<Self, TsmError> {
+        // determine offset to index, which is held in last 8 bytes of file.
+        r.seek(SeekFrom::End(-8))?;
+        let mut buf = [0u8; 8];
+        r.read_exact(&mut buf)?;
+
+        let index_offset = u64::from_be_bytes(buf);
+        r.seek(SeekFrom::Start(index_offset))?;
+
+        Ok(Self {
+            r,
+            curr_offset: index_offset,
+            end_offset: len as u64 - 8,
+            curr: None,
+            next: None,
+        })
+    }
+
+    /// next_index_entry will return either the next index entry in a TSM file's
+    /// index or will return an error. `next_index_entry` updates the offset on
+    /// the Index, but it's the caller's responsibility to stop reading entries
+    /// when the index has been exhausted.
+    fn next_index_entry(&mut self) -> Result<IndexEntry, TsmError> {
+        // read length of series key
+        let mut buf = [0u8; 2];
+        self.r.read_exact(&mut buf)?;
+        self.curr_offset += 2;
+        let key_len = u16::from_be_bytes(buf);
+
+        // read the series key itself
+        let mut key_bytes = vec![0; key_len as usize]; // TODO(edd): re-use this
+        self.r.read_exact(key_bytes.as_mut_slice())?;
+        self.curr_offset += key_len as u64;
+
+        // read the block type
+        self.r.read_exact(&mut buf[..1])?;
+        self.curr_offset += 1;
+        let b_type = buf[0];
+
+        // read how many blocks there are for this entry.
+        self.r.read_exact(&mut buf)?;
+        self.curr_offset += 2;
+        let count = u16::from_be_bytes(buf);
+
+        let typ = BlockType::try_from(b_type)?;
+        Ok(IndexEntry {
+            key: key_bytes,
+            block_type: typ,
+            count,
+            curr_block: 1,
+            block: self.next_block_entry(typ)?,
+        })
+    }
+
+    /// next_block_entry will return the next block entry within an index entry.
+    /// It is the caller's responsibility to stop reading block entries when
+    /// they have all been read for an index entry.
+    fn next_block_entry(&mut self, typ: BlockType) -> Result<Block, TsmError> {
+        // read min time on block entry
+        let mut buf = [0u8; 8];
+        self.r.read_exact(&mut buf[..])?;
+        self.curr_offset += 8;
+        let min_time = i64::from_be_bytes(buf);
+
+        // read max time on block entry
+        self.r.read_exact(&mut buf[..])?;
+        self.curr_offset += 8;
+        let max_time = i64::from_be_bytes(buf);
+
+        // read block data offset
+        self.r.read_exact(&mut buf[..])?;
+        self.curr_offset += 8;
+        let offset = u64::from_be_bytes(buf);
+
+        // read block size
+        self.r.read_exact(&mut buf[..4])?;
+        self.curr_offset += 4;
+        let size = u32::from_be_bytes([buf[0], buf[1], buf[2], buf[3]]);
+
+        Ok(Block {
+            min_time,
+            max_time,
+            offset,
+            typ,
+            size,
+            reader_idx: 0,
+        })
+    }
+}
+
+impl<R: Read + Seek> Iterator for TsmIndexReader<R> {
+    type Item = Result<IndexEntry, TsmError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.curr_offset == self.end_offset {
+            // end of entries
+            return None;
+        }
+
+        match &self.curr {
+            Some(curr) => {
+                if curr.curr_block < curr.count {
+                    // there are more block entries for this index entry. Read
+                    // the next block entry.
+                    let mut next = curr.clone();
+                    match self.next_block_entry(next.block_type) {
+                        Ok(block) => next.block = block,
+                        Err(e) => return Some(Err(e)),
+                    }
+                    next.curr_block += 1;
+                    self.next = Some(next);
+                } else {
+                    // no more block entries. Move onto the next entry.
+                    match self.next_index_entry() {
+                        Ok(entry) => self.next = Some(entry),
+                        Err(e) => return Some(Err(e)),
+                    }
+                }
+            }
+            None => match self.next_index_entry() {
+                Ok(entry) => self.next = Some(entry),
+                Err(e) => return Some(Err(e)),
+            },
+        }
+
+        self.curr = self.next.clone();
+        Some(Ok(self.curr.clone().unwrap()))
+    }
+}
+
+/// `IndexEntry` provides lazy accessors for components of the entry.
+#[derive(Debug, Clone)]
+pub struct IndexEntry {
+    key: Vec<u8>,
+
+    pub block_type: BlockType,
+    pub count: u16,
+    pub block: Block,
+    curr_block: u16,
+}
+
+impl IndexEntry {
+    /// Get the organization ID that this entry belongs to.
+    pub fn org_id(&self) -> InfluxId {
+        Self::extract_id_from_slice(&self.key[..8])
+    }
+
+    /// Get the bucket ID that this entry belongs to.
+    pub fn bucket_id(&self) -> InfluxId {
+        Self::extract_id_from_slice(&self.key[8..16])
+    }
+
+    fn extract_id_from_slice(data: &[u8]) -> InfluxId {
+        let mut buf = [0u8; 8];
+        buf.copy_from_slice(&data[..8]);
+        InfluxId::from_be_bytes(buf)
+    }
+
+    pub fn parse_key(&self) -> Result<ParsedTsmKey, TsmError> {
+        key::parse_tsm_key(&self.key).map_err(|e| TsmError {
+            description: e.to_string(),
+        })
+    }
+}
+
+/// A BlockDecoder is capable of decoding a block definition into block data
+/// (timestamps and value vectors).
+
+pub trait BlockDecoder {
+    fn decode(&mut self, block: &Block) -> Result<BlockData, TsmError>;
+}
+
+impl<T> BlockDecoder for &mut T
+where
+    T: BlockDecoder,
+{
+    fn decode(&mut self, block: &Block) -> Result<BlockData, TsmError> {
+        (**self).decode(block)
+    }
+}
+
+/// MockBlockDecoder implements the BlockDecoder trait. It uses the `min_time`
+/// value in a provided `Block` definition as a key to a map of block data,
+/// which should be provided on initialisation.
+#[derive(Debug, Clone)]
+pub struct MockBlockDecoder {
+    blocks: BTreeMap<i64, BlockData>,
+}
+
+impl MockBlockDecoder {
+    pub fn new(blocks: BTreeMap<i64, BlockData>) -> Self {
+        Self { blocks }
+    }
+}
+
+impl BlockDecoder for MockBlockDecoder {
+    fn decode(&mut self, block: &Block) -> std::result::Result<BlockData, TsmError> {
+        self.blocks.get(&block.min_time).cloned().ok_or(TsmError {
+            description: "block not found".to_string(),
+        })
+    }
+}
+
+/// `BlockData` describes the various types of block data that can be held
+/// within a TSM file.
+#[derive(Debug, Clone, PartialEq)]
+pub enum BlockData {
+    Float {
+        i: usize,
+        ts: Vec<i64>,
+        values: Vec<f64>,
+    },
+    Integer {
+        i: usize,
+        ts: Vec<i64>,
+        values: Vec<i64>,
+    },
+    Bool {
+        i: usize,
+        ts: Vec<i64>,
+        values: Vec<bool>,
+    },
+    Str {
+        i: usize,
+        ts: Vec<i64>,
+        values: Vec<Vec<u8>>,
+    },
+    Unsigned {
+        i: usize,
+        ts: Vec<i64>,
+        values: Vec<u64>,
+    },
+}
+
+impl BlockData {
+    /// Initialise an empty `BlockData` with capacity `other.len()` values.
+    fn new_from_data(other: &Self) -> Self {
+        match other {
+            Self::Float { .. } => Self::Float {
+                i: 0,
+                ts: Vec::with_capacity(other.len()),
+                values: Vec::with_capacity(other.len()),
+            },
+            Self::Integer { .. } => Self::Integer {
+                i: 0,
+                ts: Vec::with_capacity(other.len()),
+                values: Vec::with_capacity(other.len()),
+            },
+            Self::Bool { .. } => Self::Bool {
+                i: 0,
+                ts: Vec::with_capacity(other.len()),
+                values: Vec::with_capacity(other.len()),
+            },
+            Self::Str { .. } => Self::Str {
+                i: 0,
+                ts: Vec::with_capacity(other.len()),
+                values: Vec::with_capacity(other.len()),
+            },
+            Self::Unsigned { .. } => Self::Unsigned {
+                i: 0,
+                ts: Vec::with_capacity(other.len()),
+                values: Vec::with_capacity(other.len()),
+            },
+        }
+    }
+
+    pub fn reserve_exact(&mut self, additional: usize) {
+        match self {
+            Self::Float { ts, values, .. } => {
+                ts.reserve_exact(additional);
+                values.reserve_exact(additional);
+            }
+            Self::Integer { ts, values, .. } => {
+                ts.reserve_exact(additional);
+                values.reserve_exact(additional);
+            }
+            Self::Bool { ts, values, .. } => {
+                ts.reserve_exact(additional);
+                values.reserve_exact(additional);
+            }
+            Self::Str { ts, values, .. } => {
+                ts.reserve_exact(additional);
+                values.reserve_exact(additional);
+            }
+            Self::Unsigned { ts, values, .. } => {
+                ts.reserve_exact(additional);
+                values.reserve_exact(additional);
+            }
+        }
+    }
+
+    /// Pushes the provided time-stamp value tuple onto the block data.
+    pub fn push(&mut self, pair: ValuePair) {
+        match pair {
+            ValuePair::F64((t, v)) => {
+                if let Self::Float { ts, values, .. } = self {
+                    ts.push(t);
+                    values.push(v);
+                } else {
+                    panic!("unsupported variant for BlockData::Float");
+                }
+            }
+            ValuePair::I64((t, v)) => {
+                if let Self::Integer { ts, values, .. } = self {
+                    ts.push(t);
+                    values.push(v);
+                } else {
+                    panic!("unsupported variant for BlockData::Integer");
+                }
+            }
+            ValuePair::Bool((t, v)) => {
+                if let Self::Bool { ts, values, .. } = self {
+                    ts.push(t);
+                    values.push(v);
+                } else {
+                    panic!("unsupported variant for BlockData::Bool");
+                }
+            }
+            ValuePair::Str((t, v)) => {
+                if let Self::Str { ts, values, .. } = self {
+                    ts.push(t);
+                    values.push(v); // TODO(edd): figure out
+                } else {
+                    panic!("unsupported variant for BlockData::Str");
+                }
+            }
+            ValuePair::U64((t, v)) => {
+                if let Self::Unsigned { ts, values, .. } = self {
+                    ts.push(t);
+                    values.push(v);
+                } else {
+                    panic!("unsupported variant for BlockData::Unsigned");
+                }
+            }
+        }
+    }
+
+    pub fn next_pair(&mut self) -> Option<ValuePair> {
+        if self.is_empty() {
+            return None;
+        }
+
+        match self {
+            Self::Float { i, ts, values } => {
+                let idx = *i;
+                *i += 1;
+                Some(ValuePair::F64((ts[idx], values[idx])))
+            }
+            Self::Integer { i, ts, values } => {
+                let idx = *i;
+                *i += 1;
+                Some(ValuePair::I64((ts[idx], values[idx])))
+            }
+            Self::Bool { i, ts, values } => {
+                let idx = *i;
+                *i += 1;
+                Some(ValuePair::Bool((ts[idx], values[idx])))
+            }
+            Self::Str { i, ts, values } => {
+                let idx = *i;
+                *i += 1;
+                Some(ValuePair::Str((ts[idx], values[idx].clone()))) // TODO - figure out
+            }
+            Self::Unsigned { i, ts, values } => {
+                let idx = *i;
+                *i += 1;
+                Some(ValuePair::U64((ts[idx], values[idx])))
+            }
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        match &self {
+            Self::Float { i, ts, .. } => *i == ts.len(),
+            Self::Integer { i, ts, .. } => *i == ts.len(),
+            Self::Bool { i, ts, .. } => *i == ts.len(),
+            Self::Str { i, ts, .. } => *i == ts.len(),
+            Self::Unsigned { i, ts, .. } => *i == ts.len(),
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        match &self {
+            Self::Float { ts, .. } => ts.len(),
+            Self::Integer { ts, .. } => ts.len(),
+            Self::Bool { ts, .. } => ts.len(),
+            Self::Str { ts, .. } => ts.len(),
+            Self::Unsigned { ts, .. } => ts.len(),
+        }
+    }
+
+    /// Merges multiple blocks of data together.
+    ///
+    /// For values within the block that have identical timestamps, `merge`
+    /// overwrites previous values. Therefore, in order to have "last write
+    /// wins" semantics it is important that the provided vector of blocks
+    /// is ordered by the wall-clock time the blocks were created.
+    #[allow(clippy::manual_flatten)]
+    pub fn merge(mut blocks: Vec<Self>) -> Self {
+        if blocks.is_empty() {
+            panic!("merge called with zero blocks");
+        } else if blocks.len() == 1 {
+            return blocks.remove(0); // only one block; no merging.
+        }
+
+        // The merged output block data to be returned
+        let mut block_data = Self::new_from_data(blocks.first().unwrap());
+
+        // buf will hold the next candidates from each of the sorted input
+        // blocks.
+        let mut buf = vec![None; blocks.len()];
+
+        // TODO(edd): perf - this simple iterator approach will likely be sped
+        // up by batch merging none-overlapping sections of candidate inputs.
+        loop {
+            match Self::refill_buffer(&mut blocks, &mut buf) {
+                Some(min_ts) => {
+                    let mut next_pair = None;
+                    // deduplicate points that have same timestamp.
+                    for pair in &mut buf {
+                        if let Some(vp) = pair {
+                            if vp.timestamp() == min_ts {
+                                // remove the data from the candidate buffer so it
+                                // can be refilled next time around
+                                next_pair = pair.take();
+                            }
+                        }
+                    }
+
+                    if let Some(vp) = next_pair {
+                        block_data.push(vp);
+                    } else {
+                        // TODO(edd): it feels like we should be able to re-jig
+                        // this so the compiler can prove that there is always
+                        // a next_pair.
+                        panic!("value pair missing from buffer");
+                    }
+                }
+                None => return block_data, // all inputs drained
+            }
+        }
+    }
+
+    fn refill_buffer(blocks: &mut [Self], dst: &mut Vec<Option<ValuePair>>) -> Option<i64> {
+        let mut min_ts = None;
+        for (block, dst) in blocks.iter_mut().zip(dst) {
+            if dst.is_none() {
+                *dst = block.next_pair();
+            }
+
+            if let Some(pair) = dst {
+                match min_ts {
+                    Some(min) => {
+                        if pair.timestamp() < min {
+                            min_ts = Some(pair.timestamp());
+                        }
+                    }
+                    None => min_ts = Some(pair.timestamp()),
+                }
+            };
+        }
+        min_ts
+    }
+}
+
+// ValuePair represents a single timestamp-value pair from a TSM block.
+#[derive(Debug, PartialEq, Clone)]
+pub enum ValuePair {
+    F64((i64, f64)),
+    I64((i64, i64)),
+    Bool((i64, bool)),
+    Str((i64, Vec<u8>)),
+    U64((i64, u64)),
+}
+
+impl ValuePair {
+    // The timestamp associated with the value pair.
+    pub fn timestamp(&self) -> i64 {
+        match *self {
+            Self::F64((ts, _)) => ts,
+            Self::I64((ts, _)) => ts,
+            Self::Bool((ts, _)) => ts,
+            Self::Str((ts, _)) => ts,
+            Self::U64((ts, _)) => ts,
+        }
+    }
+}
+
+/// `TSMBlockReader` allows you to read and decode TSM blocks from within a TSM
+/// file.
+#[derive(Debug)]
+pub struct TsmBlockReader<R>
+where
+    R: Read + Seek,
+{
+    readers: Vec<R>,
+}
+
+impl<R> TsmBlockReader<R>
+where
+    R: Read + Seek,
+{
+    pub fn new(r: R) -> Self {
+        Self { readers: vec![r] }
+    }
+
+    pub fn add_reader(&mut self, r: R) {
+        self.readers.push(r);
+    }
+}
+
+impl<R> BlockDecoder for TsmBlockReader<R>
+where
+    R: Read + Seek,
+{
+    /// decode a block whose location is described by the provided
+    /// `Block`.
+    ///
+    /// The components of the returned `BlockData` are guaranteed to have
+    /// identical lengths.
+    fn decode(&mut self, block: &Block) -> Result<BlockData, TsmError> {
+        match self.readers.get_mut(block.reader_idx) {
+            Some(r) => {
+                r.seek(SeekFrom::Start(block.offset))?;
+
+                let mut data: Vec<u8> = vec![0; block.size as usize];
+                r.read_exact(&mut data)?;
+
+                // TODO(edd): skip 32-bit CRC checksum at beginning of block for now
+                let mut idx = 4;
+
+                // determine the block type
+                let block_type = BlockType::try_from(data[idx])?;
+                idx += 1;
+
+                // first decode the timestamp block.
+                let mut ts = Vec::with_capacity(MAX_BLOCK_VALUES); // 1000 is the max block size
+                                                                   // size of timestamp block
+                let (len, n) = u64::decode_var(&data[idx..]).ok_or_else(|| TsmError {
+                    description: "unable to decode timestamp".into(),
+                })?;
+
+                idx += n;
+                encoders::timestamp::decode(&data[idx..idx + (len as usize)], &mut ts).map_err(
+                    |e| TsmError {
+                        description: e.to_string(),
+                    },
+                )?;
+                idx += len as usize;
+
+                match block_type {
+                    BlockType::Float => {
+                        // values will be same length as time-stamps.
+                        let mut values = Vec::with_capacity(ts.len());
+                        encoders::float::decode_influxdb(&data[idx..], &mut values).map_err(
+                            |e| TsmError {
+                                description: e.to_string(),
+                            },
+                        )?;
+
+                        Ok(BlockData::Float { i: 0, ts, values })
+                    }
+                    BlockType::Integer => {
+                        // values will be same length as time-stamps.
+                        let mut values = Vec::with_capacity(ts.len());
+                        encoders::integer::decode(&data[idx..], &mut values).map_err(|e| {
+                            TsmError {
+                                description: e.to_string(),
+                            }
+                        })?;
+
+                        Ok(BlockData::Integer { i: 0, ts, values })
+                    }
+                    BlockType::Bool => {
+                        // values will be same length as time-stamps.
+                        let mut values = Vec::with_capacity(ts.len());
+                        encoders::boolean::decode(&data[idx..], &mut values).map_err(|e| {
+                            TsmError {
+                                description: e.to_string(),
+                            }
+                        })?;
+
+                        Ok(BlockData::Bool { i: 0, ts, values })
+                    }
+                    BlockType::Str => {
+                        // values will be same length as time-stamps.
+                        let mut values = Vec::with_capacity(ts.len());
+                        encoders::string::decode(&data[idx..], &mut values).map_err(|e| {
+                            TsmError {
+                                description: e.to_string(),
+                            }
+                        })?;
+                        Ok(BlockData::Str { i: 0, ts, values })
+                    }
+                    BlockType::Unsigned => {
+                        // values will be same length as time-stamps.
+                        let mut values = Vec::with_capacity(ts.len());
+                        encoders::unsigned::decode(&data[idx..], &mut values).map_err(|e| {
+                            TsmError {
+                                description: e.to_string(),
+                            }
+                        })?;
+                        Ok(BlockData::Unsigned { i: 0, ts, values })
+                    }
+                }
+            }
+            None => Err(TsmError {
+                description: format!("cannot decode block {block:?} with no associated decoder"),
+            }),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use flate2::read::GzDecoder;
+    use std::fs::File;
+    use std::i64;
+    use std::io::BufReader;
+    use std::io::Cursor;
+    use std::io::Read;
+
+    #[test]
+    fn read_tsm_index() {
+        let file = File::open("../test_fixtures/000000000000005-000000002.tsm.gz");
+        let mut decoder = GzDecoder::new(file.unwrap());
+        let mut buf = Vec::new();
+        decoder.read_to_end(&mut buf).unwrap();
+
+        let reader = TsmIndexReader::try_new(BufReader::new(Cursor::new(buf)), 4_222_248).unwrap();
+
+        assert_eq!(reader.curr_offset, 3_893_272);
+        assert_eq!(reader.count(), 2159)
+    }
+
+    #[test]
+    fn read_tsm_block() {
+        let file = File::open("../test_fixtures/000000000000005-000000002.tsm.gz");
+        let mut decoder = GzDecoder::new(file.unwrap());
+        let mut buf = Vec::new();
+        decoder.read_to_end(&mut buf).unwrap();
+
+        let reader = TsmIndexReader::try_new(BufReader::new(Cursor::new(buf)), 4_222_248).unwrap();
+
+        let mut got_blocks = 0;
+        let mut got_min_time = i64::MAX;
+        let mut got_max_time = i64::MIN;
+
+        // every block in the fixture file is for the 05c19117091a1000 org and
+        // 05c19117091a1001 bucket.
+        let org_id = InfluxId::new_str("05c19117091a1000").unwrap();
+        let bucket_id = InfluxId::new_str("05c19117091a1001").unwrap();
+
+        for index_entry in reader {
+            match index_entry {
+                Ok(entry) => {
+                    // TODO(edd): this is surely not the right way. I should be
+                    // returning mutable references from the iterator.
+                    let e = entry.clone();
+                    got_blocks += e.count as u64;
+
+                    if entry.block.min_time < got_min_time {
+                        got_min_time = e.block.min_time;
+                    }
+
+                    if entry.block.max_time > got_max_time {
+                        got_max_time = e.block.max_time;
+                    }
+
+                    assert_eq!(e.org_id(), org_id);
+                    assert_eq!(e.bucket_id(), bucket_id);
+
+                    assert!(
+                        e.parse_key().is_ok(),
+                        "failed to parse key name for {:}",
+                        String::from_utf8_lossy(entry.key.as_slice())
+                    );
+                }
+                Err(e) => panic!("{e:?} {got_blocks:?}"),
+            }
+        }
+
+        assert_eq!(got_blocks, 2159); // 2,159 blocks in the file
+        assert_eq!(got_min_time, 1_590_585_404_546_128_000); // earliest time is 2020-05-27T13:16:44.546128Z
+        assert_eq!(got_max_time, 1_590_597_378_379_824_000); // latest time is
+                                                             // 2020-05-27T16:
+                                                             // 36:18.379824Z
+    }
+
+    #[test]
+    fn decode_tsm_blocks() {
+        let file = File::open("../test_fixtures/000000000000005-000000002.tsm.gz");
+        let mut decoder = GzDecoder::new(file.unwrap());
+        let mut buf = Vec::new();
+        decoder.read_to_end(&mut buf).unwrap();
+        let r = Cursor::new(buf);
+
+        let mut block_reader = TsmBlockReader::new(BufReader::new(r));
+
+        let block_defs = vec![
+            super::Block {
+                min_time: 1590585530000000000,
+                max_time: 1590590600000000000,
+                offset: 5339,
+                size: 153,
+                typ: BlockType::Float,
+                reader_idx: 0,
+            },
+            super::Block {
+                min_time: 1590585520000000000,
+                max_time: 1590590600000000000,
+                offset: 190770,
+                size: 30,
+                typ: BlockType::Integer,
+                reader_idx: 0,
+            },
+        ];
+
+        let mut blocks = vec![];
+        for def in block_defs {
+            blocks.push(block_reader.decode(&def).unwrap());
+        }
+
+        for block in blocks {
+            // The first integer block in the value should have 509 values in it.
+            match block {
+                BlockData::Float { ts, values, .. } => {
+                    assert_eq!(ts.len(), 507);
+                    assert_eq!(values.len(), 507);
+                }
+                BlockData::Integer { ts, values, .. } => {
+                    assert_eq!(ts.len(), 509);
+                    assert_eq!(values.len(), 509);
+                }
+                other => panic!("should not have decoded {other:?}"),
+            }
+        }
+    }
+
+    // This test scans over the entire tsm contents and
+    // ensures no errors are returned from the reader.
+    fn walk_index_and_check_for_errors(tsm_gz_path: &str) {
+        let file = File::open(tsm_gz_path);
+        let mut decoder = GzDecoder::new(file.unwrap());
+        let mut buf = Vec::new();
+        decoder.read_to_end(&mut buf).unwrap();
+        let data_len = buf.len();
+
+        let mut index_reader =
+            TsmIndexReader::try_new(BufReader::new(Cursor::new(&buf)), data_len).unwrap();
+        let mut blocks = Vec::new();
+
+        for res in &mut index_reader {
+            let entry = res.unwrap();
+            let key = entry.parse_key().unwrap();
+            assert!(!key.measurement.is_empty());
+
+            blocks.push(entry.block);
+        }
+
+        let mut block_reader = TsmBlockReader::new(Cursor::new(&buf));
+        for block in blocks {
+            block_reader
+                .decode(&block)
+                .expect("error decoding block data");
+        }
+    }
+
+    #[test]
+    fn check_tsm_cpu_usage() {
+        walk_index_and_check_for_errors("../test_fixtures/cpu_usage.tsm.gz");
+    }
+
+    #[test]
+    fn check_tsm_000000000000005_000000002() {
+        walk_index_and_check_for_errors("../test_fixtures/000000000000005-000000002.tsm.gz");
+    }
+
+    #[test]
+    fn refill_buffer() {
+        let mut buf = vec![None; 2];
+        let mut blocks = vec![
+            BlockData::Float {
+                i: 0,
+                ts: vec![1, 2, 3],
+                values: vec![1.2, 2.3, 4.4],
+            },
+            BlockData::Float {
+                i: 0,
+                ts: vec![2],
+                values: vec![20.2],
+            },
+        ];
+
+        let mut min_ts = BlockData::refill_buffer(&mut blocks, &mut buf);
+        assert_eq!(min_ts.unwrap(), 1);
+        assert_eq!(buf[0].take().unwrap(), ValuePair::F64((1, 1.2)));
+        assert_eq!(buf[1].take().unwrap(), ValuePair::F64((2, 20.2)));
+
+        // input buffer drained via take calls above - refill
+        min_ts = BlockData::refill_buffer(&mut blocks, &mut buf);
+        assert_eq!(min_ts.unwrap(), 2);
+        assert_eq!(buf[0].take().unwrap(), ValuePair::F64((2, 2.3)));
+        assert_eq!(buf[1].take(), None);
+    }
+
+    #[test]
+    fn merge_blocks() {
+        let res = BlockData::merge(vec![
+            BlockData::Integer {
+                i: 0,
+                ts: vec![1, 2, 3],
+                values: vec![10, 20, 30],
+            },
+            BlockData::Integer {
+                i: 0,
+                ts: vec![2, 4],
+                values: vec![200, 300],
+            },
+        ]);
+
+        assert_eq!(
+            res,
+            BlockData::Integer {
+                i: 0,
+                ts: vec![1, 2, 3, 4],
+                values: vec![10, 200, 30, 300],
+            },
+        );
+    }
+}
diff --git a/influxrpc_parser/Cargo.toml b/influxrpc_parser/Cargo.toml
new file mode 100644
index 0000000..520bed5
--- /dev/null
+++ b/influxrpc_parser/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "influxrpc_parser"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+sqlparser = {workspace = true}
+snafu = "0.8.0"
+generated_types = { path = "../generated_types" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/influxrpc_parser/src/lib.rs b/influxrpc_parser/src/lib.rs
new file mode 100644
index 0000000..92d4feb
--- /dev/null
+++ b/influxrpc_parser/src/lib.rs
@@ -0,0 +1,20 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::str_to_string,
+    clippy::string_to_string,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod predicate;
diff --git a/influxrpc_parser/src/predicate.rs b/influxrpc_parser/src/predicate.rs
new file mode 100644
index 0000000..97651ed
--- /dev/null
+++ b/influxrpc_parser/src/predicate.rs
@@ -0,0 +1,545 @@
+//! This module has logic to translate a sub-set of DataFusion expressions into
+//! RPC Nodes (predicates).
+use generated_types::{
+    node::Comparison as RPCComparison, node::Logical as RPCLogical, node::Type as RPCType,
+    node::Value as RPCValue, Node as RPCNode, Predicate as RPCPredicate,
+};
+
+use snafu::{ResultExt, Snafu};
+use sqlparser::{
+    ast::{BinaryOperator as Operator, Expr, Ident, Value},
+    parser::Parser,
+    tokenizer::Tokenizer,
+};
+
+// String and byte representation of a measurement name in a predicate.
+const MEASUREMENT_COLUMN_NAME: &str = "_measurement";
+const TAG_KEY_FIELD: [u8; 1] = [255];
+
+// String and byte representation of a field name in a prediacte.
+const FIELD_COLUMN_NAME: &str = "_field";
+const TAG_KEY_MEASUREMENT: [u8; 1] = [0];
+
+/// Parse Error
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("unable to parse '{:?}' ", source))]
+    ExprParseError {
+        source: sqlparser::parser::ParserError,
+    },
+
+    #[snafu(display("unable to parse '{}' into numerical value", value))]
+    NumericalParseError { value: String, msg: String },
+
+    #[snafu(display("unexpected value '{:?}'", value))]
+    UnexpectedValue { value: Value },
+
+    #[snafu(display("unexpected expression type: '{:?}'", expr))]
+    UnexpectedExprType { expr: Expr },
+
+    #[snafu(display("unexpected operator: '{:?}'", op))]
+    UnexpectedBinaryOperator { op: Operator },
+
+    #[snafu(display(
+        "unsupported identifier type: '{:?}'. Supported Types: field, tag",
+        ident
+    ))]
+    UnsupportedIdentType { ident: String },
+}
+
+/// Result type for Parser Cient
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Parses and then converts a SQL expression to and InfluxRPC predicate node.
+///
+/// Expects expressions like the following:
+///
+///   * server = 'host' AND "temp"::field > 22
+///   * "env"::tag = 'eu' OR "env"::tag = 'us' OR "env"::tag = 'asia'
+///   * "host" = 'a' AND ("temp"::field > 100.3 OR "cpu"::field = 'cpu-1')
+///   * "_measurement" = 'cpu'
+///
+/// Notes:
+///
+///   * Tag keys can be optionally surrounded in double quotes.
+///   * Use the identifiers ::tag or ::field to explicitly denote whether the
+///     expression is on a tag or a field.
+///   * The omission of ::field indicates that the expression is on a tag.
+///   * Numbers are parsed into integers where possible, but fall back to floats
+///   * Use parentheses to denote precedence.
+///   * _measurement and _field will be correctly converted into the binary format.
+///
+/// Unsupported:
+///   * Regex operators are not yet supported.
+///   * Unsigned integers cannot yet be supported because there is no current
+///     way to denote them (we need to add a `u` suffix support).
+///
+pub fn expr_to_rpc_predicate(expr: &str) -> Result<RPCPredicate> {
+    let dialect = sqlparser::dialect::PostgreSqlDialect {};
+    let mut tokenizer = Tokenizer::new(&dialect, expr);
+    let tokens = tokenizer.tokenize().unwrap();
+    let mut parser = Parser::new(&dialect).with_tokens(tokens);
+
+    Ok(RPCPredicate {
+        root: Some(build_node(
+            &parser.parse_expr().context(ExprParseSnafu)?,
+            false,
+        )?),
+    })
+}
+
+// Builds an RPCNode given the value Expr and the converted children
+fn build_node(expr: &Expr, strings_are_regex: bool) -> Result<RPCNode> {
+    match expr {
+        Expr::Nested(expr) => make_node(
+            RPCType::ParenExpression,
+            vec![build_node(expr, strings_are_regex)?],
+            None,
+        ),
+        Expr::Cast {
+            expr,
+            data_type,
+            format: None,
+        } => match data_type {
+            sqlparser::ast::DataType::Custom(ident, _modifiers) => {
+                if let Some(Ident { value, .. }) = ident.0.first() {
+                    // See https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#syntax
+                    match value.as_str() {
+                        "field" => {
+                            // extract field key identifier
+                            if let Expr::Identifier(field) = &**expr {
+                                return make_leaf(
+                                    RPCType::FieldRef,
+                                    RPCValue::FieldRefValue(field.value.clone()),
+                                );
+                            }
+                            return UnexpectedExprTypeSnafu {
+                                expr: *expr.clone(),
+                            }
+                            .fail();
+                        }
+                        "tag" => {
+                            // extract tag key identifier
+                            if let Expr::Identifier(tag) = &**expr {
+                                return make_leaf(
+                                    RPCType::TagRef,
+                                    RPCValue::TagRefValue(make_tag_name(tag.value.clone())),
+                                );
+                            }
+                            return UnexpectedExprTypeSnafu {
+                                expr: *expr.clone(),
+                            }
+                            .fail();
+                        }
+                        _ => {} // fall through
+                    };
+                }
+                UnsupportedIdentTypeSnafu {
+                    ident: ident.to_string(),
+                }
+                .fail()
+            }
+            _ => UnsupportedIdentTypeSnafu {
+                ident: data_type.to_string(),
+            }
+            .fail(),
+        },
+        Expr::Identifier(c) => {
+            // Identifiers with no casting syntax (no :: present) are treated
+            // as tag keys.
+            make_leaf(
+                RPCType::TagRef,
+                RPCValue::TagRefValue(make_tag_name(c.value.clone())),
+            )
+        }
+        Expr::Value(v) => match v {
+            Value::Boolean(b) => make_lit(RPCValue::BoolValue(*b)),
+            Value::Number(n, _) => make_lit(parse_number(n)?),
+            Value::DoubleQuotedString(v)
+            | Value::SingleQuotedString(v)
+            | Value::HexStringLiteral(v)
+            | Value::NationalStringLiteral(v) => {
+                if strings_are_regex {
+                    make_lit(RPCValue::RegexValue(v.clone()))
+                } else {
+                    make_lit(RPCValue::StringValue(v.clone()))
+                }
+            }
+            _ => UnexpectedValueSnafu {
+                value: v.to_owned(),
+            }
+            .fail(),
+        },
+        Expr::BinaryOp { left, op, right } => {
+            let strings_are_regex =
+                matches!(op, Operator::PGRegexMatch | Operator::PGRegexNotMatch);
+
+            build_binary_node(
+                build_node(left, strings_are_regex)?,
+                op.clone(),
+                build_node(right, strings_are_regex)?,
+            )
+        }
+        _ => UnexpectedExprTypeSnafu {
+            expr: expr.to_owned(),
+        }
+        .fail(),
+    }
+}
+
+fn parse_number(number: &str) -> Result<RPCValue, Error> {
+    match number.parse::<i64>() {
+        Ok(n) => Ok(RPCValue::IntValue(n)),
+        Err(_) => {
+            let f = number
+                .parse::<f64>()
+                .map_err(|e| Error::NumericalParseError {
+                    value: number.to_owned(),
+                    msg: e.to_string(),
+                })?;
+            Ok(RPCValue::FloatValue(f))
+        }
+    }
+}
+
+fn build_binary_node(left: RPCNode, op: Operator, right: RPCNode) -> Result<RPCNode> {
+    match op {
+        Operator::Eq => make_comparison_node(left, RPCComparison::Equal, right),
+        Operator::NotEq => make_comparison_node(left, RPCComparison::NotEqual, right),
+        Operator::Lt => make_comparison_node(left, RPCComparison::Lt, right),
+        Operator::LtEq => make_comparison_node(left, RPCComparison::Lte, right),
+        Operator::Gt => make_comparison_node(left, RPCComparison::Gt, right),
+        Operator::GtEq => make_comparison_node(left, RPCComparison::Gte, right),
+        Operator::PGRegexMatch => make_comparison_node(left, RPCComparison::Regex, right),
+        Operator::PGRegexNotMatch => make_comparison_node(left, RPCComparison::NotRegex, right),
+        // logical nodes
+        Operator::And => make_logical_node(left, RPCLogical::And, right),
+        Operator::Or => make_logical_node(left, RPCLogical::Or, right),
+        _ => UnexpectedBinaryOperatorSnafu { op }.fail(),
+    }
+}
+
+fn make_tag_name(tag_name: String) -> Vec<u8> {
+    if tag_name == MEASUREMENT_COLUMN_NAME {
+        return TAG_KEY_MEASUREMENT.to_vec();
+    } else if tag_name == FIELD_COLUMN_NAME {
+        return TAG_KEY_FIELD.to_vec();
+    }
+    tag_name.as_bytes().to_owned()
+}
+
+// Create an RPCNode.
+fn make_node(
+    node_type: RPCType,
+    children: Vec<RPCNode>,
+    value: Option<RPCValue>,
+) -> Result<RPCNode> {
+    Ok(RPCNode {
+        node_type: node_type as i32,
+        children,
+        value,
+    })
+}
+
+// Create a comparison node, e.g., server > "foo"
+fn make_comparison_node(left: RPCNode, cmp: RPCComparison, right: RPCNode) -> Result<RPCNode> {
+    make_node(
+        RPCType::ComparisonExpression,
+        vec![left, right],
+        Some(RPCValue::Comparison(cmp as i32)),
+    )
+}
+
+// Create a logical node, e.g., ("server" > 'foo') AND ("host" = 'bar')
+fn make_logical_node(left: RPCNode, op: RPCLogical, right: RPCNode) -> Result<RPCNode> {
+    make_node(
+        RPCType::LogicalExpression,
+        vec![left, right],
+        Some(RPCValue::Logical(op as i32)),
+    )
+}
+
+// Create a leaf node.
+fn make_leaf(node_type: RPCType, value: RPCValue) -> Result<RPCNode> {
+    make_node(node_type, vec![], Some(value))
+}
+
+// Creates an RPC literal leaf node
+fn make_lit(value: RPCValue) -> Result<RPCNode> {
+    make_leaf(RPCType::Literal, value)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    // helper functions to programmatically create RPC node.
+    //
+
+    fn rpc_op_from_str(cmp: &str) -> RPCComparison {
+        match cmp {
+            "=" => RPCComparison::Equal,
+            "!=" => RPCComparison::NotEqual,
+            ">" => RPCComparison::Gt,
+            ">=" => RPCComparison::Gte,
+            "<" => RPCComparison::Lt,
+            "<=" => RPCComparison::Lte,
+            "~" => RPCComparison::Regex,
+            "!~" => RPCComparison::NotRegex,
+            _ => panic!("invalid comparator string: {cmp:?}"),
+        }
+    }
+
+    // Creates a simple tag comparison expression.
+    //
+    // Input should be a simple expression as a string, for example:
+    //
+    // server_a = foo
+    // host != bar
+    //
+    // N.B, does not support spaces in tag keys or values.
+    fn make_tag_expr(input: &str) -> RPCNode {
+        let parts = input.split_whitespace().collect::<Vec<_>>();
+        assert_eq!(parts.len(), 3, "invalid input string: {input:?}");
+
+        let comparison = rpc_op_from_str(parts[1]);
+        let is_regex =
+            (comparison == RPCComparison::Regex) || (comparison == RPCComparison::NotRegex);
+
+        // remove quoting from literal - whilst we need the quoting to parse
+        // the sql statement correctly, the RPCNode for the literal would not
+        // have the quoting present.
+        let literal = parts[2].replace(['\'', '"'], "");
+        let literal = if is_regex {
+            RPCValue::RegexValue(literal)
+        } else {
+            RPCValue::StringValue(literal)
+        };
+        RPCNode {
+            node_type: RPCType::ComparisonExpression as i32,
+            children: vec![
+                RPCNode {
+                    node_type: RPCType::TagRef as i32,
+                    children: vec![],
+                    value: Some(RPCValue::TagRefValue(parts[0].as_bytes().to_owned())),
+                },
+                RPCNode {
+                    node_type: RPCType::Literal as i32,
+                    children: vec![],
+                    value: Some(literal),
+                },
+            ],
+            value: Some(RPCValue::Comparison(comparison as i32)),
+        }
+    }
+
+    // Creates a simple field comparison expression.
+    fn make_field_expr(field_key: &str, op: &str, value: RPCValue) -> RPCNode {
+        RPCNode {
+            node_type: RPCType::ComparisonExpression as i32,
+            children: vec![
+                RPCNode {
+                    node_type: RPCType::FieldRef as i32,
+                    children: vec![],
+                    value: Some(RPCValue::FieldRefValue(field_key.to_owned())),
+                },
+                RPCNode {
+                    node_type: RPCType::Literal as i32,
+                    children: vec![],
+                    value: Some(value),
+                },
+            ],
+            value: Some(RPCValue::Comparison(rpc_op_from_str(op) as i32)),
+        }
+    }
+
+    macro_rules! make_field_expr_types {
+        ($(($name:ident, $type:ty, $variant:ident),)*) => {
+            $(
+                fn $name(field_key: &str, op: &str, value: $type) -> RPCNode {
+                    make_field_expr(field_key, op, RPCValue::$variant(value))
+                }
+            )*
+        };
+    }
+
+    make_field_expr_types! {
+        (make_field_expr_i64, i64, IntValue),
+        (make_field_expr_f64, f64, FloatValue),
+        (make_field_expr_bool, bool, BoolValue),
+        (make_field_expr_str, String, StringValue),
+    }
+
+    fn make_sql_expr(input: &str) -> RPCNode {
+        let parsed = expr_to_rpc_predicate(input).unwrap();
+        parsed.root.unwrap()
+    }
+
+    #[test]
+    // Test that simple sqlparser binary expressions are converted into the
+    // correct tag comparison nodes
+    fn test_from_sql_expr_tag_comparisons() {
+        let ops = vec!["=", "!=", ">", ">=", "<", "<=", "~", "!~"];
+        let exprs = ops
+            .into_iter()
+            .map(|op| format!("server {op} 'abc'"))
+            .collect::<Vec<_>>();
+
+        for expr_str in exprs {
+            let expr = make_sql_expr(&expr_str);
+            let exp_rpc_node = make_tag_expr(&expr_str);
+            assert_eq!(expr, exp_rpc_node)
+        }
+
+        // Using the double quoted syntax for a tag key.
+        let expr = make_sql_expr(r#""my,stuttering,tag,key" != 'foo'"#);
+        let exp_rpc_node = make_tag_expr("my,stuttering,tag,key != 'foo'");
+        assert_eq!(expr, exp_rpc_node);
+
+        // Using the explicit ::tag syntax works.
+        let expr = make_sql_expr("server::tag = 'foo'");
+        let exp_rpc_node = make_tag_expr("server = 'foo'");
+        assert_eq!(expr, exp_rpc_node);
+
+        // Using the syntax "server"::tag also works.
+        let expr = make_sql_expr(r#""server"::tag = 'foo'"#);
+        let exp_rpc_node = make_tag_expr("server = 'foo'");
+        assert_eq!(expr, exp_rpc_node);
+    }
+
+    #[test]
+    fn test_from_sql_expr_invalid_tag_comparisons() {
+        let expr = expr_to_rpc_predicate("server::foo = 'bar'");
+        assert!(matches!(expr, Err(Error::UnsupportedIdentType { .. })));
+
+        let expr = expr_to_rpc_predicate("22.32::field != 'abc'");
+        assert!(matches!(expr, Err(Error::UnexpectedExprType { .. })));
+    }
+
+    #[test]
+    fn test_from_sql_expr_special_key_comparison() {
+        let expr = make_sql_expr("_measurement = 'cpu'");
+        let exp_rpc_node = make_comparison_node(
+            RPCNode {
+                node_type: RPCType::TagRef as i32,
+                children: vec![],
+                value: Some(RPCValue::TagRefValue(TAG_KEY_MEASUREMENT.to_vec())),
+            },
+            RPCComparison::Equal,
+            RPCNode {
+                node_type: RPCType::Literal as i32,
+                children: vec![],
+                value: Some(RPCValue::StringValue("cpu".to_owned())),
+            },
+        )
+        .unwrap();
+        assert_eq!(expr, exp_rpc_node);
+
+        let expr = make_sql_expr("_field = 'cpu'");
+        let exp_rpc_node = make_comparison_node(
+            RPCNode {
+                node_type: RPCType::TagRef as i32,
+                children: vec![],
+                value: Some(RPCValue::TagRefValue(TAG_KEY_FIELD.to_vec())),
+            },
+            RPCComparison::Equal,
+            RPCNode {
+                node_type: RPCType::Literal as i32,
+                children: vec![],
+                value: Some(RPCValue::StringValue("cpu".to_owned())),
+            },
+        )
+        .unwrap();
+        assert_eq!(expr, exp_rpc_node);
+    }
+
+    #[test]
+    // Test that simple sqlparser binary expressions are converted into the
+    // correct tag comparison nodes
+    fn test_from_sql_expr_field_comparisons() {
+        let ops = vec!["=", "!=", ">", ">=", "<", "<="];
+
+        for op in ops {
+            let exprs = vec![
+                (
+                    make_sql_expr(&format!("server::field {op} 100")),
+                    make_field_expr_i64("server", op, 100_i64),
+                ),
+                (
+                    make_sql_expr(&format!("server::field {op} 100.0")),
+                    make_field_expr_f64("server", op, 100.0),
+                ),
+                (
+                    make_sql_expr(&format!("server::field {op} true")),
+                    make_field_expr_bool("server", op, true),
+                ),
+                (
+                    make_sql_expr(&format!("server::field {op} 'Mice'")),
+                    make_field_expr_str("server", op, "Mice".to_owned()),
+                ),
+            ];
+
+            for (expr, exp_rpc_node) in exprs {
+                assert_eq!(expr, exp_rpc_node)
+            }
+        }
+    }
+
+    #[test]
+    fn test_from_sql_expr_logical_node() {
+        let expr = make_sql_expr("temp::field >= 22.9 AND env = 'us-west'");
+        let exp_rpc_node = make_logical_node(
+            make_field_expr_f64("temp", ">=", 22.9),
+            RPCLogical::And,
+            make_tag_expr("env = us-west"),
+        )
+        .unwrap();
+
+        assert_eq!(expr, exp_rpc_node);
+
+        let expr = make_sql_expr(r#" "server" = 'a' OR env = 'us-west'"#);
+        let exp_rpc_node = make_logical_node(
+            make_tag_expr(r#"server = a"#),
+            RPCLogical::Or,
+            make_tag_expr("env = us-west"),
+        )
+        .unwrap();
+
+        assert_eq!(expr, exp_rpc_node);
+
+        let expr = make_sql_expr("env = 'usa' OR env = 'eu' OR env = 'asia'");
+        let exp_rpc_node = make_logical_node(
+            make_logical_node(
+                make_tag_expr("env = usa"),
+                RPCLogical::Or,
+                make_tag_expr("env = eu"),
+            )
+            .unwrap(),
+            RPCLogical::Or,
+            make_tag_expr("env = asia"),
+        )
+        .unwrap();
+
+        assert_eq!(expr, exp_rpc_node);
+    }
+
+    #[test]
+    fn test_from_sql_expr_nested() {
+        let expr = make_sql_expr("env = 'usa' OR (env = 'eu' AND temp::field = 'on')");
+
+        let left_rpc_expr = make_tag_expr("env = 'usa'");
+        let right_rpc_expr = RPCNode {
+            node_type: RPCType::ParenExpression as i32,
+            children: vec![make_logical_node(
+                make_tag_expr("env = eu"),
+                RPCLogical::And,
+                make_field_expr_str("temp", "=", "on".to_owned()),
+            )
+            .unwrap()],
+            value: None,
+        };
+        let exp_rpc_node =
+            make_logical_node(left_rpc_expr, RPCLogical::Or, right_rpc_expr).unwrap();
+        assert_eq!(expr, exp_rpc_node);
+    }
+}
diff --git a/ingester_query_grpc/Cargo.toml b/ingester_query_grpc/Cargo.toml
new file mode 100644
index 0000000..eaa246f
--- /dev/null
+++ b/ingester_query_grpc/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "ingester_query_grpc"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+arrow = { workspace = true }
+bytes = "1.5"
+base64 = "0.21"
+data_types = { path = "../data_types" }
+datafusion = { workspace = true }
+datafusion-proto = { workspace = true }
+flatbuffers = "23.5.26"
+pbjson = { workspace = true }
+predicate = { path = "../predicate" }
+prost = { workspace = true }
+query_functions = { path = "../query_functions" }
+serde = { version = "1.0", features = ["derive"] }
+snafu = "0.8"
+tonic = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[build-dependencies] # In alphabetical order
+tonic-build = { workspace = true }
+prost-build = { workspace = true }
+pbjson-build = { workspace = true }
diff --git a/ingester_query_grpc/build.rs b/ingester_query_grpc/build.rs
new file mode 100644
index 0000000..34e925f
--- /dev/null
+++ b/ingester_query_grpc/build.rs
@@ -0,0 +1,61 @@
+//! Compiles Protocol Buffers into native Rust types.
+
+use std::env;
+use std::path::{Path, PathBuf};
+
+type Error = Box<dyn std::error::Error>;
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+fn main() -> Result<()> {
+    let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
+
+    generate_grpc_types(&root)?;
+
+    Ok(())
+}
+
+/// Schema used with IOx specific Ingester-Querier gRPC requests
+///
+/// Creates:
+///
+/// - `influxdata.iox.ingester.v1.rs`
+fn generate_grpc_types(root: &Path) -> Result<()> {
+    let ingester_path_v1 = root.join("influxdata/iox/ingester/v1");
+    let ingester_path_v2 = root.join("influxdata/iox/ingester/v2");
+
+    let proto_files = vec![
+        ingester_path_v1.join("query.proto"),
+        ingester_path_v2.join("query.proto"),
+    ];
+
+    // Tell cargo to recompile if any of these proto files are changed
+    for proto_file in &proto_files {
+        println!("cargo:rerun-if-changed={}", proto_file.display());
+    }
+
+    let mut config = prost_build::Config::new();
+
+    config
+        .compile_well_known_types()
+        .disable_comments([".google"])
+        .extern_path(".google.protobuf", "::pbjson_types")
+        .btree_map([
+            ".influxdata.iox.ingester.v1.IngesterQueryResponseMetadata.unpersisted_partitions",
+        ])
+        .bytes([".influxdata.iox.ingester.v2"]);
+
+    let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");
+    tonic_build::configure()
+        .file_descriptor_set_path(&descriptor_path)
+        // protoc in ubuntu builder needs this option
+        .protoc_arg("--experimental_allow_proto3_optional")
+        .compile_with_config(config, &proto_files, &[root])?;
+
+    let descriptor_set = std::fs::read(descriptor_path)?;
+
+    pbjson_build::Builder::new()
+        .register_descriptors(&descriptor_set)?
+        .build(&[".influxdata.iox"])?;
+
+    Ok(())
+}
diff --git a/ingester_query_grpc/protos/influxdata/iox/ingester/v1/query.proto b/ingester_query_grpc/protos/influxdata/iox/ingester/v1/query.proto
new file mode 100644
index 0000000..6d74aca
--- /dev/null
+++ b/ingester_query_grpc/protos/influxdata/iox/ingester/v1/query.proto
@@ -0,0 +1,147 @@
+syntax = "proto3";
+package influxdata.iox.ingester.v1;
+option go_package = "github.com/influxdata/iox/ingester/v1";
+
+// Request to the ingester service for data that is not yet
+// persisted. This is how the querier and ingester interact.
+//
+// This type of message is Serialized as the Ticket value for the
+// Arrow Flight doGet request.
+message IngesterQueryRequest {
+  // Was table indexed via string (instead of via ID)
+  reserved "table";
+  reserved 1;
+
+  // Table to search
+  int64 table_id = 9;
+
+  // Columns the query service is interested in
+  repeated string columns = 2;
+
+  // Was start time of the query; now use the one in predicate
+  reserved "min_time";
+  reserved 3;
+
+  // Was end time of the query; now use the one in predicate
+  reserved "max_time";
+  reserved 4;
+
+  // Predicate for filtering
+  optional Predicate predicate = 5;
+
+  // Was for only returning rows with a sequence number greater than this
+  reserved "greater_than_sequence_number";
+  reserved 6;
+
+  // Was namespace indexed as a string (instead of via ID)
+  reserved "namespace";
+  reserved 7;
+
+  // Namespace to search
+  int64 namespace_id = 10;
+
+  // was used to only request data from a single sequencer ID
+  reserved "sequencer_id";
+  reserved 8;
+}
+
+// Metadata that the ingester provides to the query service along with the results. Serialized
+// in every FlightData's app_metadata .
+message IngesterQueryResponseMetadata {
+  // There was no field 1, oops.
+  reserved 1;
+
+  // Was max persisted sequence number of the table
+  reserved "max_sequencer_number";
+  reserved 2;
+
+  // Was max sequence number persisted for this table
+  reserved "parquet_max_sequence_number";
+  reserved 3;
+
+  // Was max sequence number for a tombstone associated with this table
+  reserved "tombstone_max_sequence_number";
+  reserved 4;
+
+  // Fields from a time when this metadata was sent once at the beginning of the response stream instead of once per
+  // partition.
+  reserved "unpersisted_partitions";
+  reserved 5;
+  reserved "batch_partition_ids";
+  reserved 6;
+
+  // Was partition status.
+  reserved "status";
+  reserved 8;
+
+  // UUID of this ingester instance.
+  string ingester_uuid = 9;
+
+  // Number of Parquet files that have been persisted to object storage for this partition.
+  uint64 completed_persistence_count = 10;
+
+  // Either the catalog-assigned partition ID or the deterministic identifier
+  // created from the table ID and partition key.
+  //
+  // For "old-style" partitions that were created before the switch to
+  // deterministic partition IDs, a `catalog_id` is returned, and this is used
+  // to address the partition by row ID.
+  //
+  // For "new-style" partitions, a deterministic hash-based ID is used to
+  // address a partition.
+  //
+  // Invariant: a partition is EITHER an "old-style", row addressed partition,
+  // OR a "new-style" hash ID addressed partition for the lifetime of the
+  // partition.
+  //
+  // See <https://github.com/influxdata/idpe/issues/17476>.
+  oneof partition_identifier {
+    // An "old-style" partition addressed by catalog row ID.
+    int64 catalog_id = 7;
+
+    // A "new-style" partition addressed by a deterministic hash ID.
+    bytes hash_id = 11;
+  }
+}
+
+// Serialization of `predicate::predicate::Predicate` that contains DataFusion `Expr`s
+message Predicate {
+  // Optional field restriction. If any are present, restricts the results to only tables which
+  // have *at least one* of the fields in field_columns.
+  repeated string field_columns = 1;
+
+  // partition key is no longer part of the predicate
+  reserved 2;
+  reserved "partition_key";
+
+  // Optional timestamp range: only rows within this range are included in results. Other rows are
+  // excluded.
+  optional TimestampRange range = 3;
+
+  // Optional arbitrary predicates, represented as list of DataFusion expressions applied a logical
+  // conjunction (aka they are 'AND'ed together). Only rows that evaluate to TRUE for all these
+  // expressions should be returned. Other rows are excluded from the results.
+  //
+  // Encoded using DataFusion's Expr serialization code
+  repeated bytes exprs = 4;
+
+  // Optional arbitrary predicates on the special `_value` column. These expressions are applied to
+  // `field_columns` projections in the form of `CASE` statement conditions.
+  repeated ValueExpr value_expr = 5;
+}
+
+// Specifies a continuous range of nanosecond timestamps.
+message TimestampRange {
+  // Start defines the inclusive lower bound.
+  int64 start = 1;
+
+  // End defines the exclusive upper bound.
+  int64 end = 2;
+}
+
+
+// A wrapper around a DataFusion expression against `_value` columns
+message ValueExpr {
+  // Encoded using DataFusion's Expr serialization code
+  bytes expr = 1;
+}
diff --git a/ingester_query_grpc/protos/influxdata/iox/ingester/v2/query.proto b/ingester_query_grpc/protos/influxdata/iox/ingester/v2/query.proto
new file mode 100644
index 0000000..4303ccb
--- /dev/null
+++ b/ingester_query_grpc/protos/influxdata/iox/ingester/v2/query.proto
@@ -0,0 +1,142 @@
+syntax = "proto3";
+package influxdata.iox.ingester.v2;
+option go_package = "github.com/influxdata/iox/ingester/v2";
+
+message PartitionIdentifier {
+  // Either the catalog-assigned partition ID or the deterministic identifier
+  // created from the table ID and partition key.
+  //
+  // For "old-style" partitions that were created before the switch to
+  // deterministic partition IDs, a `catalog_id` is returned, and this is used
+  // to address the partition by row ID.
+  //
+  // For "new-style" partitions, a deterministic hash-based ID is used to
+  // address a partition.
+  //
+  // Invariant: a partition is EITHER an "old-style", row addressed partition,
+  // OR a "new-style" hash ID addressed partition for the lifetime of the
+  // partition.
+  //
+  // See <https://github.com/influxdata/idpe/issues/17476>.
+  oneof partition_identifier {
+    // An "old-style" partition addressed by catalog row ID.
+    int64 catalog_id = 7;
+
+    // A "new-style" partition addressed by a deterministic hash ID.
+    bytes hash_id = 11;
+  }
+}
+
+message Filters {
+  // Optional arbitrary predicates, represented as list of DataFusion expressions applied a logical
+  // conjunction (aka they are 'AND'ed together). Only rows that evaluate to TRUE for all these
+  // expressions should be returned. Other rows are excluded from the results.
+  //
+  // Encoded using DataFusion's Expr serialization code
+  repeated bytes exprs = 1;
+}
+
+// Arrow encoded data.
+message EncodedData {
+  // Data that describes the arrow payload.
+  bytes ipc_message = 1;
+
+  // The actual arrow payload itself.
+  bytes arrow_data = 2;
+}
+
+// An encoded Arrow RecordBatch w/o schema information.
+message RecordBatch {
+  // Dictionary data.
+  repeated EncodedData dictionaries = 1;
+
+  // Record batch itself.
+  EncodedData batch = 2;
+}
+
+message QueryRequest {
+  // Namespace to search
+  int64 namespace_id = 1;
+
+  // Table that should be queried.
+  int64 table_id = 2;
+
+  // Columns the query service is interested in
+  repeated string columns = 3;
+
+  // Predicate for filtering.
+  Filters filters = 4;
+
+  // Minimum timestamp (inclusive) that should be queried. If the query
+  // is unbounded then this will have the value of i64::MIN.
+  int64 t_min = 5;
+
+  // Maximum timestamp (inclusive) that should be queried. If the query
+  // is unbounded then this will have the value of i64::MAX.
+  int64 t_max = 6;
+}
+
+message IngesterQueryResponseMetadata {
+  message Partition {
+    // Partition ID.
+    PartitionIdentifier id = 1;
+
+    // Minimum timestamp.
+    int64 t_min = 2;
+
+    // Maximum timestamp (inclusive).
+    int64 t_max = 3;
+
+    // Projection of the partition.
+    //
+    // The projection is represented as a SORTED set of column indices. The indices are 0-based and point to the table schema
+    // transmitted in this metadata message. They MUST NOT contain any duplicates.
+    repeated uint64 projection = 4;
+
+    // Number of persisted parquet files for this ingester partition.
+    int64 persist_counter = 5;
+  }
+
+  // Ingester UUID
+  string ingester_uuid = 1;
+
+  reserved "persist_counter";
+  reserved 2;
+
+  // Serialized table schema.
+  bytes table_schema = 3;
+
+  // Ingester partitions.
+  repeated Partition partitions = 4;
+}
+
+message IngesterQueryResponsePayload {
+  // Partition ID.
+  PartitionIdentifier partition_id = 1;
+
+  // Projection of the record batch.
+  //
+  // The projection is represented as a SORTED set of column indices. The indices are 0-based and point to the schema
+  // transmitted in metadata message. They MUST NOT contain any duplicates.
+  //
+  // This MUST be a subset of the partition projection transmitted in the metdata message.
+  repeated uint64 projection = 2;
+
+  // Serialized RecordBatch (w/o schema)
+  RecordBatch record_batch = 3;
+}
+
+message QueryResponse {
+  oneof msg {
+    // Metadata, this is ALWAYS the first message (even when there are no further messages) and MUST NOT be repeated.
+    IngesterQueryResponseMetadata metadata = 1;
+
+    // Payload, following the first message.
+    IngesterQueryResponsePayload payload = 2;
+  }
+}
+
+service IngesterQueryService {
+  // Query ingester for unpersisted data.
+  rpc Query (QueryRequest) returns (stream QueryResponse);
+}
diff --git a/ingester_query_grpc/src/arrow_serde.rs b/ingester_query_grpc/src/arrow_serde.rs
new file mode 100644
index 0000000..19825d2
--- /dev/null
+++ b/ingester_query_grpc/src/arrow_serde.rs
@@ -0,0 +1,491 @@
+//! (De-)Serialization of Apache Arrow [`Schema`] and [`RecordBatch`] data.
+//!
+//! **⚠️ These routines are IOx-specific and MUST NOT be used as a public interface!**
+//!
+//! Specifically this is a custom protocol, similar to but not derived from Arrow Flight.
+//! See <https://github.com/influxdata/influxdb_iox/issues/8169>.
+use std::{collections::HashMap, sync::Arc};
+
+use arrow::{
+    buffer::Buffer,
+    datatypes::{DataType, Field, FieldRef, Schema, SchemaRef},
+    error::ArrowError,
+    ipc::{
+        convert::fb_to_schema,
+        reader::{read_dictionary, read_record_batch},
+        root_as_message,
+        writer::{DictionaryTracker, EncodedData, IpcDataGenerator, IpcWriteOptions},
+    },
+    record_batch::{RecordBatch, RecordBatchOptions},
+};
+use bytes::Bytes;
+use flatbuffers::InvalidFlatbuffer;
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+
+use crate::influxdata::iox::ingester::v2 as proto2;
+
+/// Serialize [`Schema`] to [`Bytes`].
+pub fn schema_to_bytes(schema: &Schema) -> Bytes {
+    let EncodedData {
+        ipc_message,
+        arrow_data,
+    } = IpcDataGenerator::default().schema_to_bytes(schema, &write_options());
+    assert!(
+        arrow_data.is_empty(),
+        "arrow_data should always be empty for schema messages"
+    );
+    ipc_message.into()
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(module)]
+pub enum BytesToSchemaError {
+    #[snafu(display("Unable to get root as message: {source}"))]
+    RootAsMessage { source: InvalidFlatbuffer },
+
+    #[snafu(display("Unable to read IPC message as schema, is: {variant}"))]
+    WrongMessageType { variant: &'static str },
+}
+
+/// Read [`Schema`] from bytes.
+pub fn bytes_to_schema(data: &[u8]) -> Result<Schema, BytesToSchemaError> {
+    let message = root_as_message(data).context(bytes_to_schema_error::RootAsMessageSnafu)?;
+    let ipc_schema =
+        message
+            .header_as_schema()
+            .context(bytes_to_schema_error::WrongMessageTypeSnafu {
+                variant: message.header_type().variant_name().unwrap_or("<UNKNOWN>"),
+            })?;
+    let schema = fb_to_schema(ipc_schema);
+    Ok(schema)
+}
+
+/// Encoder to read/write Arrow [`RecordBatch`]es from/to [`proto2::RecordBatch`].
+#[derive(Debug)]
+pub struct BatchEncoder {
+    /// The original batch schema.
+    batch_schema: SchemaRef,
+
+    /// Schema with unique dictionary IDs.
+    dict_schema: SchemaRef,
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(module)]
+pub enum ProjectError {
+    #[snafu(display("Cannot project: {source}"))]
+    CannotProject { source: ArrowError },
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(module)]
+pub enum WriteError {
+    #[snafu(display("Invalid batch schema\n\nActual:\n{actual}\n\nExpected:\n{expected}"))]
+    InvalidSchema {
+        actual: SchemaRef,
+        expected: SchemaRef,
+    },
+}
+
+#[derive(Debug, Snafu)]
+#[snafu(module)]
+pub enum ReadError {
+    #[snafu(display("Unable to get root as dictionary message #{idx} (0-based): {source}"))]
+    DictionaryRootAsMessage {
+        source: InvalidFlatbuffer,
+        idx: usize,
+    },
+
+    #[snafu(display("Unable to read IPC message #{idx} (0-based) as dictionary, is: {variant}"))]
+    DictionaryWrongMessageType { variant: &'static str, idx: usize },
+
+    #[snafu(display("Cannot read dictionary: {source}"))]
+    ReadDictionary { source: ArrowError },
+
+    #[snafu(display("Record batch is required but missing"))]
+    RecordBatchRequired,
+
+    #[snafu(display("Unable to get root as record batch message: {source}"))]
+    RecordBatchRootAsMessage { source: InvalidFlatbuffer },
+
+    #[snafu(display("Unable to read IPC message as record batch, is: {variant}"))]
+    RecordBatchWrongMessageType { variant: &'static str },
+
+    #[snafu(display("Cannot read record batch: {source}"))]
+    ReadRecordBatch { source: ArrowError },
+}
+
+impl BatchEncoder {
+    /// Create new encoder.
+    ///
+    /// For schemas that contain dictionaries, this involves copying data and may be rather costly. If you can, try to
+    /// only do this once and use [`project`](Self::project) to select the right columns for the appropriate batch.
+    pub fn new(batch_schema: SchemaRef) -> Self {
+        let mut dict_id_counter = 0;
+        let dict_schema = Arc::new(Schema::new_with_metadata(
+            batch_schema
+                .fields()
+                .iter()
+                .map(|f| assign_dict_ids(f, &mut dict_id_counter))
+                .collect::<Vec<_>>(),
+            batch_schema.metadata().clone(),
+        ));
+        Self {
+            batch_schema,
+            dict_schema,
+        }
+    }
+
+    /// Project schema stored within this encoder.
+    pub fn project(&self, indices: &[usize]) -> Result<Self, ProjectError> {
+        Ok(Self {
+            batch_schema: Arc::new(
+                self.batch_schema
+                    .project(indices)
+                    .context(project_error::CannotProjectSnafu)?,
+            ),
+            dict_schema: Arc::new(
+                self.dict_schema
+                    .project(indices)
+                    .context(project_error::CannotProjectSnafu)?,
+            ),
+        })
+    }
+
+    /// Serialize batch.
+    pub fn write(&self, batch: &RecordBatch) -> Result<proto2::RecordBatch, WriteError> {
+        ensure!(
+            batch.schema() == self.batch_schema,
+            write_error::InvalidSchemaSnafu {
+                actual: batch.schema(),
+                expected: Arc::clone(&self.batch_schema),
+            }
+        );
+
+        let batch = reassign_schema(batch, Arc::clone(&self.dict_schema));
+
+        let mut dictionary_tracker = DictionaryTracker::new(true);
+        let (dictionaries, batch) = IpcDataGenerator::default()
+            .encoded_batch(&batch, &mut dictionary_tracker, &write_options())
+            .expect("serialization w/o compression should NEVER fail");
+
+        Ok(proto2::RecordBatch {
+            dictionaries: dictionaries.into_iter().map(|enc| enc.into()).collect(),
+            batch: Some(batch.into()),
+        })
+    }
+
+    /// Deserialize batch.
+    pub fn read(&self, batch: proto2::RecordBatch) -> Result<RecordBatch, ReadError> {
+        let proto2::RecordBatch {
+            dictionaries,
+            batch,
+        } = batch;
+
+        let mut dictionaries_by_field = HashMap::with_capacity(dictionaries.len());
+        for (idx, enc) in dictionaries.into_iter().enumerate() {
+            let proto2::EncodedData {
+                ipc_message,
+                arrow_data,
+            } = enc;
+
+            let message = root_as_message(&ipc_message)
+                .context(read_error::DictionaryRootAsMessageSnafu { idx })?;
+            let dictionary_batch = message.header_as_dictionary_batch().context(
+                read_error::DictionaryWrongMessageTypeSnafu {
+                    variant: message.header_type().variant_name().unwrap_or("<UNKNOWN>"),
+                    idx,
+                },
+            )?;
+
+            read_dictionary(
+                // copy & align
+                &Buffer::from(&arrow_data),
+                dictionary_batch,
+                &self.dict_schema,
+                &mut dictionaries_by_field,
+                &message.version(),
+            )
+            .context(read_error::ReadDictionarySnafu)?;
+        }
+
+        let proto2::EncodedData {
+            ipc_message,
+            arrow_data,
+        } = batch.context(read_error::RecordBatchRequiredSnafu)?;
+        let message =
+            root_as_message(&ipc_message).context(read_error::RecordBatchRootAsMessageSnafu)?;
+        let record_batch = message.header_as_record_batch().context(
+            read_error::RecordBatchWrongMessageTypeSnafu {
+                variant: message.header_type().variant_name().unwrap_or("<UNKNOWN>"),
+            },
+        )?;
+
+        let batch = read_record_batch(
+            // copy & align
+            &Buffer::from(&arrow_data),
+            record_batch,
+            Arc::clone(&self.dict_schema),
+            &dictionaries_by_field,
+            None,
+            &message.version(),
+        )
+        .context(read_error::ReadRecordBatchSnafu)?;
+
+        Ok(reassign_schema(&batch, Arc::clone(&self.batch_schema)))
+    }
+}
+
+/// Recursively assign unique dictionary IDs.
+fn assign_dict_ids(field: &FieldRef, counter: &mut i64) -> FieldRef {
+    match field.data_type() {
+        DataType::Dictionary(_, _) => {
+            let dict_id = *counter;
+            *counter += 1;
+            Arc::new(
+                Field::new_dict(
+                    field.name(),
+                    field.data_type().clone(),
+                    field.is_nullable(),
+                    dict_id,
+                    field.dict_is_ordered().expect("is dict type"),
+                )
+                .with_metadata(field.metadata().clone()),
+            )
+        }
+        DataType::Struct(fields) => {
+            let data_type =
+                DataType::Struct(fields.iter().map(|f| assign_dict_ids(f, counter)).collect());
+            Arc::new(field.as_ref().clone().with_data_type(data_type))
+        }
+        DataType::Union(fields, mode) => {
+            let data_type = DataType::Union(
+                fields
+                    .iter()
+                    .map(|(id, f)| (id, assign_dict_ids(f, counter)))
+                    .collect(),
+                *mode,
+            );
+            Arc::new(field.as_ref().clone().with_data_type(data_type))
+        }
+        DataType::List(field) => {
+            let data_type = DataType::List(assign_dict_ids(field, counter));
+            Arc::new(field.as_ref().clone().with_data_type(data_type))
+        }
+        DataType::LargeList(field) => {
+            let data_type = DataType::LargeList(assign_dict_ids(field, counter));
+            Arc::new(field.as_ref().clone().with_data_type(data_type))
+        }
+        DataType::FixedSizeList(field, s) => {
+            let data_type = DataType::FixedSizeList(assign_dict_ids(field, counter), *s);
+            Arc::new(field.as_ref().clone().with_data_type(data_type))
+        }
+        DataType::Map(field, sorted) => {
+            let data_type = DataType::Map(assign_dict_ids(field, counter), *sorted);
+            Arc::new(field.as_ref().clone().with_data_type(data_type))
+        }
+        _ => Arc::clone(field),
+    }
+}
+
+/// Re-assign schema to given batch.
+///
+/// This is required to overwrite dictionary IDs.
+fn reassign_schema(batch: &RecordBatch, schema: SchemaRef) -> RecordBatch {
+    RecordBatch::try_new_with_options(
+        schema,
+        batch.columns().to_vec(),
+        &RecordBatchOptions::default().with_row_count(Some(batch.num_rows())),
+    )
+    .expect("re-assigning schema should always work")
+}
+
+impl From<EncodedData> for proto2::EncodedData {
+    fn from(enc: EncodedData) -> Self {
+        let EncodedData {
+            ipc_message,
+            arrow_data,
+        } = enc;
+
+        Self {
+            ipc_message: ipc_message.into(),
+            arrow_data: arrow_data.into(),
+        }
+    }
+}
+
+/// Write options that are used for all relevant methods in this module.
+fn write_options() -> IpcWriteOptions {
+    IpcWriteOptions::default()
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, sync::Arc};
+
+    use arrow::{
+        array::{ArrayRef, Int64Array, StringDictionaryBuilder},
+        datatypes::Int32Type,
+    };
+    use datafusion::{
+        arrow::datatypes::{DataType, Field},
+        common::assert_contains,
+    };
+    use prost::Message;
+
+    use super::*;
+
+    #[test]
+    fn test_schema_roundtrip() {
+        let schema = schema();
+        let bytes = schema_to_bytes(&schema);
+
+        // ensure that the deserialization is NOT sensitive to alignment
+        const MAX_OFFSET: usize = 8;
+
+        for offset in 0..MAX_OFFSET {
+            let buffer = unalign_buffer(&bytes, MAX_OFFSET, offset);
+            let schema2 = bytes_to_schema(&buffer).unwrap();
+
+            assert_eq!(schema, schema2);
+        }
+    }
+
+    #[test]
+    fn test_record_batch_roundtrip() {
+        let schema = Arc::new(schema());
+        let batch = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![int64array(), dictarray1(), dictarray2(), dictarray1()],
+        )
+        .unwrap();
+
+        let encoder = BatchEncoder::new(schema);
+        let encoded = encoder.write(&batch).unwrap();
+
+        // check that we actually use dictionaries and don't hydrate them
+        assert_eq!(encoded.dictionaries.len(), 3);
+
+        let encoded = encoded.encode_to_vec();
+
+        // ensure that the deserialization is NOT sensitive to alignment
+        const MAX_OFFSET: usize = 128;
+
+        for offset in 0..MAX_OFFSET {
+            let view = unalign_buffer(&encoded, MAX_OFFSET, offset);
+            let encoded = proto2::RecordBatch::decode(view).unwrap();
+
+            let batch2 = encoder.read(encoded).unwrap();
+            assert_eq!(batch, batch2);
+        }
+    }
+
+    #[test]
+    fn test_write_checks_schema() {
+        let schema = Arc::new(schema());
+        let batch =
+            RecordBatch::try_new(Arc::new(schema.project(&[0]).unwrap()), vec![int64array()])
+                .unwrap();
+
+        let encoder = BatchEncoder::new(schema);
+        let err = encoder.write(&batch).unwrap_err();
+
+        assert_contains!(err.to_string(), "Invalid batch schema");
+    }
+
+    #[test]
+    fn test_project() {
+        let schema = Arc::new(schema());
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.project(&[0, 3, 2]).unwrap()),
+            vec![int64array(), dictarray1(), dictarray1()],
+        )
+        .unwrap();
+
+        let encoder = BatchEncoder::new(schema).project(&[0, 3, 2]).unwrap();
+        let encoded = encoder.write(&batch).unwrap();
+
+        // check that we actually use dictionaries and don't hydrate them
+        assert_eq!(encoded.dictionaries.len(), 2);
+
+        let batch2 = encoder.read(encoded).unwrap();
+        assert_eq!(batch, batch2);
+    }
+
+    fn schema() -> Schema {
+        Schema::new_with_metadata(
+            vec![
+                Field::new("f1", DataType::Int64, true)
+                    .with_metadata(HashMap::from([("k".to_owned(), "v".to_owned())])),
+                Field::new(
+                    "f2",
+                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                    false,
+                ),
+                Field::new(
+                    "f3",
+                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                    false,
+                ),
+                Field::new(
+                    "f4",
+                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                    false,
+                ),
+            ],
+            HashMap::from([("foo".to_owned(), "bar".to_owned())]),
+        )
+    }
+
+    fn int64array() -> ArrayRef {
+        Arc::new(Int64Array::from(vec![None, Some(1i64), Some(2i64)]))
+    }
+
+    fn dictarray1() -> ArrayRef {
+        let mut builder = StringDictionaryBuilder::<Int32Type>::new();
+        builder.append("foo").unwrap();
+        builder.append("foo").unwrap();
+        builder.append("bar").unwrap();
+        Arc::new(builder.finish())
+    }
+
+    fn dictarray2() -> ArrayRef {
+        let mut builder = StringDictionaryBuilder::<Int32Type>::new();
+        builder.append("fo").unwrap();
+        builder.append("fo").unwrap();
+        builder.append("ba").unwrap();
+        Arc::new(builder.finish())
+    }
+
+    fn unalign_buffer(data: &[u8], alignment: usize, offset: usize) -> Bytes {
+        assert!(alignment > 0);
+        assert!(alignment.is_power_of_two());
+        assert!(offset < alignment);
+
+        let memsize = data.len() + alignment;
+        let mut mem = Vec::<u8>::with_capacity(memsize);
+
+        let actual_offset = get_offset(mem.as_ptr(), alignment);
+        let padding = if actual_offset <= offset {
+            offset - actual_offset
+        } else {
+            alignment - actual_offset + offset
+        };
+        assert!(padding < alignment);
+
+        mem.resize(padding, 0);
+        mem.extend_from_slice(data);
+        assert_eq!(get_offset(mem[padding..].as_ptr(), alignment), offset);
+
+        let b = Bytes::from(mem);
+        let b = b.slice(padding..);
+        assert_eq!(get_offset(b.as_ptr(), alignment), offset);
+        assert_eq!(b.as_ref(), data);
+
+        b
+    }
+
+    fn get_offset(ptr: *const u8, alignment: usize) -> usize {
+        (alignment - ptr.align_offset(alignment)) % alignment
+    }
+}
diff --git a/ingester_query_grpc/src/lib.rs b/ingester_query_grpc/src/lib.rs
new file mode 100644
index 0000000..71d9c1f
--- /dev/null
+++ b/ingester_query_grpc/src/lib.rs
@@ -0,0 +1,606 @@
+// This crate deliberately does not use the same linting rules as the other
+// crates because of all the generated code it contains that we don't have much
+// control over.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    clippy::clone_on_ref_ptr,
+    clippy::dbg_macro,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::use_self,
+    missing_debug_implementations,
+    unused_crate_dependencies
+)]
+#![allow(
+    clippy::derive_partial_eq_without_eq,
+    clippy::needless_borrow,
+    clippy::needless_borrows_for_generic_args
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use crate::influxdata::iox::ingester::v1 as proto;
+use crate::influxdata::iox::ingester::v2 as proto2;
+use base64::{prelude::BASE64_STANDARD, Engine};
+use data_types::{
+    NamespaceId, PartitionHashId, PartitionId, TableId, TimestampMinMax, TimestampRange,
+    TransitionPartitionId,
+};
+use datafusion::{common::DataFusionError, prelude::Expr};
+use datafusion_proto::bytes::Serializeable;
+use predicate::{Predicate, ValueExpr};
+use prost::Message;
+use snafu::{ResultExt, Snafu};
+
+/// This module imports the generated protobuf code into a Rust module
+/// hierarchy that matches the namespace hierarchy of the protobuf
+/// definitions
+#[allow(clippy::use_self, missing_copy_implementations, unreachable_pub)]
+pub mod influxdata {
+    pub mod iox {
+        pub mod ingester {
+            pub mod v1 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.ingester.v1.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.ingester.v1.serde.rs"
+                ));
+            }
+
+            pub mod v2 {
+                // generated code violates a few lints, so opt-out of them
+                #![allow(clippy::future_not_send)]
+
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.ingester.v2.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.ingester.v2.serde.rs"
+                ));
+            }
+        }
+    }
+}
+
+pub mod arrow_serde;
+
+/// Error returned if a request field has an invalid value. Includes
+/// machinery to add parent field names for context -- thus it will
+/// report `rules.write_timeout` than simply `write_timeout`.
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct FieldViolation {
+    pub field: String,
+    pub description: String,
+}
+
+impl FieldViolation {
+    pub fn required(field: impl Into<String>) -> Self {
+        Self {
+            field: field.into(),
+            description: "Field is required".to_string(),
+        }
+    }
+
+    /// Re-scopes this error as the child of another field
+    pub fn scope(self, field: impl Into<String>) -> Self {
+        let field = if self.field.is_empty() {
+            field.into()
+        } else {
+            [field.into(), self.field].join(".")
+        };
+
+        Self {
+            field,
+            description: self.description,
+        }
+    }
+}
+
+impl std::error::Error for FieldViolation {}
+
+impl std::fmt::Display for FieldViolation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Violation for field \"{}\": {}",
+            self.field, self.description
+        )
+    }
+}
+
+fn expr_to_bytes_violation(field: impl Into<String>, e: DataFusionError) -> FieldViolation {
+    FieldViolation {
+        field: field.into(),
+        description: format!("Error converting Expr to bytes: {e}"),
+    }
+}
+
+fn expr_from_bytes_violation(field: impl Into<String>, e: DataFusionError) -> FieldViolation {
+    FieldViolation {
+        field: field.into(),
+        description: format!("Error creating Expr from bytes: {e}"),
+    }
+}
+
+/// Request from the querier service to the ingester service
+#[derive(Debug, PartialEq, Clone)]
+pub struct IngesterQueryRequest {
+    /// namespace to search
+    pub namespace_id: NamespaceId,
+
+    /// Table to search
+    pub table_id: TableId,
+
+    /// Columns the query service is interested in
+    pub columns: Vec<String>,
+
+    /// Predicate for filtering
+    pub predicate: Option<Predicate>,
+}
+
+impl IngesterQueryRequest {
+    /// Make a request to return data for a specified table
+    pub fn new(
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        columns: Vec<String>,
+        predicate: Option<Predicate>,
+    ) -> Self {
+        Self {
+            namespace_id,
+            table_id,
+            columns,
+            predicate,
+        }
+    }
+}
+
+impl TryFrom<proto::IngesterQueryRequest> for IngesterQueryRequest {
+    type Error = FieldViolation;
+
+    fn try_from(proto: proto::IngesterQueryRequest) -> Result<Self, Self::Error> {
+        let proto::IngesterQueryRequest {
+            namespace_id,
+            table_id,
+            columns,
+            predicate,
+        } = proto;
+
+        let namespace_id = NamespaceId::new(namespace_id);
+        let table_id = TableId::new(table_id);
+        let predicate = predicate.map(TryInto::try_into).transpose()?;
+
+        Ok(Self::new(namespace_id, table_id, columns, predicate))
+    }
+}
+
+impl TryFrom<IngesterQueryRequest> for proto::IngesterQueryRequest {
+    type Error = FieldViolation;
+
+    fn try_from(query: IngesterQueryRequest) -> Result<Self, Self::Error> {
+        let IngesterQueryRequest {
+            namespace_id,
+            table_id,
+            columns,
+            predicate,
+        } = query;
+
+        Ok(Self {
+            namespace_id: namespace_id.get(),
+            table_id: table_id.get(),
+            columns,
+            predicate: predicate.map(TryInto::try_into).transpose()?,
+        })
+    }
+}
+
+/// Request from the querier service to the ingester service
+#[derive(Debug, PartialEq, Clone)]
+pub struct IngesterQueryRequest2 {
+    /// namespace to search
+    pub namespace_id: NamespaceId,
+
+    /// Table to search
+    pub table_id: TableId,
+
+    /// Columns the query service is interested in
+    pub columns: Vec<String>,
+
+    /// Predicate for filtering
+    pub filters: Vec<Expr>,
+
+    /// Time interval specified by the filters. This will be used by the
+    /// ingestor for cheap early filtering.
+    pub t_min_max: TimestampMinMax,
+}
+
+impl IngesterQueryRequest2 {
+    /// Make a request to return data for a specified table
+    pub fn new(
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        columns: Vec<String>,
+        filters: Vec<Expr>,
+        t_min_max: TimestampMinMax,
+    ) -> Self {
+        Self {
+            namespace_id,
+            table_id,
+            columns,
+            filters,
+            t_min_max,
+        }
+    }
+}
+
+impl TryFrom<proto2::QueryRequest> for IngesterQueryRequest2 {
+    type Error = FieldViolation;
+
+    fn try_from(proto: proto2::QueryRequest) -> Result<Self, Self::Error> {
+        let proto2::QueryRequest {
+            namespace_id,
+            table_id,
+            columns,
+            filters,
+            t_min,
+            t_max,
+        } = proto;
+
+        let namespace_id = NamespaceId::new(namespace_id);
+        let table_id = TableId::new(table_id);
+        let filters = filters
+            .map(TryInto::try_into)
+            .transpose()?
+            .unwrap_or_default();
+
+        Ok(Self::new(
+            namespace_id,
+            table_id,
+            columns,
+            filters,
+            TimestampMinMax::new(t_min, t_max),
+        ))
+    }
+}
+
+impl TryFrom<IngesterQueryRequest2> for proto2::QueryRequest {
+    type Error = FieldViolation;
+
+    fn try_from(query: IngesterQueryRequest2) -> Result<Self, Self::Error> {
+        let IngesterQueryRequest2 {
+            namespace_id,
+            table_id,
+            columns,
+            filters,
+            t_min_max,
+        } = query;
+
+        Ok(Self {
+            namespace_id: namespace_id.get(),
+            table_id: table_id.get(),
+            columns,
+            filters: Some(filters.try_into()?),
+            t_min: t_min_max.min,
+            t_max: t_min_max.max,
+        })
+    }
+}
+
+impl TryFrom<Predicate> for proto::Predicate {
+    type Error = FieldViolation;
+
+    fn try_from(pred: Predicate) -> Result<Self, Self::Error> {
+        let Predicate {
+            field_columns,
+            range,
+            exprs,
+            value_expr,
+        } = pred;
+
+        let field_columns = field_columns.into_iter().flatten().collect();
+        let range = range.map(|r| proto::TimestampRange {
+            start: r.start(),
+            end: r.end(),
+        });
+
+        let exprs = exprs
+            .iter()
+            .map(|expr| {
+                expr.to_bytes()
+                    .map(|bytes| bytes.to_vec())
+                    .map_err(|e| expr_to_bytes_violation("exprs", e))
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        let value_expr = value_expr
+            .into_iter()
+            .map(TryInto::try_into)
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Ok(Self {
+            field_columns,
+            range,
+            exprs,
+            value_expr,
+        })
+    }
+}
+
+impl TryFrom<proto::Predicate> for Predicate {
+    type Error = FieldViolation;
+
+    fn try_from(proto: proto::Predicate) -> Result<Self, Self::Error> {
+        let proto::Predicate {
+            field_columns,
+            range,
+            exprs,
+            value_expr,
+        } = proto;
+
+        let field_columns = if field_columns.is_empty() {
+            None
+        } else {
+            Some(field_columns.into_iter().collect())
+        };
+
+        let range = range.map(|r| TimestampRange::new(r.start, r.end));
+
+        let exprs = exprs
+            .into_iter()
+            .map(|bytes| {
+                Expr::from_bytes_with_registry(&bytes, query_functions::registry())
+                    .map_err(|e| expr_from_bytes_violation("exprs", e))
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        let value_expr = value_expr
+            .into_iter()
+            .map(|ve| {
+                let expr = Expr::from_bytes_with_registry(&ve.expr, query_functions::registry())
+                    .map_err(|e| expr_from_bytes_violation("value_expr.expr", e))?;
+                // try to convert to ValueExpr
+                expr.try_into().map_err(|e| FieldViolation {
+                    field: "expr".into(),
+                    description: format!("Internal: Serialized expr a valid ValueExpr: {e:?}"),
+                })
+            })
+            .collect::<Result<Vec<ValueExpr>, FieldViolation>>()?;
+
+        Ok(Self {
+            field_columns,
+            range,
+            exprs,
+            value_expr,
+        })
+    }
+}
+
+impl TryFrom<ValueExpr> for proto::ValueExpr {
+    type Error = FieldViolation;
+
+    fn try_from(value_expr: ValueExpr) -> Result<Self, Self::Error> {
+        let expr: Expr = value_expr.into();
+
+        let expr = expr
+            .to_bytes()
+            .map_err(|e| expr_to_bytes_violation("value_expr.expr", e))?
+            .to_vec();
+
+        Ok(Self { expr })
+    }
+}
+
+impl TryFrom<Vec<Expr>> for proto2::Filters {
+    type Error = FieldViolation;
+
+    fn try_from(filters: Vec<Expr>) -> Result<Self, Self::Error> {
+        let exprs = filters
+            .iter()
+            .enumerate()
+            .map(|(i, expr)| {
+                expr.to_bytes()
+                    .map_err(|e| expr_to_bytes_violation(i.to_string(), e).scope("expr"))
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Ok(Self { exprs })
+    }
+}
+
+impl TryFrom<proto2::Filters> for Vec<Expr> {
+    type Error = FieldViolation;
+
+    fn try_from(proto: proto2::Filters) -> Result<Self, Self::Error> {
+        let proto2::Filters { exprs } = proto;
+
+        let exprs = exprs
+            .into_iter()
+            .map(|bytes| {
+                Expr::from_bytes_with_registry(&bytes, query_functions::registry())
+                    .map_err(|e| expr_from_bytes_violation("exprs", e))
+            })
+            .collect::<Result<Self, _>>()?;
+
+        Ok(exprs)
+    }
+}
+
+#[derive(Debug, Snafu, Copy, Clone)]
+pub enum EncodeProtoPredicateFromBase64Error {
+    #[snafu(display("Cannot encode protobuf: {source}"))]
+    ProtobufEncode { source: prost::EncodeError },
+}
+
+/// Encodes [`proto::Predicate`] as base64.
+pub fn encode_proto_predicate_as_base64(
+    predicate: &proto::Predicate,
+) -> Result<String, EncodeProtoPredicateFromBase64Error> {
+    let mut buf = vec![];
+    predicate.encode(&mut buf).context(ProtobufEncodeSnafu)?;
+    Ok(BASE64_STANDARD.encode(&buf))
+}
+
+/// Encodes [`proto2::Filters`] as base64.
+pub fn encode_proto2_filters_as_base64(
+    filters: &proto2::Filters,
+) -> Result<String, EncodeProtoPredicateFromBase64Error> {
+    let mut buf = vec![];
+    filters.encode(&mut buf).context(ProtobufEncodeSnafu)?;
+    Ok(BASE64_STANDARD.encode(&buf))
+}
+
+#[derive(Debug, Snafu)]
+pub enum DecodeProtoPredicateFromBase64Error {
+    #[snafu(display("Cannot decode base64: {source}"))]
+    Base64Decode { source: base64::DecodeError },
+
+    #[snafu(display("Cannot decode protobuf: {source}"))]
+    ProtobufDecode { source: prost::DecodeError },
+}
+
+/// Decodes [`proto::Predicate`] from base64 string.
+pub fn decode_proto_predicate_from_base64(
+    s: &str,
+) -> Result<proto::Predicate, DecodeProtoPredicateFromBase64Error> {
+    let predicate_binary = BASE64_STANDARD.decode(s).context(Base64DecodeSnafu)?;
+    proto::Predicate::decode(predicate_binary.as_slice()).context(ProtobufDecodeSnafu)
+}
+
+/// Decodes [`proto2::Filters`] from base64 string.
+pub fn decode_proto2_filters_from_base64(
+    s: &str,
+) -> Result<proto2::Filters, DecodeProtoPredicateFromBase64Error> {
+    let predicate_binary = BASE64_STANDARD.decode(s).context(Base64DecodeSnafu)?;
+    proto2::Filters::decode(predicate_binary.as_slice()).context(ProtobufDecodeSnafu)
+}
+
+impl TryFrom<proto2::PartitionIdentifier> for TransitionPartitionId {
+    type Error = FieldViolation;
+
+    fn try_from(value: proto2::PartitionIdentifier) -> Result<Self, Self::Error> {
+        let proto2::PartitionIdentifier {
+            partition_identifier,
+        } = value;
+        let id =
+            partition_identifier.ok_or_else(|| FieldViolation::required("partition_identifier"))?;
+        let id = match id {
+            proto2::partition_identifier::PartitionIdentifier::CatalogId(id) => {
+                Self::Deprecated(PartitionId::new(id))
+            }
+            proto2::partition_identifier::PartitionIdentifier::HashId(id) => {
+                Self::Deterministic(PartitionHashId::try_from(id.as_ref()).map_err(|e| {
+                    FieldViolation {
+                        field: "partition_identifier".to_owned(),
+                        description: e.to_string(),
+                    }
+                })?)
+            }
+        };
+        Ok(id)
+    }
+}
+
+impl From<TransitionPartitionId> for proto2::PartitionIdentifier {
+    fn from(id: TransitionPartitionId) -> Self {
+        let id = match id {
+            TransitionPartitionId::Deprecated(id) => {
+                proto2::partition_identifier::PartitionIdentifier::CatalogId(id.get())
+            }
+            TransitionPartitionId::Deterministic(id) => {
+                proto2::partition_identifier::PartitionIdentifier::HashId(
+                    id.as_bytes().to_vec().into(),
+                )
+            }
+        };
+        Self {
+            partition_identifier: Some(id),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::BTreeSet, sync::Arc};
+
+    use super::*;
+    use datafusion::{logical_expr::LogicalPlanBuilder, prelude::*};
+
+    #[test]
+    fn query_round_trip() {
+        let rust_predicate = predicate::Predicate::new()
+            .with_range(1, 100)
+            .with_expr(col("foo"))
+            .with_value_expr(col("_value").eq(lit("bar")).try_into().unwrap());
+
+        let rust_query = IngesterQueryRequest::new(
+            NamespaceId::new(42),
+            TableId::new(1337),
+            vec!["usage".into(), "time".into()],
+            Some(rust_predicate),
+        );
+
+        let proto_query: proto::IngesterQueryRequest = rust_query.clone().try_into().unwrap();
+
+        let rust_query_converted: IngesterQueryRequest = proto_query.try_into().unwrap();
+
+        assert_eq!(rust_query, rust_query_converted);
+    }
+
+    #[test]
+    fn query2_round_trip() {
+        let rust_query = IngesterQueryRequest2::new(
+            NamespaceId::new(42),
+            TableId::new(1337),
+            vec!["usage".into(), "time".into()],
+            vec![col("foo").eq(lit(1i64))],
+            TimestampMinMax::new(1000, 2000),
+        );
+
+        let proto_query: proto2::QueryRequest = rust_query.clone().try_into().unwrap();
+
+        let rust_query_converted: IngesterQueryRequest2 = proto_query.try_into().unwrap();
+
+        assert_eq!(rust_query, rust_query_converted);
+    }
+
+    #[test]
+    fn predicate_proto_base64_roundtrip() {
+        let predicate = Predicate {
+            field_columns: Some(BTreeSet::from([String::from("foo"), String::from("bar")])),
+            range: Some(TimestampRange::new(13, 42)),
+            exprs: vec![Expr::Wildcard { qualifier: None }],
+            value_expr: vec![col("_value").eq(lit("bar")).try_into().unwrap()],
+        };
+        let predicate: proto::Predicate = predicate.try_into().unwrap();
+        let base64 = encode_proto_predicate_as_base64(&predicate).unwrap();
+        let predicate2 = decode_proto_predicate_from_base64(&base64).unwrap();
+        assert_eq!(predicate, predicate2);
+    }
+
+    #[test]
+    fn filters_proto2_base64_roundtrip() {
+        let filters = vec![col("col").eq(lit(1i64))];
+        let filters_1: proto2::Filters = filters.try_into().unwrap();
+
+        let base64_1 = encode_proto2_filters_as_base64(&filters_1).unwrap();
+        let filters_2 = decode_proto2_filters_from_base64(&base64_1).unwrap();
+        let base64_2 = encode_proto2_filters_as_base64(&filters_2).unwrap();
+
+        assert_eq!(filters_1, filters_2);
+        assert_eq!(base64_1, base64_2);
+    }
+
+    #[test]
+    fn filters_not_serializable_error() {
+        let subquery = Arc::new(LogicalPlanBuilder::empty(true).build().unwrap());
+        let filters = vec![
+            col("col").eq(lit(1i64)),
+            exists(subquery),
+            col("col").eq(lit(1i64)),
+        ];
+
+        let err = proto2::Filters::try_from(filters).unwrap_err();
+        assert_eq!(err.field, "expr.1",)
+    }
+}
diff --git a/iox_catalog/.gitignore b/iox_catalog/.gitignore
new file mode 100644
index 0000000..1dc091a
--- /dev/null
+++ b/iox_catalog/.gitignore
@@ -0,0 +1 @@
+iox_catalog.sqlite3
\ No newline at end of file
diff --git a/iox_catalog/Cargo.toml b/iox_catalog/Cargo.toml
new file mode 100644
index 0000000..40e9bb9
--- /dev/null
+++ b/iox_catalog/Cargo.toml
@@ -0,0 +1,46 @@
+[package]
+name = "iox_catalog"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+async-trait = "0.1.77"
+backoff = { version = "0.1.0", path = "../backoff" }
+catalog_cache = { path = "../catalog_cache" }
+data_types = { path = "../data_types" }
+generated_types = { path = "../generated_types" }
+futures = "0.3"
+iox_time = { version = "0.1.0", path = "../iox_time" }
+log = "0.4"
+metric = { version = "0.1.0", path = "../metric" }
+mutable_batch = { path = "../mutable_batch" }
+observability_deps = { path = "../observability_deps" }
+once_cell = { version = "1.19", features = ["parking_lot"] }
+parking_lot = { version = "0.12" }
+serde = { version = "1.0", features = ["derive"] }
+siphasher = "1.0"
+snafu = "0.8"
+sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "uuid", "sqlite"] }
+sqlx-hotswap-pool = { path = "../sqlx-hotswap-pool" }
+thiserror = "1.0.56"
+tokio = { version = "1.35", features = ["io-util", "macros", "parking_lot", "rt-multi-thread", "time"] }
+tonic = { workspace = true }
+trace_http = { path = "../trace_http" }
+uuid = "1"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+assert_matches = "1.5.0"
+dotenvy = "0.15.7"
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+paste = "1.0.14"
+pretty_assertions = "1.4.0"
+proptest = { version = "1", default_features = false, features = ["std"] }
+rand = "0.8"
+tempfile = "3"
+test_helpers = { path = "../test_helpers" }
diff --git a/iox_catalog/README.md b/iox_catalog/README.md
new file mode 100644
index 0000000..21388cf
--- /dev/null
+++ b/iox_catalog/README.md
@@ -0,0 +1,75 @@
+# IOx Catalog
+
+This crate contains the code for the IOx Catalog. This includes the definitions of namespaces,
+their tables, the columns of those tables and their types, what Parquet files are in object storage
+and delete tombstones. There's also some configuration information that the overall distributed
+system uses for operation.
+
+To run this crate's tests you'll need Postgres installed and running locally. You'll also need to
+set the `INFLUXDB_IOX_CATALOG_DSN` environment variable so that sqlx will be able to connect to
+your local DB. For example with user and password filled in:
+
+```
+INFLUXDB_IOX_CATALOG_DSN=postgres://<postgres user>:<postgres password>@localhost/iox_shared
+```
+
+You can omit the host part if your postgres is running on the default unix domain socket (useful on
+macos because, by default, the config installed by `brew install postgres` doesn't listen to a TCP
+port):
+
+```
+INFLUXDB_IOX_CATALOG_DSN=postgres:///iox_shared
+```
+
+You'll then need to create the database. You can do this via the sqlx command line.
+
+```
+cargo install sqlx-cli
+DATABASE_URL=<dsn> sqlx database create
+cargo run -q -- catalog setup
+```
+
+This will set up the database based on the files in `./migrations` in this crate. SQLx also creates
+a table to keep track of which migrations have been run.
+
+NOTE: **do not** use `sqlx database setup`, because that will create the migration table in the
+wrong schema (namespace). Our `catalog setup` code will do that part by using the same sqlx
+migration module but with the right namespace setup.
+
+## Migrations
+
+If you need to create and run migrations to add, remove, or change the schema, you'll need the
+`sqlx-cli` tool. Install with `cargo install sqlx-cli` if you haven't already, then run `sqlx
+migrate --help` to see the commands relevant to migrations.
+
+## Tests
+
+To run the Postgres integration tests, ensure the above setup is complete first.
+
+**CAUTION:** existing data in the database is dropped when tests are run, so you should use a
+DIFFERENT database name for your test database than your `INFLUXDB_IOX_CATALOG_DSN` database.
+
+* Set `TEST_INFLUXDB_IOX_CATALOG_DSN=<testdsn>` env as above with the `INFLUXDB_IOX_CATALOG_DSN`
+  env var. The integration tests *will* pick up this value if set in your `.env` file.
+* Set `TEST_INTEGRATION=1`
+* Run `cargo test -p iox_catalog`
+
+## Schema namespace
+
+All iox catalog tables are created in a `iox_catalog` schema. Remember to set the schema search
+path when accessing the database with `psql`.
+
+There are several ways to set the default search path, depending if you want to do it for your
+session, for the database or for the user.
+
+Setting a default search path for the database or user may interfere with tests (e.g. it may make
+some test pass when they should fail). The safest option is set the search path on a per session
+basis. As always, there are a few ways to do that:
+
+1. you can type `set search_path to public,iox_catalog;` inside psql.
+2. you can add (1) to your `~/.psqlrc`
+3. or you can just pass it as a CLI argument with:
+
+```
+psql 'dbname=iox_shared options=-csearch_path=public,iox_catalog'
+```
diff --git a/iox_catalog/build.rs b/iox_catalog/build.rs
new file mode 100644
index 0000000..d506869
--- /dev/null
+++ b/iox_catalog/build.rs
@@ -0,0 +1,5 @@
+// generated by `sqlx migrate build-script`
+fn main() {
+    // trigger recompilation when a new migration is added
+    println!("cargo:rerun-if-changed=migrations");
+}
diff --git a/iox_catalog/check_linear_migrations.sh b/iox_catalog/check_linear_migrations.sh
new file mode 100755
index 0000000..3d0a9fe
--- /dev/null
+++ b/iox_catalog/check_linear_migrations.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+#
+# This script checks that migrations are modified correctly.
+
+set -euo pipefail
+
+# arg parsing
+if [ "$#" -ne 1 ]; then
+    echo "ERROR: need to give a path"
+    exit 1
+fi
+path="$1"
+
+# go to correct path
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd "$SCRIPT_DIR"
+
+# check branch
+main_branch="main"
+current_branch="$(git rev-parse --abbrev-ref HEAD)"
+if [[ "$current_branch" == "$main_branch" ]]; then
+    echo "INFO:  do not run on '$main_branch'"
+    exit 0
+else
+    echo "INFO:  compare to branch: '$main_branch'"
+fi
+
+# check new versions
+versions_main="$(git ls-tree -r --name-only "$main_branch" "$path" | xargs -n1 basename | sed -E 's:^([0-9]+).*:\1:g' | sort -n)"
+version_last_main="$(echo "$versions_main" | tail -1)"
+echo "INFO:  last existing version: $version_last_main"
+
+versions_new="$(git diff --name-only --diff-filter=A --no-renames "$main_branch" HEAD -- "$path" | xargs -rn1 basename | sed -E 's:^([0-9]+).*:\1:g' | sort -n)"
+version_first_new="$(echo "$versions_new" | head -1)"
+echo "INFO:  first new version:     $version_first_new"
+
+if [[ -n "$version_first_new" && "$version_first_new" < "$version_last_main" ]]; then
+    echo "ERROR: new version BEFORE existing ones"
+    exit 1
+else
+    echo "INFO:  new versions are AFTER existing ones"
+fi
+
+# check deleted versions
+versions_deleted="$(git diff --name-only --diff-filter=D --no-renames "$main_branch" HEAD -- "$path" | xargs -rn1 basename | sed -E 's:^([0-9]+).*:\1:g' | sort -n)"
+
+if [[ -n "$versions_deleted" ]]; then
+    echo "ERROR: deleted versions:"
+    echo "$versions_deleted"
+    exit 1
+else
+    echo "INFO:  no deleted versions"
+fi
+
+# check edited versions
+files_modified="$(git diff --name-only --diff-filter=M --no-renames "$main_branch" HEAD -- "$path" | sort)"
+if [[ -n "$files_modified" ]]; then
+    readarray -t files_modified <<<"$files_modified"
+
+    for f in "${files_modified[@]}"; do
+        version="$(basename "$f" | sed -E 's:^([0-9]+).*:\1:g')"
+        checksum_old="$(git show "$main_branch:$f" | sha384sum | sed -E 's:^([0-9a-f]+).*:\1:g')"
+        pragma="-- IOX_OTHER_CHECKSUM: $checksum_old"
+        if grep -qF -- "$pragma" "../$f"; then
+            echo "INFO:  version $version modified correctly"
+        else
+            echo "ERROR: modified version $version is missing correct pragma: '$pragma'"
+            exit 1
+        fi
+    done
+else
+    echo "INFO:  no modified versions"
+fi
+
+# perfect
+echo "INFO:  all good"
diff --git a/iox_catalog/migrations/20210217134322_create_schema.sql b/iox_catalog/migrations/20210217134322_create_schema.sql
new file mode 100644
index 0000000..b6cbfa1
--- /dev/null
+++ b/iox_catalog/migrations/20210217134322_create_schema.sql
@@ -0,0 +1 @@
+CREATE SCHEMA IF NOT EXISTS iox_catalog;
diff --git a/iox_catalog/migrations/20211229171744_initial_schema.sql b/iox_catalog/migrations/20211229171744_initial_schema.sql
new file mode 100644
index 0000000..e768686
--- /dev/null
+++ b/iox_catalog/migrations/20211229171744_initial_schema.sql
@@ -0,0 +1,185 @@
+-- iox_shared schema
+CREATE TABLE IF NOT EXISTS kafka_topic (
+    id INT GENERATED ALWAYS AS IDENTITY,
+    name VARCHAR NOT NULL,
+    PRIMARY KEY (id),
+    CONSTRAINT kafka_topic_name_unique UNIQUE (name)
+);
+
+CREATE TABLE IF NOT EXISTS query_pool (
+    id SMALLINT GENERATED ALWAYS AS IDENTITY,
+    name VARCHAR NOT NULL,
+    PRIMARY KEY (id),
+    CONSTRAINT query_pool_name_unique UNIQUE (name)
+);
+
+CREATE TABLE IF NOT EXISTS namespace (
+    id INT GENERATED ALWAYS AS IDENTITY,
+    name VARCHAR NOT NULL,
+    retention_duration VARCHAR,
+    kafka_topic_id integer NOT NULL,
+    query_pool_id SMALLINT NOT NULL,
+    PRIMARY KEY (id),
+    CONSTRAINT namespace_name_unique UNIQUE (name)
+);
+
+CREATE TABLE IF NOT EXISTS table_name (
+    id INT GENERATED ALWAYS AS IDENTITY,
+    namespace_id integer NOT NULL,
+    name VARCHAR NOT NULL,
+    PRIMARY KEY (id),
+    CONSTRAINT table_name_unique UNIQUE (namespace_id, name)
+);
+
+CREATE TABLE IF NOT EXISTS column_name (
+    id INT GENERATED ALWAYS AS IDENTITY,
+    table_id INT NOT NULL,
+    name VARCHAR NOT NULL,
+    column_type SMALLINT NOT NULL,
+    PRIMARY KEY (id),
+    CONSTRAINT column_name_unique UNIQUE (table_id, name)
+);
+
+CREATE TABLE IF NOT EXISTS sequencer (
+    id SMALLINT GENERATED ALWAYS AS IDENTITY,
+    kafka_topic_id INT NOT NULL,
+    kafka_partition INT NOT NULL,
+    min_unpersisted_sequence_number BIGINT,
+    PRIMARY KEY (id),
+    CONSTRAINT sequencer_unique UNIQUE (kafka_topic_id, kafka_partition)
+);
+
+CREATE TABLE IF NOT EXISTS sharding_rule_override (
+    id INT GENERATED ALWAYS AS IDENTITY,
+    namespace_id INT NOT NULL,
+    table_id INT NOT NULL,
+    column_id INT NOT NULL,
+    PRIMARY KEY (id)
+);
+
+CREATE TABLE IF NOT EXISTS PARTITION (
+    id BIGINT GENERATED ALWAYS AS IDENTITY,
+    sequencer_id SMALLINT NOT NULL,
+    table_id INT NOT NULL,
+    partition_key VARCHAR NOT NULL,
+    PRIMARY KEY (id),
+    CONSTRAINT partition_key_unique UNIQUE (table_id, partition_key)
+);
+
+CREATE TABLE IF NOT EXISTS parquet_file (
+    id BIGINT GENERATED ALWAYS AS IDENTITY,
+    sequencer_id SMALLINT NOT NULL,
+    table_id INT NOT NULL,
+    partition_id BIGINT NOT NULL,
+    object_store_id uuid NOT NULL,
+    min_sequence_number BIGINT,
+    max_sequence_number BIGINT,
+    min_time BIGINT,
+    max_time BIGINT,
+    to_delete BOOLEAN,
+    PRIMARY KEY (id),
+    CONSTRAINT parquet_location_unique UNIQUE (object_store_id)
+);
+
+CREATE TABLE IF NOT EXISTS tombstone (
+    id BIGINT GENERATED ALWAYS AS IDENTITY,
+    table_id INT NOT NULL,
+    sequencer_id SMALLINT NOT NULL,
+    sequence_number BIGINT NOT NULL,
+    min_time BIGINT NOT NULL,
+    max_time BIGINT NOT NULL,
+    serialized_predicate TEXT NOT NULL,
+    PRIMARY KEY (id),
+    CONSTRAINT tombstone_unique UNIQUE (table_id, sequencer_id, sequence_number)
+);
+
+CREATE TABLE IF NOT EXISTS processed_tombstone (
+    tombstone_id BIGINT NOT NULL,
+    parquet_file_id BIGINT NOT NULL,
+    PRIMARY KEY (tombstone_id, parquet_file_id)
+);
+
+ALTER TABLE
+    IF EXISTS namespace
+ADD
+    FOREIGN KEY (kafka_topic_id) REFERENCES kafka_topic (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS namespace
+ADD
+    FOREIGN KEY (query_pool_id) REFERENCES query_pool (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS table_name
+ADD
+    FOREIGN KEY (namespace_id) REFERENCES namespace (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS column_name
+ADD
+    FOREIGN KEY (table_id) REFERENCES table_name (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS sequencer
+ADD
+    FOREIGN KEY (kafka_topic_id) REFERENCES kafka_topic (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS sharding_rule_override
+ADD
+    FOREIGN KEY (namespace_id) REFERENCES namespace (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS sharding_rule_override
+ADD
+    FOREIGN KEY (table_id) REFERENCES table_name (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS sharding_rule_override
+ADD
+    FOREIGN KEY (column_id) REFERENCES column_name (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS PARTITION
+ADD
+    FOREIGN KEY (sequencer_id) REFERENCES sequencer (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS PARTITION
+ADD
+    FOREIGN KEY (table_id) REFERENCES table_name (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS parquet_file
+ADD
+    FOREIGN KEY (sequencer_id) REFERENCES sequencer (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS parquet_file
+ADD
+    FOREIGN KEY (table_id) REFERENCES table_name (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS parquet_file
+ADD
+    FOREIGN KEY (partition_id) REFERENCES PARTITION (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS tombstone
+ADD
+    FOREIGN KEY (sequencer_id) REFERENCES sequencer (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS tombstone
+ADD
+    FOREIGN KEY (table_id) REFERENCES table_name (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS processed_tombstone
+ADD
+    FOREIGN KEY (tombstone_id) REFERENCES tombstone (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+ALTER TABLE
+    IF EXISTS processed_tombstone
+ADD
+    FOREIGN KEY (parquet_file_id) REFERENCES parquet_file (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
diff --git a/iox_catalog/migrations/20220221145409_routing_limits.sql b/iox_catalog/migrations/20220221145409_routing_limits.sql
new file mode 100644
index 0000000..21fa66a
--- /dev/null
+++ b/iox_catalog/migrations/20220221145409_routing_limits.sql
@@ -0,0 +1,9 @@
+ALTER TABLE
+  IF EXISTS namespace
+ADD
+  COLUMN max_tables INT NOT NULL DEFAULT 10000;
+
+ALTER TABLE
+  IF EXISTS namespace
+ADD
+  COLUMN max_columns_per_table INT NOT NULL DEFAULT 1000;
diff --git a/iox_catalog/migrations/20220223192306_add_row_count_to_parquet_file.sql b/iox_catalog/migrations/20220223192306_add_row_count_to_parquet_file.sql
new file mode 100644
index 0000000..c94ff81
--- /dev/null
+++ b/iox_catalog/migrations/20220223192306_add_row_count_to_parquet_file.sql
@@ -0,0 +1,4 @@
+ALTER TABLE
+  IF EXISTS parquet_file
+ADD
+  COLUMN row_count BIGINT NOT NULL DEFAULT 0;
diff --git a/iox_catalog/migrations/20220224080910_add_file_size_bytes.sql b/iox_catalog/migrations/20220224080910_add_file_size_bytes.sql
new file mode 100644
index 0000000..a34157c
--- /dev/null
+++ b/iox_catalog/migrations/20220224080910_add_file_size_bytes.sql
@@ -0,0 +1,4 @@
+ALTER TABLE
+  IF EXISTS parquet_file
+ADD
+  COLUMN file_size_bytes BIGINT NOT NULL DEFAULT 0;
diff --git a/iox_catalog/migrations/20220224082147_add_parquet_metadata.sql b/iox_catalog/migrations/20220224082147_add_parquet_metadata.sql
new file mode 100644
index 0000000..45734b3
--- /dev/null
+++ b/iox_catalog/migrations/20220224082147_add_parquet_metadata.sql
@@ -0,0 +1,4 @@
+ALTER TABLE
+  IF EXISTS parquet_file
+ADD
+  COLUMN parquet_metadata bytea NOT NULL DEFAULT '';
diff --git a/iox_catalog/migrations/20220307201050_add_parquet_level_and_created_at.sql b/iox_catalog/migrations/20220307201050_add_parquet_level_and_created_at.sql
new file mode 100644
index 0000000..8576f68
--- /dev/null
+++ b/iox_catalog/migrations/20220307201050_add_parquet_level_and_created_at.sql
@@ -0,0 +1,6 @@
+ALTER TABLE
+    IF EXISTS parquet_file
+    ADD
+    COLUMN compaction_level smallint NOT NULL DEFAULT 0,
+    ADD
+    COLUMN created_at bigint;
diff --git a/iox_catalog/migrations/20220308120917_ingester-missing-indexes.sql b/iox_catalog/migrations/20220308120917_ingester-missing-indexes.sql
new file mode 100644
index 0000000..47297ba
--- /dev/null
+++ b/iox_catalog/migrations/20220308120917_ingester-missing-indexes.sql
@@ -0,0 +1,5 @@
+-- Add indexes to support JOINs in TableRepo::get_table_persist_info
+-- https://github.com/influxdata/influxdb_iox/blob/aaec1c7828139e47296665c84aeea4c974026118/iox_catalog/src/postgres.rs#L717-L734
+CREATE INDEX IF NOT EXISTS table_name_namespace_idx ON table_name (namespace_id);
+
+CREATE INDEX IF NOT EXISTS parquet_file_table_idx ON parquet_file (table_id);
diff --git a/iox_catalog/migrations/20220323150705_change_to_delete_to_timestamp.sql b/iox_catalog/migrations/20220323150705_change_to_delete_to_timestamp.sql
new file mode 100644
index 0000000..f63ba36
--- /dev/null
+++ b/iox_catalog/migrations/20220323150705_change_to_delete_to_timestamp.sql
@@ -0,0 +1,9 @@
+ALTER TABLE
+    IF EXISTS parquet_file
+    ALTER COLUMN to_delete
+    TYPE BIGINT
+    -- If to_delete is set to true, assign it a zero-but-not-null timestamp since we don't have the
+    -- actual time it was marked to_delete, but we want it to be deleted. If to_delete is false or
+    -- null, set it to null.
+    USING CASE WHEN to_delete THEN 0 ELSE NULL END
+;
diff --git a/iox_catalog/migrations/20220324152729_add_namespace_id_to_parquet_file.sql b/iox_catalog/migrations/20220324152729_add_namespace_id_to_parquet_file.sql
new file mode 100644
index 0000000..cbb0423
--- /dev/null
+++ b/iox_catalog/migrations/20220324152729_add_namespace_id_to_parquet_file.sql
@@ -0,0 +1,19 @@
+ALTER TABLE
+  IF EXISTS parquet_file
+  ADD COLUMN namespace_id INT;
+
+ALTER TABLE
+  IF EXISTS parquet_file
+  ADD FOREIGN KEY (namespace_id)
+  REFERENCES namespace (id) MATCH SIMPLE
+  ON UPDATE NO ACTION
+  ON DELETE NO ACTION
+  NOT VALID;
+
+UPDATE parquet_file p
+  SET namespace_id=t.namespace_id
+  FROM table_name t
+  WHERE t.id=p.table_id;
+
+ALTER TABLE parquet_file
+  ALTER COLUMN namespace_id SET NOT NULL;
diff --git a/iox_catalog/migrations/20220328150925_add_index_to_to_delete.sql b/iox_catalog/migrations/20220328150925_add_index_to_to_delete.sql
new file mode 100644
index 0000000..c6dbba9
--- /dev/null
+++ b/iox_catalog/migrations/20220328150925_add_index_to_to_delete.sql
@@ -0,0 +1,4 @@
+-- Add indexes to support querying for and deleting Parquet Files marked for deletion before
+-- a specified time.
+
+CREATE INDEX IF NOT EXISTS parquet_file_deleted_at_idx ON parquet_file (to_delete);
diff --git a/iox_catalog/migrations/20220331222147_compactor-indexes.sql b/iox_catalog/migrations/20220331222147_compactor-indexes.sql
new file mode 100644
index 0000000..e9db9f8
--- /dev/null
+++ b/iox_catalog/migrations/20220331222147_compactor-indexes.sql
@@ -0,0 +1,5 @@
+-- Add indexes for looking for level 0 files by sequencer id for compactor
+CREATE INDEX IF NOT EXISTS parquet_file_sequencer_compaction_delete_idx ON parquet_file (sequencer_id, compaction_level, to_delete);
+
+-- Add indexes for looking up parquet files by partition for compaction
+CREATE INDEX IF NOT EXISTS parquet_file_partition_idx ON parquet_file (partition_id);
diff --git a/iox_catalog/migrations/20220401174403_add_sort_key_to_partition.sql b/iox_catalog/migrations/20220401174403_add_sort_key_to_partition.sql
new file mode 100644
index 0000000..bb3ba81
--- /dev/null
+++ b/iox_catalog/migrations/20220401174403_add_sort_key_to_partition.sql
@@ -0,0 +1,4 @@
+ALTER TABLE
+    IF EXISTS partition
+    ADD
+    COLUMN sort_key VARCHAR;
diff --git a/iox_catalog/migrations/20220404120116_column_name-table_id-index.sql b/iox_catalog/migrations/20220404120116_column_name-table_id-index.sql
new file mode 100644
index 0000000..f262a1f
--- /dev/null
+++ b/iox_catalog/migrations/20220404120116_column_name-table_id-index.sql
@@ -0,0 +1,2 @@
+-- Avoid seqscan when filtering columns by their table ID.
+CREATE INDEX IF NOT EXISTS column_name_table_idx ON column_name (table_id);
diff --git a/iox_catalog/migrations/20220420142224_ids_bigint.sql b/iox_catalog/migrations/20220420142224_ids_bigint.sql
new file mode 100644
index 0000000..359fb4b
--- /dev/null
+++ b/iox_catalog/migrations/20220420142224_ids_bigint.sql
@@ -0,0 +1,152 @@
+----------------------------------------
+-- kafka_topic_id
+ALTER TABLE
+    IF EXISTS kafka_topic
+    ALTER COLUMN id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS namespace
+    ALTER COLUMN kafka_topic_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS sequencer
+    ALTER COLUMN kafka_topic_id
+    TYPE BIGINT
+;
+----------------------------------------
+
+----------------------------------------
+-- query_pool_id
+ALTER TABLE
+    IF EXISTS query_pool
+    ALTER COLUMN id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS namespace
+    ALTER COLUMN query_pool_id
+    TYPE BIGINT
+;
+----------------------------------------
+
+----------------------------------------
+-- namespace_id
+ALTER TABLE
+    IF EXISTS namespace
+    ALTER COLUMN id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS table_name
+    ALTER COLUMN namespace_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS sharding_rule_override
+    ALTER COLUMN namespace_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS parquet_file
+    ALTER COLUMN namespace_id
+    TYPE BIGINT
+;
+----------------------------------------
+
+----------------------------------------
+-- table_id
+ALTER TABLE
+    IF EXISTS table_name
+    ALTER COLUMN id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS column_name
+    ALTER COLUMN table_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS sharding_rule_override
+    ALTER COLUMN table_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS partition
+    ALTER COLUMN table_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS parquet_file
+    ALTER COLUMN table_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS tombstone
+    ALTER COLUMN table_id
+    TYPE BIGINT
+;
+----------------------------------------
+
+----------------------------------------
+-- column_id
+ALTER TABLE
+    IF EXISTS column_name
+    ALTER COLUMN id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS sharding_rule_override
+    ALTER COLUMN column_id
+    TYPE BIGINT
+;
+----------------------------------------
+
+----------------------------------------
+-- sharding_rule_override_id
+ALTER TABLE
+    IF EXISTS sharding_rule_override
+    ALTER COLUMN id
+    TYPE BIGINT
+;
+----------------------------------------
+
+----------------------------------------
+-- sequencer_id
+ALTER TABLE
+    IF EXISTS sequencer
+    ALTER COLUMN id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS partition
+    ALTER COLUMN sequencer_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS parquet_file
+    ALTER COLUMN sequencer_id
+    TYPE BIGINT
+;
+
+ALTER TABLE
+    IF EXISTS tombstone
+    ALTER COLUMN sequencer_id
+    TYPE BIGINT
+;
+----------------------------------------
diff --git a/iox_catalog/migrations/20220610102325_add_sort_key_array_column.sql b/iox_catalog/migrations/20220610102325_add_sort_key_array_column.sql
new file mode 100644
index 0000000..d02bde2
--- /dev/null
+++ b/iox_catalog/migrations/20220610102325_add_sort_key_array_column.sql
@@ -0,0 +1,10 @@
+-- This column is ment to be later manually populated with:
+--
+-- UPDATE partition SET sort_key_arr=string_to_array(CASE WHEN sort_key IS NULL THEN '' ELSE sort_key END,',');
+--
+-- before merging https://github.com/influxdata/influxdb_iox/pull/4801/
+-- (which will rename `sort_key_arr` back to `sort_key`)
+--
+-- Then, once the migration is done we should add a NOT NULL constraint.
+
+ALTER TABLE partition ADD COLUMN sort_key_arr TEXT[];
diff --git a/iox_catalog/migrations/20220610131325_rename_sort_key_arr.sql b/iox_catalog/migrations/20220610131325_rename_sort_key_arr.sql
new file mode 100644
index 0000000..835a923
--- /dev/null
+++ b/iox_catalog/migrations/20220610131325_rename_sort_key_arr.sql
@@ -0,0 +1,4 @@
+UPDATE partition SET sort_key_arr=string_to_array(COALESCE(sort_key, ''),',') WHERE sort_key_arr IS NULL;
+ALTER TABLE partition ALTER COLUMN sort_key_arr SET NOT NULL;
+ALTER TABLE partition RENAME COLUMN sort_key TO sort_key_old;
+ALTER TABLE partition RENAME COLUMN sort_key_arr TO sort_key;
diff --git a/iox_catalog/migrations/20220620160136_parquet_file_columns.sql b/iox_catalog/migrations/20220620160136_parquet_file_columns.sql
new file mode 100644
index 0000000..99d1120
--- /dev/null
+++ b/iox_catalog/migrations/20220620160136_parquet_file_columns.sql
@@ -0,0 +1 @@
+ALTER TABLE parquet_file ADD COLUMN column_set TEXT[];
diff --git a/iox_catalog/migrations/20220623143430_non_null_parquet_file_columns.sql b/iox_catalog/migrations/20220623143430_non_null_parquet_file_columns.sql
new file mode 100644
index 0000000..b693120
--- /dev/null
+++ b/iox_catalog/migrations/20220623143430_non_null_parquet_file_columns.sql
@@ -0,0 +1 @@
+ALTER TABLE parquet_file ALTER COLUMN column_set SET NOT NULL;
diff --git a/iox_catalog/migrations/20220624104030_remove_column_sort_key_old.sql b/iox_catalog/migrations/20220624104030_remove_column_sort_key_old.sql
new file mode 100644
index 0000000..bee36da
--- /dev/null
+++ b/iox_catalog/migrations/20220624104030_remove_column_sort_key_old.sql
@@ -0,0 +1 @@
+ALTER TABLE partition DROP COLUMN IF EXISTS sort_key_old;
diff --git a/iox_catalog/migrations/20220627092839_remove_parquet_metadata.sql b/iox_catalog/migrations/20220627092839_remove_parquet_metadata.sql
new file mode 100644
index 0000000..42a2493
--- /dev/null
+++ b/iox_catalog/migrations/20220627092839_remove_parquet_metadata.sql
@@ -0,0 +1 @@
+ALTER TABLE parquet_file DROP COLUMN IF EXISTS parquet_metadata;
diff --git a/iox_catalog/migrations/20220628085556_column_set_via_IDs.sql b/iox_catalog/migrations/20220628085556_column_set_via_IDs.sql
new file mode 100644
index 0000000..cc56048
--- /dev/null
+++ b/iox_catalog/migrations/20220628085556_column_set_via_IDs.sql
@@ -0,0 +1,30 @@
+-- set up target column
+-- NOTE: proper foreign key arrays are NOT supported at the moment, see https://stackoverflow.com/a/50441059
+ALTER TABLE parquet_file ADD COLUMN column_set_new BIGINT[];
+
+-- convert
+UPDATE parquet_file
+SET column_set_new=sub.column_set_new
+FROM (
+    SELECT
+        parquet_file.id AS id,
+        array_agg(column_name.id) AS column_set_new
+    FROM (
+        SELECT
+            id,
+            table_id,
+            unnest(column_set) AS column_name
+        FROM parquet_file
+    ) AS parquet_file
+    JOIN column_name
+        ON parquet_file.table_id = column_name.table_id AND parquet_file.column_name = column_name.name
+    GROUP BY parquet_file.id
+) AS sub
+WHERE parquet_file.id = sub.id;
+
+-- start to enforce target column constraints
+ALTER TABLE parquet_file ALTER COLUMN column_set_new SET NOT NULL;
+
+-- replace old column
+ALTER TABLE parquet_file DROP COLUMN column_set;
+ALTER TABLE parquet_file RENAME COLUMN column_set_new TO column_set;
diff --git a/iox_catalog/migrations/20220721195556_drop_min_sequence_number.sql b/iox_catalog/migrations/20220721195556_drop_min_sequence_number.sql
new file mode 100644
index 0000000..f69231a
--- /dev/null
+++ b/iox_catalog/migrations/20220721195556_drop_min_sequence_number.sql
@@ -0,0 +1 @@
+ALTER TABLE parquet_file DROP COLUMN min_sequence_number;
diff --git a/iox_catalog/migrations/20220819154515_sequencer_to_shard.sql b/iox_catalog/migrations/20220819154515_sequencer_to_shard.sql
new file mode 100644
index 0000000..600549e
--- /dev/null
+++ b/iox_catalog/migrations/20220819154515_sequencer_to_shard.sql
@@ -0,0 +1,17 @@
+-- Rename sequencer to shard
+ALTER TABLE IF EXISTS sequencer RENAME TO shard;
+ALTER TABLE IF EXISTS shard RENAME CONSTRAINT sequencer_unique TO shard_unique;
+ALTER TABLE IF EXISTS partition RENAME sequencer_id TO shard_id;
+ALTER TABLE IF EXISTS tombstone RENAME sequencer_id TO shard_id;
+ALTER TABLE IF EXISTS parquet_file RENAME sequencer_id TO shard_id;
+ALTER INDEX IF EXISTS parquet_file_sequencer_compaction_delete_idx
+  RENAME TO parquet_file_shard_compaction_delete_idx;
+
+-- Rename kafka_partition to shard_index
+ALTER TABLE IF EXISTS shard RENAME kafka_partition TO shard_index;
+
+-- Rename kafka_topic to topic
+ALTER TABLE IF EXISTS kafka_topic RENAME TO topic;
+ALTER TABLE IF EXISTS topic RENAME CONSTRAINT kafka_topic_name_unique TO topic_name_unique;
+ALTER TABLE IF EXISTS namespace RENAME kafka_topic_id TO topic_id;
+ALTER TABLE IF EXISTS shard RENAME kafka_topic_id TO topic_id;
diff --git a/iox_catalog/migrations/20220902145700_compactor_index.sql b/iox_catalog/migrations/20220902145700_compactor_index.sql
new file mode 100644
index 0000000..98dcea9
--- /dev/null
+++ b/iox_catalog/migrations/20220902145700_compactor_index.sql
@@ -0,0 +1,2 @@
+-- Add index for query to select recent_highest_throughput_partitions
+CREATE INDEX IF NOT EXISTS parquet_file_shard_compaction_delete_created_idx ON parquet_file (shard_id, compaction_level, to_delete, created_at);
\ No newline at end of file
diff --git a/iox_catalog/migrations/20220909140048_create_skipped_compactions.sql b/iox_catalog/migrations/20220909140048_create_skipped_compactions.sql
new file mode 100644
index 0000000..27056d0
--- /dev/null
+++ b/iox_catalog/migrations/20220909140048_create_skipped_compactions.sql
@@ -0,0 +1,6 @@
+CREATE TABLE IF NOT EXISTS skipped_compactions (
+    partition_id BIGINT REFERENCES PARTITION (id) ON DELETE CASCADE,
+    reason TEXT NOT NULL,
+    skipped_at BIGINT NOT NULL,
+    PRIMARY KEY (partition_id)
+);
diff --git a/iox_catalog/migrations/20220913162700_parquetfile_filesize_trigger.sql b/iox_catalog/migrations/20220913162700_parquetfile_filesize_trigger.sql
new file mode 100644
index 0000000..c6e6fa0
--- /dev/null
+++ b/iox_catalog/migrations/20220913162700_parquetfile_filesize_trigger.sql
@@ -0,0 +1,54 @@
+CREATE TABLE IF NOT EXISTS billing_summary (
+    namespace_id INT NOT NULL,
+    total_file_size_bytes BIGINT NOT NULL,
+    PRIMARY KEY (namespace_id)
+);
+
+CREATE INDEX IF NOT EXISTS billing_summary_namespace_idx ON billing_summary (namespace_id);
+
+ALTER TABLE
+    IF EXISTS billing_summary
+ADD
+    FOREIGN KEY (namespace_id) REFERENCES namespace (id) MATCH SIMPLE ON UPDATE NO ACTION ON DELETE NO ACTION NOT VALID;
+
+CREATE OR REPLACE FUNCTION increment_billing_summary()
+    RETURNS TRIGGER
+    LANGUAGE PLPGSQL
+    AS
+$$
+BEGIN
+    INSERT INTO billing_summary (namespace_id, total_file_size_bytes)
+    VALUES (NEW.namespace_id, NEW.file_size_bytes)
+    ON CONFLICT (namespace_id) DO UPDATE
+    SET total_file_size_bytes = billing_summary.total_file_size_bytes + NEW.file_size_bytes
+    WHERE billing_summary.namespace_id = NEW.namespace_id;
+    RETURN NEW;
+END;
+$$ ;
+
+CREATE TRIGGER update_billing
+    AFTER INSERT
+    ON parquet_file
+    FOR EACH ROW
+    EXECUTE PROCEDURE increment_billing_summary();
+
+CREATE OR REPLACE FUNCTION maybe_decrement_billing_summary()
+    RETURNS TRIGGER
+    LANGUAGE PLPGSQL
+    AS
+$$
+BEGIN
+    IF OLD.to_delete IS NULL AND NEW.to_delete IS NOT NULL THEN
+        UPDATE billing_summary
+        SET total_file_size_bytes = billing_summary.total_file_size_bytes - OLD.file_size_bytes
+        WHERE billing_summary.namespace_id = OLD.namespace_id;
+    END IF;
+    RETURN OLD;
+END;
+$$ ;
+
+CREATE TRIGGER decrement_summary
+    AFTER UPDATE
+    ON parquet_file
+    FOR EACH ROW
+    EXECUTE PROCEDURE maybe_decrement_billing_summary();
diff --git a/iox_catalog/migrations/20220915085930_per-partition-persist-offset.sql b/iox_catalog/migrations/20220915085930_per-partition-persist-offset.sql
new file mode 100644
index 0000000..34e2fbd
--- /dev/null
+++ b/iox_catalog/migrations/20220915085930_per-partition-persist-offset.sql
@@ -0,0 +1,9 @@
+/*
+ Add a per-partition persistence watermark (inclusive).
+
+ https://github.com/influxdata/influxdb_iox/issues/5638
+ */
+ALTER TABLE
+    "partition"
+ADD
+    COLUMN "persisted_sequence_number" BIGINT NOT NULL DEFAULT 0;
diff --git a/iox_catalog/migrations/20220915155941_nullable-partition-persist-offset.sql b/iox_catalog/migrations/20220915155941_nullable-partition-persist-offset.sql
new file mode 100644
index 0000000..8e7321f
--- /dev/null
+++ b/iox_catalog/migrations/20220915155941_nullable-partition-persist-offset.sql
@@ -0,0 +1,11 @@
+-- Make persisted_sequence_number nullable.
+--
+-- NULL == no persisted data for this partition
+ALTER TABLE
+    "partition" DROP COLUMN "persisted_sequence_number";
+
+-- Remove implicit NULL / default of 0.
+ALTER TABLE
+    "partition"
+ADD
+    COLUMN "persisted_sequence_number" BIGINT NULL DEFAULT NULL;
diff --git a/iox_catalog/migrations/20220926182450_add_cols_to_skipped_compactions.sql b/iox_catalog/migrations/20220926182450_add_cols_to_skipped_compactions.sql
new file mode 100644
index 0000000..e35e0cd
--- /dev/null
+++ b/iox_catalog/migrations/20220926182450_add_cols_to_skipped_compactions.sql
@@ -0,0 +1,5 @@
+ALTER TABLE IF EXISTS skipped_compactions 
+    ADD COLUMN IF NOT EXISTS num_files BIGINT DEFAULT NULL,
+    ADD COLUMN IF NOT EXISTS limit_num_files BIGINT DEFAULT NULL,
+    ADD COLUMN IF NOT EXISTS estimated_bytes BIGINT DEFAULT NULL,
+    ADD COLUMN IF NOT EXISTS limit_bytes BIGINT DEFAULT NULL;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20221014122742_lower-default-per-table-column-limit.sql b/iox_catalog/migrations/20221014122742_lower-default-per-table-column-limit.sql
new file mode 100644
index 0000000..69ed514
--- /dev/null
+++ b/iox_catalog/migrations/20221014122742_lower-default-per-table-column-limit.sql
@@ -0,0 +1,7 @@
+-- Lower the defualt per-table column limit.
+--
+-- https://github.com/influxdata/influxdb_iox/issues/5858
+ALTER TABLE
+    namespace ALTER max_columns_per_table
+SET
+    DEFAULT 200;
diff --git a/iox_catalog/migrations/20221101170001_add_cols_for_retention_policy.sql b/iox_catalog/migrations/20221101170001_add_cols_for_retention_policy.sql
new file mode 100644
index 0000000..cbd4f80
--- /dev/null
+++ b/iox_catalog/migrations/20221101170001_add_cols_for_retention_policy.sql
@@ -0,0 +1,6 @@
+ALTER TABLE IF EXISTS namespace
+    ADD COLUMN IF NOT EXISTS retention_period_ns BIGINT DEFAULT NULL;
+
+
+ALTER TABLE IF EXISTS partition
+    ADD COLUMN IF NOT EXISTS to_delete BIGINT DEFAULT NULL;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20221111133939_drop_retention_duration.sql b/iox_catalog/migrations/20221111133939_drop_retention_duration.sql
new file mode 100644
index 0000000..ece48e7
--- /dev/null
+++ b/iox_catalog/migrations/20221111133939_drop_retention_duration.sql
@@ -0,0 +1 @@
+ALTER TABLE namespace DROP COLUMN IF EXISTS retention_duration;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20221118135602_add_col_for_compaction_first_file_in_partition.sql b/iox_catalog/migrations/20221118135602_add_col_for_compaction_first_file_in_partition.sql
new file mode 100644
index 0000000..249fc54
--- /dev/null
+++ b/iox_catalog/migrations/20221118135602_add_col_for_compaction_first_file_in_partition.sql
@@ -0,0 +1,2 @@
+ALTER TABLE IF EXISTS skipped_compactions
+    ADD COLUMN IF NOT EXISTS limit_num_files_first_in_partition BIGINT DEFAULT NULL;
diff --git a/iox_catalog/migrations/20221215145057_namespace_delete_cascade.sql b/iox_catalog/migrations/20221215145057_namespace_delete_cascade.sql
new file mode 100644
index 0000000..20a53b6
--- /dev/null
+++ b/iox_catalog/migrations/20221215145057_namespace_delete_cascade.sql
@@ -0,0 +1,48 @@
+ALTER TABLE parquet_file
+    DROP CONSTRAINT parquet_file_namespace_id_fkey;
+ALTER TABLE parquet_file
+    ADD CONSTRAINT parquet_file_namespace_id_fkey
+    FOREIGN KEY (namespace_id) REFERENCES namespace(id)
+    ON DELETE CASCADE;
+
+ALTER TABLE processed_tombstone
+    DROP CONSTRAINT processed_tombstone_parquet_file_id_fkey;
+ALTER TABLE processed_tombstone
+    ADD CONSTRAINT processed_tombstone_parquet_file_id_fkey
+    FOREIGN KEY (parquet_file_id) REFERENCES parquet_file(id)
+    ON DELETE CASCADE;
+
+ALTER TABLE tombstone
+    DROP CONSTRAINT tombstone_table_id_fkey;
+ALTER TABLE tombstone
+    ADD CONSTRAINT tombstone_table_id_fkey
+    FOREIGN KEY (table_id) REFERENCES table_name(id)
+    ON DELETE CASCADE;
+
+ALTER TABLE table_name
+    DROP CONSTRAINT table_name_namespace_id_fkey;
+ALTER TABLE table_name
+    ADD CONSTRAINT table_name_namespace_id_fkey
+    FOREIGN KEY (namespace_id) REFERENCES namespace(id)
+    ON DELETE CASCADE;
+
+ALTER TABLE column_name
+    DROP CONSTRAINT column_name_table_id_fkey;
+ALTER TABLE column_name
+    ADD CONSTRAINT column_name_table_id_fkey
+    FOREIGN KEY (table_id) REFERENCES table_name(id)
+    ON DELETE CASCADE;
+
+ALTER TABLE partition
+    DROP CONSTRAINT partition_table_id_fkey;
+ALTER TABLE partition
+    ADD CONSTRAINT partition_table_id_fkey
+    FOREIGN KEY (table_id) REFERENCES table_name(id)
+    ON DELETE CASCADE;
+
+ALTER TABLE billing_summary
+    DROP CONSTRAINT billing_summary_namespace_id_fkey;
+ALTER TABLE billing_summary
+    ADD CONSTRAINT billing_summary_namespace_id_fkey
+    FOREIGN KEY (namespace_id) REFERENCES namespace(id)
+    ON DELETE CASCADE;
diff --git a/iox_catalog/migrations/20230104114545_create_indexes_without_shard.sql b/iox_catalog/migrations/20230104114545_create_indexes_without_shard.sql
new file mode 100644
index 0000000..1a02a5d
--- /dev/null
+++ b/iox_catalog/migrations/20230104114545_create_indexes_without_shard.sql
@@ -0,0 +1,2 @@
+-- This index will be used for selecting partitions with parquet files created after a given time
+CREATE INDEX IF NOT EXISTS parquet_file_partition_created_idx ON parquet_file (partition_id, created_at);
diff --git a/iox_catalog/migrations/20230105120822_trigger_update_partition.sql b/iox_catalog/migrations/20230105120822_trigger_update_partition.sql
new file mode 100644
index 0000000..81b629b
--- /dev/null
+++ b/iox_catalog/migrations/20230105120822_trigger_update_partition.sql
@@ -0,0 +1,22 @@
+-- A new field in the partition that stores the latest time a new file is added to the partition
+ALTER TABLE partition ADD COLUMN IF NOT EXISTS new_file_at bigint;
+
+-- FUNTION that updates the new_file_at field in the partition table when the update_partition trigger is fired
+CREATE OR REPLACE FUNCTION update_partition_on_new_file_at()
+RETURNS TRIGGER 
+LANGUAGE PLPGSQL
+AS $$
+BEGIN 
+    UPDATE partition SET new_file_at = EXTRACT(EPOCH FROM now() ) * 1000000000 WHERE id = NEW.partition_id;
+
+    RETURN NEW;
+END;
+$$;
+
+-- TRIGGER that fires the update_partition_on_new_file_at function when a new file is added to the parquet_file table
+CREATE TRIGGER update_partition 
+    AFTER INSERT ON parquet_file 
+    FOR EACH ROW 
+    EXECUTE PROCEDURE update_partition_on_new_file_at();
+    
+
diff --git a/iox_catalog/migrations/20230106145820_modify_trigger.sql b/iox_catalog/migrations/20230106145820_modify_trigger.sql
new file mode 100644
index 0000000..4733799
--- /dev/null
+++ b/iox_catalog/migrations/20230106145820_modify_trigger.sql
@@ -0,0 +1,11 @@
+-- FUNTION that updates the new_file_at field in the partition table when the update_partition trigger is fired
+CREATE OR REPLACE FUNCTION update_partition_on_new_file_at()
+RETURNS TRIGGER 
+LANGUAGE PLPGSQL
+AS $$
+BEGIN 
+    UPDATE partition SET new_file_at = NEW.created_at WHERE id = NEW.partition_id;
+
+    RETURN NEW;
+END;
+$$;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20230113075522_modify_trigger_new_file_at.sql b/iox_catalog/migrations/20230113075522_modify_trigger_new_file_at.sql
new file mode 100644
index 0000000..caa8472
--- /dev/null
+++ b/iox_catalog/migrations/20230113075522_modify_trigger_new_file_at.sql
@@ -0,0 +1,16 @@
+-- FUNTION that updates the new_file_at field in the partition table when the update_partition trigger is fired
+-- The field new_file_at signals when the last file was added to the partition for compaction. However, 
+-- since compaction level 2 is the final stage, its creation should not signal further compaction
+
+CREATE OR REPLACE FUNCTION update_partition_on_new_file_at()
+RETURNS TRIGGER 
+LANGUAGE PLPGSQL
+AS $$
+BEGIN
+    IF NEW.compaction_level < 2 THEN
+        UPDATE partition SET new_file_at = NEW.created_at WHERE id = NEW.partition_id;    
+    END IF;
+
+    RETURN NEW;
+END;
+$$;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20230125103737_create_max_l0_crrated_at.sql b/iox_catalog/migrations/20230125103737_create_max_l0_crrated_at.sql
new file mode 100644
index 0000000..4bddbd1
--- /dev/null
+++ b/iox_catalog/migrations/20230125103737_create_max_l0_crrated_at.sql
@@ -0,0 +1,3 @@
+ALTER TABLE 
+    IF EXISTS parquet_file 
+    ADD COLUMN max_l0_created_at BIGINT NOT NULL DEFAULT 0; 
\ No newline at end of file
diff --git a/iox_catalog/migrations/20230207103859_namespace-soft-delete-column.sql b/iox_catalog/migrations/20230207103859_namespace-soft-delete-column.sql
new file mode 100644
index 0000000..80b5d5c
--- /dev/null
+++ b/iox_catalog/migrations/20230207103859_namespace-soft-delete-column.sql
@@ -0,0 +1,9 @@
+-- Add a soft-deletion timestamp to the "namespace" table.
+--
+-- <https://github.com/influxdata/influxdb_iox/issues/6492>
+ALTER TABLE
+    namespace
+ADD
+    COLUMN deleted_at BIGINT DEFAULT NULL;
+
+CREATE INDEX namespace_deleted_at_idx ON namespace (deleted_at);
\ No newline at end of file
diff --git a/iox_catalog/migrations/20230417173102_deterministic_partition_id.sql b/iox_catalog/migrations/20230417173102_deterministic_partition_id.sql
new file mode 100644
index 0000000..fe175d9
--- /dev/null
+++ b/iox_catalog/migrations/20230417173102_deterministic_partition_id.sql
@@ -0,0 +1,16 @@
+-- Add a new, nullable hash ID column to the partition table
+ALTER TABLE
+  IF EXISTS partition
+  ADD COLUMN hash_id bytea;
+
+-- If it's specified, it must be unique
+ALTER TABLE
+  IF EXISTS partition
+  ADD CONSTRAINT partition_hash_id_unique UNIQUE (hash_id);
+
+-- Add a new, nullable foreign key column to the parquet_file table referencing the partition table
+ALTER TABLE
+  IF EXISTS parquet_file
+  ADD COLUMN partition_hash_id bytea
+  REFERENCES partition (hash_id);
+
diff --git a/iox_catalog/migrations/20230501173720_add_partition_templates.sql b/iox_catalog/migrations/20230501173720_add_partition_templates.sql
new file mode 100644
index 0000000..c5239d2
--- /dev/null
+++ b/iox_catalog/migrations/20230501173720_add_partition_templates.sql
@@ -0,0 +1,7 @@
+ALTER TABLE
+    IF EXISTS namespace
+    ADD COLUMN partition_template JSONB;
+
+ALTER TABLE
+    IF EXISTS table_name
+    ADD COLUMN partition_template JSONB;
diff --git a/iox_catalog/migrations/20230522145017_remove_processed_tombstones_table.sql b/iox_catalog/migrations/20230522145017_remove_processed_tombstones_table.sql
new file mode 100644
index 0000000..21d5904
--- /dev/null
+++ b/iox_catalog/migrations/20230522145017_remove_processed_tombstones_table.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS processed_tombstone;
diff --git a/iox_catalog/migrations/20230524095337_remove_unused_parquet_file_indices.sql b/iox_catalog/migrations/20230524095337_remove_unused_parquet_file_indices.sql
new file mode 100644
index 0000000..7020bac
--- /dev/null
+++ b/iox_catalog/migrations/20230524095337_remove_unused_parquet_file_indices.sql
@@ -0,0 +1,4 @@
+-- Remove unused indices
+DROP INDEX IF EXISTS parquet_file_partition_created_idx;
+DROP INDEX IF EXISTS parquet_file_shard_compaction_delete_created_idx;
+DROP INDEX IF EXISTS parquet_file_shard_compaction_delete_idx;
diff --git a/iox_catalog/migrations/20230524151854_add_parquet_file_table_and_deleted_index.sql b/iox_catalog/migrations/20230524151854_add_parquet_file_table_and_deleted_index.sql
new file mode 100644
index 0000000..120ecb8
--- /dev/null
+++ b/iox_catalog/migrations/20230524151854_add_parquet_file_table_and_deleted_index.sql
@@ -0,0 +1,20 @@
+-- Add to help the querier when it searches for undeleted parquet files.
+-- IOX_OTHER_CHECKSUM: ddc52db62ed446e4a8fe30af7bced52724bcf93e6e6c6cac8cbd3783bf7312595cfc44dc6aa282db9ac58e4ecfb08268
+
+-- By default we often only have 5min to finish our statements. The `CREATE INDEX CONCURRENTLY` however takes longer.
+-- In our prod test this took about 15min, but better be safe than sorry.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- remove potentially invalid index
+-- IOX_NO_TRANSACTION
+DROP INDEX CONCURRENTLY IF EXISTS parquet_file_table_delete_idx;
+
+-- IOX_STEP_BOUNDARY
+
+-- While `CONCURRENTLY` means it runs parallel to other writes, this command will only finish after the index was
+-- successfully built.
+-- IOX_NO_TRANSACTION
+CREATE INDEX CONCURRENTLY parquet_file_table_delete_idx ON parquet_file (table_id) WHERE to_delete IS NULL;
diff --git a/iox_catalog/migrations/20230530132809_add_parquet_file_partition_and_deleted_index.sql b/iox_catalog/migrations/20230530132809_add_parquet_file_partition_and_deleted_index.sql
new file mode 100644
index 0000000..fbdcbb8
--- /dev/null
+++ b/iox_catalog/migrations/20230530132809_add_parquet_file_partition_and_deleted_index.sql
@@ -0,0 +1,20 @@
+-- Add to help the compactor when it searches for undeleted parquet files.
+-- IOX_OTHER_CHECKSUM: 4b8295a25aa051620c8fe5fa64b914901a2f4af4d343d2735cde484aa018691e22d274aab84200ba2830fbe51833ab1a
+
+-- By default we often only have 5min to finish our statements. The `CREATE INDEX CONCURRENTLY` however takes longer.
+-- In our prod test this took about 15min, but better be safe than sorry.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- remove potentially invalid index
+-- IOX_NO_TRANSACTION
+DROP INDEX CONCURRENTLY IF EXISTS parquet_file_partition_delete_idx;
+
+-- IOX_STEP_BOUNDARY
+
+-- While `CONCURRENTLY` means it runs parallel to other writes, this command will only finish after the index was
+-- successfully built.
+-- IOX_NO_TRANSACTION
+CREATE INDEX CONCURRENTLY parquet_file_partition_delete_idx ON parquet_file (partition_id) WHERE to_delete IS NULL;
diff --git a/iox_catalog/migrations/20230601130730_remove_unused_parquet_file_indices_2.sql b/iox_catalog/migrations/20230601130730_remove_unused_parquet_file_indices_2.sql
new file mode 100644
index 0000000..eb76443
--- /dev/null
+++ b/iox_catalog/migrations/20230601130730_remove_unused_parquet_file_indices_2.sql
@@ -0,0 +1,3 @@
+-- Remove unused indices
+DROP INDEX IF EXISTS parquet_file_partition_idx;
+DROP INDEX IF EXISTS parquet_file_table_idx;
diff --git a/iox_catalog/migrations/20230707195110_add_created_at_index.sql b/iox_catalog/migrations/20230707195110_add_created_at_index.sql
new file mode 100644
index 0000000..63cfe56
--- /dev/null
+++ b/iox_catalog/migrations/20230707195110_add_created_at_index.sql
@@ -0,0 +1,20 @@
+-- Add to help the compactor when it searches for partitions with files created recently.
+-- IOX_OTHER_CHECKSUM: 7a353a9a9876a691c6df91af2f2774bb0a43e5f86aa65ca470c16e413ef1741f4a75505ad4e56f1ecb206c2097aad90f
+
+-- By default we often only have 5min to finish our statements. The `CREATE INDEX CONCURRENTLY` however takes longer.
+-- In our prod test this took about 15min, but better be safe than sorry.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- remove potentially invalid index
+-- IOX_NO_TRANSACTION
+DROP INDEX CONCURRENTLY IF EXISTS partition_new_file_at_idx;
+
+-- IOX_STEP_BOUNDARY
+
+-- While `CONCURRENTLY` means it runs parallel to other writes, this command will only finish after the index was
+-- successfully built.
+-- IOX_NO_TRANSACTION
+CREATE INDEX CONCURRENTLY partition_new_file_at_idx ON partition (new_file_at);
diff --git a/iox_catalog/migrations/20230710141740_partition_hash_id_index.sql b/iox_catalog/migrations/20230710141740_partition_hash_id_index.sql
new file mode 100644
index 0000000..11944bf
--- /dev/null
+++ b/iox_catalog/migrations/20230710141740_partition_hash_id_index.sql
@@ -0,0 +1,19 @@
+-- By default, we often only have 5min to finish our statements. The `CREATE INDEX CONCURRENTLY`,
+-- however, can take longer.
+-- IOX_OTHER_CHECKSUM: 2ee2416cc206254f5b8a5497a3cfc5bcb2146759416cd4cb6a83ae34d3e0141387eb733e61f224076d5af0e3c6016e7b
+
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- remove potentially invalid index
+-- IOX_NO_TRANSACTION
+DROP INDEX CONCURRENTLY IF EXISTS parquet_file_partition_hash_id_idx;
+
+-- IOX_STEP_BOUNDARY
+
+-- IOX_NO_TRANSACTION
+CREATE INDEX CONCURRENTLY parquet_file_partition_hash_id_idx
+ON parquet_file (partition_hash_id)
+WHERE partition_hash_id IS NOT NULL;
diff --git a/iox_catalog/migrations/20230714154828_make-kafka-columns-nullable.sql b/iox_catalog/migrations/20230714154828_make-kafka-columns-nullable.sql
new file mode 100644
index 0000000..2d902fc
--- /dev/null
+++ b/iox_catalog/migrations/20230714154828_make-kafka-columns-nullable.sql
@@ -0,0 +1,11 @@
+-- Drop the foreign key constraints referencing the various 
+-- placeholder kafka columns
+ALTER TABLE IF EXISTS namespace DROP CONSTRAINT IF EXISTS namespace_kafka_topic_id_fkey, DROP CONSTRAINT IF EXISTS namespace_query_pool_id_fkey;
+ALTER TABLE IF EXISTS parquet_file DROP CONSTRAINT IF EXISTS parquet_file_sequencer_id_fkey;
+ALTER TABLE IF EXISTS partition DROP CONSTRAINT IF EXISTS partition_sequencer_id_fkey;
+ALTER TABLE IF EXISTS tombstone DROP CONSTRAINT IF EXISTS tombstone_sequencer_id_fkey;
+-- Allow the ID columns in these tables to be nullable
+ALTER TABLE IF EXISTS namespace ALTER COLUMN topic_id DROP NOT NULL, ALTER COLUMN query_pool_id DROP NOT NULL;
+ALTER TABLE IF EXISTS parquet_file ALTER COLUMN shard_id DROP NOT NULL;
+ALTER TABLE IF EXISTS partition ALTER COLUMN shard_id DROP NOT NULL;
+ALTER TABLE IF EXISTS tombstone ALTER COLUMN shard_id DROP NOT NULL;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20230720132400_modify_parquet_file_triggers.sql b/iox_catalog/migrations/20230720132400_modify_parquet_file_triggers.sql
new file mode 100644
index 0000000..83effb0
--- /dev/null
+++ b/iox_catalog/migrations/20230720132400_modify_parquet_file_triggers.sql
@@ -0,0 +1,13 @@
+-- FUNTION that updates the new_file_at field in the partition table when the update_partition trigger is fired
+-- The field new_file_at signals when the last file was added to the partition for compaction.
+
+CREATE OR REPLACE FUNCTION update_partition_on_new_file_at()
+RETURNS TRIGGER 
+LANGUAGE PLPGSQL
+AS $$
+BEGIN
+    UPDATE partition SET new_file_at = NEW.created_at WHERE id = NEW.partition_id;    
+
+    RETURN NEW;
+END;
+$$;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20230726175943_make_parquet_file_partition_id_optional.sql b/iox_catalog/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
new file mode 100644
index 0000000..b95b8da
--- /dev/null
+++ b/iox_catalog/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
@@ -0,0 +1,24 @@
+DROP TRIGGER IF EXISTS update_partition ON parquet_file;
+
+ALTER TABLE parquet_file
+ALTER COLUMN partition_id
+DROP NOT NULL;
+
+CREATE OR REPLACE FUNCTION update_partition_on_new_file_at()
+RETURNS TRIGGER
+LANGUAGE PLPGSQL
+AS $$
+BEGIN
+    UPDATE partition
+    SET new_file_at = NEW.created_at
+    WHERE (NEW.partition_id IS NULL OR id = NEW.partition_id)
+      AND (NEW.partition_hash_id IS NULL OR hash_id = NEW.partition_hash_id);
+
+    RETURN NEW;
+END;
+$$;
+
+CREATE TRIGGER update_partition
+    AFTER INSERT ON parquet_file
+    FOR EACH ROW
+    EXECUTE PROCEDURE update_partition_on_new_file_at();
diff --git a/iox_catalog/migrations/20230731143030_add_column_sort_key_array_ids.sql b/iox_catalog/migrations/20230731143030_add_column_sort_key_array_ids.sql
new file mode 100644
index 0000000..a1468fb
--- /dev/null
+++ b/iox_catalog/migrations/20230731143030_add_column_sort_key_array_ids.sql
@@ -0,0 +1,3 @@
+-- https://github.com/influxdata/influxdb_iox/issues/6401
+
+ALTER TABLE partition ADD COLUMN IF NOT EXISTS sort_key_ids BIGINT[];
diff --git a/iox_catalog/migrations/20230814162311_sort_key_ids_add_helper_index.sql b/iox_catalog/migrations/20230814162311_sort_key_ids_add_helper_index.sql
new file mode 100644
index 0000000..29c1395
--- /dev/null
+++ b/iox_catalog/migrations/20230814162311_sort_key_ids_add_helper_index.sql
@@ -0,0 +1,18 @@
+-- Add helper index for the migration from name-based partition sort keys to ID-based sort keys
+
+-- By default we often only have 5min to finish our statements.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- remove potentially invalid index
+-- IOX_NO_TRANSACTION
+DROP INDEX CONCURRENTLY IF EXISTS partition_sort_key_name_to_id_helper;
+
+-- IOX_STEP_BOUNDARY
+
+-- While `CONCURRENTLY` means it runs parallel to other writes, this command will only finish after the index was
+-- successfully built.
+-- IOX_NO_TRANSACTION
+CREATE INDEX CONCURRENTLY partition_sort_key_name_to_id_helper ON partition (id) WHERE sort_key_ids IS NULL;
diff --git a/iox_catalog/migrations/20230814162327_sort_key_ids_from_names.sql b/iox_catalog/migrations/20230814162327_sort_key_ids_from_names.sql
new file mode 100644
index 0000000..7479ad6
--- /dev/null
+++ b/iox_catalog/migrations/20230814162327_sort_key_ids_from_names.sql
@@ -0,0 +1,2 @@
+-- removed
+-- IOX_OTHER_CHECKSUM: e828129a8029ff3c623ac5ce5f37a250b586a3f4d7fe4109c5ecf9e612a24051b0a29413e66e7f44fa75baf5da559a22
diff --git a/iox_catalog/migrations/20230814162336_sort_key_ids_remove_helper_index.sql b/iox_catalog/migrations/20230814162336_sort_key_ids_remove_helper_index.sql
new file mode 100644
index 0000000..0cf7761
--- /dev/null
+++ b/iox_catalog/migrations/20230814162336_sort_key_ids_remove_helper_index.sql
@@ -0,0 +1,2 @@
+-- removed
+-- IOX_OTHER_CHECKSUM: 6facfe1bd496bb1ab192cbf50143b9bf802a2749f1abaa29e4fc1df28156421c8196e0c72d42c2d8f4776b75515f3cd7
diff --git a/iox_catalog/migrations/20230815091507_sort_key_ids_not_null.sql b/iox_catalog/migrations/20230815091507_sort_key_ids_not_null.sql
new file mode 100644
index 0000000..578679f
--- /dev/null
+++ b/iox_catalog/migrations/20230815091507_sort_key_ids_not_null.sql
@@ -0,0 +1,2 @@
+-- removed
+-- IOX_OTHER_CHECKSUM: f98891545d89fea0d7efee901c16e620ddd0adec5c21a7b8fdc1b24b133fe3b89cb21e457a2c3318bc9213b7f2a9b67b
diff --git a/iox_catalog/migrations/20230822120632_sort_key_ids_add_helper_index_take_2.sql b/iox_catalog/migrations/20230822120632_sort_key_ids_add_helper_index_take_2.sql
new file mode 100644
index 0000000..29c1395
--- /dev/null
+++ b/iox_catalog/migrations/20230822120632_sort_key_ids_add_helper_index_take_2.sql
@@ -0,0 +1,18 @@
+-- Add helper index for the migration from name-based partition sort keys to ID-based sort keys
+
+-- By default we often only have 5min to finish our statements.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- remove potentially invalid index
+-- IOX_NO_TRANSACTION
+DROP INDEX CONCURRENTLY IF EXISTS partition_sort_key_name_to_id_helper;
+
+-- IOX_STEP_BOUNDARY
+
+-- While `CONCURRENTLY` means it runs parallel to other writes, this command will only finish after the index was
+-- successfully built.
+-- IOX_NO_TRANSACTION
+CREATE INDEX CONCURRENTLY partition_sort_key_name_to_id_helper ON partition (id) WHERE sort_key_ids IS NULL;
diff --git a/iox_catalog/migrations/20230822120702_sort_key_ids_from_name_take_2.sql b/iox_catalog/migrations/20230822120702_sort_key_ids_from_name_take_2.sql
new file mode 100644
index 0000000..ac9d5d4
--- /dev/null
+++ b/iox_catalog/migrations/20230822120702_sort_key_ids_from_name_take_2.sql
@@ -0,0 +1,60 @@
+-- Actual batched migration that converts partition sort IDs
+
+-- By default we often only have 5min to finish our statements.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- IOX_NO_TRANSACTION
+DO $$
+DECLARE
+    rem integer;
+BEGIN
+    -- fixed-point loop that migrates partition in batches
+    LOOP
+        -- update batch: non-empty keys
+        UPDATE partition
+        SET sort_key_ids=sub.sort_key_ids
+        FROM (
+            SELECT
+                partition.id AS id,
+                array_agg(column_name.id ORDER BY partition.idx) AS sort_key_ids
+            FROM (
+                SELECT
+                    id,
+                    table_id,
+                    unnest(sort_key) AS column_name,
+                    generate_series(1, array_length(sort_key, 1)) as idx
+                FROM (SELECT id, table_id, sort_key FROM partition WHERE sort_key_ids IS NULL LIMIT 1000) AS partition
+            ) AS partition
+            JOIN column_name
+                ON partition.table_id = column_name.table_id AND partition.column_name = column_name.name
+            GROUP BY partition.id
+        ) AS sub
+        WHERE partition.id = sub.id;
+
+        -- commit update
+        COMMIT;
+
+        -- update batch: empty keys
+        UPDATE partition
+        SET sort_key_ids=ARRAY[]::BIGINT[]
+        FROM (
+            SELECT id FROM partition WHERE sort_key_ids IS NULL AND cardinality(sort_key) = 0 LIMIT 1000
+        ) AS sub
+        WHERE partition.id = sub.id;
+
+        -- commit update
+        COMMIT;
+
+        -- check remaining work
+        -- do this AT THE END of the loop so that the loop body runs at least once even for an empty database
+        SELECT COUNT(*) INTO STRICT rem FROM partition WHERE sort_key_ids IS NULL;
+        RAISE NOTICE 'Remaining: %', rem;
+        IF rem = 0 THEN
+            EXIT;
+        END IF;
+    END LOOP;
+END
+$$ LANGUAGE plpgsql;
diff --git a/iox_catalog/migrations/20230822120750_sort_key_ids_remove_helper_index_take_2.sql b/iox_catalog/migrations/20230822120750_sort_key_ids_remove_helper_index_take_2.sql
new file mode 100644
index 0000000..b3dd659
--- /dev/null
+++ b/iox_catalog/migrations/20230822120750_sort_key_ids_remove_helper_index_take_2.sql
@@ -0,0 +1,11 @@
+-- Remove helper index for the migration from name-based partition sort keys to ID-based sort keys
+
+-- By default we often only have 5min to finish our statements.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- remove potentially invalid index
+-- IOX_NO_TRANSACTION
+DROP INDEX CONCURRENTLY IF EXISTS partition_sort_key_name_to_id_helper;
diff --git a/iox_catalog/migrations/20230822120825_sort_key_ids_not_null_take_2.sql b/iox_catalog/migrations/20230822120825_sort_key_ids_not_null_take_2.sql
new file mode 100644
index 0000000..ca7398a
--- /dev/null
+++ b/iox_catalog/migrations/20230822120825_sort_key_ids_not_null_take_2.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ALTER COLUMN sort_key_ids SET NOT NULL;
diff --git a/iox_catalog/migrations/20230831090842_check_sort_key_sync.sql b/iox_catalog/migrations/20230831090842_check_sort_key_sync.sql
new file mode 100644
index 0000000..5e3313d
--- /dev/null
+++ b/iox_catalog/migrations/20230831090842_check_sort_key_sync.sql
@@ -0,0 +1,11 @@
+-- sanity check for https://github.com/influxdata/influxdb_iox/issues/8570
+DO $$
+DECLARE
+    rem integer;
+BEGIN
+    SELECT COUNT(*) INTO STRICT rem FROM partition WHERE cardinality(sort_key_ids) != cardinality(sort_key);
+    IF rem != 0 THEN
+        RAISE EXCEPTION 'Number of not correctly migrated entries: %, see https://github.com/influxdata/influxdb_iox/issues/8570', rem;
+    END IF;
+END
+$$ LANGUAGE plpgsql;
diff --git a/iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql b/iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql
new file mode 100644
index 0000000..95fba05
--- /dev/null
+++ b/iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ALTER COLUMN sort_key DROP NOT NULL;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20231003120000_drop_sort_key.sql b/iox_catalog/migrations/20231003120000_drop_sort_key.sql
new file mode 100644
index 0000000..868a0e5
--- /dev/null
+++ b/iox_catalog/migrations/20231003120000_drop_sort_key.sql
@@ -0,0 +1 @@
+ALTER TABLE partition DROP COLUMN sort_key;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20231004120000_add_empty_sort_key.sql b/iox_catalog/migrations/20231004120000_add_empty_sort_key.sql
new file mode 100644
index 0000000..6e398b3
--- /dev/null
+++ b/iox_catalog/migrations/20231004120000_add_empty_sort_key.sql
@@ -0,0 +1,4 @@
+-- We no longer use sort_key but to avoid phase deployments for Clustered customers,
+-- we do not need to drop it. However, since it was already dropped in the previous migration,
+-- let us add it back as a NULL column 
+ALTER TABLE partition ADD COLUMN sort_key TEXT[];
\ No newline at end of file
diff --git a/iox_catalog/migrations/20231121120000_add_partition_generation.sql b/iox_catalog/migrations/20231121120000_add_partition_generation.sql
new file mode 100644
index 0000000..9481143
--- /dev/null
+++ b/iox_catalog/migrations/20231121120000_add_partition_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ADD COLUMN generation BIGINT NOT NULL DEFAULT 0;
diff --git a/iox_catalog/migrations/20231121150000_partition_id_trigger.sql b/iox_catalog/migrations/20231121150000_partition_id_trigger.sql
new file mode 100644
index 0000000..8dd8ff0
--- /dev/null
+++ b/iox_catalog/migrations/20231121150000_partition_id_trigger.sql
@@ -0,0 +1,20 @@
+-- Forward compatibility for old writers that fail to populate partition.id
+
+-- FUNCTION that updates the partition field in the parquet_file table when the set_partition_id trigger is fired
+CREATE OR REPLACE FUNCTION update_partition_id()
+RETURNS TRIGGER
+LANGUAGE PLPGSQL
+AS $$
+BEGIN
+SELECT partition.id INTO NEW.partition_id
+    FROM partition WHERE partition.hash_id = NEW.partition_hash_id;
+RETURN NEW;
+END;
+$$;
+
+-- TRIGGER that fires the update_partition_id function when a new file is added to the parquet_file table
+CREATE TRIGGER set_partition_id
+    BEFORE INSERT ON parquet_file
+    FOR EACH ROW
+    WHEN (NEW.partition_id IS NULL)
+    EXECUTE PROCEDURE update_partition_id();
diff --git a/iox_catalog/migrations/20231123120000_partition_id_from_partition.sql b/iox_catalog/migrations/20231123120000_partition_id_from_partition.sql
new file mode 100644
index 0000000..d32bdea
--- /dev/null
+++ b/iox_catalog/migrations/20231123120000_partition_id_from_partition.sql
@@ -0,0 +1,47 @@
+-- Actual batched migration that converts parquet file partition ids
+
+-- By default we often only have 5min to finish our statements.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- IOX_NO_TRANSACTION
+DO
+$$
+    DECLARE
+        pos       integer;
+        max       integer;
+        processed integer;
+    BEGIN
+        SELECT coalesce(min(id), 0), coalesce(max(id), 0)
+        INTO pos, max
+        FROM parquet_file;
+
+        -- loop that migrates parquet_file in batches
+        LOOP
+            -- update batch:
+            RAISE NOTICE 'Processing rows from %', pos;
+            UPDATE parquet_file
+            SET partition_id=partition.id
+            FROM partition
+            WHERE parquet_file.partition_hash_id = partition.hash_id
+              AND parquet_file.partition_id is NULL
+              AND parquet_file.id >= pos
+              AND parquet_file.id < pos + 100000;
+
+            pos = pos + 100000;
+
+            GET DIAGNOSTICS processed = ROW_COUNT;
+
+            -- commit update
+            COMMIT;
+
+            -- check remaining work
+            RAISE NOTICE 'Updated: % rows', processed;
+            IF pos > max THEN
+                EXIT;
+            END IF;
+        END LOOP;
+    END
+$$ LANGUAGE plpgsql;
diff --git a/iox_catalog/migrations/20240111150000_add_table_generation.sql b/iox_catalog/migrations/20240111150000_add_table_generation.sql
new file mode 100644
index 0000000..14f124e
--- /dev/null
+++ b/iox_catalog/migrations/20240111150000_add_table_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE table_name ADD COLUMN generation BIGINT NOT NULL DEFAULT 0;
diff --git a/iox_catalog/sqlite/migrations/20230203080000_initial_schema.sql b/iox_catalog/sqlite/migrations/20230203080000_initial_schema.sql
new file mode 100644
index 0000000..3196f80
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230203080000_initial_schema.sql
@@ -0,0 +1,233 @@
+create table if not exists topic
+(
+    id   INTEGER not null
+        constraint kafka_topic_pkey
+            primary key autoincrement,
+    name VARCHAR not null
+        constraint topic_name_unique unique
+);
+
+create table if not exists query_pool
+(
+    id   INTEGER NOT NULL
+        constraint query_pool_pkey
+            primary key autoincrement,
+    name varchar not null
+        constraint query_pool_name_unique
+            unique
+);
+
+create table if not exists namespace
+(
+    id                    INTEGER
+        constraint namespace_pkey
+            primary key autoincrement,
+    name                  varchar               not null
+        constraint namespace_name_unique
+            unique,
+    topic_id              numeric               not null
+        constraint namespace_kafka_topic_id_fkey
+            references topic,
+    query_pool_id         numeric               not null
+        references query_pool,
+    max_tables            integer default 10000 not null,
+    max_columns_per_table integer default 200   not null,
+    retention_period_ns   numeric
+);
+
+create table if not exists table_name
+(
+    id           INTEGER
+        constraint table_name_pkey
+            primary key autoincrement,
+    namespace_id numeric not null
+        references namespace
+            on delete cascade,
+    name         varchar not null,
+    constraint table_name_unique
+        unique (namespace_id, name)
+);
+
+
+create index if not exists table_name_namespace_idx
+    on table_name (namespace_id);
+
+create table if not exists column_name
+(
+    id          INTEGER
+        constraint column_name_pkey
+            primary key autoincrement,
+    table_id    numeric  not null
+        references table_name
+            on delete cascade,
+    name        varchar  not null,
+    column_type smallint not null,
+    constraint column_name_unique
+        unique (table_id, name)
+);
+
+
+create index if not exists column_name_table_idx
+    on column_name (table_id);
+
+create table if not exists shard
+(
+    id                              INTEGER
+        constraint sequencer_pkey
+            primary key autoincrement,
+    topic_id                        numeric not null
+        constraint sequencer_kafka_topic_id_fkey
+            references topic,
+    shard_index                     integer not null,
+    min_unpersisted_sequence_number numeric,
+    constraint shard_unique
+        unique (topic_id, shard_index)
+);
+
+
+create table if not exists sharding_rule_override
+(
+    id           INTEGER
+        constraint sharding_rule_override_pkey
+            primary key autoincrement,
+    namespace_id numeric not null
+        references namespace,
+    table_id     numeric not null
+        references table_name,
+    column_id    numeric not null
+        references column_name
+);
+
+
+create table if not exists partition
+(
+    id                        INTEGER
+        constraint partition_pkey
+            primary key autoincrement,
+    shard_id                  numeric not null
+        constraint partition_sequencer_id_fkey
+            references shard,
+    table_id                  numeric not null
+        references table_name
+            on delete cascade,
+    partition_key             varchar not null,
+    sort_key                  text [] not null,
+    persisted_sequence_number numeric,
+    to_delete                 numeric,
+    new_file_at               numeric,
+    constraint partition_key_unique
+        unique (table_id, partition_key)
+);
+
+
+create table if not exists parquet_file
+(
+    id                  INTEGER
+        constraint parquet_file_pkey
+            primary key autoincrement,
+    shard_id            numeric            not null
+        constraint parquet_file_sequencer_id_fkey
+            references shard,
+    table_id            numeric            not null
+        references table_name,
+    partition_id        numeric            not null
+        references partition,
+    object_store_id     uuid               not null
+        constraint parquet_location_unique
+            unique,
+    max_sequence_number numeric,
+    min_time            numeric,
+    max_time            numeric,
+    to_delete           numeric,
+    row_count           numeric  default 0 not null,
+    file_size_bytes     numeric  default 0 not null,
+    compaction_level    smallint default 0 not null,
+    created_at          numeric,
+    namespace_id        numeric            not null
+        references namespace
+            on delete cascade,
+    column_set          numeric[]          not null,
+    max_l0_created_at   numeric  default 0 not null
+);
+
+
+create index if not exists parquet_file_deleted_at_idx
+    on parquet_file (to_delete);
+
+create index if not exists parquet_file_partition_idx
+    on parquet_file (partition_id);
+
+create index if not exists parquet_file_table_idx
+    on parquet_file (table_id);
+
+create index if not exists parquet_file_shard_compaction_delete_idx
+    on parquet_file (shard_id, compaction_level, to_delete);
+
+create index if not exists parquet_file_shard_compaction_delete_created_idx
+    on parquet_file (shard_id, compaction_level, to_delete, created_at);
+
+create index if not exists parquet_file_partition_created_idx
+    on parquet_file (partition_id, created_at);
+
+create table if not exists tombstone
+(
+    id                   INTEGER
+        constraint tombstone_pkey
+            primary key autoincrement,
+    table_id             numeric not null
+        references table_name
+            on delete cascade,
+    shard_id             numeric not null
+        constraint tombstone_sequencer_id_fkey
+            references shard,
+    sequence_number      numeric not null,
+    min_time             numeric not null,
+    max_time             numeric not null,
+    serialized_predicate text    not null,
+    constraint tombstone_unique
+        unique (table_id, shard_id, sequence_number)
+);
+
+
+create table if not exists processed_tombstone
+(
+    tombstone_id    INTEGER not null
+        references tombstone,
+    parquet_file_id numeric not null
+        references parquet_file
+            on delete cascade,
+    primary key (tombstone_id, parquet_file_id)
+);
+
+
+create table if not exists skipped_compactions
+(
+    partition_id                       INTEGER not null
+        constraint skipped_compactions_pkey
+            primary key
+        references partition
+            on delete cascade,
+    reason                             text    not null,
+    skipped_at                         numeric not null,
+    num_files                          numeric,
+    limit_num_files                    numeric,
+    estimated_bytes                    numeric,
+    limit_bytes                        numeric,
+    limit_num_files_first_in_partition numeric
+);
+
+
+create table if not exists billing_summary
+(
+    namespace_id          integer not null
+        constraint billing_summary_pkey
+            primary key
+        references namespace
+            on delete cascade,
+    total_file_size_bytes numeric not null
+);
+
+
+create index if not exists billing_summary_namespace_idx
+    on billing_summary (namespace_id);
+
diff --git a/iox_catalog/sqlite/migrations/20230204082400_parquet_file_triggers.sql b/iox_catalog/sqlite/migrations/20230204082400_parquet_file_triggers.sql
new file mode 100644
index 0000000..dfea3ac
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230204082400_parquet_file_triggers.sql
@@ -0,0 +1,31 @@
+create trigger if not exists update_partition
+    after insert
+    on parquet_file
+    for each row
+    when NEW.compaction_level < 2
+begin
+    UPDATE partition set new_file_at = NEW.created_at WHERE id = NEW.partition_id;
+end;
+
+create trigger if not exists update_billing
+    after insert
+    on parquet_file
+    for each row
+begin
+    INSERT INTO billing_summary (namespace_id, total_file_size_bytes)
+    VALUES (NEW.namespace_id, NEW.file_size_bytes)
+    ON CONFLICT (namespace_id) DO UPDATE
+        SET total_file_size_bytes = billing_summary.total_file_size_bytes + NEW.file_size_bytes
+    WHERE billing_summary.namespace_id = NEW.namespace_id;
+end;
+
+create trigger if not exists decrement_summary
+    after update
+    on parquet_file
+    for each row
+    when OLD.to_delete IS NULL AND NEW.to_delete IS NOT NULL
+begin
+    UPDATE billing_summary
+    SET total_file_size_bytes = billing_summary.total_file_size_bytes - OLD.file_size_bytes
+    WHERE billing_summary.namespace_id = OLD.namespace_id;
+end;
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20230207103944_namespace-soft-delete-column.sql b/iox_catalog/sqlite/migrations/20230207103944_namespace-soft-delete-column.sql
new file mode 100644
index 0000000..ab2278f
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230207103944_namespace-soft-delete-column.sql
@@ -0,0 +1,9 @@
+-- Add a soft-deletion timestamp to the "namespace" table.
+--
+-- <https://github.com/influxdata/influxdb_iox/issues/6492>
+ALTER TABLE
+    namespace
+ADD
+    COLUMN deleted_at numeric DEFAULT NULL;
+
+CREATE INDEX namespace_deleted_at_idx ON namespace (deleted_at);
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20230417173102_deterministic_partition_id.sql b/iox_catalog/sqlite/migrations/20230417173102_deterministic_partition_id.sql
new file mode 100644
index 0000000..5c1e003
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230417173102_deterministic_partition_id.sql
@@ -0,0 +1,13 @@
+-- Add a new, nullable hash ID column to the partition table
+ALTER TABLE
+  partition
+  ADD COLUMN hash_id BLOB;
+
+-- If it's specified, it must be unique
+CREATE UNIQUE INDEX IF NOT EXISTS partition_hash_id_unique ON partition (hash_id);
+
+-- Add a new, nullable foreign key column to the parquet_file table referencing the partition table
+ALTER TABLE
+  parquet_file
+  ADD COLUMN partition_hash_id bytea
+  REFERENCES partition (hash_id);
diff --git a/iox_catalog/sqlite/migrations/20230501173720_add_partition_templates.sql b/iox_catalog/sqlite/migrations/20230501173720_add_partition_templates.sql
new file mode 100644
index 0000000..94efe34
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230501173720_add_partition_templates.sql
@@ -0,0 +1,7 @@
+ALTER TABLE
+    namespace
+ADD COLUMN partition_template TEXT;
+
+ALTER TABLE
+    table_name
+ADD COLUMN partition_template TEXT;
diff --git a/iox_catalog/sqlite/migrations/20230710141740_partition_hash_id_index.sql b/iox_catalog/sqlite/migrations/20230710141740_partition_hash_id_index.sql
new file mode 100644
index 0000000..e1dda4e
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230710141740_partition_hash_id_index.sql
@@ -0,0 +1,3 @@
+CREATE INDEX IF NOT EXISTS parquet_file_partition_hash_id_idx
+ON parquet_file (partition_hash_id)
+WHERE partition_hash_id IS NOT NULL;
diff --git a/iox_catalog/sqlite/migrations/20230720132400_parquet_file_triggers.sql b/iox_catalog/sqlite/migrations/20230720132400_parquet_file_triggers.sql
new file mode 100644
index 0000000..54184fd
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230720132400_parquet_file_triggers.sql
@@ -0,0 +1,9 @@
+-- update new_file_at for all compactions, not just L0 & L1
+drop trigger update_partition;
+create trigger if not exists update_partition
+    after insert
+    on parquet_file
+    for each row
+begin
+    UPDATE partition set new_file_at = NEW.created_at WHERE id = NEW.partition_id;
+end;
diff --git a/iox_catalog/sqlite/migrations/20230726175943_make_parquet_file_partition_id_optional.sql b/iox_catalog/sqlite/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
new file mode 100644
index 0000000..1adcbe3
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230726175943_make_parquet_file_partition_id_optional.sql
@@ -0,0 +1,98 @@
+CREATE TABLE parquet_file_temp
+AS SELECT * FROM parquet_file;
+
+DROP TABLE parquet_file;
+
+CREATE TABLE parquet_file
+(
+    id                  INTEGER
+        constraint parquet_file_pkey
+            primary key autoincrement,
+    shard_id            numeric            not null
+        constraint parquet_file_sequencer_id_fkey
+            references shard,
+    table_id            numeric            not null
+        references table_name,
+    partition_id        numeric
+        references partition,
+    partition_hash_id bytea
+      references partition (hash_id),
+
+    object_store_id     uuid               not null
+        constraint parquet_location_unique
+            unique,
+    max_sequence_number numeric,
+    min_time            numeric,
+    max_time            numeric,
+    to_delete           numeric,
+    row_count           numeric  default 0 not null,
+    file_size_bytes     numeric  default 0 not null,
+    compaction_level    smallint default 0 not null,
+    created_at          numeric,
+    namespace_id        numeric            not null
+        references namespace
+            on delete cascade,
+    column_set          numeric[]          not null,
+    max_l0_created_at   numeric  default 0 not null
+);
+
+create index if not exists parquet_file_deleted_at_idx
+    on parquet_file (to_delete);
+
+create index if not exists parquet_file_partition_idx
+    on parquet_file (partition_id);
+
+create index if not exists parquet_file_table_idx
+    on parquet_file (table_id);
+
+create index if not exists parquet_file_shard_compaction_delete_idx
+    on parquet_file (shard_id, compaction_level, to_delete);
+
+create index if not exists parquet_file_shard_compaction_delete_created_idx
+    on parquet_file (shard_id, compaction_level, to_delete, created_at);
+
+create index if not exists parquet_file_partition_created_idx
+    on parquet_file (partition_id, created_at);
+
+CREATE INDEX IF NOT EXISTS parquet_file_partition_hash_id_idx
+ON parquet_file (partition_hash_id)
+WHERE partition_hash_id IS NOT NULL;
+
+create trigger if not exists update_partition
+    after insert
+    on parquet_file
+    for each row
+begin
+    UPDATE partition
+    SET new_file_at = NEW.created_at
+    WHERE (NEW.partition_id IS NULL OR id = NEW.partition_id)
+       AND (NEW.partition_hash_id IS NULL OR hash_id = NEW.partition_hash_id);
+end;
+
+create trigger if not exists update_billing
+    after insert
+    on parquet_file
+    for each row
+begin
+    INSERT INTO billing_summary (namespace_id, total_file_size_bytes)
+    VALUES (NEW.namespace_id, NEW.file_size_bytes)
+    ON CONFLICT (namespace_id) DO UPDATE
+        SET total_file_size_bytes = billing_summary.total_file_size_bytes + NEW.file_size_bytes
+    WHERE billing_summary.namespace_id = NEW.namespace_id;
+end;
+
+create trigger if not exists decrement_summary
+    after update
+    on parquet_file
+    for each row
+    when OLD.to_delete IS NULL AND NEW.to_delete IS NOT NULL
+begin
+    UPDATE billing_summary
+    SET total_file_size_bytes = billing_summary.total_file_size_bytes - OLD.file_size_bytes
+    WHERE billing_summary.namespace_id = OLD.namespace_id;
+end;
+
+INSERT INTO parquet_file
+SELECT * FROM parquet_file_temp;
+
+DROP TABLE parquet_file_temp;
diff --git a/iox_catalog/sqlite/migrations/20230731143030_add_column_sort_key_array_ids.sql b/iox_catalog/sqlite/migrations/20230731143030_add_column_sort_key_array_ids.sql
new file mode 100644
index 0000000..14946c6
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230731143030_add_column_sort_key_array_ids.sql
@@ -0,0 +1,3 @@
+-- https://github.com/influxdata/influxdb_iox/issues/6401
+
+ALTER TABLE partition ADD COLUMN sort_key_ids INTEGER[];
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20230822081728_sort_key_ids_not_null.sql b/iox_catalog/sqlite/migrations/20230822081728_sort_key_ids_not_null.sql
new file mode 100644
index 0000000..6192f9b
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230822081728_sort_key_ids_not_null.sql
@@ -0,0 +1,2 @@
+ALTER TABLE partition DROP COLUMN sort_key_ids;
+ALTER TABLE partition ADD COLUMN sort_key_ids INTEGER[] NOT NULL;
diff --git a/iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql b/iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql
new file mode 100644
index 0000000..0fef22f
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql
@@ -0,0 +1,39 @@
+-- Remove unused indices
+DROP INDEX IF EXISTS parquet_file_partition_created_idx;
+DROP INDEX IF EXISTS parquet_file_shard_compaction_delete_created_idx;
+DROP INDEX IF EXISTS parquet_file_shard_compaction_delete_idx;
+DROP INDEX IF EXISTS parquet_file_partition_idx;
+DROP INDEX IF EXISTS parquet_file_table_idx;
+-- Remove the columns referring to Kafka based information
+ALTER TABLE namespace DROP COLUMN query_pool_id;
+ALTER TABLE namespace DROP COLUMN topic_id;
+ALTER TABLE parquet_file DROP COLUMN shard_id;
+ALTER TABLE partition DROP COLUMN shard_id;
+-- The tombstone table had a redundant unique constraint on shard_id which
+-- prevents it from being dropped in SQLite, we have to copy the data into a new
+-- table, delete the old one and then recreate the table.
+CREATE TABLE tombstone_temp AS SELECT * FROM tombstone;
+DROP TABLE tombstone;
+CREATE TABLE tombstone (
+    id                   INTEGER
+        constraint tombstone_pkey
+            primary key autoincrement,
+    table_id             numeric not null
+        references table_name
+            on delete cascade,
+    shard_id             numeric not null,
+    sequence_number      numeric not null,
+    min_time             numeric not null,
+    max_time             numeric not null,
+    serialized_predicate text    not null,
+    constraint tombstone_unique
+        unique (table_id, sequence_number)
+);
+INSERT INTO tombstone SELECT * FROM tombstone_temp;
+ALTER TABLE tombstone DROP COLUMN shard_id;
+DROP TABLE tombstone_temp;
+-- Remove the now unreferenced, unused tables 
+DROP TABLE IF EXISTS topic;
+DROP TABLE IF EXISTS query_pool;
+DROP TABLE IF EXISTS shard;
+DROP TABLE IF EXISTS sharding_rule_override;
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql b/iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql
new file mode 100644
index 0000000..6ede650
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql
@@ -0,0 +1,2 @@
+ALTER TABLE partition DROP COLUMN sort_key;
+ALTER TABLE partition ADD COLUMN sort_key TEXT[];
diff --git a/iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql b/iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql
new file mode 100644
index 0000000..868a0e5
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql
@@ -0,0 +1 @@
+ALTER TABLE partition DROP COLUMN sort_key;
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql b/iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql
new file mode 100644
index 0000000..103b0e9
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ADD COLUMN sort_key TEXT[];
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql b/iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql
new file mode 100644
index 0000000..6fe1cec
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ADD COLUMN generation INTEGER NOT NULL DEFAULT 0;
diff --git a/iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql b/iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql
new file mode 100644
index 0000000..ca736eb
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql
@@ -0,0 +1,8 @@
+-- Populate partition ID where omitted (#9338)
+
+UPDATE parquet_file SET partition_id = partition.id
+FROM partition
+WHERE parquet_file.partition_id is NULL
+  AND parquet_file.partition_hash_id = partition.hash_id;
+
+-- Ideally would SET NOT NULL but not supported by SQLite
diff --git a/iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql b/iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql
new file mode 100644
index 0000000..438a899
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE table_name ADD COLUMN generation INTEGER NOT NULL DEFAULT 0;
diff --git a/iox_catalog/src/cache.rs b/iox_catalog/src/cache.rs
new file mode 100644
index 0000000..09aa45d
--- /dev/null
+++ b/iox_catalog/src/cache.rs
@@ -0,0 +1,831 @@
+//! Cache layer.
+
+use std::{
+    collections::{HashMap, HashSet},
+    ops::ControlFlow,
+    sync::Arc,
+};
+
+use async_trait::async_trait;
+use backoff::{Backoff, BackoffConfig};
+use catalog_cache::{
+    api::quorum::{Error as QuorumError, QuorumCatalogCache},
+    CacheKey, CacheValue,
+};
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    snapshot::partition::PartitionSnapshot,
+    snapshot::table::TableSnapshot,
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
+};
+use futures::{StreamExt, TryStreamExt};
+use generated_types::influxdata::iox::catalog_cache::v1 as proto;
+use generated_types::prost::Message;
+use iox_time::TimeProvider;
+use observability_deps::tracing::{debug, warn};
+
+use crate::{
+    interface::{
+        CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, PartitionRepo,
+        RepoCollection, Result, SoftDeletedRows, TableRepo,
+    },
+    metrics::MetricDecorator,
+};
+
+/// Caching catalog.
+#[derive(Debug)]
+pub struct CachingCatalog {
+    backing: Arc<dyn Catalog>,
+    cache: Arc<QuorumCatalogCache>,
+    metrics: Arc<metric::Registry>,
+    time_provider: Arc<dyn TimeProvider>,
+    quorum_fanout: usize,
+    backoff_config: Arc<BackoffConfig>,
+}
+
+impl CachingCatalog {
+    /// Create new caching catalog.
+    ///
+    /// Sets:
+    /// - `cache`: quorum-based cache
+    /// - `backing`: underlying backing catalog
+    /// - `metrics`: metrics registry
+    /// - `time_provider`: time provider, used for metrics
+    /// - `quorum_fanout`: number of concurrent quorum operations that a single request can trigger
+    pub fn new(
+        cache: Arc<QuorumCatalogCache>,
+        backing: Arc<dyn Catalog>,
+        metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
+        quorum_fanout: usize,
+    ) -> Self {
+        let backoff_config = Arc::new(BackoffConfig::default());
+
+        Self {
+            backing,
+            cache,
+            metrics,
+            time_provider,
+            quorum_fanout,
+            backoff_config,
+        }
+    }
+}
+
+impl std::fmt::Display for CachingCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "caching")
+    }
+}
+
+#[async_trait]
+impl Catalog for CachingCatalog {
+    async fn setup(&self) -> Result<(), Error> {
+        Ok(())
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        Box::new(MetricDecorator::new(
+            Repos {
+                backing: Arc::clone(&self.backing),
+                cache: Arc::clone(&self.cache),
+                quorum_fanout: self.quorum_fanout,
+                backoff_config: Arc::clone(&self.backoff_config),
+            },
+            Arc::clone(&self.metrics),
+            self.time_provider(),
+        ))
+    }
+
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<metric::Registry> {
+        Arc::clone(&self.metrics)
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider)
+    }
+}
+
+#[derive(Debug)]
+struct Repos {
+    backing: Arc<dyn Catalog>,
+    cache: Arc<QuorumCatalogCache>,
+    quorum_fanout: usize,
+    backoff_config: Arc<BackoffConfig>,
+}
+
+impl Repos {
+    /// Get data from quorum cache.
+    ///
+    /// This method implements retries.
+    async fn get_quorum(&self, key: CacheKey) -> Result<Option<CacheValue>, Error> {
+        Backoff::new(&self.backoff_config)
+            .retry_with_backoff(&format!("quorum GET: {key:?}"), || async move {
+                match self.cache.get(key).await {
+                    Ok(val) => ControlFlow::Break(Ok(val)),
+                    Err(e @ QuorumError::Quorum { .. }) => ControlFlow::Continue(e),
+                    Err(e) => ControlFlow::Break(Err(Error::from(e))),
+                }
+            })
+            .await
+            .map_err(|e| Error::External {
+                source: Box::new(e),
+            })?
+    }
+
+    /// Refresh cached value of given partition.
+    ///
+    /// This requests a new snapshot and performs a quorum-write.
+    ///
+    /// Note that this also performs a snapshot+write if the partition was NOT cached yet.
+    async fn refresh_partition(&self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let snapshot = self
+            .backing
+            .repositories()
+            .partitions()
+            .snapshot(partition_id)
+            .await?;
+        assert_eq!(snapshot.partition_id(), partition_id);
+
+        let generation = snapshot.generation();
+
+        let proto: proto::Partition = snapshot.clone().into();
+        let data = proto.encode_to_vec().into();
+
+        debug!(
+            partition_id = partition_id.get(),
+            generation, "refresh partition",
+        );
+        self.cache
+            .put(
+                CacheKey::Partition(partition_id.get()),
+                CacheValue::new(data, generation),
+            )
+            .await
+            .map_err(|e| {
+                warn!(
+                    partition_id=partition_id.get(),
+                    generation,
+                    %e,
+                    "partition quorum write failed",
+                );
+
+                e
+            })?;
+
+        Ok(snapshot)
+    }
+
+    /// Get snapshot for a partition.
+    ///
+    /// This first tries to quorum-read the partition. If the partition does not exist yet, this will perform a
+    /// [refresh](Self::refresh_partition).
+    async fn get_partition(&self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        if let Some(val) = self
+            .get_quorum(CacheKey::Partition(partition_id.get()))
+            .await
+            .map_err(|e| {
+                warn!(
+                    partition_id=partition_id.get(),
+                    %e,
+                    "partition quorum read failed",
+                );
+
+                e
+            })?
+        {
+            debug!(
+                partition_id = partition_id.get(),
+                status = "HIT",
+                generation = val.generation(),
+                "get partition",
+            );
+
+            let proto = proto::Partition::decode(val.data().clone())?;
+            return Ok(PartitionSnapshot::decode(proto, val.generation()));
+        }
+
+        debug!(
+            partition_id = partition_id.get(),
+            status = "MISS",
+            "get partition",
+        );
+        self.refresh_partition(partition_id).await
+    }
+}
+
+impl RepoCollection for Repos {
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl NamespaceRepo for Repos {
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .create(
+                name,
+                partition_template,
+                retention_period_ns,
+                service_protection_limits,
+            )
+            .await
+    }
+
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .update_retention_period(name, retention_period_ns)
+            .await
+    }
+
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
+        self.backing.repositories().namespaces().list(deleted).await
+    }
+
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .get_by_id(id, deleted)
+            .await
+    }
+
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .get_by_name(name, deleted)
+            .await
+    }
+
+    async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .soft_delete(name)
+            .await
+    }
+
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .update_table_limit(name, new_max)
+            .await
+    }
+
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .update_column_limit(name, new_max)
+            .await
+    }
+}
+
+#[async_trait]
+impl TableRepo for Repos {
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table> {
+        self.backing
+            .repositories()
+            .tables()
+            .create(name, partition_template, namespace_id)
+            .await
+    }
+
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
+        self.backing
+            .repositories()
+            .tables()
+            .get_by_id(table_id)
+            .await
+    }
+
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>> {
+        self.backing
+            .repositories()
+            .tables()
+            .get_by_namespace_and_name(namespace_id, name)
+            .await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        self.backing
+            .repositories()
+            .tables()
+            .list_by_namespace_id(namespace_id)
+            .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Table>> {
+        self.backing.repositories().tables().list().await
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        self.backing
+            .repositories()
+            .tables()
+            .snapshot(table_id)
+            .await
+    }
+}
+
+#[async_trait]
+impl ColumnRepo for Repos {
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column> {
+        self.backing
+            .repositories()
+            .columns()
+            .create_or_get(name, table_id, column_type)
+            .await
+    }
+
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>> {
+        self.backing
+            .repositories()
+            .columns()
+            .create_or_get_many_unchecked(table_id, columns)
+            .await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        self.backing
+            .repositories()
+            .columns()
+            .list_by_namespace_id(namespace_id)
+            .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
+        self.backing
+            .repositories()
+            .columns()
+            .list_by_table_id(table_id)
+            .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Column>> {
+        self.backing.repositories().columns().list().await
+    }
+}
+
+#[async_trait]
+impl PartitionRepo for Repos {
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
+        // read-through: need to wire up table snapshots to look this up efficiently
+        self.backing
+            .repositories()
+            .partitions()
+            .create_or_get(key, table_id)
+            .await
+    }
+
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        futures::stream::iter(prepare_set(partition_ids.iter().cloned()))
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    let snapshot = match this.get_partition(p_id).await {
+                        Ok(s) => s,
+                        Err(Error::NotFound { .. }) => {
+                            return Ok(futures::stream::empty().boxed());
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+
+                    match snapshot.partition() {
+                        Ok(p) => Ok(futures::stream::once(async move { Ok(p) }).boxed()),
+                        Err(e) => Err(Error::from(e)),
+                    }
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_flatten()
+            .try_collect::<Vec<_>>()
+            .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
+        // read-through: need to wire up table snapshots to look this up efficiently
+        self.backing
+            .repositories()
+            .partitions()
+            .list_by_table_id(table_id)
+            .await
+    }
+
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
+        // read-through: only used for testing, we should eventually remove this interface
+        self.backing.repositories().partitions().list_ids().await
+    }
+
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let res = self
+            .backing
+            .repositories()
+            .partitions()
+            .cas_sort_key(partition_id, old_sort_key_ids, new_sort_key_ids)
+            .await?;
+
+        self.refresh_partition(partition_id)
+            .await
+            .map_err(CasFailure::QueryError)?;
+
+        Ok(res)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()> {
+        self.backing
+            .repositories()
+            .partitions()
+            .record_skipped_compaction(
+                partition_id,
+                reason,
+                num_files,
+                limit_num_files,
+                limit_num_files_first_in_partition,
+                estimated_bytes,
+                limit_bytes,
+            )
+            .await?;
+
+        self.refresh_partition(partition_id).await?;
+
+        Ok(())
+    }
+
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_id: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>> {
+        futures::stream::iter(prepare_set(partition_id.iter().cloned()))
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    let snapshot = match this.get_partition(p_id).await {
+                        Ok(s) => s,
+                        Err(Error::NotFound { .. }) => {
+                            return Ok(futures::stream::empty().boxed());
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+
+                    match snapshot.skipped_compaction() {
+                        Some(sc) => Ok(futures::stream::once(async move { Ok(sc) }).boxed()),
+                        None => Ok(futures::stream::empty().boxed()),
+                    }
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_flatten()
+            .try_collect::<Vec<_>>()
+            .await
+    }
+
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
+        // read-through: used for debugging, this should be replaced w/ proper hierarchy-traversal
+        self.backing
+            .repositories()
+            .partitions()
+            .list_skipped_compactions()
+            .await
+    }
+
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>> {
+        let res = self
+            .backing
+            .repositories()
+            .partitions()
+            .delete_skipped_compactions(partition_id)
+            .await?;
+
+        self.refresh_partition(partition_id).await?;
+
+        Ok(res)
+    }
+
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
+        // read-through: used for ingester warm-up at the moment
+        self.backing
+            .repositories()
+            .partitions()
+            .most_recent_n(n)
+            .await
+    }
+
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>> {
+        // read-through: used by the compactor for scheduling, we should eventually find a better interface
+        self.backing
+            .repositories()
+            .partitions()
+            .partitions_new_file_between(minimum_time, maximum_time)
+            .await
+    }
+
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
+        // read-through: used by the ingester due to hash-id stuff
+        self.backing
+            .repositories()
+            .partitions()
+            .list_old_style()
+            .await
+    }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        self.get_partition(partition_id).await
+    }
+}
+
+#[async_trait]
+impl ParquetFileRepo for Repos {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let res = self
+            .backing
+            .repositories()
+            .parquet_files()
+            .flag_for_delete_by_retention()
+            .await?;
+
+        let affected_partitions = res
+            .iter()
+            .map(|(p_id, _os_id)| *p_id)
+            .collect::<HashSet<_>>();
+
+        // ensure deterministic order
+        let mut affected_partitions = affected_partitions.into_iter().collect::<Vec<_>>();
+        affected_partitions.sort_unstable();
+
+        // refresh ALL partitons that are affected, NOT just only the ones that were cached. This should avoid the
+        // following "lost update" race condition:
+        //
+        // This scenario assumes that the partition in question is NOT cached yet.
+        //
+        // | T | Thread 1                              | Thread 2                                           |
+        // | - | ------------------------------------- | -------------------------------------------------- |
+        // | 1 | receive `create_update_delete`        |                                                    |
+        // | 2 | execute change within backing catalog |                                                    |
+        // | 3 | takes snapshot from backing catalog   |                                                    |
+        // | 4 |                                       | receive `flag_for_delete_by_retention`             |
+        // | 5 |                                       | execute change within backing catalog              |
+        // | 6 |                                       | affected partition not cached => no snapshot taken |
+        // | 7 |                                       | return                                             |
+        // | 8 | quorum-write snapshot                 |                                                    |
+        // | 9 | return                                |                                                    |
+        //
+        // The partition is now cached by does NOT contain the `flag_for_delete_by_retention` change and will not
+        // automatically converge.
+        futures::stream::iter(affected_partitions)
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    this.refresh_partition(p_id).await?;
+                    Ok::<(), Error>(())
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_collect::<()>()
+            .await?;
+
+        Ok(res)
+    }
+
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        // deleted files are NOT part of the snapshot, so this bypasses the cache
+        self.backing
+            .repositories()
+            .parquet_files()
+            .delete_old_ids_only(older_than)
+            .await
+    }
+
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>> {
+        futures::stream::iter(prepare_set(partition_ids))
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    let snapshot = match this.get_partition(p_id).await {
+                        Ok(s) => s,
+                        Err(Error::NotFound { .. }) => {
+                            return Ok(futures::stream::empty().boxed());
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+
+                    // Decode files so we can drop the snapshot early.
+                    //
+                    // Need to collect the file results into a vec though because we cannot return borrowed data and
+                    // "owned iterators" aren't a thing.
+                    let files = snapshot
+                        .files()
+                        .map(|res| res.map_err(Error::from))
+                        .collect::<Vec<_>>();
+                    Ok::<_, Error>(futures::stream::iter(files).boxed())
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_flatten()
+            .try_collect::<Vec<_>>()
+            .await
+    }
+
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>> {
+        // read-through: see https://github.com/influxdata/influxdb_iox/issues/9719
+        self.backing
+            .repositories()
+            .parquet_files()
+            .get_by_object_store_id(object_store_id)
+            .await
+    }
+
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        // read-through: this is used by the GC, so this is not overall latency-critical
+        self.backing
+            .repositories()
+            .parquet_files()
+            .exists_by_object_store_id_batch(object_store_ids)
+            .await
+    }
+
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>> {
+        let res = self
+            .backing
+            .repositories()
+            .parquet_files()
+            .create_upgrade_delete(partition_id, delete, upgrade, create, target_level)
+            .await?;
+
+        self.refresh_partition(partition_id).await?;
+
+        Ok(res)
+    }
+}
+
+/// Prepare set of elements in deterministic order.
+fn prepare_set<S, T>(set: S) -> Vec<T>
+where
+    S: IntoIterator<Item = T>,
+    T: Eq + Ord,
+{
+    // ensure deterministic order (also required for de-dup)
+    let mut set = set.into_iter().collect::<Vec<_>>();
+    set.sort_unstable();
+
+    // de-dup
+    set.dedup();
+
+    set
+}
+
+#[cfg(test)]
+mod tests {
+    use catalog_cache::api::server::test_util::TestCacheServer;
+    use catalog_cache::local::CatalogCache;
+    use iox_time::SystemProvider;
+
+    use crate::{interface_tests::TestCatalog, mem::MemCatalog};
+
+    use super::*;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn test_catalog() {
+        crate::interface_tests::test_catalog(|| async {
+            let metrics = Arc::new(metric::Registry::default());
+            let time_provider = Arc::new(SystemProvider::new()) as _;
+            let backing = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider)));
+
+            let peer0 = TestCacheServer::bind_ephemeral();
+            let peer1 = TestCacheServer::bind_ephemeral();
+            let cache = Arc::new(QuorumCatalogCache::new(
+                Arc::new(CatalogCache::default()),
+                Arc::new([peer0.client(), peer1.client()]),
+            ));
+
+            // use new metrics registry so the two layers don't double-count
+            let metrics = Arc::new(metric::Registry::default());
+            let caching_catalog = Arc::new(CachingCatalog::new(
+                cache,
+                backing,
+                metrics,
+                time_provider,
+                10,
+            ));
+
+            let test_catalog = TestCatalog::new(caching_catalog);
+            test_catalog.hold_onto(peer0);
+            test_catalog.hold_onto(peer1);
+
+            Arc::new(test_catalog) as _
+        })
+        .await;
+    }
+}
diff --git a/iox_catalog/src/constants.rs b/iox_catalog/src/constants.rs
new file mode 100644
index 0000000..b6b88fb
--- /dev/null
+++ b/iox_catalog/src/constants.rs
@@ -0,0 +1,19 @@
+//! Constants that are hold for all catalog implementations.
+
+/// Time column.
+pub const TIME_COLUMN: &str = "time";
+
+/// Default retention period for data in the catalog.
+pub const DEFAULT_RETENTION_PERIOD: Option<i64> = None;
+
+/// Maximum number of files touched by [`ParquetFileRepo::flag_for_delete_by_retention`] at a time.
+///
+///
+/// [`ParquetFileRepo::flag_for_delete_by_retention`]: crate::interface::ParquetFileRepo::flag_for_delete_by_retention
+pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION: i64 = 1_000;
+
+/// Maximum number of files touched by [`ParquetFileRepo::delete_old_ids_only`] at a time.
+///
+///
+/// [`ParquetFileRepo::delete_old_ids_only`]: crate::interface::ParquetFileRepo::delete_old_ids_only
+pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE: i64 = 10_000;
diff --git a/iox_catalog/src/grpc/client.rs b/iox_catalog/src/grpc/client.rs
new file mode 100644
index 0000000..8edc05d
--- /dev/null
+++ b/iox_catalog/src/grpc/client.rs
@@ -0,0 +1,997 @@
+//! gRPC client implementation.
+use std::future::Future;
+use std::ops::ControlFlow;
+use std::{collections::HashMap, sync::Arc};
+
+use async_trait::async_trait;
+use futures::TryStreamExt;
+use log::{debug, info, warn};
+use tonic::transport::{Channel, Uri};
+
+use crate::{
+    interface::{
+        CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, PartitionRepo,
+        RepoCollection, Result, SoftDeletedRows, TableRepo,
+    },
+    metrics::MetricDecorator,
+};
+use backoff::{Backoff, BackoffError};
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    snapshot::table::TableSnapshot,
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
+};
+use generated_types::influxdata::iox::catalog::v2 as proto;
+use iox_time::TimeProvider;
+use trace_http::metrics::{MetricFamily, RequestMetrics};
+use trace_http::tower::TraceService;
+
+use super::serialization::{
+    convert_status, deserialize_column, deserialize_namespace, deserialize_object_store_id,
+    deserialize_parquet_file, deserialize_partition, deserialize_skipped_compaction,
+    deserialize_sort_key_ids, deserialize_table, serialize_column_type, serialize_object_store_id,
+    serialize_parquet_file_params, serialize_soft_deleted_rows, serialize_sort_key_ids, ContextExt,
+    RequiredExt,
+};
+
+type InstrumentedChannel = TraceService<Channel>;
+
+/// Catalog that goes through a gRPC interface.
+#[derive(Debug)]
+pub struct GrpcCatalogClient {
+    channel: InstrumentedChannel,
+    metrics: Arc<metric::Registry>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl GrpcCatalogClient {
+    /// Create new client.
+    pub fn new(
+        uri: Uri,
+        metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
+    ) -> Self {
+        let channel = TraceService::new_client(
+            Channel::builder(uri).connect_lazy(),
+            Arc::new(RequestMetrics::new(
+                Arc::clone(&metrics),
+                MetricFamily::GrpcClient,
+            )),
+            None,
+            "catalog",
+        );
+        Self {
+            channel,
+            metrics,
+            time_provider,
+        }
+    }
+}
+
+impl std::fmt::Display for GrpcCatalogClient {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "grpc")
+    }
+}
+
+#[async_trait]
+impl Catalog for GrpcCatalogClient {
+    async fn setup(&self) -> Result<(), Error> {
+        Ok(())
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        Box::new(MetricDecorator::new(
+            GrpcCatalogClientRepos {
+                channel: self.channel.clone(),
+            },
+            Arc::clone(&self.metrics),
+            Arc::clone(&self.time_provider),
+        ))
+    }
+
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<metric::Registry> {
+        Arc::clone(&self.metrics)
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider)
+    }
+}
+
+#[derive(Debug)]
+struct GrpcCatalogClientRepos {
+    channel: InstrumentedChannel,
+}
+
+type ServiceClient = proto::catalog_service_client::CatalogServiceClient<InstrumentedChannel>;
+
+fn is_upstream_error(e: &tonic::Status) -> bool {
+    matches!(
+        e.code(),
+        tonic::Code::Cancelled
+            | tonic::Code::DeadlineExceeded
+            | tonic::Code::FailedPrecondition
+            | tonic::Code::Aborted
+            | tonic::Code::Unavailable
+    )
+}
+
+impl GrpcCatalogClientRepos {
+    fn client(&self) -> ServiceClient {
+        proto::catalog_service_client::CatalogServiceClient::new(self.channel.clone())
+    }
+
+    async fn retry<U, FunIo, Fut, D>(
+        &self,
+        operation: &str,
+        upload: U,
+        fun_io: FunIo,
+    ) -> Result<D, Error>
+    where
+        U: Clone + std::fmt::Debug + Send + Sync,
+        FunIo: Fn(U, ServiceClient) -> Fut + Send + Sync,
+        Fut: Future<Output = Result<tonic::Response<D>, tonic::Status>> + Send,
+        D: std::fmt::Debug,
+    {
+        Backoff::new(&Default::default())
+            .retry_with_backoff(operation, || async {
+                let res = fun_io(upload.clone(), self.client()).await;
+                match res {
+                    Ok(r) => {
+                        let r = r.into_inner();
+                        debug!("{} successfully received: {:?}", operation, &r);
+                        ControlFlow::Break(Ok(r))
+                    }
+                    Err(e) if is_upstream_error(&e) => {
+                        info!("{} retriable error encountered: {:?}", operation, &e);
+                        ControlFlow::Continue(e)
+                    }
+                    Err(e) => {
+                        warn!(
+                            "{operation} attempted {:?} and received error: {:?}",
+                            upload, e
+                        );
+                        ControlFlow::Break(Err(convert_status(e)))
+                    }
+                }
+            })
+            .await
+            .map_err(|be| {
+                let status = match be {
+                    BackoffError::DeadlineExceeded { source, .. } => source,
+                };
+                convert_status(status)
+            })?
+    }
+}
+
+impl RepoCollection for GrpcCatalogClientRepos {
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl NamespaceRepo for GrpcCatalogClientRepos {
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace> {
+        let n = proto::NamespaceCreateRequest {
+            name: name.to_string(),
+            partition_template: partition_template.and_then(|t| t.as_proto().cloned()),
+            retention_period_ns,
+            service_protection_limits: service_protection_limits.map(|l| {
+                proto::ServiceProtectionLimits {
+                    max_tables: l.max_tables.map(|x| x.get_i32()),
+                    max_columns_per_table: l.max_columns_per_table.map(|x| x.get_i32()),
+                }
+            }),
+        };
+
+        let resp = self
+            .retry("namespace_create", n, |data, mut client| async move {
+                client.namespace_create(data).await
+            })
+            .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace> {
+        let n = proto::NamespaceUpdateRetentionPeriodRequest {
+            name: name.to_owned(),
+            retention_period_ns,
+        };
+
+        let resp = self.retry(
+            "namespace_update_retention_period",
+            n,
+            |data, mut client| async move { client.namespace_update_retention_period(data).await },
+        )
+        .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
+        let n = proto::NamespaceListRequest {
+            deleted: serialize_soft_deleted_rows(deleted),
+        };
+
+        self.retry("namespace_list", n, |data, mut client| async move {
+            client.namespace_list(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            deserialize_namespace(res.namespace.required().ctx("namespace")?).map_err(Error::from)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let n = proto::NamespaceGetByIdRequest {
+            id: id.get(),
+            deleted: serialize_soft_deleted_rows(deleted),
+        };
+
+        let resp = self
+            .retry("namespace_get_by_id", n, |data, mut client| async move {
+                client.namespace_get_by_id(data).await
+            })
+            .await?;
+        Ok(resp.namespace.map(deserialize_namespace).transpose()?)
+    }
+
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let n = proto::NamespaceGetByNameRequest {
+            name: name.to_owned(),
+            deleted: serialize_soft_deleted_rows(deleted),
+        };
+
+        let resp = self
+            .retry("namespace_get_by_name", n, |data, mut client| async move {
+                client.namespace_get_by_name(data).await
+            })
+            .await?;
+        Ok(resp.namespace.map(deserialize_namespace).transpose()?)
+    }
+
+    async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        let n = proto::NamespaceSoftDeleteRequest {
+            name: name.to_owned(),
+        };
+
+        self.retry("namespace_soft_delete", n, |data, mut client| async move {
+            client.namespace_soft_delete(data).await
+        })
+        .await?;
+        Ok(())
+    }
+
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
+        let n = proto::NamespaceUpdateTableLimitRequest {
+            name: name.to_owned(),
+            new_max: new_max.get_i32(),
+        };
+
+        let resp = self
+            .retry("namespace_soft_delete", n, |data, mut client| async move {
+                client.namespace_update_table_limit(data).await
+            })
+            .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace> {
+        let n = proto::NamespaceUpdateColumnLimitRequest {
+            name: name.to_owned(),
+            new_max: new_max.get_i32(),
+        };
+
+        let resp = self
+            .retry("namespace_soft_delete", n, |data, mut client| async move {
+                client.namespace_update_column_limit(data).await
+            })
+            .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+}
+
+#[async_trait]
+impl TableRepo for GrpcCatalogClientRepos {
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table> {
+        let t = proto::TableCreateRequest {
+            name: name.to_owned(),
+            partition_template: partition_template.as_proto().cloned(),
+            namespace_id: namespace_id.get(),
+        };
+
+        let resp = self
+            .retry("table_create", t, |data, mut client| async move {
+                client.table_create(data).await
+            })
+            .await?;
+        Ok(deserialize_table(resp.table.required().ctx("table")?)?)
+    }
+
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
+        let t = proto::TableGetByIdRequest { id: table_id.get() };
+
+        let resp = self
+            .retry("table_get_by_id", t, |data, mut client| async move {
+                client.table_get_by_id(data).await
+            })
+            .await?;
+        Ok(resp.table.map(deserialize_table).transpose()?)
+    }
+
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>> {
+        let t = proto::TableGetByNamespaceAndNameRequest {
+            namespace_id: namespace_id.get(),
+            name: name.to_owned(),
+        };
+
+        let resp = self.retry(
+            "table_get_by_namespace_and_name",
+            t,
+            |data, mut client| async move { client.table_get_by_namespace_and_name(data).await },
+        )
+        .await?;
+        Ok(resp.table.map(deserialize_table).transpose()?)
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        let t = proto::TableListByNamespaceIdRequest {
+            namespace_id: namespace_id.get(),
+        };
+
+        self.retry(
+            "table_list_by_namespace_id",
+            t,
+            |data, mut client| async move { client.table_list_by_namespace_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move { Ok(deserialize_table(res.table.required().ctx("table")?)?) })
+        .try_collect()
+        .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Table>> {
+        let t = proto::TableListRequest {};
+
+        self.retry("table_list", t, |data, mut client| async move {
+            client.table_list(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move { Ok(deserialize_table(res.table.required().ctx("table")?)?) })
+        .try_collect()
+        .await
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let t = proto::TableSnapshotRequest {
+            table_id: table_id.get(),
+        };
+
+        let resp = self
+            .retry("table_snapshot", t, |data, mut client| async move {
+                client.table_snapshot(data).await
+            })
+            .await?;
+
+        let table = resp.table.required().ctx("table")?;
+        Ok(TableSnapshot::decode(table, resp.generation))
+    }
+}
+
+#[async_trait]
+impl ColumnRepo for GrpcCatalogClientRepos {
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column> {
+        let c = proto::ColumnCreateOrGetRequest {
+            name: name.to_owned(),
+            table_id: table_id.get(),
+            column_type: serialize_column_type(column_type),
+        };
+
+        let resp = self
+            .retry("column_create_or_get", c, |data, mut client| async move {
+                client.column_create_or_get(data).await
+            })
+            .await?;
+        Ok(deserialize_column(resp.column.required().ctx("column")?)?)
+    }
+
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>> {
+        let c = proto::ColumnCreateOrGetManyUncheckedRequest {
+            table_id: table_id.get(),
+            columns: columns
+                .into_iter()
+                .map(|(name, t)| (name.to_owned(), serialize_column_type(t)))
+                .collect(),
+        };
+
+        self.retry(
+            "column_create_or_get_many_unchecked",
+            c,
+            |data, mut client| async move { client.column_create_or_get_many_unchecked(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_column(res.column.required().ctx("column")?)?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        let c = proto::ColumnListByNamespaceIdRequest {
+            namespace_id: namespace_id.get(),
+        };
+
+        self.retry(
+            "column_list_by_namespace_id",
+            c,
+            |data, mut client| async move { client.column_list_by_namespace_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(
+            |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) },
+        )
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
+        let c = proto::ColumnListByTableIdRequest {
+            table_id: table_id.get(),
+        };
+
+        self.retry(
+            "column_list_by_table_id",
+            c,
+            |data, mut client| async move { client.column_list_by_table_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(
+            |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) },
+        )
+        .try_collect()
+        .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Column>> {
+        let c = proto::ColumnListRequest {};
+
+        self.retry("column_list", c, |data, mut client| async move {
+            client.column_list(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .and_then(
+            |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) },
+        )
+        .try_collect()
+        .await
+    }
+}
+
+#[async_trait]
+impl PartitionRepo for GrpcCatalogClientRepos {
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
+        let p = proto::PartitionCreateOrGetRequest {
+            key: key.inner().to_owned(),
+            table_id: table_id.get(),
+        };
+
+        let resp = self
+            .retry(
+                "partition_create_or_get",
+                p,
+                |data, mut client| async move { client.partition_create_or_get(data).await },
+            )
+            .await?;
+
+        Ok(deserialize_partition(
+            resp.partition.required().ctx("partition")?,
+        )?)
+    }
+
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        let p = proto::PartitionGetByIdBatchRequest {
+            partition_ids: partition_ids.iter().map(|id| id.get()).collect(),
+        };
+
+        self.retry(
+            "partition_get_by_id_batch",
+            p,
+            |data, mut client| async move { client.partition_get_by_id_batch(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
+        let p = proto::PartitionListByTableIdRequest {
+            table_id: table_id.get(),
+        };
+
+        self.retry(
+            "partition_list_by_table_id",
+            p,
+            |data, mut client| async move { client.partition_list_by_table_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
+        let p = proto::PartitionListIdsRequest {};
+
+        self.retry("partition_list_ids", p, |data, mut client| async move {
+            client.partition_list_ids(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .map_ok(|res| PartitionId::new(res.partition_id))
+        .try_collect()
+        .await
+    }
+
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        // This method does not use request/request_streaming_response
+        // because the error handling (converting to CasFailure) differs
+        // from how all the other methods handle errors.
+
+        let p = proto::PartitionCasSortKeyRequest {
+            partition_id: partition_id.get(),
+            old_sort_key_ids: old_sort_key_ids.map(serialize_sort_key_ids),
+            new_sort_key_ids: Some(serialize_sort_key_ids(new_sort_key_ids)),
+        };
+
+        let res = self
+            .retry("partition_cas_sort_key", p, |data, mut client| async move {
+                client.partition_cas_sort_key(data).await
+            })
+            .await
+            .map_err(CasFailure::QueryError)?;
+
+        let res = res
+            .res
+            .required()
+            .ctx("res")
+            .map_err(|e| CasFailure::QueryError(e.into()))?;
+
+        match res {
+            proto::partition_cas_sort_key_response::Res::Partition(p) => {
+                let p = deserialize_partition(p).map_err(|e| CasFailure::QueryError(e.into()))?;
+                Ok(p)
+            }
+            proto::partition_cas_sort_key_response::Res::CurrentSortKey(k) => {
+                Err(CasFailure::ValueMismatch(deserialize_sort_key_ids(k)))
+            }
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()> {
+        let p = proto::PartitionRecordSkippedCompactionRequest {
+            partition_id: partition_id.get(),
+            reason: reason.to_owned(),
+            num_files: num_files as u64,
+            limit_num_files: limit_num_files as u64,
+            limit_num_files_first_in_partition: limit_num_files_first_in_partition as u64,
+            estimated_bytes,
+            limit_bytes,
+        };
+
+        self.retry(
+            "partition_record_skipped_compaction",
+            p,
+            |data, mut client| async move { client.partition_record_skipped_compaction(data).await },
+        )
+        .await?;
+        Ok(())
+    }
+
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_id: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>> {
+        let p = proto::PartitionGetInSkippedCompactionsRequest {
+            partition_ids: partition_id.iter().map(|id| id.get()).collect(),
+        };
+
+        self.retry(
+            "partition_get_in_skipped_compactions",
+            p,
+            |data, mut client| async move { client.partition_get_in_skipped_compactions(data).await },
+        )
+        .await?
+            .map_err(convert_status)
+            .and_then(|res| async move {
+                Ok(deserialize_skipped_compaction(res.skipped_compaction.required().ctx("skipped_compaction")?))
+            })
+            .try_collect()
+            .await
+    }
+
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
+        let p = proto::PartitionListSkippedCompactionsRequest {};
+
+        self.retry(
+            "partition_list_skipped_compactions",
+            p,
+            |data, mut client| async move { client.partition_list_skipped_compactions(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_skipped_compaction(
+                res.skipped_compaction
+                    .required()
+                    .ctx("skipped_compaction")?,
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>> {
+        let p = proto::PartitionDeleteSkippedCompactionsRequest {
+            partition_id: partition_id.get(),
+        };
+
+        let resp = self
+            .retry(
+                "partition_delete_skipped_compactions",
+                p,
+                |data, mut client| async move {
+                    client.partition_delete_skipped_compactions(data).await
+                },
+            )
+            .await?;
+
+        Ok(resp.skipped_compaction.map(deserialize_skipped_compaction))
+    }
+
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
+        let p = proto::PartitionMostRecentNRequest { n: n as u64 };
+
+        self.retry(
+            "partition_most_recent_n",
+            p,
+            |data, mut client| async move { client.partition_most_recent_n(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>> {
+        let p = proto::PartitionNewFileBetweenRequest {
+            minimum_time: minimum_time.get(),
+            maximum_time: maximum_time.map(|ts| ts.get()),
+        };
+
+        self.retry(
+            "partition_new_file_between",
+            p,
+            |data, mut client| async move { client.partition_new_file_between(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .map_ok(|res| PartitionId::new(res.partition_id))
+        .try_collect()
+        .await
+    }
+
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
+        let p = proto::PartitionListOldStyleRequest {};
+
+        self.retry(
+            "partition_list_old_style",
+            p,
+            |data, mut client| async move { client.partition_list_old_style(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let p = proto::PartitionSnapshotRequest {
+            partition_id: partition_id.get(),
+        };
+
+        let resp = self
+            .retry("partition_snapshot", p, |data, mut client| async move {
+                client.partition_snapshot(data).await
+            })
+            .await?;
+        let partition = resp.partition.required().ctx("partition")?;
+        Ok(PartitionSnapshot::decode(partition, resp.generation))
+    }
+}
+
+#[async_trait]
+impl ParquetFileRepo for GrpcCatalogClientRepos {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let p = proto::ParquetFileFlagForDeleteByRetentionRequest {};
+
+        self.retry(
+            "parquet_file_flag_for_delete_by_retention",
+            p,
+            |data, mut client| async move {
+                client.parquet_file_flag_for_delete_by_retention(data).await
+            },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok((
+                PartitionId::new(res.partition_id),
+                deserialize_object_store_id(res.object_store_id.required().ctx("object_store_id")?),
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        let p = proto::ParquetFileDeleteOldIdsOnlyRequest {
+            older_than: older_than.get(),
+        };
+
+        self.retry(
+            "parquet_file_delete_old_ids_only",
+            p,
+            |data, mut client| async move { client.parquet_file_delete_old_ids_only(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_object_store_id(
+                res.object_store_id.required().ctx("object_store_id")?,
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>> {
+        let p = proto::ParquetFileListByPartitionNotToDeleteBatchRequest {
+            partition_ids: partition_ids.into_iter().map(|p| p.get()).collect(),
+        };
+
+        self.retry(
+            "parquet_file_list_by_partition_not_to_delete_batch",
+            p,
+            |data, mut client| async move {
+                client
+                    .parquet_file_list_by_partition_not_to_delete_batch(data)
+                    .await
+            },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_parquet_file(
+                res.parquet_file.required().ctx("parquet_file")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>> {
+        let p = proto::ParquetFileGetByObjectStoreIdRequest {
+            object_store_id: Some(serialize_object_store_id(object_store_id)),
+        };
+
+        let maybe_file = self.retry(
+            "parquet_file_get_by_object_store_id",
+            p,
+            |data, mut client| async move { client.parquet_file_get_by_object_store_id(data).await })
+            .await?
+            .parquet_file.map(deserialize_parquet_file).transpose()?;
+        Ok(maybe_file)
+    }
+
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        let p = futures::stream::iter(object_store_ids.into_iter().map(|id| {
+            proto::ParquetFileExistsByObjectStoreIdBatchRequest {
+                object_store_id: Some(serialize_object_store_id(id)),
+            }
+        }));
+
+        self.retry(
+            "parquet_file_exists_by_object_store_id_batch",
+            p,
+            |data, mut client: ServiceClient| async move {
+                client
+                    .parquet_file_exists_by_object_store_id_batch(data)
+                    .await
+            },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_object_store_id(
+                res.object_store_id.required().ctx("object_store_id")?,
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>> {
+        let p = proto::ParquetFileCreateUpgradeDeleteRequest {
+            partition_id: partition_id.get(),
+            delete: delete
+                .iter()
+                .copied()
+                .map(serialize_object_store_id)
+                .collect(),
+            upgrade: upgrade
+                .iter()
+                .copied()
+                .map(serialize_object_store_id)
+                .collect(),
+            create: create.iter().map(serialize_parquet_file_params).collect(),
+            target_level: target_level as i32,
+        };
+
+        let resp = self.retry(
+            "parquet_file_create_upgrade_delete",
+            p,
+            |data, mut client| async move { client.parquet_file_create_upgrade_delete(data).await },
+        )
+        .await?;
+
+        Ok(resp
+            .created_parquet_file_ids
+            .into_iter()
+            .map(ParquetFileId::new)
+            .collect())
+    }
+}
diff --git a/iox_catalog/src/grpc/mod.rs b/iox_catalog/src/grpc/mod.rs
new file mode 100644
index 0000000..0374f57
--- /dev/null
+++ b/iox_catalog/src/grpc/mod.rs
@@ -0,0 +1,143 @@
+//! gRPC catalog tunnel.
+//!
+//! This tunnels catalog requests over gRPC.
+
+pub mod client;
+mod serialization;
+pub mod server;
+
+#[cfg(test)]
+mod tests {
+    use std::{net::SocketAddr, sync::Arc};
+
+    use data_types::NamespaceName;
+    use iox_time::SystemProvider;
+    use metric::{Attributes, Metric, U64Counter};
+    use test_helpers::maybe_start_logging;
+    use tokio::{net::TcpListener, task::JoinSet};
+    use tonic::transport::{server::TcpIncoming, Server, Uri};
+
+    use crate::{interface::Catalog, interface_tests::TestCatalog, mem::MemCatalog};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_catalog() {
+        maybe_start_logging();
+
+        crate::interface_tests::test_catalog(|| async {
+            let metrics = Arc::new(metric::Registry::default());
+            let time_provider = Arc::new(SystemProvider::new()) as _;
+            let backing_catalog = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider)));
+            let test_server = TestServer::new(backing_catalog).await;
+            let uri = test_server.uri();
+
+            // create new metrics for client so that they don't overlap w/ server
+            let metrics = Arc::new(metric::Registry::default());
+            let client = Arc::new(client::GrpcCatalogClient::new(
+                uri,
+                metrics,
+                Arc::clone(&time_provider),
+            ));
+
+            let test_catalog = TestCatalog::new(client);
+            test_catalog.hold_onto(test_server);
+
+            Arc::new(test_catalog) as _
+        })
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_catalog_metrics() {
+        maybe_start_logging();
+
+        let time_provider = Arc::new(SystemProvider::new()) as _;
+        let metrics = Arc::new(metric::Registry::default());
+        let backing_catalog = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider)));
+        let test_server = TestServer::new(backing_catalog).await;
+        let uri = test_server.uri();
+
+        // create new metrics for client so that they don't overlap w/ server
+        let metrics = Arc::new(metric::Registry::default());
+        let client = Arc::new(client::GrpcCatalogClient::new(
+            uri,
+            Arc::clone(&metrics),
+            Arc::clone(&time_provider),
+        ));
+
+        let ns = client
+            .repositories()
+            .namespaces()
+            .create(&NamespaceName::new("testns").unwrap(), None, None, None)
+            .await
+            .expect("namespace failed to create");
+
+        let _ = client
+            .repositories()
+            .tables()
+            .list_by_namespace_id(ns.id)
+            .await
+            .expect("failed to list namespaces");
+
+        let metric = metrics
+            .get_instrument::<Metric<U64Counter>>("grpc_client_requests")
+            .expect("failed to get metric");
+
+        let count = metric
+            .get_observer(&Attributes::from(&[
+                (
+                    "path",
+                    "/influxdata.iox.catalog.v2.CatalogService/NamespaceCreate",
+                ),
+                ("status", "ok"),
+            ]))
+            .unwrap()
+            .fetch();
+
+        assert_eq!(count, 1);
+
+        let count = metric
+            .get_observer(&Attributes::from(&[
+                (
+                    "path",
+                    "/influxdata.iox.catalog.v2.CatalogService/TableListByNamespaceId",
+                ),
+                ("status", "ok"),
+            ]))
+            .unwrap()
+            .fetch();
+
+        assert_eq!(count, 1);
+    }
+
+    struct TestServer {
+        addr: SocketAddr,
+        #[allow(dead_code)]
+        task: JoinSet<()>,
+    }
+
+    impl TestServer {
+        async fn new(catalog: Arc<dyn Catalog>) -> Self {
+            let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
+            let addr = listener.local_addr().unwrap();
+            let incoming = TcpIncoming::from_listener(listener, true, None).unwrap();
+            let mut task = JoinSet::new();
+            task.spawn(async move {
+                Server::builder()
+                    .add_service(server::GrpcCatalogServer::new(catalog).service())
+                    .serve_with_incoming(incoming)
+                    .await
+                    .unwrap();
+            });
+
+            Self { addr, task }
+        }
+
+        fn uri(&self) -> Uri {
+            format!("http://{}:{}", self.addr.ip(), self.addr.port())
+                .parse()
+                .unwrap()
+        }
+    }
+}
diff --git a/iox_catalog/src/grpc/serialization.rs b/iox_catalog/src/grpc/serialization.rs
new file mode 100644
index 0000000..2698dc4
--- /dev/null
+++ b/iox_catalog/src/grpc/serialization.rs
@@ -0,0 +1,712 @@
+use data_types::{
+    partition_template::NamespacePartitionTemplateOverride, Column, ColumnId, ColumnSet,
+    ColumnType, Namespace, NamespaceId, ObjectStoreId, ParquetFile, ParquetFileId,
+    ParquetFileParams, Partition, PartitionId, SkippedCompaction, SortKeyIds, Table, TableId,
+    Timestamp,
+};
+use generated_types::influxdata::iox::catalog::v2 as proto;
+use uuid::Uuid;
+
+use crate::interface::SoftDeletedRows;
+
+#[derive(Debug)]
+pub struct Error {
+    msg: String,
+    path: Vec<&'static str>,
+}
+
+impl Error {
+    fn new<E>(e: E) -> Self
+    where
+        E: std::fmt::Display,
+    {
+        Self {
+            msg: e.to_string(),
+            path: vec![],
+        }
+    }
+
+    fn ctx(self, arg: &'static str) -> Self {
+        let Self { msg, mut path } = self;
+        path.insert(0, arg);
+        Self { msg, path }
+    }
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if !self.path.is_empty() {
+            write!(f, "{}", self.path[0])?;
+            for p in self.path.iter().skip(1) {
+                write!(f, ".{}", p)?;
+            }
+            write!(f, ": ")?;
+        }
+
+        write!(f, "{}", self.msg)?;
+
+        Ok(())
+    }
+}
+
+impl std::error::Error for Error {}
+
+impl From<Error> for crate::interface::Error {
+    fn from(e: Error) -> Self {
+        Self::External { source: e.into() }
+    }
+}
+
+impl From<Error> for tonic::Status {
+    fn from(e: Error) -> Self {
+        Self::invalid_argument(e.to_string())
+    }
+}
+
+pub(crate) trait ConvertExt<O> {
+    fn convert(self) -> Result<O, Error>;
+}
+
+impl<T, O> ConvertExt<O> for T
+where
+    T: TryInto<O>,
+    T::Error: std::fmt::Display,
+{
+    fn convert(self) -> Result<O, Error> {
+        self.try_into().map_err(Error::new)
+    }
+}
+
+pub(crate) trait ConvertOptExt<O> {
+    fn convert_opt(self) -> Result<O, Error>;
+}
+
+impl<T, O> ConvertOptExt<Option<O>> for Option<T>
+where
+    T: TryInto<O>,
+    T::Error: std::fmt::Display,
+{
+    fn convert_opt(self) -> Result<Option<O>, Error> {
+        self.map(|x| x.convert()).transpose()
+    }
+}
+
+pub(crate) trait RequiredExt<T> {
+    fn required(self) -> Result<T, Error>;
+}
+
+impl<T> RequiredExt<T> for Option<T> {
+    fn required(self) -> Result<T, Error> {
+        self.ok_or_else(|| Error::new("required"))
+    }
+}
+
+pub(crate) trait ContextExt<T> {
+    fn ctx(self, path: &'static str) -> Result<T, Error>;
+}
+
+impl<T> ContextExt<T> for Result<T, Error> {
+    fn ctx(self, path: &'static str) -> Self {
+        self.map_err(|e| e.ctx(path))
+    }
+}
+
+pub(crate) fn catalog_error_to_status(e: crate::interface::Error) -> tonic::Status {
+    use crate::interface::Error;
+
+    match e {
+        Error::External { source } => tonic::Status::internal(source.to_string()),
+        Error::AlreadyExists { descr } => tonic::Status::already_exists(descr),
+        Error::LimitExceeded { descr } => tonic::Status::resource_exhausted(descr),
+        Error::NotFound { descr } => tonic::Status::not_found(descr),
+    }
+}
+
+pub(crate) fn convert_status(status: tonic::Status) -> crate::interface::Error {
+    use crate::interface::Error;
+
+    match status.code() {
+        tonic::Code::Internal => Error::External {
+            source: status.message().to_owned().into(),
+        },
+        tonic::Code::AlreadyExists => Error::AlreadyExists {
+            descr: status.message().to_owned(),
+        },
+        tonic::Code::ResourceExhausted => Error::LimitExceeded {
+            descr: status.message().to_owned(),
+        },
+        tonic::Code::NotFound => Error::NotFound {
+            descr: status.message().to_owned(),
+        },
+        _ => Error::External {
+            source: Box::new(status),
+        },
+    }
+}
+
+pub(crate) fn serialize_soft_deleted_rows(sdr: SoftDeletedRows) -> i32 {
+    let sdr = match sdr {
+        SoftDeletedRows::AllRows => proto::SoftDeletedRows::AllRows,
+        SoftDeletedRows::ExcludeDeleted => proto::SoftDeletedRows::ExcludeDeleted,
+        SoftDeletedRows::OnlyDeleted => proto::SoftDeletedRows::OnlyDeleted,
+    };
+
+    sdr.into()
+}
+
+pub(crate) fn deserialize_soft_deleted_rows(sdr: i32) -> Result<SoftDeletedRows, Error> {
+    let sdr: proto::SoftDeletedRows = sdr.convert().ctx("soft deleted rows")?;
+    let sdr = match sdr {
+        proto::SoftDeletedRows::Unspecified => {
+            return Err(Error::new("unspecified soft deleted rows"));
+        }
+        proto::SoftDeletedRows::AllRows => SoftDeletedRows::AllRows,
+        proto::SoftDeletedRows::ExcludeDeleted => SoftDeletedRows::ExcludeDeleted,
+        proto::SoftDeletedRows::OnlyDeleted => SoftDeletedRows::OnlyDeleted,
+    };
+    Ok(sdr)
+}
+
+pub(crate) fn serialize_namespace(ns: Namespace) -> proto::Namespace {
+    proto::Namespace {
+        id: ns.id.get(),
+        name: ns.name,
+        retention_period_ns: ns.retention_period_ns,
+        max_tables: ns.max_tables.get_i32(),
+        max_columns_per_table: ns.max_columns_per_table.get_i32(),
+        deleted_at: ns.deleted_at.map(|ts| ts.get()),
+        partition_template: ns.partition_template.as_proto().cloned(),
+    }
+}
+
+pub(crate) fn deserialize_namespace(ns: proto::Namespace) -> Result<Namespace, Error> {
+    Ok(Namespace {
+        id: NamespaceId::new(ns.id),
+        name: ns.name,
+        retention_period_ns: ns.retention_period_ns,
+        max_tables: ns.max_tables.convert().ctx("max_tables")?,
+        max_columns_per_table: ns
+            .max_columns_per_table
+            .convert()
+            .ctx("max_columns_per_table")?,
+        deleted_at: ns.deleted_at.map(Timestamp::new),
+        partition_template: ns
+            .partition_template
+            .convert_opt()
+            .ctx("partition_template")?
+            .unwrap_or_else(NamespacePartitionTemplateOverride::const_default),
+    })
+}
+
+pub(crate) fn serialize_table(t: Table) -> proto::Table {
+    proto::Table {
+        id: t.id.get(),
+        namespace_id: t.namespace_id.get(),
+        name: t.name,
+        partition_template: t.partition_template.as_proto().cloned(),
+    }
+}
+
+pub(crate) fn deserialize_table(t: proto::Table) -> Result<Table, Error> {
+    Ok(Table {
+        id: TableId::new(t.id),
+        namespace_id: NamespaceId::new(t.namespace_id),
+        name: t.name,
+        partition_template: t.partition_template.convert().ctx("partition_template")?,
+    })
+}
+
+pub(crate) fn serialize_column_type(t: ColumnType) -> i32 {
+    use generated_types::influxdata::iox::column_type::v1 as proto;
+    proto::ColumnType::from(t).into()
+}
+
+pub(crate) fn deserialize_column_type(t: i32) -> Result<ColumnType, Error> {
+    use generated_types::influxdata::iox::column_type::v1 as proto;
+    let t: proto::ColumnType = t.convert()?;
+    t.convert()
+}
+
+pub(crate) fn serialize_column(column: Column) -> proto::Column {
+    proto::Column {
+        id: column.id.get(),
+        table_id: column.table_id.get(),
+        name: column.name,
+        column_type: serialize_column_type(column.column_type),
+    }
+}
+
+pub(crate) fn deserialize_column(column: proto::Column) -> Result<Column, Error> {
+    Ok(Column {
+        id: ColumnId::new(column.id),
+        table_id: TableId::new(column.table_id),
+        name: column.name,
+        column_type: deserialize_column_type(column.column_type)?,
+    })
+}
+
+pub(crate) fn serialize_sort_key_ids(sort_key_ids: &SortKeyIds) -> proto::SortKeyIds {
+    proto::SortKeyIds {
+        column_ids: sort_key_ids.iter().map(|c_id| c_id.get()).collect(),
+    }
+}
+
+pub(crate) fn deserialize_sort_key_ids(sort_key_ids: proto::SortKeyIds) -> SortKeyIds {
+    SortKeyIds::new(sort_key_ids.column_ids.into_iter().map(ColumnId::new))
+}
+
+pub(crate) fn serialize_partition(partition: Partition) -> proto::Partition {
+    let empty_sk = SortKeyIds::new(std::iter::empty());
+
+    proto::Partition {
+        id: partition.id.get(),
+        hash_id: partition
+            .hash_id()
+            .map(|id| id.as_bytes().to_vec())
+            .unwrap_or_default(),
+        partition_key: partition.partition_key.inner().to_owned(),
+        table_id: partition.table_id.get(),
+        sort_key_ids: Some(serialize_sort_key_ids(
+            partition.sort_key_ids().unwrap_or(&empty_sk),
+        )),
+        new_file_at: partition.new_file_at.map(|ts| ts.get()),
+    }
+}
+
+pub(crate) fn deserialize_partition(partition: proto::Partition) -> Result<Partition, Error> {
+    Ok(Partition::new_catalog_only(
+        PartitionId::new(partition.id),
+        (!partition.hash_id.is_empty())
+            .then_some(partition.hash_id.as_slice())
+            .convert_opt()
+            .ctx("hash_id")?,
+        TableId::new(partition.table_id),
+        partition.partition_key.into(),
+        deserialize_sort_key_ids(partition.sort_key_ids.required().ctx("sort_key_ids")?),
+        partition.new_file_at.map(Timestamp::new),
+    ))
+}
+
+pub(crate) fn serialize_skipped_compaction(sc: SkippedCompaction) -> proto::SkippedCompaction {
+    proto::SkippedCompaction {
+        partition_id: sc.partition_id.get(),
+        reason: sc.reason,
+        skipped_at: sc.skipped_at.get(),
+        estimated_bytes: sc.estimated_bytes,
+        limit_bytes: sc.limit_bytes,
+        num_files: sc.num_files,
+        limit_num_files: sc.limit_num_files,
+        limit_num_files_first_in_partition: sc.limit_num_files_first_in_partition,
+    }
+}
+
+pub(crate) fn deserialize_skipped_compaction(sc: proto::SkippedCompaction) -> SkippedCompaction {
+    SkippedCompaction {
+        partition_id: PartitionId::new(sc.partition_id),
+        reason: sc.reason,
+        skipped_at: Timestamp::new(sc.skipped_at),
+        estimated_bytes: sc.estimated_bytes,
+        limit_bytes: sc.limit_bytes,
+        num_files: sc.num_files,
+        limit_num_files: sc.limit_num_files,
+        limit_num_files_first_in_partition: sc.limit_num_files_first_in_partition,
+    }
+}
+
+pub(crate) fn serialize_object_store_id(id: ObjectStoreId) -> proto::ObjectStoreId {
+    let (high64, low64) = id.get_uuid().as_u64_pair();
+    proto::ObjectStoreId { high64, low64 }
+}
+
+pub(crate) fn deserialize_object_store_id(id: proto::ObjectStoreId) -> ObjectStoreId {
+    ObjectStoreId::from_uuid(Uuid::from_u64_pair(id.high64, id.low64))
+}
+
+pub(crate) fn serialize_column_set(set: &ColumnSet) -> proto::ColumnSet {
+    proto::ColumnSet {
+        column_ids: set.iter().map(|id| id.get()).collect(),
+    }
+}
+
+pub(crate) fn deserialize_column_set(set: proto::ColumnSet) -> ColumnSet {
+    ColumnSet::new(set.column_ids.into_iter().map(ColumnId::new))
+}
+
+pub(crate) fn serialize_parquet_file_params(
+    params: &ParquetFileParams,
+) -> proto::ParquetFileParams {
+    proto::ParquetFileParams {
+        namespace_id: params.namespace_id.get(),
+        table_id: params.table_id.get(),
+        partition_id: params.partition_id.get(),
+        partition_hash_id: params
+            .partition_hash_id
+            .as_ref()
+            .map(|id| id.as_bytes().to_vec()),
+        object_store_id: Some(serialize_object_store_id(params.object_store_id)),
+        min_time: params.min_time.get(),
+        max_time: params.max_time.get(),
+        file_size_bytes: params.file_size_bytes,
+        row_count: params.row_count,
+        compaction_level: params.compaction_level as i32,
+        created_at: params.created_at.get(),
+        column_set: Some(serialize_column_set(&params.column_set)),
+        max_l0_created_at: params.max_l0_created_at.get(),
+    }
+}
+
+pub(crate) fn deserialize_parquet_file_params(
+    params: proto::ParquetFileParams,
+) -> Result<ParquetFileParams, Error> {
+    Ok(ParquetFileParams {
+        namespace_id: NamespaceId::new(params.namespace_id),
+        table_id: TableId::new(params.table_id),
+        partition_id: PartitionId::new(params.partition_id),
+        partition_hash_id: params
+            .partition_hash_id
+            .as_deref()
+            .convert_opt()
+            .ctx("partition_hash_id")?,
+        object_store_id: deserialize_object_store_id(
+            params.object_store_id.required().ctx("object_store_id")?,
+        ),
+        min_time: Timestamp::new(params.min_time),
+        max_time: Timestamp::new(params.max_time),
+        file_size_bytes: params.file_size_bytes,
+        row_count: params.row_count,
+        compaction_level: params.compaction_level.convert().ctx("compaction_level")?,
+        created_at: Timestamp::new(params.created_at),
+        column_set: deserialize_column_set(params.column_set.required().ctx("column_set")?),
+        max_l0_created_at: Timestamp::new(params.max_l0_created_at),
+    })
+}
+
+pub(crate) fn serialize_parquet_file(file: ParquetFile) -> proto::ParquetFile {
+    let partition_hash_id = file
+        .partition_hash_id
+        .map(|x| x.as_bytes().to_vec())
+        .unwrap_or_default();
+
+    proto::ParquetFile {
+        id: file.id.get(),
+        namespace_id: file.namespace_id.get(),
+        table_id: file.table_id.get(),
+        partition_id: file.partition_id.get(),
+        partition_hash_id,
+        object_store_id: Some(serialize_object_store_id(file.object_store_id)),
+        min_time: file.min_time.get(),
+        max_time: file.max_time.get(),
+        to_delete: file.to_delete.map(|ts| ts.get()),
+        file_size_bytes: file.file_size_bytes,
+        row_count: file.row_count,
+        compaction_level: file.compaction_level as i32,
+        created_at: file.created_at.get(),
+        column_set: Some(serialize_column_set(&file.column_set)),
+        max_l0_created_at: file.max_l0_created_at.get(),
+    }
+}
+
+pub(crate) fn deserialize_parquet_file(file: proto::ParquetFile) -> Result<ParquetFile, Error> {
+    let partition_hash_id = match file.partition_hash_id.as_slice() {
+        b"" => None,
+        s => Some(s.convert().ctx("partition_hash_id")?),
+    };
+
+    Ok(ParquetFile {
+        id: ParquetFileId::new(file.id),
+        namespace_id: NamespaceId::new(file.namespace_id),
+        table_id: TableId::new(file.table_id),
+        partition_id: PartitionId::new(file.partition_id),
+        partition_hash_id,
+        object_store_id: deserialize_object_store_id(
+            file.object_store_id.required().ctx("object_store_id")?,
+        ),
+        min_time: Timestamp::new(file.min_time),
+        max_time: Timestamp::new(file.max_time),
+        to_delete: file.to_delete.map(Timestamp::new),
+        file_size_bytes: file.file_size_bytes,
+        row_count: file.row_count,
+        compaction_level: file.compaction_level.convert().ctx("compaction_level")?,
+        created_at: Timestamp::new(file.created_at),
+        column_set: deserialize_column_set(file.column_set.required().ctx("column_set")?),
+        max_l0_created_at: Timestamp::new(file.max_l0_created_at),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use data_types::{
+        partition_template::TablePartitionTemplateOverride, CompactionLevel, PartitionHashId,
+        PartitionKey,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_column_type_roundtrip() {
+        assert_column_type_roundtrip(ColumnType::Bool);
+        assert_column_type_roundtrip(ColumnType::I64);
+        assert_column_type_roundtrip(ColumnType::U64);
+        assert_column_type_roundtrip(ColumnType::F64);
+        assert_column_type_roundtrip(ColumnType::String);
+        assert_column_type_roundtrip(ColumnType::Tag);
+        assert_column_type_roundtrip(ColumnType::Time);
+    }
+
+    #[track_caller]
+    fn assert_column_type_roundtrip(t: ColumnType) {
+        let protobuf = serialize_column_type(t);
+        let t2 = deserialize_column_type(protobuf).unwrap();
+        assert_eq!(t, t2);
+    }
+
+    #[test]
+    fn test_error_roundtrip() {
+        use crate::interface::Error;
+
+        assert_error_roundtrip(Error::AlreadyExists {
+            descr: "foo".to_owned(),
+        });
+        assert_error_roundtrip(Error::External {
+            source: "foo".to_owned().into(),
+        });
+        assert_error_roundtrip(Error::LimitExceeded {
+            descr: "foo".to_owned(),
+        });
+        assert_error_roundtrip(Error::NotFound {
+            descr: "foo".to_owned(),
+        });
+    }
+
+    #[track_caller]
+    fn assert_error_roundtrip(e: crate::interface::Error) {
+        let msg_orig = e.to_string();
+
+        let status = catalog_error_to_status(e);
+        let e = convert_status(status);
+        let msg = e.to_string();
+        assert_eq!(msg, msg_orig);
+    }
+
+    #[test]
+    fn test_soft_deleted_rows_roundtrip() {
+        assert_soft_deleted_rows_roundtrip(SoftDeletedRows::AllRows);
+        assert_soft_deleted_rows_roundtrip(SoftDeletedRows::ExcludeDeleted);
+        assert_soft_deleted_rows_roundtrip(SoftDeletedRows::OnlyDeleted);
+    }
+
+    #[track_caller]
+    fn assert_soft_deleted_rows_roundtrip(sdr: SoftDeletedRows) {
+        let protobuf = serialize_soft_deleted_rows(sdr);
+        let sdr2 = deserialize_soft_deleted_rows(protobuf).unwrap();
+        assert_eq!(sdr, sdr2);
+    }
+
+    #[test]
+    fn test_namespace_roundtrip() {
+        use generated_types::influxdata::iox::partition_template::v1 as proto;
+
+        let ns = Namespace {
+            id: NamespaceId::new(1),
+            name: "ns".to_owned(),
+            retention_period_ns: Some(2),
+            max_tables: 3.try_into().unwrap(),
+            max_columns_per_table: 4.try_into().unwrap(),
+            deleted_at: Some(Timestamp::new(5)),
+            partition_template: NamespacePartitionTemplateOverride::try_from(
+                proto::PartitionTemplate {
+                    parts: vec![proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                    }],
+                },
+            )
+            .unwrap(),
+        };
+        let protobuf = serialize_namespace(ns.clone());
+        let ns2 = deserialize_namespace(protobuf).unwrap();
+        assert_eq!(ns, ns2);
+    }
+
+    #[test]
+    fn test_table_roundtrip() {
+        use generated_types::influxdata::iox::partition_template::v1 as proto;
+
+        let table = Table {
+            id: TableId::new(1),
+            namespace_id: NamespaceId::new(2),
+            name: "table".to_owned(),
+            partition_template: TablePartitionTemplateOverride::try_new(
+                Some(proto::PartitionTemplate {
+                    parts: vec![proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                    }],
+                }),
+                &NamespacePartitionTemplateOverride::const_default(),
+            )
+            .unwrap(),
+        };
+        let protobuf = serialize_table(table.clone());
+        let table2 = deserialize_table(protobuf).unwrap();
+        assert_eq!(table, table2);
+    }
+
+    #[test]
+    fn test_column_roundtrip() {
+        let column = Column {
+            id: ColumnId::new(1),
+            table_id: TableId::new(2),
+            name: "col".to_owned(),
+            column_type: ColumnType::F64,
+        };
+        let protobuf = serialize_column(column.clone());
+        let column2 = deserialize_column(protobuf).unwrap();
+        assert_eq!(column, column2);
+    }
+
+    #[test]
+    fn test_sort_key_ids_roundtrip() {
+        assert_sort_key_ids_roundtrip(SortKeyIds::new(std::iter::empty()));
+        assert_sort_key_ids_roundtrip(SortKeyIds::new([ColumnId::new(1)]));
+        assert_sort_key_ids_roundtrip(SortKeyIds::new([
+            ColumnId::new(1),
+            ColumnId::new(5),
+            ColumnId::new(20),
+        ]));
+    }
+
+    #[track_caller]
+    fn assert_sort_key_ids_roundtrip(sort_key_ids: SortKeyIds) {
+        let protobuf = serialize_sort_key_ids(&sort_key_ids);
+        let sort_key_ids2 = deserialize_sort_key_ids(protobuf);
+        assert_eq!(sort_key_ids, sort_key_ids2);
+    }
+
+    #[test]
+    fn test_partition_roundtrip() {
+        let table_id = TableId::new(1);
+        let partition_key = PartitionKey::from("key");
+        let hash_id = PartitionHashId::new(table_id, &partition_key);
+
+        assert_partition_roundtrip(Partition::new_catalog_only(
+            PartitionId::new(2),
+            Some(hash_id.clone()),
+            table_id,
+            partition_key.clone(),
+            SortKeyIds::new([ColumnId::new(3), ColumnId::new(4)]),
+            Some(Timestamp::new(5)),
+        ));
+        assert_partition_roundtrip(Partition::new_catalog_only(
+            PartitionId::new(2),
+            Some(hash_id),
+            table_id,
+            partition_key,
+            SortKeyIds::new(std::iter::empty()),
+            Some(Timestamp::new(5)),
+        ));
+    }
+
+    #[track_caller]
+    fn assert_partition_roundtrip(partition: Partition) {
+        let protobuf = serialize_partition(partition.clone());
+        let partition2 = deserialize_partition(protobuf).unwrap();
+        assert_eq!(partition, partition2);
+    }
+
+    #[test]
+    fn test_skipped_compaction_roundtrip() {
+        let sc = SkippedCompaction {
+            partition_id: PartitionId::new(1),
+            reason: "foo".to_owned(),
+            skipped_at: Timestamp::new(2),
+            estimated_bytes: 3,
+            limit_bytes: 4,
+            num_files: 5,
+            limit_num_files: 6,
+            limit_num_files_first_in_partition: 7,
+        };
+        let protobuf = serialize_skipped_compaction(sc.clone());
+        let sc2 = deserialize_skipped_compaction(protobuf);
+        assert_eq!(sc, sc2);
+    }
+
+    #[test]
+    fn test_object_store_id_roundtrip() {
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::nil()));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(0)));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(u128::MAX)));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(1)));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(u128::MAX - 1)));
+    }
+
+    #[track_caller]
+    fn assert_object_store_id_roundtrip(id: ObjectStoreId) {
+        let protobuf = serialize_object_store_id(id);
+        let id2 = deserialize_object_store_id(protobuf);
+        assert_eq!(id, id2);
+    }
+
+    #[test]
+    fn test_column_set_roundtrip() {
+        assert_column_set_roundtrip(ColumnSet::new([]));
+        assert_column_set_roundtrip(ColumnSet::new([ColumnId::new(1)]));
+        assert_column_set_roundtrip(ColumnSet::new([ColumnId::new(1), ColumnId::new(10)]));
+        assert_column_set_roundtrip(ColumnSet::new([
+            ColumnId::new(3),
+            ColumnId::new(4),
+            ColumnId::new(10),
+        ]));
+    }
+
+    #[track_caller]
+    fn assert_column_set_roundtrip(set: ColumnSet) {
+        let protobuf = serialize_column_set(&set);
+        let set2 = deserialize_column_set(protobuf);
+        assert_eq!(set, set2);
+    }
+
+    #[test]
+    fn test_parquet_file_params_roundtrip() {
+        let params = ParquetFileParams {
+            namespace_id: NamespaceId::new(1),
+            table_id: TableId::new(2),
+            partition_id: PartitionId::new(3),
+            partition_hash_id: Some(PartitionHashId::arbitrary_for_testing()),
+            object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(1337)),
+            min_time: Timestamp::new(4),
+            max_time: Timestamp::new(5),
+            file_size_bytes: 6,
+            row_count: 7,
+            compaction_level: CompactionLevel::Final,
+            created_at: Timestamp::new(8),
+            column_set: ColumnSet::new([ColumnId::new(9), ColumnId::new(10)]),
+            max_l0_created_at: Timestamp::new(11),
+        };
+        let protobuf = serialize_parquet_file_params(&params);
+        let params2 = deserialize_parquet_file_params(protobuf).unwrap();
+        assert_eq!(params, params2);
+    }
+
+    #[test]
+    fn test_parquet_file_roundtrip() {
+        let file = ParquetFile {
+            id: ParquetFileId::new(12),
+            namespace_id: NamespaceId::new(1),
+            table_id: TableId::new(2),
+            partition_id: PartitionId::new(3),
+            partition_hash_id: Some(PartitionHashId::arbitrary_for_testing()),
+            object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(1337)),
+            min_time: Timestamp::new(4),
+            max_time: Timestamp::new(5),
+            to_delete: Some(Timestamp::new(13)),
+            file_size_bytes: 6,
+            row_count: 7,
+            compaction_level: CompactionLevel::Final,
+            created_at: Timestamp::new(8),
+            column_set: ColumnSet::new([ColumnId::new(9), ColumnId::new(10)]),
+            max_l0_created_at: Timestamp::new(11),
+        };
+        let protobuf = serialize_parquet_file(file.clone());
+        let file2 = deserialize_parquet_file(protobuf).unwrap();
+        assert_eq!(file, file2);
+    }
+}
diff --git a/iox_catalog/src/grpc/server.rs b/iox_catalog/src/grpc/server.rs
new file mode 100644
index 0000000..2105457
--- /dev/null
+++ b/iox_catalog/src/grpc/server.rs
@@ -0,0 +1,1032 @@
+//! gRPC server implementation.
+
+use std::{pin::Pin, sync::Arc};
+
+use crate::{
+    grpc::serialization::{
+        catalog_error_to_status, deserialize_column_type, deserialize_object_store_id,
+        deserialize_parquet_file_params, deserialize_soft_deleted_rows, deserialize_sort_key_ids,
+        serialize_column, serialize_namespace, serialize_object_store_id, serialize_parquet_file,
+        serialize_partition, serialize_skipped_compaction, serialize_sort_key_ids, serialize_table,
+        ContextExt, ConvertExt, ConvertOptExt, RequiredExt,
+    },
+    interface::{CasFailure, Catalog},
+};
+use async_trait::async_trait;
+use data_types::{
+    NamespaceId, NamespaceServiceProtectionLimitsOverride, PartitionId, PartitionKey, TableId,
+    Timestamp,
+};
+use futures::{Stream, StreamExt, TryStreamExt};
+use generated_types::influxdata::iox::catalog::v2 as proto;
+use generated_types::influxdata::iox::catalog::v2::{TableSnapshotRequest, TableSnapshotResponse};
+use tonic::{Request, Response, Status};
+
+type TonicStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send + 'static>>;
+
+/// gRPC server.
+#[derive(Debug)]
+pub struct GrpcCatalogServer {
+    catalog: Arc<dyn Catalog>,
+}
+
+impl GrpcCatalogServer {
+    /// Create a new [`GrpcCatalogServer`].
+    pub fn new(catalog: Arc<dyn Catalog>) -> Self {
+        Self { catalog }
+    }
+
+    /// Get service for integration w/ tonic.
+    pub fn service(&self) -> proto::catalog_service_server::CatalogServiceServer<Self> {
+        let this = Self {
+            catalog: Arc::clone(&self.catalog),
+        };
+        proto::catalog_service_server::CatalogServiceServer::new(this)
+    }
+}
+
+#[async_trait]
+impl proto::catalog_service_server::CatalogService for GrpcCatalogServer {
+    type NamespaceListStream = TonicStream<proto::NamespaceListResponse>;
+
+    type TableListByNamespaceIdStream = TonicStream<proto::TableListByNamespaceIdResponse>;
+    type TableListStream = TonicStream<proto::TableListResponse>;
+
+    type ColumnCreateOrGetManyUncheckedStream =
+        TonicStream<proto::ColumnCreateOrGetManyUncheckedResponse>;
+    type ColumnListByNamespaceIdStream = TonicStream<proto::ColumnListByNamespaceIdResponse>;
+    type ColumnListByTableIdStream = TonicStream<proto::ColumnListByTableIdResponse>;
+    type ColumnListStream = TonicStream<proto::ColumnListResponse>;
+
+    type PartitionGetByIdBatchStream = TonicStream<proto::PartitionGetByIdBatchResponse>;
+    type PartitionListByTableIdStream = TonicStream<proto::PartitionListByTableIdResponse>;
+    type PartitionListIdsStream = TonicStream<proto::PartitionListIdsResponse>;
+    type PartitionGetInSkippedCompactionsStream =
+        TonicStream<proto::PartitionGetInSkippedCompactionsResponse>;
+    type PartitionListSkippedCompactionsStream =
+        TonicStream<proto::PartitionListSkippedCompactionsResponse>;
+    type PartitionMostRecentNStream = TonicStream<proto::PartitionMostRecentNResponse>;
+    type PartitionNewFileBetweenStream = TonicStream<proto::PartitionNewFileBetweenResponse>;
+    type PartitionListOldStyleStream = TonicStream<proto::PartitionListOldStyleResponse>;
+
+    type ParquetFileFlagForDeleteByRetentionStream =
+        TonicStream<proto::ParquetFileFlagForDeleteByRetentionResponse>;
+    type ParquetFileDeleteOldIdsOnlyStream =
+        TonicStream<proto::ParquetFileDeleteOldIdsOnlyResponse>;
+    type ParquetFileListByPartitionNotToDeleteBatchStream =
+        TonicStream<proto::ParquetFileListByPartitionNotToDeleteBatchResponse>;
+    type ParquetFileExistsByObjectStoreIdBatchStream =
+        TonicStream<proto::ParquetFileExistsByObjectStoreIdBatchResponse>;
+
+    async fn namespace_create(
+        &self,
+        request: Request<proto::NamespaceCreateRequest>,
+    ) -> Result<Response<proto::NamespaceCreateResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .create(
+                &req.name.convert().ctx("name")?,
+                req.partition_template
+                    .convert_opt()
+                    .ctx("partition_template")?,
+                req.retention_period_ns,
+                req.service_protection_limits
+                    .map(|l| {
+                        let l = NamespaceServiceProtectionLimitsOverride {
+                            max_tables: l.max_tables.convert_opt().ctx("max_tables")?,
+                            max_columns_per_table: l
+                                .max_columns_per_table
+                                .convert_opt()
+                                .ctx("max_columns_per_table")?,
+                        };
+                        Ok(l) as Result<_, tonic::Status>
+                    })
+                    .transpose()?,
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(proto::NamespaceCreateResponse {
+            namespace: Some(ns),
+        }))
+    }
+
+    async fn namespace_update_retention_period(
+        &self,
+        request: Request<proto::NamespaceUpdateRetentionPeriodRequest>,
+    ) -> Result<Response<proto::NamespaceUpdateRetentionPeriodResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .update_retention_period(&req.name, req.retention_period_ns)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(
+            proto::NamespaceUpdateRetentionPeriodResponse {
+                namespace: Some(ns),
+            },
+        ))
+    }
+
+    async fn namespace_list(
+        &self,
+        request: Request<proto::NamespaceListRequest>,
+    ) -> Result<Response<Self::NamespaceListStream>, tonic::Status> {
+        let req = request.into_inner();
+        let deleted = deserialize_soft_deleted_rows(req.deleted)?;
+
+        let ns_list = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .list(deleted)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(ns_list.into_iter().map(|ns| {
+                let ns = serialize_namespace(ns);
+
+                Ok(proto::NamespaceListResponse {
+                    namespace: Some(ns),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn namespace_get_by_id(
+        &self,
+        request: Request<proto::NamespaceGetByIdRequest>,
+    ) -> Result<Response<proto::NamespaceGetByIdResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let deleted = deserialize_soft_deleted_rows(req.deleted)?;
+
+        let maybe_ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .get_by_id(NamespaceId::new(req.id), deleted)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let maybe_ns = maybe_ns.map(serialize_namespace);
+
+        Ok(Response::new(proto::NamespaceGetByIdResponse {
+            namespace: maybe_ns,
+        }))
+    }
+
+    async fn namespace_get_by_name(
+        &self,
+        request: Request<proto::NamespaceGetByNameRequest>,
+    ) -> Result<Response<proto::NamespaceGetByNameResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let deleted = deserialize_soft_deleted_rows(req.deleted)?;
+
+        let maybe_ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .get_by_name(&req.name, deleted)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let maybe_ns = maybe_ns.map(serialize_namespace);
+
+        Ok(Response::new(proto::NamespaceGetByNameResponse {
+            namespace: maybe_ns,
+        }))
+    }
+
+    async fn namespace_soft_delete(
+        &self,
+        request: Request<proto::NamespaceSoftDeleteRequest>,
+    ) -> Result<Response<proto::NamespaceSoftDeleteResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        self.catalog
+            .repositories()
+            .namespaces()
+            .soft_delete(&req.name)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::NamespaceSoftDeleteResponse {}))
+    }
+
+    async fn namespace_update_table_limit(
+        &self,
+        request: Request<proto::NamespaceUpdateTableLimitRequest>,
+    ) -> Result<Response<proto::NamespaceUpdateTableLimitResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .update_table_limit(&req.name, req.new_max.convert().ctx("new_max")?)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(proto::NamespaceUpdateTableLimitResponse {
+            namespace: Some(ns),
+        }))
+    }
+
+    async fn namespace_update_column_limit(
+        &self,
+        request: Request<proto::NamespaceUpdateColumnLimitRequest>,
+    ) -> Result<Response<proto::NamespaceUpdateColumnLimitResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .update_column_limit(&req.name, req.new_max.convert().ctx("new_max")?)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(proto::NamespaceUpdateColumnLimitResponse {
+            namespace: Some(ns),
+        }))
+    }
+
+    async fn table_create(
+        &self,
+        request: Request<proto::TableCreateRequest>,
+    ) -> Result<Response<proto::TableCreateResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let table = self
+            .catalog
+            .repositories()
+            .tables()
+            .create(
+                &req.name,
+                req.partition_template.convert().ctx("partition_template")?,
+                NamespaceId::new(req.namespace_id),
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let table = serialize_table(table);
+
+        Ok(Response::new(proto::TableCreateResponse {
+            table: Some(table),
+        }))
+    }
+
+    async fn table_get_by_id(
+        &self,
+        request: Request<proto::TableGetByIdRequest>,
+    ) -> Result<Response<proto::TableGetByIdResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_table = self
+            .catalog
+            .repositories()
+            .tables()
+            .get_by_id(TableId::new(req.id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::TableGetByIdResponse {
+            table: maybe_table.map(serialize_table),
+        }))
+    }
+
+    async fn table_get_by_namespace_and_name(
+        &self,
+        request: Request<proto::TableGetByNamespaceAndNameRequest>,
+    ) -> Result<Response<proto::TableGetByNamespaceAndNameResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_table = self
+            .catalog
+            .repositories()
+            .tables()
+            .get_by_namespace_and_name(NamespaceId::new(req.namespace_id), &req.name)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::TableGetByNamespaceAndNameResponse {
+            table: maybe_table.map(serialize_table),
+        }))
+    }
+
+    async fn table_list_by_namespace_id(
+        &self,
+        request: Request<proto::TableListByNamespaceIdRequest>,
+    ) -> Result<Response<Self::TableListByNamespaceIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let table_list = self
+            .catalog
+            .repositories()
+            .tables()
+            .list_by_namespace_id(NamespaceId::new(req.namespace_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(table_list.into_iter().map(|table| {
+                let table = serialize_table(table);
+                Ok(proto::TableListByNamespaceIdResponse { table: Some(table) })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn table_list(
+        &self,
+        _request: Request<proto::TableListRequest>,
+    ) -> Result<Response<Self::TableListStream>, tonic::Status> {
+        let table_list = self
+            .catalog
+            .repositories()
+            .tables()
+            .list()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(table_list.into_iter().map(|table| {
+                let table = serialize_table(table);
+                Ok(proto::TableListResponse { table: Some(table) })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn table_snapshot(
+        &self,
+        request: Request<TableSnapshotRequest>,
+    ) -> Result<Response<TableSnapshotResponse>, Status> {
+        let req = request.into_inner();
+        let snapshot = self
+            .catalog
+            .repositories()
+            .tables()
+            .snapshot(TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(TableSnapshotResponse {
+            generation: snapshot.generation(),
+            table: Some(snapshot.into()),
+        }))
+    }
+
+    async fn column_create_or_get(
+        &self,
+        request: Request<proto::ColumnCreateOrGetRequest>,
+    ) -> Result<Response<proto::ColumnCreateOrGetResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let column_type = deserialize_column_type(req.column_type)?;
+
+        let column = self
+            .catalog
+            .repositories()
+            .columns()
+            .create_or_get(&req.name, TableId::new(req.table_id), column_type)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let column = serialize_column(column);
+
+        Ok(Response::new(proto::ColumnCreateOrGetResponse {
+            column: Some(column),
+        }))
+    }
+
+    async fn column_create_or_get_many_unchecked(
+        &self,
+        request: Request<proto::ColumnCreateOrGetManyUncheckedRequest>,
+    ) -> Result<Response<Self::ColumnCreateOrGetManyUncheckedStream>, tonic::Status> {
+        let req = request.into_inner();
+        let columns = req
+            .columns
+            .iter()
+            .map(|(name, t)| {
+                let t = deserialize_column_type(*t)?;
+                Ok((name.as_str(), t))
+            })
+            .collect::<Result<_, tonic::Status>>()?;
+
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .create_or_get_many_unchecked(TableId::new(req.table_id), columns)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnCreateOrGetManyUncheckedResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn column_list_by_namespace_id(
+        &self,
+        request: Request<proto::ColumnListByNamespaceIdRequest>,
+    ) -> Result<Response<Self::ColumnListByNamespaceIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .list_by_namespace_id(NamespaceId::new(req.namespace_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnListByNamespaceIdResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn column_list_by_table_id(
+        &self,
+        request: Request<proto::ColumnListByTableIdRequest>,
+    ) -> Result<Response<Self::ColumnListByTableIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .list_by_table_id(TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnListByTableIdResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn column_list(
+        &self,
+        _request: Request<proto::ColumnListRequest>,
+    ) -> Result<Response<Self::ColumnListStream>, tonic::Status> {
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .list()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnListResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_create_or_get(
+        &self,
+        request: Request<proto::PartitionCreateOrGetRequest>,
+    ) -> Result<Response<proto::PartitionCreateOrGetResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let partition = self
+            .catalog
+            .repositories()
+            .partitions()
+            .create_or_get(PartitionKey::from(req.key), TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let partition = serialize_partition(partition);
+
+        Ok(Response::new(proto::PartitionCreateOrGetResponse {
+            partition: Some(partition),
+        }))
+    }
+
+    async fn partition_get_by_id_batch(
+        &self,
+        request: Request<proto::PartitionGetByIdBatchRequest>,
+    ) -> Result<Response<Self::PartitionGetByIdBatchStream>, tonic::Status> {
+        let req = request.into_inner();
+        let partition_ids = req
+            .partition_ids
+            .into_iter()
+            .map(PartitionId::new)
+            .collect::<Vec<_>>();
+
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .get_by_id_batch(&partition_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionGetByIdBatchResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_by_table_id(
+        &self,
+        request: Request<proto::PartitionListByTableIdRequest>,
+    ) -> Result<Response<Self::PartitionListByTableIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_by_table_id(TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionListByTableIdResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_ids(
+        &self,
+        _request: Request<proto::PartitionListIdsRequest>,
+    ) -> Result<Response<Self::PartitionListIdsStream>, tonic::Status> {
+        let id_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_ids()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                Ok(proto::PartitionListIdsResponse {
+                    partition_id: id.get(),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_cas_sort_key(
+        &self,
+        request: Request<proto::PartitionCasSortKeyRequest>,
+    ) -> Result<Response<proto::PartitionCasSortKeyResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let res = self
+            .catalog
+            .repositories()
+            .partitions()
+            .cas_sort_key(
+                PartitionId::new(req.partition_id),
+                req.old_sort_key_ids.map(deserialize_sort_key_ids).as_ref(),
+                &deserialize_sort_key_ids(req.new_sort_key_ids.required().ctx("new_sort_key_ids")?),
+            )
+            .await;
+
+        match res {
+            Ok(partition) => Ok(Response::new(proto::PartitionCasSortKeyResponse {
+                res: Some(proto::partition_cas_sort_key_response::Res::Partition(
+                    serialize_partition(partition),
+                )),
+            })),
+            Err(CasFailure::ValueMismatch(sort_key_ids)) => {
+                Ok(Response::new(proto::PartitionCasSortKeyResponse {
+                    res: Some(proto::partition_cas_sort_key_response::Res::CurrentSortKey(
+                        serialize_sort_key_ids(&sort_key_ids),
+                    )),
+                }))
+            }
+            Err(CasFailure::QueryError(e)) => Err(catalog_error_to_status(e)),
+        }
+    }
+
+    async fn partition_record_skipped_compaction(
+        &self,
+        request: Request<proto::PartitionRecordSkippedCompactionRequest>,
+    ) -> Result<Response<proto::PartitionRecordSkippedCompactionResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        self.catalog
+            .repositories()
+            .partitions()
+            .record_skipped_compaction(
+                PartitionId::new(req.partition_id),
+                &req.reason,
+                req.num_files as usize,
+                req.limit_num_files as usize,
+                req.limit_num_files_first_in_partition as usize,
+                req.estimated_bytes,
+                req.limit_bytes,
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            proto::PartitionRecordSkippedCompactionResponse {},
+        ))
+    }
+
+    async fn partition_get_in_skipped_compactions(
+        &self,
+        request: Request<proto::PartitionGetInSkippedCompactionsRequest>,
+    ) -> Result<Response<Self::PartitionGetInSkippedCompactionsStream>, tonic::Status> {
+        let req = request.into_inner();
+        let partition_ids = req
+            .partition_ids
+            .into_iter()
+            .map(PartitionId::new)
+            .collect::<Vec<_>>();
+
+        let skipped_compaction_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .get_in_skipped_compactions(&partition_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(skipped_compaction_list.into_iter().map(|sc| {
+                let sc = serialize_skipped_compaction(sc);
+                Ok(proto::PartitionGetInSkippedCompactionsResponse {
+                    skipped_compaction: Some(sc),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_skipped_compactions(
+        &self,
+        _request: Request<proto::PartitionListSkippedCompactionsRequest>,
+    ) -> Result<Response<Self::PartitionListSkippedCompactionsStream>, tonic::Status> {
+        let skipped_compaction_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_skipped_compactions()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(skipped_compaction_list.into_iter().map(|sc| {
+                let sc = serialize_skipped_compaction(sc);
+                Ok(proto::PartitionListSkippedCompactionsResponse {
+                    skipped_compaction: Some(sc),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_delete_skipped_compactions(
+        &self,
+        request: Request<proto::PartitionDeleteSkippedCompactionsRequest>,
+    ) -> Result<Response<proto::PartitionDeleteSkippedCompactionsResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_skipped_compaction = self
+            .catalog
+            .repositories()
+            .partitions()
+            .delete_skipped_compactions(PartitionId::new(req.partition_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let maybe_skipped_compaction = maybe_skipped_compaction.map(serialize_skipped_compaction);
+
+        Ok(Response::new(
+            proto::PartitionDeleteSkippedCompactionsResponse {
+                skipped_compaction: maybe_skipped_compaction,
+            },
+        ))
+    }
+
+    async fn partition_most_recent_n(
+        &self,
+        request: Request<proto::PartitionMostRecentNRequest>,
+    ) -> Result<Response<Self::PartitionMostRecentNStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .most_recent_n(req.n as usize)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionMostRecentNResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_new_file_between(
+        &self,
+        request: Request<proto::PartitionNewFileBetweenRequest>,
+    ) -> Result<Response<Self::PartitionNewFileBetweenStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .partitions_new_file_between(
+                Timestamp::new(req.minimum_time),
+                req.maximum_time.map(Timestamp::new),
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                Ok(proto::PartitionNewFileBetweenResponse {
+                    partition_id: id.get(),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_old_style(
+        &self,
+        _request: Request<proto::PartitionListOldStyleRequest>,
+    ) -> Result<Response<Self::PartitionListOldStyleStream>, tonic::Status> {
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_old_style()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionListOldStyleResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_snapshot(
+        &self,
+        request: Request<proto::PartitionSnapshotRequest>,
+    ) -> Result<Response<proto::PartitionSnapshotResponse>, Status> {
+        let req = request.into_inner();
+        let snapshot = self
+            .catalog
+            .repositories()
+            .partitions()
+            .snapshot(PartitionId::new(req.partition_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::PartitionSnapshotResponse {
+            generation: snapshot.generation(),
+            partition: Some(snapshot.into()),
+        }))
+    }
+
+    async fn parquet_file_flag_for_delete_by_retention(
+        &self,
+        _request: Request<proto::ParquetFileFlagForDeleteByRetentionRequest>,
+    ) -> Result<Response<Self::ParquetFileFlagForDeleteByRetentionStream>, tonic::Status> {
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .flag_for_delete_by_retention()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|(p_id, os_id)| {
+                let object_store_id = serialize_object_store_id(os_id);
+                Ok(proto::ParquetFileFlagForDeleteByRetentionResponse {
+                    partition_id: p_id.get(),
+                    object_store_id: Some(object_store_id),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_delete_old_ids_only(
+        &self,
+        request: Request<proto::ParquetFileDeleteOldIdsOnlyRequest>,
+    ) -> Result<Response<Self::ParquetFileDeleteOldIdsOnlyStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .delete_old_ids_only(Timestamp::new(req.older_than))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                let object_store_id = serialize_object_store_id(id);
+                Ok(proto::ParquetFileDeleteOldIdsOnlyResponse {
+                    object_store_id: Some(object_store_id),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_list_by_partition_not_to_delete_batch(
+        &self,
+        request: Request<proto::ParquetFileListByPartitionNotToDeleteBatchRequest>,
+    ) -> Result<Response<Self::ParquetFileListByPartitionNotToDeleteBatchStream>, tonic::Status>
+    {
+        let req = request.into_inner();
+        let partition_ids = req
+            .partition_ids
+            .into_iter()
+            .map(PartitionId::new)
+            .collect::<Vec<_>>();
+
+        let file_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .list_by_partition_not_to_delete_batch(partition_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(file_list.into_iter().map(|file| {
+                let file = serialize_parquet_file(file);
+                Ok(proto::ParquetFileListByPartitionNotToDeleteBatchResponse {
+                    parquet_file: Some(file),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_get_by_object_store_id(
+        &self,
+        request: Request<proto::ParquetFileGetByObjectStoreIdRequest>,
+    ) -> Result<Response<proto::ParquetFileGetByObjectStoreIdResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_file = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .get_by_object_store_id(deserialize_object_store_id(
+                req.object_store_id.required().ctx("object_store_id")?,
+            ))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            proto::ParquetFileGetByObjectStoreIdResponse {
+                parquet_file: maybe_file.map(serialize_parquet_file),
+            },
+        ))
+    }
+
+    async fn parquet_file_exists_by_object_store_id_batch(
+        &self,
+        request: Request<tonic::Streaming<proto::ParquetFileExistsByObjectStoreIdBatchRequest>>,
+    ) -> Result<Response<Self::ParquetFileExistsByObjectStoreIdBatchStream>, tonic::Status> {
+        let object_store_ids = request
+            .into_inner()
+            .map_err(|e| tonic::Status::invalid_argument(e.to_string()))
+            .and_then(|req| async move {
+                Ok(deserialize_object_store_id(
+                    req.object_store_id.required().ctx("object_store_id")?,
+                ))
+            })
+            .try_collect::<Vec<_>>()
+            .await?;
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .exists_by_object_store_id_batch(object_store_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                let object_store_id = serialize_object_store_id(id);
+                Ok(proto::ParquetFileExistsByObjectStoreIdBatchResponse {
+                    object_store_id: Some(object_store_id),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_create_upgrade_delete(
+        &self,
+        request: Request<proto::ParquetFileCreateUpgradeDeleteRequest>,
+    ) -> Result<Response<proto::ParquetFileCreateUpgradeDeleteResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let delete = req
+            .delete
+            .into_iter()
+            .map(deserialize_object_store_id)
+            .collect::<Vec<_>>();
+        let upgrade = req
+            .upgrade
+            .into_iter()
+            .map(deserialize_object_store_id)
+            .collect::<Vec<_>>();
+        let create = req
+            .create
+            .into_iter()
+            .map(deserialize_parquet_file_params)
+            .collect::<Result<Vec<_>, _>>()?;
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .create_upgrade_delete(
+                PartitionId::new(req.partition_id),
+                &delete,
+                &upgrade,
+                &create,
+                req.target_level.convert().ctx("target_level")?,
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            proto::ParquetFileCreateUpgradeDeleteResponse {
+                created_parquet_file_ids: id_list.into_iter().map(|id| id.get()).collect(),
+            },
+        ))
+    }
+}
diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs
new file mode 100644
index 0000000..dae33a2
--- /dev/null
+++ b/iox_catalog/src/interface.rs
@@ -0,0 +1,490 @@
+//! Traits and data types for the IOx Catalog API.
+
+use async_trait::async_trait;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
+};
+use iox_time::TimeProvider;
+use snafu::Snafu;
+use std::{
+    collections::HashMap,
+    fmt::{Debug, Display},
+    sync::Arc,
+};
+
+/// An error wrapper detailing the reason for a compare-and-swap failure.
+#[derive(Debug)]
+pub enum CasFailure<T> {
+    /// The compare-and-swap failed because the current value differers from the
+    /// comparator.
+    ///
+    /// Contains the new current value.
+    ValueMismatch(T),
+    /// A query error occurred.
+    QueryError(Error),
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    #[snafu(display("unhandled external error: {source}"))]
+    External {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("already exists: {descr}"))]
+    AlreadyExists { descr: String },
+
+    #[snafu(display("limit exceeded: {descr}"))]
+    LimitExceeded { descr: String },
+
+    #[snafu(display("not found: {descr}"))]
+    NotFound { descr: String },
+}
+
+impl From<sqlx::Error> for Error {
+    fn from(e: sqlx::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
+
+impl From<sqlx::migrate::MigrateError> for Error {
+    fn from(e: sqlx::migrate::MigrateError) -> Self {
+        Self::from(sqlx::Error::from(e))
+    }
+}
+
+impl From<data_types::snapshot::partition::Error> for Error {
+    fn from(e: data_types::snapshot::partition::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
+
+impl From<data_types::snapshot::table::Error> for Error {
+    fn from(e: data_types::snapshot::table::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
+
+impl From<catalog_cache::api::quorum::Error> for Error {
+    fn from(e: catalog_cache::api::quorum::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
+
+impl From<generated_types::prost::DecodeError> for Error {
+    fn from(e: generated_types::prost::DecodeError) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
+
+/// A specialized `Error` for Catalog errors
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Specify how soft-deleted entities should affect query results.
+///
+/// ```text
+///
+///                ExcludeDeleted          OnlyDeleted
+///
+///                       ┃                     ┃
+///                 .─────╋─────.         .─────╋─────.
+///              ,─'      ┃      '─.   ,─'      ┃      '─.
+///            ,'         ●         `,'         ●         `.
+///          ,'                    ,' `.                    `.
+///         ;                     ;     :                     :
+///         │      No deleted     │     │   Only deleted      │
+///         │         rows        │  ●  │       rows          │
+///         :                     :  ┃  ;                     ;
+///          ╲                     ╲ ┃ ╱                     ╱
+///           `.                    `┃'                    ,'
+///             `.                 ,'┃`.                 ,'
+///               '─.           ,─'  ┃  '─.           ,─'
+///                  `─────────'     ┃     `─────────'
+///                                  ┃
+///
+///                               AllRows
+///
+/// ```
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SoftDeletedRows {
+    /// Return all rows.
+    AllRows,
+
+    /// Return all rows, except soft deleted rows.
+    ExcludeDeleted,
+
+    /// Return only soft deleted rows.
+    OnlyDeleted,
+}
+
+impl SoftDeletedRows {
+    pub(crate) fn as_sql_predicate(&self) -> &str {
+        match self {
+            Self::ExcludeDeleted => "deleted_at IS NULL",
+            Self::OnlyDeleted => "deleted_at IS NOT NULL",
+            Self::AllRows => "1=1",
+        }
+    }
+}
+
+/// Methods for working with the catalog.
+#[async_trait]
+pub trait Catalog: Send + Sync + Debug + Display {
+    /// Setup catalog for usage and apply possible migrations.
+    async fn setup(&self) -> Result<(), Error>;
+
+    /// Accesses the repositories without a transaction scope.
+    fn repositories(&self) -> Box<dyn RepoCollection>;
+
+    /// Gets metric registry associated with this catalog for testing purposes.
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<metric::Registry>;
+
+    /// Gets the time provider associated with this catalog.
+    fn time_provider(&self) -> Arc<dyn TimeProvider>;
+}
+
+/// Methods for working with the catalog's various repositories (collections of entities).
+///
+/// # Repositories
+///
+/// The methods (e.g. `create_*` or `get_by_*`) for handling entities (namespaces, partitions,
+/// etc.) are grouped into *repositories* with one repository per entity. A repository can be
+/// thought of a collection of a single kind of entity. Getting repositories from the transaction
+/// is cheap.
+///
+/// A repository might internally map to a wide range of different storage abstractions, ranging
+/// from one or more SQL tables over key-value key spaces to simple in-memory vectors. The user
+/// should and must not care how these are implemented.
+pub trait RepoCollection: Send + Sync + Debug {
+    /// Repository for [namespaces](data_types::Namespace).
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo;
+
+    /// Repository for [tables](data_types::Table).
+    fn tables(&mut self) -> &mut dyn TableRepo;
+
+    /// Repository for [columns](data_types::Column).
+    fn columns(&mut self) -> &mut dyn ColumnRepo;
+
+    /// Repository for [partitions](data_types::Partition).
+    fn partitions(&mut self) -> &mut dyn PartitionRepo;
+
+    /// Repository for [Parquet files](data_types::ParquetFile).
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo;
+}
+
+/// Functions for working with namespaces in the catalog
+#[async_trait]
+pub trait NamespaceRepo: Send + Sync {
+    /// Creates the namespace in the catalog. If one by the same name already exists, an
+    /// error is returned.
+    /// Specify `None` for `retention_period_ns` to get infinite retention.
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace>;
+
+    /// Update retention period for a namespace
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace>;
+
+    /// List all namespaces.
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>>;
+
+    /// Gets the namespace by its ID.
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>>;
+
+    /// Gets the namespace by its unique name.
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>>;
+
+    /// Soft-delete a namespace by name
+    async fn soft_delete(&mut self, name: &str) -> Result<()>;
+
+    /// Update the limit on the number of tables that can exist per namespace.
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace>;
+
+    /// Update the limit on the number of columns that can exist per table in a given namespace.
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace>;
+}
+
+/// Functions for working with tables in the catalog
+#[async_trait]
+pub trait TableRepo: Send + Sync {
+    /// Creates the table in the catalog. If one in the same namespace with the same name already
+    /// exists, an error is returned.
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table>;
+
+    /// get table by ID
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>>;
+
+    /// get table by namespace ID and name
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>>;
+
+    /// Lists all tables in the catalog for the given namespace id.
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>>;
+
+    /// List all tables.
+    async fn list(&mut self) -> Result<Vec<Table>>;
+
+    /// Obtain a table snapshot
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot>;
+}
+
+/// Functions for working with columns in the catalog
+#[async_trait]
+pub trait ColumnRepo: Send + Sync {
+    /// Creates the column in the catalog or returns the existing column. Will return a
+    /// `Error::ColumnTypeMismatch` if the existing column type doesn't match the type
+    /// the caller is attempting to create.
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column>;
+
+    /// Perform a bulk upsert of columns specified by a map of column name to column type.
+    ///
+    /// Implementations make no guarantees as to the ordering or atomicity of
+    /// the batch of column upsert operations - a batch upsert may partially
+    /// commit, in which case an error MUST be returned by the implementation.
+    ///
+    /// Per-namespace limits on the number of columns allowed per table are explicitly NOT checked
+    /// by this function, hence the name containing `unchecked`. It is expected that the caller
+    /// will check this first-- and yes, this is racy.
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>>;
+
+    /// Lists all columns in the passed in namespace id.
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>>;
+
+    /// List all columns for the given table ID.
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>>;
+
+    /// List all columns.
+    async fn list(&mut self) -> Result<Vec<Column>>;
+}
+
+/// Extension trait for [`ParquetFileRepo`]
+#[async_trait]
+pub trait PartitionRepoExt {
+    /// create the parquet file
+    async fn get_by_id(self, partition_id: PartitionId) -> Result<Option<Partition>>;
+}
+
+#[async_trait]
+impl PartitionRepoExt for &mut dyn PartitionRepo {
+    async fn get_by_id(self, partition_id: PartitionId) -> Result<Option<Partition>> {
+        let iter = self.get_by_id_batch(&[partition_id]).await?;
+        Ok(iter.into_iter().next())
+    }
+}
+
+/// Functions for working with IOx partitions in the catalog. These are how IOx splits up
+/// data within a namespace.
+#[async_trait]
+pub trait PartitionRepo: Send + Sync {
+    /// create or get a partition record for the given partition key and table
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition>;
+
+    /// get multiple partitions by ID.
+    ///
+    /// the output order is undefined, non-existing partitions are not part of the output.
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>>;
+
+    /// return the partitions by table id
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>>;
+
+    /// return all partitions IDs
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>>;
+
+    /// Update the sort key for the partition, setting it to `new_sort_key_ids` iff
+    /// the current value matches `old_sort_key_ids`.
+    ///
+    /// NOTE: it is expected that ONLY the ingesters update sort keys for
+    /// existing partitions.
+    ///
+    /// # Spurious failure
+    ///
+    /// Implementations are allowed to spuriously return
+    /// [`CasFailure::ValueMismatch`] for performance reasons in the presence of
+    /// concurrent writers.
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>>;
+
+    /// Record an instance of a partition being selected for compaction but compaction was not
+    /// completed for the specified reason.
+    #[allow(clippy::too_many_arguments)]
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()>;
+
+    /// Get the record of partitions being skipped.
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_id: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>>;
+
+    /// List the records of compacting a partition being skipped. This is mostly useful for testing.
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>>;
+
+    /// Delete the records of skipping a partition being compacted.
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>>;
+
+    /// Return the N most recently created partitions.
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>>;
+
+    /// Select partitions with a `new_file_at` value greater than the minimum time value and, if specified, less than
+    /// the maximum time value. Both range ends are exclusive; a timestamp exactly equal to either end will _not_ be
+    /// included in the results.
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>>;
+
+    /// Return all partitions that do not have deterministic hash IDs in the catalog. Used in
+    /// the ingester's `OldPartitionBloomFilter` to determine whether a catalog query is necessary.
+    /// Can be removed when all partitions have hash IDs and support for old-style partitions is no
+    /// longer needed.
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>>;
+
+    /// Obtain a partition snapshot
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot>;
+}
+
+/// Extension trait for [`ParquetFileRepo`]
+#[async_trait]
+pub trait ParquetFileRepoExt {
+    /// create the parquet file
+    async fn create(self, parquet_file_params: ParquetFileParams) -> Result<ParquetFile>;
+}
+
+#[async_trait]
+impl ParquetFileRepoExt for &mut dyn ParquetFileRepo {
+    /// create the parquet file
+    async fn create(self, params: ParquetFileParams) -> Result<ParquetFile> {
+        let files = self
+            .create_upgrade_delete(
+                params.partition_id,
+                &[],
+                &[],
+                &[params.clone()],
+                CompactionLevel::Initial,
+            )
+            .await?;
+        let id = files.into_iter().next().unwrap();
+        Ok(ParquetFile::from_params(params, id))
+    }
+}
+
+/// Functions for working with parquet file pointers in the catalog
+#[async_trait]
+pub trait ParquetFileRepo: Send + Sync {
+    /// Flag all parquet files for deletion that are older than their namespace's retention period.
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>>;
+
+    /// Delete parquet files that were marked to be deleted earlier than the specified time.
+    ///
+    /// Returns the deleted IDs only.
+    ///
+    /// This deletion is limited to a certain (backend-specific) number of files to avoid overlarge
+    /// changes. The caller MAY call this method again if the result was NOT empty.
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>>;
+
+    /// List parquet files for given partitions that are NOT marked as
+    /// [`to_delete`](ParquetFile::to_delete).
+    ///
+    /// The output order is undefined, non-existing partitions are not part of the output.
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>>;
+
+    /// Return the parquet file with the given object store id
+    // used heavily in tests for verification of catalog state.
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>>;
+
+    /// Test a batch of parquet files exist by object store ids
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>>;
+
+    /// Commit deletions, upgrades and creations in a single transaction.
+    ///
+    /// Returns IDs of created files.
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>>;
+}
diff --git a/iox_catalog/src/interface_tests.rs b/iox_catalog/src/interface_tests.rs
new file mode 100644
index 0000000..4635483
--- /dev/null
+++ b/iox_catalog/src/interface_tests.rs
@@ -0,0 +1,3203 @@
+//! Abstract tests of the catalog interface w/o relying on the actual implementation.
+use crate::{
+    interface::{
+        CasFailure, Catalog, Error, ParquetFileRepoExt, PartitionRepoExt, RepoCollection,
+        SoftDeletedRows,
+    },
+    test_helpers::{arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table},
+    util::{list_schemas, validate_or_insert_schema},
+};
+
+use ::test_helpers::assert_error;
+use assert_matches::assert_matches;
+use async_trait::async_trait;
+use data_types::snapshot::table::TableSnapshot;
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    ColumnId, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceSchema, ObjectStoreId, ParquetFile, ParquetFileId, ParquetFileParams,
+    PartitionId, SortKeyIds, TableId, Timestamp,
+};
+use data_types::{snapshot::partition::PartitionSnapshot, Column, PartitionHashId, PartitionKey};
+use futures::{Future, StreamExt};
+use generated_types::influxdata::iox::partition_template::v1 as proto;
+use iox_time::TimeProvider;
+use metric::{Attributes, DurationHistogram, Metric};
+use parking_lot::Mutex;
+use std::{any::Any, fmt::Display};
+use std::{
+    collections::{BTreeMap, BTreeSet, HashMap},
+    ops::DerefMut,
+    sync::Arc,
+    time::Duration,
+};
+
+pub(crate) async fn test_catalog<R, F>(clean_state: R)
+where
+    R: Fn() -> F + Send + Sync,
+    F: Future<Output = Arc<dyn Catalog>> + Send,
+{
+    test_setup(clean_state().await).await;
+    test_namespace_soft_deletion(clean_state().await).await;
+    test_partitions_new_file_between(clean_state().await).await;
+    test_column(clean_state().await).await;
+    test_partition(clean_state().await).await;
+    test_parquet_file(clean_state().await).await;
+    test_parquet_file_delete_broken(clean_state().await).await;
+    test_update_to_compaction_level_1(clean_state().await).await;
+    test_list_by_partiton_not_to_delete(clean_state().await).await;
+    test_list_schemas(clean_state().await).await;
+    test_list_schemas_soft_deleted_rows(clean_state().await).await;
+    test_delete_namespace(clean_state().await).await;
+
+    let catalog = clean_state().await;
+    test_namespace(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "namespace_create");
+
+    let catalog = clean_state().await;
+    test_table(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "table_create");
+
+    let catalog = clean_state().await;
+    test_column(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "column_create_or_get");
+
+    let catalog = clean_state().await;
+    test_partition(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "partition_create_or_get");
+
+    let catalog = clean_state().await;
+    test_parquet_file(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "parquet_create_upgrade_delete");
+
+    test_two_repos(clean_state().await).await;
+    test_partition_create_or_get_idempotent(clean_state().await).await;
+    test_column_create_or_get_many_unchecked(clean_state).await;
+}
+
+async fn test_setup(catalog: Arc<dyn Catalog>) {
+    catalog.setup().await.expect("first catalog setup");
+    catalog.setup().await.expect("second catalog setup");
+}
+
+async fn test_namespace(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace_name = NamespaceName::new("test_namespace").unwrap();
+    let namespace = repos
+        .namespaces()
+        .create(&namespace_name, None, None, None)
+        .await
+        .unwrap();
+    assert!(namespace.id > NamespaceId::new(0));
+    assert_eq!(namespace.name, namespace_name.as_str());
+    assert_eq!(
+        namespace.partition_template,
+        NamespacePartitionTemplateOverride::default()
+    );
+    let lookup_namespace = repos
+        .namespaces()
+        .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(namespace, lookup_namespace);
+
+    // Assert default values for service protection limits.
+    assert_eq!(namespace.max_tables, MaxTables::default());
+    assert_eq!(
+        namespace.max_columns_per_table,
+        MaxColumnsPerTable::default()
+    );
+
+    let conflict = repos
+        .namespaces()
+        .create(&namespace_name, None, None, None)
+        .await;
+    assert!(matches!(conflict.unwrap_err(), Error::AlreadyExists { .. }));
+
+    let found = repos
+        .namespaces()
+        .get_by_id(namespace.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .expect("namespace should be there");
+    assert_eq!(namespace, found);
+
+    let not_found = repos
+        .namespaces()
+        .get_by_id(NamespaceId::new(i64::MAX), SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(not_found.is_none());
+
+    let found = repos
+        .namespaces()
+        .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .expect("namespace should be there");
+    assert_eq!(namespace, found);
+
+    let not_found = repos
+        .namespaces()
+        .get_by_name("does_not_exist", SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(not_found.is_none());
+
+    let namespace2 = arbitrary_namespace(&mut *repos, "test_namespace2").await;
+    let mut namespaces = repos
+        .namespaces()
+        .list(SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    namespaces.sort_by_key(|ns| ns.name.clone());
+    assert_eq!(namespaces, vec![namespace, namespace2]);
+
+    let new_table_limit = MaxTables::try_from(15_000).unwrap();
+    let modified = repos
+        .namespaces()
+        .update_table_limit(namespace_name.as_str(), new_table_limit)
+        .await
+        .expect("namespace should be updateable");
+    assert_eq!(new_table_limit, modified.max_tables);
+
+    let new_column_limit = MaxColumnsPerTable::try_from(1_500).unwrap();
+    let modified = repos
+        .namespaces()
+        .update_column_limit(namespace_name.as_str(), new_column_limit)
+        .await
+        .expect("namespace should be updateable");
+    assert_eq!(new_column_limit, modified.max_columns_per_table);
+
+    const NEW_RETENTION_PERIOD_NS: i64 = 5 * 60 * 60 * 1000 * 1000 * 1000;
+    let modified = repos
+        .namespaces()
+        .update_retention_period(namespace_name.as_str(), Some(NEW_RETENTION_PERIOD_NS))
+        .await
+        .expect("namespace should be updateable");
+    assert_eq!(
+        NEW_RETENTION_PERIOD_NS,
+        modified.retention_period_ns.unwrap()
+    );
+
+    let modified = repos
+        .namespaces()
+        .update_retention_period(namespace_name.as_str(), None)
+        .await
+        .expect("namespace should be updateable");
+    assert!(modified.retention_period_ns.is_none());
+
+    // create namespace with retention period NULL (the default)
+    let namespace3 = arbitrary_namespace(&mut *repos, "test_namespace3").await;
+    assert!(namespace3.retention_period_ns.is_none());
+
+    // create namespace with retention period
+    let namespace4_name = NamespaceName::new("test_namespace4").unwrap();
+    let namespace4 = repos
+        .namespaces()
+        .create(&namespace4_name, None, Some(NEW_RETENTION_PERIOD_NS), None)
+        .await
+        .expect("namespace with 5-hour retention should be created");
+    assert_eq!(
+        NEW_RETENTION_PERIOD_NS,
+        namespace4.retention_period_ns.unwrap()
+    );
+    // reset retention period to NULL to avoid affecting later tests
+    repos
+        .namespaces()
+        .update_retention_period(&namespace4_name, None)
+        .await
+        .expect("namespace should be updateable");
+
+    // create a namespace with a PartitionTemplate other than the default
+    let tag_partition_template =
+        NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("tag1".into())),
+            }],
+        })
+        .unwrap();
+    let namespace5_name = NamespaceName::new("test_namespace5").unwrap();
+    let namespace5 = repos
+        .namespaces()
+        .create(
+            &namespace5_name,
+            Some(tag_partition_template.clone()),
+            None,
+            None,
+        )
+        .await
+        .unwrap();
+    assert_eq!(namespace5.partition_template, tag_partition_template);
+    let lookup_namespace5 = repos
+        .namespaces()
+        .get_by_name(&namespace5_name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(namespace5, lookup_namespace5);
+
+    // remove namespace to avoid it from affecting later tests
+    repos
+        .namespaces()
+        .soft_delete("test_namespace")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("test_namespace2")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("test_namespace3")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("test_namespace4")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+/// Construct a set of two namespaces:
+///
+///  * deleted-ns: marked as soft-deleted
+///  * active-ns: not marked as deleted
+///
+/// And assert the expected "soft delete" semantics / correctly filter out
+/// the expected rows for all three states of [`SoftDeletedRows`].
+async fn test_namespace_soft_deletion(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let deleted_ns = arbitrary_namespace(&mut *repos, "deleted-ns").await;
+    let active_ns = arbitrary_namespace(&mut *repos, "active-ns").await;
+
+    // Mark "deleted-ns" as soft-deleted.
+    repos.namespaces().soft_delete("deleted-ns").await.unwrap();
+
+    // Which should be idempotent (ignoring the timestamp change - when
+    // changing this to "soft delete" it was idempotent, so I am preserving
+    // that).
+    repos.namespaces().soft_delete("deleted-ns").await.unwrap();
+
+    // Listing should respect soft deletion.
+    let got = repos
+        .namespaces()
+        .list(SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns", "active-ns"]);
+
+    let got = repos
+        .namespaces()
+        .list(SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns"]);
+
+    let got = repos
+        .namespaces()
+        .list(SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+
+    // As should get by ID
+    let got = repos
+        .namespaces()
+        .get_by_id(deleted_ns.id, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_id(deleted_ns.id, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| {
+            assert!(v.deleted_at.is_some());
+            v.name
+        });
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_id(deleted_ns.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_id(active_ns.id, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_id(active_ns.id, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_id(active_ns.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+
+    // And get by name
+    let got = repos
+        .namespaces()
+        .get_by_name(&deleted_ns.name, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_name(&deleted_ns.name, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| {
+            assert!(v.deleted_at.is_some());
+            v.name
+        });
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_name(&deleted_ns.name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_name(&active_ns.name, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_name(&active_ns.name, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_name(&active_ns.name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+}
+
+// Assert the set of strings "a" is equal to the set "b", tolerating
+// duplicates.
+#[track_caller]
+fn assert_string_set_eq<T, U>(a: impl IntoIterator<Item = T>, b: impl IntoIterator<Item = U>)
+where
+    T: Into<String>,
+    U: Into<String>,
+{
+    let mut a = a.into_iter().map(Into::into).collect::<Vec<String>>();
+    a.sort_unstable();
+    let mut b = b.into_iter().map(Into::into).collect::<Vec<String>>();
+    b.sort_unstable();
+    assert_eq!(a, b);
+}
+
+async fn test_table(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_table_test").await;
+
+    // test we can create a table
+    let t = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+    assert!(t.id > TableId::new(0));
+    assert_eq!(
+        t.partition_template,
+        TablePartitionTemplateOverride::default()
+    );
+
+    // The default template doesn't use any tag values, so no columns need to be created.
+    let table_columns = repos.columns().list_by_table_id(t.id).await.unwrap();
+    assert!(table_columns.is_empty());
+
+    // test we get an error if we try to create it again
+    let err = repos
+        .tables()
+        .create(
+            "test_table",
+            TablePartitionTemplateOverride::try_new(None, &namespace.partition_template).unwrap(),
+            namespace.id,
+        )
+        .await;
+    assert_error!(
+        err,
+        Error::AlreadyExists { ref descr }
+            if descr == &format!("table 'test_table' in namespace {}", namespace.id)
+    );
+
+    // get by id
+    assert_eq!(t, repos.tables().get_by_id(t.id).await.unwrap().unwrap());
+    assert!(repos
+        .tables()
+        .get_by_id(TableId::new(i64::MAX))
+        .await
+        .unwrap()
+        .is_none());
+
+    let tables = repos
+        .tables()
+        .list_by_namespace_id(namespace.id)
+        .await
+        .unwrap();
+    assert_eq!(vec![t.clone()], tables);
+
+    // test we can create a table of the same name in a different namespace
+    let namespace2 = arbitrary_namespace(&mut *repos, "two").await;
+    assert_ne!(namespace, namespace2);
+    let test_table = arbitrary_table(&mut *repos, "test_table", &namespace2).await;
+    assert_ne!(t.id, test_table.id);
+    assert_eq!(test_table.namespace_id, namespace2.id);
+
+    // test get by namespace and name
+    let foo_table = arbitrary_table(&mut *repos, "foo", &namespace2).await;
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(NamespaceId::new(i64::MAX), "test_table")
+            .await
+            .unwrap(),
+        None
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace.id, "not_existing")
+            .await
+            .unwrap(),
+        None
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace.id, "test_table")
+            .await
+            .unwrap(),
+        Some(t.clone())
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace2.id, "test_table")
+            .await
+            .unwrap()
+            .as_ref(),
+        Some(&test_table)
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace2.id, "foo")
+            .await
+            .unwrap()
+            .as_ref(),
+        Some(&foo_table)
+    );
+
+    // All tables should be returned by list(), regardless of namespace
+    let mut list = repos.tables().list().await.unwrap();
+    list.sort_by_key(|t| t.id);
+    let mut expected = [t, test_table, foo_table];
+    expected.sort_by_key(|t| t.id);
+    assert_eq!(&list, &expected);
+
+    // test per-namespace table limits
+    let latest = repos
+        .namespaces()
+        .update_table_limit("namespace_table_test", MaxTables::try_from(1).unwrap())
+        .await
+        .expect("namespace should be updateable");
+    let err = repos
+        .tables()
+        .create(
+            "definitely_unique",
+            TablePartitionTemplateOverride::try_new(None, &latest.partition_template).unwrap(),
+            latest.id,
+        )
+        .await
+        .expect_err("should error with table create limit error");
+    assert!(matches!(err, Error::LimitExceeded { .. }));
+
+    // Create a table with a partition template other than the default
+    let custom_table_template = TablePartitionTemplateOverride::try_new(
+        Some(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("tag1".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("tag2".into())),
+                },
+            ],
+        }),
+        &namespace2.partition_template,
+    )
+    .unwrap();
+    let templated = repos
+        .tables()
+        .create(
+            "use_a_template",
+            custom_table_template.clone(),
+            namespace2.id,
+        )
+        .await
+        .unwrap();
+    assert_eq!(templated.partition_template, custom_table_template);
+
+    // Tag columns should be created for tags used in the template
+    let table_columns = repos
+        .columns()
+        .list_by_table_id(templated.id)
+        .await
+        .unwrap();
+    assert_eq!(table_columns.len(), 2);
+    assert!(table_columns.iter().all(|c| c.is_tag()));
+    let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect();
+    column_names.sort();
+    assert_eq!(column_names, &["tag1", "tag2"]);
+
+    let lookup_templated = repos
+        .tables()
+        .get_by_namespace_and_name(namespace2.id, "use_a_template")
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(templated, lookup_templated);
+
+    // Create a namespace with a partition template other than the default
+    let custom_namespace_template =
+        NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("zzz".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("aaa".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+            ],
+        })
+        .unwrap();
+    let custom_namespace_name = NamespaceName::new("custom_namespace").unwrap();
+    let custom_namespace = repos
+        .namespaces()
+        .create(
+            &custom_namespace_name,
+            Some(custom_namespace_template.clone()),
+            None,
+            None,
+        )
+        .await
+        .unwrap();
+    // Create a table without specifying the partition template
+    let custom_table_template =
+        TablePartitionTemplateOverride::try_new(None, &custom_namespace.partition_template)
+            .unwrap();
+    let table_templated_by_namespace = repos
+        .tables()
+        .create(
+            "use_namespace_template",
+            custom_table_template,
+            custom_namespace.id,
+        )
+        .await
+        .unwrap();
+    assert_eq!(
+        table_templated_by_namespace.partition_template,
+        TablePartitionTemplateOverride::try_new(None, &custom_namespace_template).unwrap()
+    );
+
+    // Tag columns should be created for tags used in the template
+    let table_columns = repos
+        .columns()
+        .list_by_table_id(table_templated_by_namespace.id)
+        .await
+        .unwrap();
+    assert_eq!(table_columns.len(), 2);
+    assert!(table_columns.iter().all(|c| c.is_tag()));
+    let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect();
+    column_names.sort();
+    assert_eq!(column_names, &["aaa", "zzz"]);
+
+    repos
+        .namespaces()
+        .soft_delete("namespace_table_test")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("two")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+async fn test_column(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_column_test").await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+    assert_eq!(table.namespace_id, namespace.id);
+
+    // test we can create or get a column
+    let c = repos
+        .columns()
+        .create_or_get("column_test", table.id, ColumnType::Tag)
+        .await
+        .unwrap();
+
+    let ts1 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts1).await;
+
+    let cc = repos
+        .columns()
+        .create_or_get("column_test", table.id, ColumnType::Tag)
+        .await
+        .unwrap();
+    assert!(c.id > ColumnId::new(0));
+    assert_eq!(c, cc);
+
+    let ts2 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts2).await;
+
+    assert_gt(ts2.generation(), ts1.generation());
+
+    // test that attempting to create an already defined column of a different type returns
+    // error
+    let err = repos
+        .columns()
+        .create_or_get("column_test", table.id, ColumnType::U64)
+        .await
+        .expect_err("should error with wrong column type");
+    assert!(matches!(err, Error::AlreadyExists { .. }));
+
+    // test that we can create a column of the same name under a different table
+    let table2 = arbitrary_table(&mut *repos, "test_table_2", &namespace).await;
+    let ccc = repos
+        .columns()
+        .create_or_get("column_test", table2.id, ColumnType::U64)
+        .await
+        .unwrap();
+    assert_ne!(c, ccc);
+
+    let columns = repos
+        .columns()
+        .list_by_namespace_id(namespace.id)
+        .await
+        .unwrap();
+
+    let ts3 = repos.tables().snapshot(table2.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts3).await;
+
+    let mut want = vec![c.clone(), ccc];
+    assert_eq!(want, columns);
+
+    let columns = repos.columns().list_by_table_id(table.id).await.unwrap();
+
+    let want2 = vec![c];
+    assert_eq!(want2, columns);
+
+    // Add another tag column into table2
+    let c3 = repos
+        .columns()
+        .create_or_get("b", table2.id, ColumnType::Tag)
+        .await
+        .unwrap();
+
+    let ts4 = repos.tables().snapshot(table2.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts4).await;
+
+    assert_gt(ts4.generation(), ts3.generation());
+
+    // Listing columns should return all columns in the catalog
+    let list = repos.columns().list().await.unwrap();
+    want.extend([c3]);
+    assert_eq!(list, want);
+
+    // test create_or_get_many_unchecked, below column limit
+    let mut columns = HashMap::new();
+    columns.insert("column_test", ColumnType::Tag);
+    columns.insert("new_column", ColumnType::Tag);
+    let table1_columns = repos
+        .columns()
+        .create_or_get_many_unchecked(table.id, columns)
+        .await
+        .unwrap();
+    let mut table1_column_names: Vec<_> = table1_columns.iter().map(|c| &c.name).collect();
+    table1_column_names.sort();
+    assert_eq!(table1_column_names, vec!["column_test", "new_column"]);
+
+    // test per-namespace column limits
+    repos
+        .namespaces()
+        .update_column_limit(
+            "namespace_column_test",
+            MaxColumnsPerTable::try_from(1).unwrap(),
+        )
+        .await
+        .expect("namespace should be updateable");
+    let err = repos
+        .columns()
+        .create_or_get("definitely unique", table.id, ColumnType::Tag)
+        .await
+        .expect_err("should error with table create limit error");
+    assert!(matches!(err, Error::LimitExceeded { .. }));
+
+    // test per-namespace column limits are NOT enforced with create_or_get_many_unchecked
+    let table3 = arbitrary_table(&mut *repos, "test_table_3", &namespace).await;
+    let mut columns = HashMap::new();
+    columns.insert("apples", ColumnType::Tag);
+    columns.insert("oranges", ColumnType::Tag);
+    let table3_columns = repos
+        .columns()
+        .create_or_get_many_unchecked(table3.id, columns)
+        .await
+        .unwrap();
+    let mut table3_column_names: Vec<_> = table3_columns.iter().map(|c| &c.name).collect();
+    table3_column_names.sort();
+    assert_eq!(table3_column_names, vec!["apples", "oranges"]);
+
+    repos
+        .namespaces()
+        .soft_delete("namespace_column_test")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+async fn test_partition(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_partition_test").await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+
+    let mut created = BTreeMap::new();
+    // partition to use
+    let partition = repos
+        .partitions()
+        .create_or_get("foo".into(), table.id)
+        .await
+        .expect("failed to create partition");
+    // Test: sort_key_ids from create_or_get
+    assert!(partition.sort_key_ids().is_none());
+    created.insert(partition.id, partition.clone());
+    // partition to use
+    let partition_bar = repos
+        .partitions()
+        .create_or_get("bar".into(), table.id)
+        .await
+        .expect("failed to create partition");
+    created.insert(partition_bar.id, partition_bar);
+    // partition to be skipped later
+    let to_skip_partition = repos
+        .partitions()
+        .create_or_get("asdf".into(), table.id)
+        .await
+        .unwrap();
+    created.insert(to_skip_partition.id, to_skip_partition.clone());
+    // partition to be skipped later
+    let to_skip_partition_too = repos
+        .partitions()
+        .create_or_get("asdf too".into(), table.id)
+        .await
+        .unwrap();
+    created.insert(to_skip_partition_too.id, to_skip_partition_too.clone());
+
+    // partitions can be retrieved easily
+    let mut created_sorted = created.values().cloned().collect::<Vec<_>>();
+    created_sorted.sort_by_key(|p| p.id);
+    assert_eq!(
+        to_skip_partition,
+        repos
+            .partitions()
+            .get_by_id_batch(&[to_skip_partition.id])
+            .await
+            .unwrap()
+            .into_iter()
+            .next()
+            .unwrap()
+    );
+    let non_existing_partition_id = PartitionId::new(i64::MAX);
+    assert!(repos
+        .partitions()
+        .get_by_id_batch(&[non_existing_partition_id])
+        .await
+        .unwrap()
+        .is_empty());
+    let mut batch = repos
+        .partitions()
+        .get_by_id_batch(
+            &created
+                .keys()
+                .cloned()
+                // non-existing entries are ignored
+                .chain([non_existing_partition_id])
+                // duplicates are ignored
+                .chain(created.keys().cloned())
+                .collect::<Vec<_>>(),
+        )
+        .await
+        .unwrap();
+    batch.sort_by_key(|p| p.id);
+    assert_eq!(created_sorted, batch);
+    // Test: sort_key_ids from get_by_id_batch
+    assert!(batch.iter().all(|p| p.sort_key_ids().is_none()));
+
+    assert_eq!(created_sorted, batch);
+
+    let s1 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &s1).await;
+
+    let listed = repos
+        .partitions()
+        .list_by_table_id(table.id)
+        .await
+        .expect("failed to list partitions")
+        .into_iter()
+        .map(|v| (v.id, v))
+        .collect::<BTreeMap<_, _>>();
+    // Test: sort_key_ids from list_by_table_id
+    assert!(listed.values().all(|p| p.sort_key_ids().is_none()));
+
+    assert_eq!(created, listed);
+
+    let listed = repos
+        .partitions()
+        .list_ids()
+        .await
+        .expect("failed to list partitions")
+        .into_iter()
+        .collect::<BTreeSet<_>>();
+
+    assert_eq!(created.keys().copied().collect::<BTreeSet<_>>(), listed);
+
+    // The code no longer supports creating old-style partitions, so this list is always empty
+    // in these tests. See each catalog implementation for tests that insert old-style
+    // partitions directly and verify they're returned.
+    let old_style = repos.partitions().list_old_style().await.unwrap();
+    assert!(
+        old_style.is_empty(),
+        "Expected no old-style partitions, got {old_style:?}"
+    );
+
+    // sort key should be unset on creation
+    assert!(to_skip_partition.sort_key_ids().is_none());
+
+    let s1 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    validate_partition_snapshot(repos.as_mut(), &s1).await;
+
+    // test that updates sort key from None to Some
+    let updated_partition = repos
+        .partitions()
+        .cas_sort_key(to_skip_partition.id, None, &SortKeyIds::from([2, 1, 3]))
+        .await
+        .unwrap();
+
+    // verify sort key is updated correctly
+    assert_eq!(
+        updated_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 3])
+    );
+
+    let s2 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    assert_gt(s2.generation(), s1.generation());
+    validate_partition_snapshot(repos.as_mut(), &s2).await;
+
+    // test that provides value of old_sort_key_ids but it do not match the existing one
+    // --> the new sort key will not be updated
+    let err = repos
+        .partitions()
+        .cas_sort_key(
+            to_skip_partition.id,
+            Some(&SortKeyIds::from([1])),
+            &SortKeyIds::from([1, 2, 3, 4]),
+        )
+        .await
+        .expect_err("CAS with incorrect value should fail");
+    // verify the sort key is not updated
+    assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => {
+        assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3]));
+    });
+
+    // test that provides same length but not-matched old_sort_key_ids
+    // --> the new sort key will not be updated
+    let err = repos
+        .partitions()
+        .cas_sort_key(
+            to_skip_partition.id,
+            Some(&SortKeyIds::from([1, 5, 10])),
+            &SortKeyIds::from([1, 2, 3, 4]),
+        )
+        .await
+        .expect_err("CAS with incorrect value should fail");
+    // verify the sort key is not updated
+    assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => {
+        assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3]));
+    });
+
+    // test that provide None sort_key_ids that do not match with existing values that are not None
+    // --> the new sort key will not be updated
+    let err = repos
+        .partitions()
+        .cas_sort_key(to_skip_partition.id, None, &SortKeyIds::from([1, 2, 3, 4]))
+        .await
+        .expect_err("CAS with incorrect value should fail");
+    assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => {
+        assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3]));
+    });
+
+    // test getting partition from partition id and verify values of sort_key and sort_key_ids
+    let updated_other_partition = repos
+        .partitions()
+        .get_by_id_batch(&[to_skip_partition.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+    // still has the old sort key
+    assert_eq!(
+        updated_other_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 3])
+    );
+
+    // test that updates sort_key_ids from Some matching value to Some other value
+    let updated_partition = repos
+        .partitions()
+        .cas_sort_key(
+            to_skip_partition.id,
+            Some(&SortKeyIds::from([2, 1, 3])),
+            &SortKeyIds::from([2, 1, 4, 3]),
+        )
+        .await
+        .unwrap();
+    // verify the new values are updated
+    assert_eq!(
+        updated_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 4, 3])
+    );
+
+    // test getting the new sort key from partition id
+    let updated_partition = repos
+        .partitions()
+        .get_by_id_batch(&[to_skip_partition.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+    assert_eq!(
+        updated_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 4, 3])
+    );
+
+    // use to_skip_partition_too to update sort key from empty old values
+    // first make sure the old sort key is unset
+    assert!(to_skip_partition_too.sort_key_ids().is_none());
+
+    // test that provides empty old_sort_key_ids
+    // --> the new sort key will be updated
+    let updated_to_skip_partition_too = repos
+        .partitions()
+        .cas_sort_key(to_skip_partition_too.id, None, &SortKeyIds::from([3, 4]))
+        .await
+        .unwrap();
+    // verify the new values are updated
+    assert_eq!(
+        updated_to_skip_partition_too.sort_key_ids().unwrap(),
+        &SortKeyIds::from([3, 4])
+    );
+
+    let s3 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    assert_gt(s3.generation(), s2.generation());
+    validate_partition_snapshot(repos.as_mut(), &s3).await;
+
+    // The compactor can log why compaction was skipped
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert!(
+        skipped_compactions.is_empty(),
+        "Expected no skipped compactions, got: {skipped_compactions:?}"
+    );
+    repos
+        .partitions()
+        .record_skipped_compaction(to_skip_partition.id, "I am le tired", 1, 2, 4, 10, 20)
+        .await
+        .unwrap();
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert_eq!(skipped_compactions.len(), 1);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    assert_eq!(skipped_compactions[0].reason, "I am le tired");
+    assert_eq!(skipped_compactions[0].num_files, 1);
+    assert_eq!(skipped_compactions[0].limit_num_files, 2);
+    assert_eq!(skipped_compactions[0].estimated_bytes, 10);
+    assert_eq!(skipped_compactions[0].limit_bytes, 20);
+    //
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[
+            to_skip_partition.id,
+            PartitionId::new(i64::MAX),
+            to_skip_partition.id,
+        ])
+        .await
+        .unwrap();
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(skipped_partition_records[0].reason, "I am le tired");
+
+    let s4 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    assert_gt(s4.generation(), s3.generation());
+    validate_partition_snapshot(repos.as_mut(), &s4).await;
+
+    // Only save the last reason that any particular partition was skipped (really if the
+    // partition appears in the skipped compactions, it shouldn't become a compaction candidate
+    // again, but race conditions and all that)
+    repos
+        .partitions()
+        .record_skipped_compaction(to_skip_partition.id, "I'm on fire", 11, 12, 24, 110, 120)
+        .await
+        .unwrap();
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert_eq!(skipped_compactions.len(), 1);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    assert_eq!(skipped_compactions[0].reason, "I'm on fire");
+    assert_eq!(skipped_compactions[0].num_files, 11);
+    assert_eq!(skipped_compactions[0].limit_num_files, 12);
+    assert_eq!(skipped_compactions[0].estimated_bytes, 110);
+    assert_eq!(skipped_compactions[0].limit_bytes, 120);
+    //
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition.id])
+        .await
+        .unwrap();
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(skipped_partition_records[0].reason, "I'm on fire");
+
+    // Can receive multiple skipped compactions for different partitions
+    repos
+        .partitions()
+        .record_skipped_compaction(
+            to_skip_partition_too.id,
+            "I am le tired too",
+            1,
+            2,
+            4,
+            10,
+            20,
+        )
+        .await
+        .unwrap();
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert_eq!(skipped_compactions.len(), 2);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    assert_eq!(
+        skipped_compactions[1].partition_id,
+        to_skip_partition_too.id
+    );
+    // confirm can fetch subset of skipped compactions (a.k.a. have two, only fetch 1)
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition.id])
+        .await
+        .unwrap();
+    assert_eq!(skipped_partition_records.len(), 1);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition_too.id])
+        .await
+        .unwrap();
+    assert_eq!(skipped_partition_records.len(), 1);
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition_too.id
+    );
+    // confirm can fetch both skipped compactions, and not the unskipped one
+    // also confirm will not error on non-existing partition
+    let non_existing_partition_id = PartitionId::new(9999);
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[
+            partition.id,
+            to_skip_partition.id,
+            to_skip_partition_too.id,
+            non_existing_partition_id,
+        ])
+        .await
+        .unwrap();
+    assert_eq!(skipped_partition_records.len(), 2);
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(
+        skipped_partition_records[1].partition_id,
+        to_skip_partition_too.id
+    );
+
+    // Delete the skipped compactions
+    let deleted_skipped_compaction = repos
+        .partitions()
+        .delete_skipped_compactions(to_skip_partition.id)
+        .await
+        .unwrap()
+        .expect("The skipped compaction should have been returned");
+    assert_eq!(
+        deleted_skipped_compaction.partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(deleted_skipped_compaction.reason, "I'm on fire");
+    assert_eq!(deleted_skipped_compaction.num_files, 11);
+    assert_eq!(deleted_skipped_compaction.limit_num_files, 12);
+    assert_eq!(deleted_skipped_compaction.estimated_bytes, 110);
+    assert_eq!(deleted_skipped_compaction.limit_bytes, 120);
+    //
+    let deleted_skipped_compaction = repos
+        .partitions()
+        .delete_skipped_compactions(to_skip_partition_too.id)
+        .await
+        .unwrap()
+        .expect("The skipped compaction should have been returned");
+    assert_eq!(
+        deleted_skipped_compaction.partition_id,
+        to_skip_partition_too.id
+    );
+    assert_eq!(deleted_skipped_compaction.reason, "I am le tired too");
+    //
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition.id])
+        .await
+        .unwrap();
+    assert!(skipped_partition_records.is_empty());
+
+    let not_deleted_skipped_compaction = repos
+        .partitions()
+        .delete_skipped_compactions(to_skip_partition.id)
+        .await
+        .unwrap();
+
+    assert!(
+        not_deleted_skipped_compaction.is_none(),
+        "There should be no skipped compation",
+    );
+
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert!(
+        skipped_compactions.is_empty(),
+        "Expected no skipped compactions, got: {skipped_compactions:?}"
+    );
+
+    let recent = repos
+        .partitions()
+        .most_recent_n(10)
+        .await
+        .expect("should list most recent");
+    assert_eq!(recent.len(), 4);
+
+    // Test: sort_key_ids from most_recent_n
+    // Only the first two partitions (represent to_skip_partition_too and to_skip_partition) have vallues, the others are empty
+    assert_eq!(
+        recent[0].sort_key_ids().unwrap(),
+        &SortKeyIds::from(vec![3, 4])
+    );
+    assert_eq!(
+        recent[1].sort_key_ids().unwrap(),
+        &SortKeyIds::from(vec![2, 1, 4, 3])
+    );
+    assert!(recent[2].sort_key_ids().is_none());
+    assert!(recent[3].sort_key_ids().is_none());
+
+    let recent = repos
+        .partitions()
+        .most_recent_n(4)
+        .await
+        .expect("should list most recent");
+    assert_eq!(recent.len(), 4); // no off by one error
+
+    let recent = repos
+        .partitions()
+        .most_recent_n(2)
+        .await
+        .expect("should list most recent");
+    assert_eq!(recent.len(), 2);
+
+    repos
+        .namespaces()
+        .soft_delete("namespace_partition_test")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+async fn validate_partition_snapshot(repos: &mut dyn RepoCollection, snapshot: &PartitionSnapshot) {
+    // compare files
+    let mut expected = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![snapshot.partition_id()])
+        .await
+        .unwrap();
+    expected.sort_unstable_by_key(|x| x.id);
+    let mut actual = snapshot.files().collect::<Result<Vec<_>, _>>().unwrap();
+    actual.sort_unstable_by_key(|x| x.id);
+    assert_eq!(expected, actual);
+
+    // compare skipped partition
+    let expected = repos
+        .partitions()
+        .get_in_skipped_compactions(&[snapshot.partition_id()])
+        .await
+        .unwrap()
+        .into_iter()
+        .next();
+    let actual = snapshot.skipped_compaction();
+    assert_eq!(actual, expected);
+
+    // compare partition itself
+    let actual = snapshot.partition().unwrap();
+    let expected = repos
+        .partitions()
+        .get_by_id(snapshot.partition_id())
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(actual, expected);
+}
+
+async fn validate_table_snapshot(repos: &mut dyn RepoCollection, snapshot: &TableSnapshot) {
+    let table = snapshot.table().unwrap();
+
+    let expected = repos.tables().get_by_id(table.id).await.unwrap().unwrap();
+    assert_eq!(table, expected);
+
+    // compare columns
+    let mut expected = repos.columns().list_by_table_id(table.id).await.unwrap();
+    expected.sort_unstable_by_key(|x| x.id);
+    let mut actual = snapshot.columns().collect::<Result<Vec<_>, _>>().unwrap();
+    actual.sort_unstable_by_key(|x| x.id);
+    assert_eq!(expected, actual);
+
+    // compare partitions
+    let mut expected = repos.partitions().list_by_table_id(table.id).await.unwrap();
+    expected.sort_unstable_by_key(|x| x.id);
+    let mut actual = snapshot
+        .partitions()
+        .collect::<Result<Vec<_>, _>>()
+        .unwrap();
+    actual.sort_unstable_by_key(|x| x.id());
+    assert_eq!(expected.len(), actual.len());
+
+    let eq = expected
+        .iter()
+        .zip(&actual)
+        .all(|(l, r)| l.id == r.id() && l.partition_key.as_bytes() == r.key());
+    assert!(eq, "expected {expected:?} got {actual:?}");
+}
+
+/// List all parquet files in given namespace.
+async fn list_parquet_files_by_namespace_not_to_delete(
+    catalog: Arc<dyn Catalog>,
+    namespace_id: NamespaceId,
+) -> Vec<ParquetFile> {
+    let partitions = futures::stream::iter(
+        catalog
+            .repositories()
+            .tables()
+            .list_by_namespace_id(namespace_id)
+            .await
+            .unwrap(),
+    )
+    .then(|t| {
+        let catalog = Arc::clone(&catalog);
+        async move {
+            futures::stream::iter(
+                catalog
+                    .repositories()
+                    .partitions()
+                    .list_by_table_id(t.id)
+                    .await
+                    .unwrap(),
+            )
+        }
+    })
+    .flatten()
+    .map(|p| p.id)
+    .collect::<Vec<_>>()
+    .await;
+
+    catalog
+        .repositories()
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(partitions)
+        .await
+        .unwrap()
+}
+
+/// tests many interactions with the catalog and parquet files. See the individual conditions
+/// herein
+async fn test_parquet_file(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test").await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+    let other_table = arbitrary_table(&mut *repos, "other", &namespace).await;
+    let partition = repos
+        .partitions()
+        .create_or_get("one".into(), table.id)
+        .await
+        .unwrap();
+    let other_partition = repos
+        .partitions()
+        .create_or_get("one".into(), other_table.id)
+        .await
+        .unwrap();
+
+    let ts1 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts1).await;
+
+    let ts2 = repos.tables().snapshot(other_table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts2).await;
+
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
+    let parquet_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+
+    // verify we can get it by its object store id
+    let pfg = repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap();
+    assert_eq!(parquet_file, pfg.unwrap());
+
+    // verify that trying to create a file with the same UUID throws an error
+    let err = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap_err();
+    assert!(matches!(err, Error::AlreadyExists { .. }));
+
+    let other_params = ParquetFileParams {
+        table_id: other_partition.table_id,
+        partition_id: other_partition.id,
+        partition_hash_id: other_partition.hash_id().cloned(),
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(50),
+        max_time: Timestamp::new(60),
+        ..parquet_file_params.clone()
+    };
+    let other_file = repos.parquet_files().create(other_params).await.unwrap();
+
+    let exist_id = parquet_file.id;
+    let non_exist_id = ParquetFileId::new(other_file.id.get() + 10);
+    // make sure exists_id != non_exist_id
+    assert_ne!(exist_id, non_exist_id);
+
+    // verify that to_delete is initially set to null and the file does not get deleted
+    assert!(parquet_file.to_delete.is_none());
+    let older_than = Timestamp::new(
+        (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(),
+    );
+    let deleted = repos
+        .parquet_files()
+        .delete_old_ids_only(older_than)
+        .await
+        .unwrap();
+    assert!(deleted.is_empty());
+
+    // test list_all that includes soft-deleted file
+    // at this time the file is not soft-deleted yet and will be included in the returned list
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await;
+    assert_eq!(files.len(), 2);
+
+    // verify to_delete can be updated to a timestamp
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            parquet_file.partition_id,
+            &[parquet_file.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+
+    // test list_all that includes soft-deleted file
+    // at this time the file is soft-deleted and will be NOT included in the returned list
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await;
+    assert_eq!(files.len(), 1);
+
+    // the deleted file can still be retrieved by UUID though
+    repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    // File is not deleted if it was marked to be deleted after the specified time
+    let before_deleted = Timestamp::new(
+        (catalog.time_provider().now() - Duration::from_secs(100)).timestamp_nanos(),
+    );
+    let deleted = repos
+        .parquet_files()
+        .delete_old_ids_only(before_deleted)
+        .await
+        .unwrap();
+    assert!(deleted.is_empty());
+
+    // not hard-deleted yet
+    repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    // File is deleted if it was marked to be deleted before the specified time
+    let deleted = repos
+        .parquet_files()
+        .delete_old_ids_only(older_than)
+        .await
+        .unwrap();
+    assert_eq!(deleted.len(), 1);
+    assert_eq!(parquet_file.object_store_id, deleted[0]);
+
+    // test list_all that includes soft-deleted file
+    // at this time the file is hard deleted -> the returned list is empty
+    assert!(repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap()
+        .is_none());
+
+    // test list
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await;
+    assert_eq!(vec![other_file.clone()], files);
+
+    // test list_by_namespace_not_to_delete
+    let namespace2 = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test1").await;
+    let table2 = arbitrary_table(&mut *repos, "test_table2", &namespace2).await;
+    let partition2 = repos
+        .partitions()
+        .create_or_get("foo".into(), table2.id)
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert!(files.is_empty());
+
+    let ts3 = repos.tables().snapshot(table2.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts3).await;
+
+    let f1_params = ParquetFileParams {
+        table_id: partition2.table_id,
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        namespace_id: namespace2.id,
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(1),
+        max_time: Timestamp::new(10),
+        ..parquet_file_params
+    };
+    let f1 = repos
+        .parquet_files()
+        .create(f1_params.clone())
+        .await
+        .unwrap();
+
+    let f2_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(50),
+        max_time: Timestamp::new(60),
+        ..f1_params.clone()
+    };
+    let f2 = repos
+        .parquet_files()
+        .create(f2_params.clone())
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f2.clone()], files);
+
+    let f3_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(50),
+        max_time: Timestamp::new(60),
+        ..f2_params
+    };
+    let f3 = repos
+        .parquet_files()
+        .create(f3_params.clone())
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f2.clone(), f3.clone()], files);
+
+    let s1 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    validate_partition_snapshot(repos.as_mut(), &s1).await;
+
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            f2.partition_id,
+            &[f2.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f3.clone()], files);
+
+    // Cannot delete file twice
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f2.object_store_id, f3.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f2.object_store_id],
+            &[f3.object_store_id],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+
+    // Cannot upgrade deleted file
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f3.object_store_id],
+            &[f2.object_store_id],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+
+    // Failed transactions don't modify
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f3.clone()], files);
+
+    let s2 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_gt(s2.generation(), s1.generation());
+    validate_partition_snapshot(repos.as_mut(), &s2).await;
+
+    let files = list_parquet_files_by_namespace_not_to_delete(
+        Arc::clone(&catalog),
+        NamespaceId::new(i64::MAX),
+    )
+    .await;
+    assert!(files.is_empty());
+
+    // test delete_old_ids_only
+    let older_than = Timestamp::new(
+        (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(),
+    );
+    let ids = repos
+        .parquet_files()
+        .delete_old_ids_only(older_than)
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), 1);
+
+    let s3 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_ge(s3.generation(), s2.generation()); // no new snapshot required, but some backends will generate a new one
+    validate_partition_snapshot(repos.as_mut(), &s3).await;
+
+    // test retention-based flagging for deletion
+    // Since mem catalog has default retention 1 hour, let us first set it to 0 means infinite
+    let namespaces = repos
+        .namespaces()
+        .list(SoftDeletedRows::AllRows)
+        .await
+        .expect("listing namespaces");
+    for namespace in namespaces {
+        repos
+            .namespaces()
+            .update_retention_period(&namespace.name, None) // infinite
+            .await
+            .unwrap();
+    }
+
+    // 1. with no retention period set on the ns, nothing should get flagged
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert!(ids.is_empty());
+    // 2. set ns retention period to one hour then create some files before and after and
+    //    ensure correct files get deleted
+    repos
+        .namespaces()
+        .update_retention_period(&namespace2.name, Some(60 * 60 * 1_000_000_000)) // 1 hour
+        .await
+        .unwrap();
+    let f4_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        max_time: Timestamp::new(
+            // a bit over an hour ago
+            (catalog.time_provider().now() - Duration::from_secs(60 * 65)).timestamp_nanos(),
+        ),
+        ..f3_params
+    };
+    let f4 = repos
+        .parquet_files()
+        .create(f4_params.clone())
+        .await
+        .unwrap();
+    let f5_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        max_time: Timestamp::new(
+            // a bit under an hour ago
+            (catalog.time_provider().now() - Duration::from_secs(60 * 55)).timestamp_nanos(),
+        ),
+        ..f4_params
+    };
+    let f5 = repos
+        .parquet_files()
+        .create(f5_params.clone())
+        .await
+        .unwrap();
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert!(ids.len() > 1); // it's also going to flag f1, f2 & f3 because they have low max
+                            // timestamps but i don't want this test to be brittle if those
+                            // values change so i'm not asserting len == 4
+    let f4 = repos
+        .parquet_files()
+        .get_by_object_store_id(f4.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f4.to_delete, Some(_)); // f4 is > 1hr old
+    let f5 = repos
+        .parquet_files()
+        .get_by_object_store_id(f5.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f5.to_delete, None); // f5 is < 1hr old
+
+    let s4 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_gt(s4.generation(), s3.generation());
+    validate_partition_snapshot(repos.as_mut(), &s4).await;
+
+    // call flag_for_delete_by_retention() again and nothing should be flagged because they've
+    // already been flagged
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert!(ids.is_empty());
+
+    // test that flag_for_delete_by_retention respects UPDATE LIMIT
+    // create limit + the meaning of life parquet files that are all older than the retention (>1hr)
+    const LIMIT: usize = 1000;
+    const MOL: usize = 42;
+    let now = catalog.time_provider().now();
+    let params = (0..LIMIT + MOL)
+        .map(|_| {
+            ParquetFileParams {
+                object_store_id: ObjectStoreId::new(),
+                max_time: Timestamp::new(
+                    // a bit over an hour ago
+                    (now - Duration::from_secs(60 * 65)).timestamp_nanos(),
+                ),
+                ..f1_params.clone()
+            }
+        })
+        .collect::<Vec<_>>();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            f1_params.partition_id,
+            &[],
+            &[],
+            &params,
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), LIMIT);
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), MOL); // second call took remainder
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), 0); // none left
+
+    // test create_update_delete
+    let f6_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..f5_params
+    };
+    let f6 = repos
+        .parquet_files()
+        .create(f6_params.clone())
+        .await
+        .unwrap();
+
+    let f7_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..f6_params
+    };
+    let f1_uuid = f1.object_store_id;
+    let f6_uuid = f6.object_store_id;
+    let f5_uuid = f5.object_store_id;
+    let cud = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            f5.partition_id,
+            &[f5.object_store_id],
+            &[f6.object_store_id],
+            &[f7_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await
+        .unwrap();
+
+    assert_eq!(cud.len(), 1);
+    let f5_delete = repos
+        .parquet_files()
+        .get_by_object_store_id(f5_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f5_delete.to_delete, Some(_));
+
+    let f6_compaction_level = repos
+        .parquet_files()
+        .get_by_object_store_id(f6_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+
+    assert_matches!(f6_compaction_level.compaction_level, CompactionLevel::Final);
+
+    let f7 = repos
+        .parquet_files()
+        .get_by_object_store_id(f7_params.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    let f7_uuid = f7.object_store_id;
+
+    // test create_update_delete transaction (rollback because f7 already exists)
+    let cud = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[],
+            &[],
+            &[f7_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await;
+
+    assert_matches!(
+        cud,
+        Err(Error::AlreadyExists {
+            descr
+        }) if descr == f7_params.object_store_id.to_string()
+    );
+
+    let f1_to_delete = repos
+        .parquet_files()
+        .get_by_object_store_id(f1_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f1_to_delete.to_delete, Some(_));
+
+    let f7_not_delete = repos
+        .parquet_files()
+        .get_by_object_store_id(f7_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f7_not_delete.to_delete, None);
+
+    // test exists_by_object_store_id_batch returns parquet files by object store id
+    let does_not_exist = ObjectStoreId::new();
+    let mut present = repos
+        .parquet_files()
+        .exists_by_object_store_id_batch(vec![f1_uuid, f7_uuid, does_not_exist])
+        .await
+        .unwrap();
+    let mut expected = vec![f1_uuid, f7_uuid];
+    present.sort();
+    expected.sort();
+    assert_eq!(present, expected);
+
+    let s5 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_gt(s5.generation(), s4.generation());
+    validate_partition_snapshot(repos.as_mut(), &s5).await;
+
+    // Cannot mix partition IDs
+    let partition3 = repos
+        .partitions()
+        .create_or_get("three".into(), table.id)
+        .await
+        .unwrap();
+
+    let ts4 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts4).await;
+    assert_gt(ts4.generation(), ts1.generation());
+
+    let f8_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        partition_id: partition3.id,
+        ..f7_params
+    };
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f7_uuid],
+            &[],
+            &[f8_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await
+        .unwrap_err()
+        .to_string();
+
+    assert!(
+        err.contains("Inconsistent ParquetFileParams, expected PartitionId"),
+        "{err}"
+    );
+
+    let list = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition2.id])
+        .await
+        .unwrap();
+    assert_eq!(list.len(), 2);
+
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition3.id,
+            &[],
+            &[],
+            &[f8_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await
+        .unwrap();
+
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition3.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 1);
+    let f8_uuid = files[0].object_store_id;
+
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 0);
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition2.id, partition3.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 3);
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![
+            partition2.id,
+            PartitionId::new(i64::MAX),
+            partition3.id,
+            partition2.id,
+        ])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 3);
+
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(partition2.id, &[f8_uuid], &[], &[], CompactionLevel::Final)
+        .await
+        .unwrap_err();
+
+    assert_matches!(err, Error::NotFound { .. });
+
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(partition2.id, &[], &[f8_uuid], &[], CompactionLevel::Final)
+        .await
+        .unwrap_err();
+
+    assert_matches!(err, Error::NotFound { .. });
+
+    repos
+        .parquet_files()
+        .create_upgrade_delete(partition3.id, &[f8_uuid], &[], &[], CompactionLevel::Final)
+        .await
+        .unwrap();
+
+    // take snapshot of unknown partition
+    let err = repos
+        .partitions()
+        .snapshot(PartitionId::new(i64::MAX))
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+}
+
+async fn test_parquet_file_delete_broken(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace_1 = arbitrary_namespace(&mut *repos, "retention_broken_1").await;
+    let namespace_2 = repos
+        .namespaces()
+        .create(
+            &NamespaceName::new("retention_broken_2").unwrap(),
+            None,
+            Some(1),
+            None,
+        )
+        .await
+        .unwrap();
+    let table_1 = arbitrary_table(&mut *repos, "test_table", &namespace_1).await;
+    let table_2 = arbitrary_table(&mut *repos, "test_table", &namespace_2).await;
+    let partition_1 = repos
+        .partitions()
+        .create_or_get("one".into(), table_1.id)
+        .await
+        .unwrap();
+    let partition_2 = repos
+        .partitions()
+        .create_or_get("one".into(), table_2.id)
+        .await
+        .unwrap();
+
+    let parquet_file_params_1 = arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1);
+    let parquet_file_params_2 = arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2);
+    let _parquet_file_1 = repos
+        .parquet_files()
+        .create(parquet_file_params_1)
+        .await
+        .unwrap();
+    let parquet_file_2 = repos
+        .parquet_files()
+        .create(parquet_file_params_2)
+        .await
+        .unwrap();
+
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(
+        ids,
+        vec![(parquet_file_2.partition_id, parquet_file_2.object_store_id)]
+    );
+}
+
+async fn test_partitions_new_file_between(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "test_partitions_new_file_between").await;
+    let table = arbitrary_table(&mut *repos, "test_table_for_new_file_between", &namespace).await;
+
+    // param for the tests
+    let time_now = Timestamp::from(catalog.time_provider().now());
+    let time_one_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(1));
+    let time_two_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(2));
+    let time_three_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(3));
+    let time_five_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(5));
+    let time_six_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(6));
+
+    // Db has no partitions
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // -----------------
+    // PARTITION one
+    // The DB has 1 partition but it does not have any file
+    let partition1 = repos
+        .partitions()
+        .create_or_get("one".into(), table.id)
+        .await
+        .unwrap();
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // create files for partition one
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition1);
+
+    // create a deleted L0 file that was created 3 hours ago
+    let delete_l0_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            delete_l0_file.partition_id,
+            &[delete_l0_file.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, Some(time_one_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_one_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // create a deleted L0 file that was created 1 hour ago
+    let l0_one_hour_ago_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_one_hour_ago,
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l0_one_hour_ago_file_params.clone())
+        .await
+        .unwrap();
+    // partition one should be returned
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_two_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // -----------------
+    // PARTITION two
+    // Partition two without any file
+    let partition2 = repos
+        .partitions()
+        .create_or_get("two".into(), table.id)
+        .await
+        .unwrap();
+    // should return partition one only
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+
+    // Add a L0 file created 5 hours ago
+    let l0_five_hour_ago_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_five_hour_ago,
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l0_five_hour_ago_file_params.clone())
+        .await
+        .unwrap();
+    // still return partition one only
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    // Between six and three hours ago, return only partition 2
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition2.id);
+
+    // Add an L1 file created just now
+    let l1_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_now,
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        compaction_level: CompactionLevel::FileNonOverlapped,
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l1_file_params.clone())
+        .await
+        .unwrap();
+    // should return both partitions
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    // Only return partition1: the creation time must be strictly less than the maximum time,
+    // not equal
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // -----------------
+    // PARTITION three
+    // Partition three without any file
+    let partition3 = repos
+        .partitions()
+        .create_or_get("three".into(), table.id)
+        .await
+        .unwrap();
+    // should return partition one and two only
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    // Only return partition1: the creation time must be strictly less than the maximum time,
+    // not equal
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    // When the maximum time is greater than the creation time of partition2, return it
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now + 1))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // Add an L2 file created just now for partition three
+    let l2_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_now,
+        partition_id: partition3.id,
+        partition_hash_id: partition3.hash_id().cloned(),
+        compaction_level: CompactionLevel::Final,
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l2_file_params.clone())
+        .await
+        .unwrap();
+    // now should return partition one two and three
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 3);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    assert_eq!(partitions[2], partition3.id);
+    // Only return partition1: the creation time must be strictly less than the maximum time,
+    // not equal
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // add an L0 file created one hour ago for partition three
+    let l0_one_hour_ago_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_one_hour_ago,
+        partition_id: partition3.id,
+        partition_hash_id: partition3.hash_id().cloned(),
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l0_one_hour_ago_file_params.clone())
+        .await
+        .unwrap();
+    // should return all partitions
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 3);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    assert_eq!(partitions[2], partition3.id);
+    // Only return partitions 1 and 3; 2 was created just now
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition3.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+}
+
+async fn test_list_by_partiton_not_to_delete(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(
+        &mut *repos,
+        "namespace_parquet_file_test_list_by_partiton_not_to_delete",
+    )
+    .await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+
+    let partition = repos
+        .partitions()
+        .create_or_get("test_list_by_partiton_not_to_delete_one".into(), table.id)
+        .await
+        .unwrap();
+    let partition2 = repos
+        .partitions()
+        .create_or_get("test_list_by_partiton_not_to_delete_two".into(), table.id)
+        .await
+        .unwrap();
+
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
+
+    let parquet_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    let delete_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    let delete_file = repos
+        .parquet_files()
+        .create(delete_file_params)
+        .await
+        .unwrap();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition.id,
+            &[delete_file.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let level1_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    let mut level1_file = repos
+        .parquet_files()
+        .create(level1_file_params)
+        .await
+        .unwrap();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition.id,
+            &[],
+            &[level1_file.object_store_id],
+            &[],
+            CompactionLevel::FileNonOverlapped,
+        )
+        .await
+        .unwrap();
+    level1_file.compaction_level = CompactionLevel::FileNonOverlapped;
+
+    let other_partition_params = ParquetFileParams {
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    let _partition2_file = repos
+        .parquet_files()
+        .create(other_partition_params)
+        .await
+        .unwrap();
+
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 2);
+
+    let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
+    file_ids.sort();
+    let mut expected_ids = vec![parquet_file.id, level1_file.id];
+    expected_ids.sort();
+    assert_eq!(file_ids, expected_ids);
+
+    // Using the catalog partition ID should return the same files, even if the Parquet file
+    // records don't have the partition ID on them (which is the default now)
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 2);
+
+    let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
+    file_ids.sort();
+    let mut expected_ids = vec![parquet_file.id, level1_file.id];
+    expected_ids.sort();
+    assert_eq!(file_ids, expected_ids);
+}
+
+async fn test_update_to_compaction_level_1(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace =
+        arbitrary_namespace(&mut *repos, "namespace_update_to_compaction_level_1_test").await;
+    let table = arbitrary_table(&mut *repos, "update_table", &namespace).await;
+    let partition = repos
+        .partitions()
+        .create_or_get("test_update_to_compaction_level_1_one".into(), table.id)
+        .await
+        .unwrap();
+
+    // Set up the window of times we're interested in level 1 files for
+    let query_min_time = Timestamp::new(5);
+    let query_max_time = Timestamp::new(10);
+
+    // Create a file with times entirely within the window
+    let mut parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
+    parquet_file_params.min_time = query_min_time + 1;
+    parquet_file_params.max_time = query_max_time - 1;
+    let parquet_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+
+    // Create a file that will remain as level 0
+    let level_0_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    repos.parquet_files().create(level_0_params).await.unwrap();
+
+    // Make parquet_file compaction level 1
+    let created = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            parquet_file.partition_id,
+            &[],
+            &[parquet_file.object_store_id],
+            &[],
+            CompactionLevel::FileNonOverlapped,
+        )
+        .await
+        .unwrap();
+    assert_eq!(created, vec![]);
+
+    // remove namespace to avoid it from affecting later tests
+    repos
+        .namespaces()
+        .soft_delete("namespace_update_to_compaction_level_1_test")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+/// Assert that a namespace deletion does NOT cascade to the tables/schema
+/// items/parquet files/etc.
+///
+/// Removal of this entities breaks the invariant that once created, a row
+/// always exists for the lifetime of an IOx process, and causes the system
+/// to panic in multiple components. It's also ineffective, because most
+/// components maintain a cache of at least one of these entities.
+///
+/// Instead soft deleted namespaces should have their files GC'd like a
+/// normal parquet file deletion, removing the rows once they're no longer
+/// being actively used by the system. This is done by waiting a long time
+/// before deleting records, and whilst isn't perfect, it is largely
+/// effective.
+async fn test_delete_namespace(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace_1 = arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_1").await;
+    let table_1 = arbitrary_table(&mut *repos, "test_table_1", &namespace_1).await;
+    let _c = repos
+        .columns()
+        .create_or_get("column_test_1", table_1.id, ColumnType::Tag)
+        .await
+        .unwrap();
+    let partition_1 = repos
+        .partitions()
+        .create_or_get("test_delete_namespace_one".into(), table_1.id)
+        .await
+        .unwrap();
+
+    // parquet files
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1);
+    repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    let parquet_file_params_2 = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(200),
+        max_time: Timestamp::new(300),
+        ..parquet_file_params
+    };
+    repos
+        .parquet_files()
+        .create(parquet_file_params_2.clone())
+        .await
+        .unwrap();
+
+    // we've now created a namespace with a table and parquet files. before we test deleting
+    // it, let's create another so we can ensure that doesn't get deleted.
+    let namespace_2 = arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_2").await;
+    let table_2 = arbitrary_table(&mut *repos, "test_table_2", &namespace_2).await;
+    let _c = repos
+        .columns()
+        .create_or_get("column_test_2", table_2.id, ColumnType::Tag)
+        .await
+        .unwrap();
+    let partition_2 = repos
+        .partitions()
+        .create_or_get("test_delete_namespace_two".into(), table_2.id)
+        .await
+        .unwrap();
+
+    // parquet files
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2);
+    repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    let parquet_file_params_2 = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(200),
+        max_time: Timestamp::new(300),
+        ..parquet_file_params
+    };
+    repos
+        .parquet_files()
+        .create(parquet_file_params_2.clone())
+        .await
+        .unwrap();
+
+    // now delete namespace_1 and assert it's all gone and none of
+    // namespace_2 is gone
+    repos
+        .namespaces()
+        .soft_delete("namespace_test_delete_namespace_1")
+        .await
+        .expect("delete namespace should succeed");
+    // assert that namespace is soft-deleted, but the table, column, and parquet files are all
+    // still there.
+    assert!(repos
+        .namespaces()
+        .get_by_id(namespace_1.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .expect("get namespace should succeed")
+        .is_none());
+    assert_eq!(
+        repos
+            .namespaces()
+            .get_by_id(namespace_1.id, SoftDeletedRows::AllRows)
+            .await
+            .expect("get namespace should succeed")
+            .map(|mut v| {
+                // The only change after soft-deletion should be the deleted_at
+                // field being set - this block normalises that field, so that
+                // the before/after can be asserted as equal.
+                v.deleted_at = None;
+                v
+            })
+            .expect("should see soft-deleted row"),
+        namespace_1
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_id(table_1.id)
+            .await
+            .expect("get table should succeed")
+            .expect("should return row"),
+        table_1
+    );
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_namespace_id(namespace_1.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_table_id(table_1.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+
+    // partition's get_by_id should succeed
+    repos
+        .partitions()
+        .get_by_id_batch(&[partition_1.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+
+    // assert that the namespace, table, column, and parquet files for namespace_2 are still
+    // there
+    assert!(repos
+        .namespaces()
+        .get_by_id(namespace_2.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .expect("get namespace should succeed")
+        .is_some());
+
+    assert!(repos
+        .tables()
+        .get_by_id(table_2.id)
+        .await
+        .expect("get table should succeed")
+        .is_some());
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_namespace_id(namespace_2.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_table_id(table_2.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+
+    // partition's get_by_id should succeed
+    repos
+        .partitions()
+        .get_by_id_batch(&[partition_2.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+}
+
+/// Upsert a namespace called `namespace_name` and write `lines` to it.
+async fn populate_namespace<R>(
+    repos: &mut R,
+    namespace_name: &str,
+    lines: &str,
+) -> (Namespace, NamespaceSchema)
+where
+    R: RepoCollection + ?Sized,
+{
+    let namespace = repos
+        .namespaces()
+        .create(
+            &NamespaceName::new(namespace_name).unwrap(),
+            None,
+            None,
+            None,
+        )
+        .await;
+
+    let namespace = match namespace {
+        Ok(v) => v,
+        Err(Error::AlreadyExists { .. }) => repos
+            .namespaces()
+            .get_by_name(namespace_name, SoftDeletedRows::AllRows)
+            .await
+            .unwrap()
+            .unwrap(),
+        e @ Err(_) => e.unwrap(),
+    };
+
+    let batches = mutable_batch_lp::lines_to_batches(lines, 42).unwrap();
+    let batches = batches.iter().map(|(table, batch)| (table.as_str(), batch));
+    let ns = NamespaceSchema::new_empty_from(&namespace);
+
+    let schema = validate_or_insert_schema(batches, &ns, repos)
+        .await
+        .expect("validate schema failed")
+        .unwrap_or(ns);
+
+    (namespace, schema)
+}
+
+async fn test_list_schemas(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let ns1 = populate_namespace(
+        repos.deref_mut(),
+        "ns1",
+        "cpu,tag=1 field=1i\nanother,tag=1 field=1.0",
+    )
+    .await;
+    let ns2 = populate_namespace(
+        repos.deref_mut(),
+        "ns2",
+        "cpu,tag=1 field=1i\nsomethingelse field=1u",
+    )
+    .await;
+
+    // Otherwise the in-mem catalog deadlocks.... (but not postgres)
+    drop(repos);
+
+    let got = list_schemas(&*catalog)
+        .await
+        .expect("should be able to list the schemas")
+        .collect::<Vec<_>>();
+
+    assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1);
+    assert!(got.contains(&ns2), "{:#?}\n\nwant{:#?}", got, &ns2);
+}
+
+async fn test_list_schemas_soft_deleted_rows(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let ns1 = populate_namespace(
+        repos.deref_mut(),
+        "ns1",
+        "cpu,tag=1 field=1i\nanother,tag=1 field=1.0",
+    )
+    .await;
+    let ns2 = populate_namespace(
+        repos.deref_mut(),
+        "ns2",
+        "cpu,tag=1 field=1i\nsomethingelse field=1u",
+    )
+    .await;
+
+    repos
+        .namespaces()
+        .soft_delete(&ns2.0.name)
+        .await
+        .expect("failed to soft delete namespace");
+
+    // Otherwise the in-mem catalog deadlocks.... (but not postgres)
+    drop(repos);
+
+    let got = list_schemas(&*catalog)
+        .await
+        .expect("should be able to list the schemas")
+        .collect::<Vec<_>>();
+
+    assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1);
+    assert!(!got.contains(&ns2), "{:#?}\n\n do not want{:#?}", got, &ns2);
+}
+
+/// Ensure that we can create two repo objects and that they instantly share their state.
+///
+/// This is a regression test for <https://github.com/influxdata/influxdb_iox/issues/3859>.
+async fn test_two_repos(catalog: Arc<dyn Catalog>) {
+    let mut repos_1 = catalog.repositories();
+    let mut repos_2 = catalog.repositories();
+    let repo_1 = repos_1.namespaces();
+    let repo_2 = repos_2.namespaces();
+
+    let namespace_name = NamespaceName::new("test_namespace").unwrap();
+    repo_1
+        .create(&namespace_name, None, None, None)
+        .await
+        .unwrap();
+
+    repo_2
+        .get_by_name(&namespace_name, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .unwrap();
+}
+
+async fn test_partition_create_or_get_idempotent(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+    let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id;
+
+    let key = PartitionKey::from("bananas");
+
+    let hash_id = PartitionHashId::new(table_id, &key);
+
+    let a = repos
+        .partitions()
+        .create_or_get(key.clone(), table_id)
+        .await
+        .expect("should create OK");
+
+    assert_eq!(a.hash_id().unwrap(), &hash_id);
+    // Test: sort_key_ids from partition_create_or_get_idempotent
+    assert!(a.sort_key_ids().is_none());
+
+    // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent.
+    let b = repos
+        .partitions()
+        .create_or_get(key.clone(), table_id)
+        .await
+        .expect("idempotent write should succeed");
+
+    assert_eq!(a, b);
+
+    // Check that the hash_id is saved in the database and is returned when queried.
+    let table_partitions = repos.partitions().list_by_table_id(table_id).await.unwrap();
+    assert_eq!(table_partitions.len(), 1);
+    assert_eq!(table_partitions[0].hash_id().unwrap(), &hash_id);
+
+    // Test: sort_key_ids from partition_create_or_get_idempotent
+    assert!(table_partitions[0].sort_key_ids().is_none());
+}
+
+#[track_caller]
+fn assert_metric_hit(metrics: &metric::Registry, name: &'static str) {
+    let histogram = metrics
+        .get_instrument::<Metric<DurationHistogram>>("catalog_op_duration")
+        .expect("failed to read metric")
+        .get_observer(&Attributes::from(&[("op", name), ("result", "success")]))
+        .expect("failed to get observer")
+        .fetch();
+
+    let hit_count = histogram.sample_count();
+    assert!(hit_count > 0, "metric did not record any calls");
+}
+
+async fn test_column_create_or_get_many_unchecked<R, F>(clean_state: R)
+where
+    R: Fn() -> F + Send + Sync,
+    F: Future<Output = Arc<dyn Catalog>> + Send,
+{
+    // Issue a few calls to create_or_get_many that contain distinct columns and
+    // covers the full set of column types.
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+                ("test5", ColumnType::String),
+                ("test6", ColumnType::Time),
+                ("test7", ColumnType::Tag),
+            ],
+            &[("test8", ColumnType::String), ("test9", ColumnType::Bool)],
+        ],
+        |res| assert_matches!(res, Ok(_)),
+    )
+    .await;
+
+    // Issue two calls with overlapping columns - request should succeed (upsert
+    // semantics).
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+            ],
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+                ("test5", ColumnType::String),
+                ("test6", ColumnType::Time),
+                ("test7", ColumnType::Tag),
+                ("test8", ColumnType::String),
+            ],
+        ],
+        |res| assert_matches!(res, Ok(_)),
+    )
+    .await;
+
+    // Issue two calls with the same columns and types.
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+            ],
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+            ],
+        ],
+        |res| assert_matches!(res, Ok(_)),
+    )
+    .await;
+
+    // Issue two calls with overlapping columns with conflicting types and
+    // observe a correctly populated ColumnTypeMismatch error.
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::String),
+                ("test2", ColumnType::String),
+                ("test3", ColumnType::String),
+                ("test4", ColumnType::String),
+            ],
+            &[
+                ("test1", ColumnType::String),
+                ("test2", ColumnType::Bool), // This one differs
+                ("test3", ColumnType::String),
+                // 4 is missing.
+                ("test5", ColumnType::String),
+                ("test6", ColumnType::Time),
+                ("test7", ColumnType::Tag),
+                ("test8", ColumnType::String),
+            ],
+        ],
+        |res| assert_matches!(res, Err(e) => {
+            assert_matches!(e, Error::AlreadyExists { descr } => {
+                assert_eq!(descr, "column test2 is type string but schema update has type bool");
+            })
+        }),
+    ).await;
+}
+
+async fn test_column_create_or_get_many_unchecked_sub<F>(
+    catalog: Arc<dyn Catalog>,
+    calls: &[&[(&'static str, ColumnType)]],
+    want: F,
+) where
+    F: FnOnce(Result<Vec<Column>, Error>) + Send,
+{
+    let mut repos = catalog.repositories();
+
+    let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+    let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id;
+
+    let mut last_got = None;
+    for insert in calls {
+        let insert = insert
+            .iter()
+            .map(|(n, t)| (*n, *t))
+            .collect::<HashMap<_, _>>();
+
+        let got = repos
+            .columns()
+            .create_or_get_many_unchecked(table_id, insert.clone())
+            .await;
+
+        // The returned columns MUST always match the requested
+        // column values if successful.
+        if let Ok(got) = &got {
+            assert_eq!(insert.len(), got.len());
+
+            for got in got {
+                assert_eq!(table_id, got.table_id);
+                let requested_column_type = insert
+                    .get(got.name.as_str())
+                    .expect("Should have gotten back a column that was inserted");
+                assert_eq!(*requested_column_type, got.column_type,);
+            }
+
+            assert_metric_hit(&catalog.metrics(), "column_create_or_get_many_unchecked");
+        }
+
+        last_got = Some(got);
+    }
+
+    want(last_got.unwrap());
+}
+
+/// [`Catalog`] wrapper that is helpful for testing.
+#[derive(Debug)]
+pub(crate) struct TestCatalog {
+    hold_onto: Mutex<Vec<Box<dyn Any + Send>>>,
+    inner: Arc<dyn Catalog>,
+}
+
+impl TestCatalog {
+    /// Create new test catalog.
+    pub(crate) fn new(inner: Arc<dyn Catalog>) -> Self {
+        Self {
+            hold_onto: Mutex::new(vec![]),
+            inner,
+        }
+    }
+
+    /// Hold onto given value til dropped.
+    pub(crate) fn hold_onto<T>(&self, o: T)
+    where
+        T: Send + 'static,
+    {
+        self.hold_onto.lock().push(Box::new(o) as _)
+    }
+}
+
+impl Display for TestCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "test({})", self.inner)
+    }
+}
+
+#[async_trait]
+impl Catalog for TestCatalog {
+    async fn setup(&self) -> Result<(), Error> {
+        self.inner.setup().await
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        self.inner.repositories()
+    }
+
+    fn metrics(&self) -> Arc<metric::Registry> {
+        self.inner.metrics()
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        self.inner.time_provider()
+    }
+}
+
+#[track_caller]
+fn assert_gt<T>(a: T, b: T)
+where
+    T: Display + PartialOrd,
+{
+    assert!(a > b, "failed: {a} > {b}",);
+}
+
+#[track_caller]
+fn assert_ge<T>(a: T, b: T)
+where
+    T: Display + PartialOrd,
+{
+    assert!(a >= b, "failed: {a} >= {b}",);
+}
diff --git a/iox_catalog/src/kafkaless_transition.rs b/iox_catalog/src/kafkaless_transition.rs
new file mode 100644
index 0000000..4216216
--- /dev/null
+++ b/iox_catalog/src/kafkaless_transition.rs
@@ -0,0 +1,95 @@
+/// Magic number to be used shard indices and shard ids in "kafkaless".
+pub(crate) const TRANSITION_SHARD_NUMBER: i32 = 1234;
+/// In kafkaless mode all new persisted data uses this shard id.
+pub(crate) const TRANSITION_SHARD_ID: ShardId = ShardId::new(TRANSITION_SHARD_NUMBER as i64);
+/// In kafkaless mode all new persisted data uses this shard index.
+pub(crate) const TRANSITION_SHARD_INDEX: ShardIndex = ShardIndex::new(TRANSITION_SHARD_NUMBER);
+pub(crate) const SHARED_TOPIC_NAME: &str = "iox-shared";
+pub(crate) const SHARED_TOPIC_ID: TopicId = TopicId::new(1);
+pub(crate) const SHARED_QUERY_POOL_ID: QueryPoolId = QueryPoolId::new(1);
+pub(crate) const SHARED_QUERY_POOL: &str = SHARED_TOPIC_NAME;
+
+/// Unique ID for a `Shard`, assigned by the catalog. Joins to other catalog tables to uniquely
+/// identify shards independently of the underlying write buffer implementation.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub(crate) struct ShardId(i64);
+
+#[allow(missing_docs)]
+impl ShardId {
+    pub(crate) const fn new(v: i64) -> Self {
+        Self(v)
+    }
+}
+
+impl std::fmt::Display for ShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+/// The index of the shard in the set of shards. When Kafka is used as the write buffer, this is
+/// the Kafka Partition ID. Used by the router and write buffer to shard requests to a particular
+/// index in a set of shards.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub(crate) struct ShardIndex(i32);
+
+#[allow(missing_docs)]
+impl ShardIndex {
+    pub(crate) const fn new(v: i32) -> Self {
+        Self(v)
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = std::num::ParseIntError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let v: i32 = s.parse()?;
+        Ok(Self(v))
+    }
+}
+
+/// Data object for a shard. Only one shard record can exist for a given topic and shard
+/// index (enforced via uniqueness constraint).
+#[derive(Debug, Copy, Clone, PartialEq, Eq, sqlx::FromRow)]
+pub(crate) struct Shard {
+    /// the id of the shard, assigned by the catalog
+    pub(crate) id: ShardId,
+    /// the topic the shard is reading from
+    pub(crate) topic_id: TopicId,
+    /// the shard index of the shard the sequence numbers are coming from, sharded by the router
+    /// and write buffer
+    pub(crate) shard_index: ShardIndex,
+}
+
+/// Unique ID for a Topic, assigned by the catalog
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct TopicId(i64);
+
+#[allow(missing_docs)]
+impl TopicId {
+    pub const fn new(v: i64) -> Self {
+        Self(v)
+    }
+}
+
+/// Unique ID for a `QueryPool`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct QueryPoolId(i64);
+
+#[allow(missing_docs)]
+impl QueryPoolId {
+    pub const fn new(v: i64) -> Self {
+        Self(v)
+    }
+}
diff --git a/iox_catalog/src/lib.rs b/iox_catalog/src/lib.rs
new file mode 100644
index 0000000..17fa14f
--- /dev/null
+++ b/iox_catalog/src/lib.rs
@@ -0,0 +1,35 @@
+//! The IOx catalog keeps track of the namespaces, tables, columns, parquet files,
+//! and deletes in the system. Configuration information for distributing ingest, query
+//! and compaction is also stored here.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod cache;
+pub mod constants;
+pub mod grpc;
+pub mod interface;
+pub mod mem;
+pub mod metrics;
+pub mod migrate;
+pub mod postgres;
+pub mod sqlite;
+pub mod test_helpers;
+pub mod util;
+
+#[cfg(test)]
+pub(crate) mod interface_tests;
diff --git a/iox_catalog/src/mem.rs b/iox_catalog/src/mem.rs
new file mode 100644
index 0000000..0d810fd
--- /dev/null
+++ b/iox_catalog/src/mem.rs
@@ -0,0 +1,1135 @@
+//! This module implements an in-memory implementation of the iox_catalog interface. It can be
+//! used for testing or for an IOx designed to run without catalog persistence.
+
+use crate::{
+    constants::{
+        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
+    },
+    interface::{
+        AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo,
+        PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
+    },
+    metrics::MetricDecorator,
+};
+use async_trait::async_trait;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
+use data_types::{
+    partition_template::{
+        NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart,
+    },
+    Column, ColumnId, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace,
+    NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId,
+    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId,
+    PartitionKey, SkippedCompaction, SortKeyIds, Table, TableId, Timestamp,
+};
+use iox_time::TimeProvider;
+use parking_lot::Mutex;
+use snafu::ensure;
+use std::ops::Deref;
+use std::{
+    collections::{HashMap, HashSet},
+    fmt::{Display, Formatter},
+    ops::DerefMut,
+    sync::Arc,
+};
+
+/// In-memory catalog that implements the `RepoCollection` and individual repo traits from
+/// the catalog interface.
+pub struct MemCatalog {
+    metrics: Arc<metric::Registry>,
+    collections: Arc<Mutex<MemCollections>>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl MemCatalog {
+    /// return new initialized [`MemCatalog`]
+    pub fn new(metrics: Arc<metric::Registry>, time_provider: Arc<dyn TimeProvider>) -> Self {
+        Self {
+            metrics,
+            collections: Default::default(),
+            time_provider,
+        }
+    }
+
+    /// Add partition directly, for testing purposes only as it does not do any consistency or
+    /// uniqueness checks
+    pub fn add_partition(&self, partition: Partition) {
+        let mut stage = self.collections.lock();
+        stage.partitions.push(partition.into());
+    }
+}
+
+impl std::fmt::Debug for MemCatalog {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MemCatalog").finish_non_exhaustive()
+    }
+}
+
+/// A wrapper around `T` adding a generation number
+#[derive(Debug, Clone)]
+struct Versioned<T> {
+    generation: u64,
+    value: T,
+}
+
+impl<T> Deref for Versioned<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.value
+    }
+}
+
+impl<T> DerefMut for Versioned<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.value
+    }
+}
+
+impl<T> From<T> for Versioned<T> {
+    fn from(value: T) -> Self {
+        Self {
+            generation: 0,
+            value,
+        }
+    }
+}
+
+#[derive(Default, Debug, Clone)]
+struct MemCollections {
+    namespaces: Vec<Namespace>,
+    tables: Vec<Versioned<Table>>,
+    columns: Vec<Column>,
+    partitions: Vec<Versioned<Partition>>,
+    skipped_compactions: Vec<SkippedCompaction>,
+    parquet_files: Vec<ParquetFile>,
+}
+
+/// transaction bound to an in-memory catalog.
+#[derive(Debug)]
+pub struct MemTxn {
+    collections: Arc<Mutex<MemCollections>>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl Display for MemCatalog {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Memory")
+    }
+}
+
+#[async_trait]
+impl Catalog for MemCatalog {
+    async fn setup(&self) -> Result<(), Error> {
+        Ok(())
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        let collections = Arc::clone(&self.collections);
+        Box::new(MetricDecorator::new(
+            MemTxn {
+                collections,
+                time_provider: self.time_provider(),
+            },
+            Arc::clone(&self.metrics),
+            self.time_provider(),
+        ))
+    }
+
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<metric::Registry> {
+        Arc::clone(&self.metrics)
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider)
+    }
+}
+
+impl RepoCollection for MemTxn {
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl NamespaceRepo for MemTxn {
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace> {
+        let mut stage = self.collections.lock();
+
+        if stage.namespaces.iter().any(|n| n.name == name.as_str()) {
+            return Err(Error::AlreadyExists {
+                descr: name.to_string(),
+            });
+        }
+
+        let max_tables = service_protection_limits
+            .and_then(|l| l.max_tables)
+            .unwrap_or_default();
+        let max_columns_per_table = service_protection_limits
+            .and_then(|l| l.max_columns_per_table)
+            .unwrap_or_default();
+
+        let namespace = Namespace {
+            id: NamespaceId::new(stage.namespaces.len() as i64 + 1),
+            name: name.to_string(),
+            max_tables,
+            max_columns_per_table,
+            retention_period_ns,
+            deleted_at: None,
+            partition_template: partition_template.unwrap_or_default(),
+        };
+        stage.namespaces.push(namespace);
+        Ok(stage.namespaces.last().unwrap().clone())
+    }
+
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
+        let stage = self.collections.lock();
+
+        Ok(filter_namespace_soft_delete(&stage.namespaces, deleted)
+            .cloned()
+            .collect())
+    }
+
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let stage = self.collections.lock();
+
+        let res = filter_namespace_soft_delete(&stage.namespaces, deleted)
+            .find(|n| n.id == id)
+            .cloned();
+
+        Ok(res)
+    }
+
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let stage = self.collections.lock();
+
+        let res = filter_namespace_soft_delete(&stage.namespaces, deleted)
+            .find(|n| n.name == name)
+            .cloned();
+
+        Ok(res)
+    }
+
+    // performs a cascading delete of all things attached to the namespace, then deletes the
+    // namespace
+    async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        let mut stage = self.collections.lock();
+        let timestamp = self.time_provider.now();
+        // get namespace by name
+        match stage.namespaces.iter_mut().find(|n| n.name == name) {
+            Some(n) => {
+                n.deleted_at = Some(Timestamp::from(timestamp));
+                Ok(())
+            }
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
+            }),
+        }
+    }
+
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
+        let mut stage = self.collections.lock();
+        match stage.namespaces.iter_mut().find(|n| n.name == name) {
+            Some(n) => {
+                n.max_tables = new_max;
+                Ok(n.clone())
+            }
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
+            }),
+        }
+    }
+
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace> {
+        let mut stage = self.collections.lock();
+        match stage.namespaces.iter_mut().find(|n| n.name == name) {
+            Some(n) => {
+                n.max_columns_per_table = new_max;
+                Ok(n.clone())
+            }
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
+            }),
+        }
+    }
+
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace> {
+        let mut stage = self.collections.lock();
+        match stage.namespaces.iter_mut().find(|n| n.name == name) {
+            Some(n) => {
+                n.retention_period_ns = retention_period_ns;
+                Ok(n.clone())
+            }
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
+            }),
+        }
+    }
+}
+
+#[async_trait]
+impl TableRepo for MemTxn {
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table> {
+        let mut stage = self.collections.lock();
+
+        let table = {
+            // this block is just to ensure the mem impl correctly creates TableCreateLimitError in
+            // tests, we don't care about any of the errors it is discarding
+            stage
+                .namespaces
+                .iter()
+                .find(|n| n.id == namespace_id)
+                .cloned()
+                .ok_or_else(|| Error::NotFound {
+                    // we're never going to use this error, this is just for flow control,
+                    // so it doesn't matter that we only have the ID, not the name
+                    descr: "".to_string(),
+                })
+                .and_then(|n| {
+                    let max_tables = n.max_tables;
+                    let tables_count = stage
+                        .tables
+                        .iter()
+                        .filter(|t| t.namespace_id == namespace_id)
+                        .count();
+                    if tables_count >= max_tables.get() {
+                        return Err(Error::LimitExceeded {
+                            descr: format!(
+                                "couldn't create table {}; limit reached on namespace {}",
+                                name, namespace_id
+                            ),
+                        });
+                    }
+                    Ok(())
+                })?;
+
+            match stage
+                .tables
+                .iter()
+                .find(|t| t.name == name && t.namespace_id == namespace_id)
+            {
+                Some(_t) => {
+                    return Err(Error::AlreadyExists {
+                        descr: format!("table '{name}' in namespace {namespace_id}"),
+                    })
+                }
+                None => {
+                    let table = Table {
+                        id: TableId::new(stage.tables.len() as i64 + 1),
+                        namespace_id,
+                        name: name.to_string(),
+                        partition_template,
+                    };
+                    stage.tables.push(table.into());
+                    stage.tables.last().unwrap().value.clone()
+                }
+            }
+        };
+
+        // Partitioning is only supported for tags, so create tag columns for all `TagValue`
+        // partition template parts. It's important this happens within the table creation
+        // transaction so that there isn't a possibility of a concurrent write creating these
+        // columns with an unsupported type.
+        for template_part in table.partition_template.parts() {
+            if let TemplatePart::TagValue(tag_name) = template_part {
+                create_or_get_column(&mut stage, tag_name, table.id, ColumnType::Tag)?;
+            }
+        }
+
+        Ok(table)
+    }
+
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
+        let stage = self.collections.lock();
+
+        let mut tables = stage.tables.iter();
+        Ok(tables.find(|t| t.id == table_id).map(|v| v.value.clone()))
+    }
+
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>> {
+        let stage = self.collections.lock();
+
+        let mut tables = stage.tables.iter();
+        let search = tables.find(|t| t.namespace_id == namespace_id && t.name == name);
+        Ok(search.map(|v| v.value.clone()))
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        let stage = self.collections.lock();
+
+        let tables = stage.tables.iter();
+        let filtered = tables.filter(|t| t.namespace_id == namespace_id);
+        let tables: Vec<_> = filtered.map(|v| v.value.clone()).collect();
+        Ok(tables)
+    }
+
+    async fn list(&mut self) -> Result<Vec<Table>> {
+        let stage = self.collections.lock();
+        Ok(stage.tables.iter().map(|v| v.value.clone()).collect())
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let mut guard = self.collections.lock();
+
+        let (table, generation) = {
+            let mut tables = guard.tables.iter_mut();
+            let search = tables.find(|x| x.id == table_id);
+            let table = search.ok_or_else(|| Error::NotFound {
+                descr: table_id.to_string(),
+            })?;
+
+            let generation = table.generation;
+            table.generation += 1;
+            (table.value.clone(), generation)
+        };
+
+        let columns = guard
+            .columns
+            .iter()
+            .filter(|x| x.table_id == table_id)
+            .cloned()
+            .collect();
+
+        let partitions = guard
+            .partitions
+            .iter()
+            .filter(|x| x.table_id == table_id)
+            .map(|v| v.value.clone())
+            .collect();
+
+        Ok(TableSnapshot::encode(
+            table, partitions, columns, generation,
+        )?)
+    }
+}
+
+#[async_trait]
+impl ColumnRepo for MemTxn {
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column> {
+        let mut stage = self.collections.lock();
+        create_or_get_column(&mut stage, name, table_id, column_type)
+    }
+
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>> {
+        // Explicitly NOT using `create_or_get` in this function: the Postgres catalog doesn't
+        // check column limits when inserting many columns because it's complicated and expensive,
+        // and for testing purposes the in-memory catalog needs to match its functionality.
+
+        let mut stage = self.collections.lock();
+
+        let out: Vec<_> = columns
+            .iter()
+            .map(|(&column_name, &column_type)| {
+                match stage
+                    .columns
+                    .iter()
+                    .find(|t| t.name == column_name && t.table_id == table_id)
+                {
+                    Some(c) => {
+                        ensure!(
+                            column_type == c.column_type,
+                            AlreadyExistsSnafu {
+                                descr: format!(
+                                    "column {} is type {} but schema update has type {}",
+                                    column_name, c.column_type, column_type
+                                ),
+                            }
+                        );
+                        Ok(c.clone())
+                    }
+                    None => {
+                        let new_column = Column {
+                            id: ColumnId::new(stage.columns.len() as i64 + 1),
+                            table_id,
+                            name: column_name.to_string(),
+                            column_type,
+                        };
+                        stage.columns.push(new_column);
+                        Ok(stage.columns.last().unwrap().clone())
+                    }
+                }
+            })
+            .collect::<Result<Vec<Column>>>()?;
+
+        Ok(out)
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        let stage = self.collections.lock();
+
+        let table_ids: Vec<_> = stage
+            .tables
+            .iter()
+            .filter(|t| t.namespace_id == namespace_id)
+            .map(|t| t.id)
+            .collect();
+        let columns: Vec<_> = stage
+            .columns
+            .iter()
+            .filter(|c| table_ids.contains(&c.table_id))
+            .cloned()
+            .collect();
+
+        Ok(columns)
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
+        let stage = self.collections.lock();
+
+        let columns: Vec<_> = stage
+            .columns
+            .iter()
+            .filter(|c| c.table_id == table_id)
+            .cloned()
+            .collect();
+
+        Ok(columns)
+    }
+
+    async fn list(&mut self) -> Result<Vec<Column>> {
+        let stage = self.collections.lock();
+        Ok(stage.columns.clone())
+    }
+}
+
+#[async_trait]
+impl PartitionRepo for MemTxn {
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
+        let mut stage = self.collections.lock();
+
+        let partition = match stage
+            .partitions
+            .iter()
+            .find(|p| p.partition_key == key && p.table_id == table_id)
+        {
+            Some(p) => p,
+            None => {
+                let hash_id = PartitionHashId::new(table_id, &key);
+                let p = Partition::new_catalog_only(
+                    PartitionId::new(stage.partitions.len() as i64 + 1),
+                    Some(hash_id),
+                    table_id,
+                    key,
+                    SortKeyIds::default(),
+                    None,
+                );
+                stage.partitions.push(p.into());
+                stage.partitions.last().unwrap()
+            }
+        };
+
+        Ok(partition.value.clone())
+    }
+
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        let lookup = partition_ids.iter().collect::<HashSet<_>>();
+
+        let stage = self.collections.lock();
+
+        Ok(stage
+            .partitions
+            .iter()
+            .filter(|p| lookup.contains(&p.id))
+            .map(|x| x.value.clone())
+            .collect())
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
+        let stage = self.collections.lock();
+
+        let partitions: Vec<_> = stage
+            .partitions
+            .iter()
+            .filter(|p| p.table_id == table_id)
+            .map(|x| x.value.clone())
+            .collect();
+        Ok(partitions)
+    }
+
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
+        let stage = self.collections.lock();
+
+        let partitions: Vec<_> = stage.partitions.iter().map(|p| p.id).collect();
+
+        Ok(partitions)
+    }
+
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let mut stage = self.collections.lock();
+
+        match stage.partitions.iter_mut().find(|p| p.id == partition_id) {
+            Some(p) if p.sort_key_ids() == old_sort_key_ids => {
+                p.set_sort_key_ids(new_sort_key_ids);
+                Ok(p.value.clone())
+            }
+            Some(p) => {
+                return Err(CasFailure::ValueMismatch(
+                    p.sort_key_ids().cloned().unwrap_or_default(),
+                ));
+            }
+            None => Err(CasFailure::QueryError(Error::NotFound {
+                descr: partition_id.to_string(),
+            })),
+        }
+    }
+
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()> {
+        let mut stage = self.collections.lock();
+
+        let reason = reason.to_string();
+        let skipped_at = Timestamp::from(self.time_provider.now());
+
+        let sc = SkippedCompaction {
+            partition_id,
+            reason,
+            skipped_at,
+            num_files: num_files as i64,
+            limit_num_files: limit_num_files as i64,
+            limit_num_files_first_in_partition: limit_num_files_first_in_partition as i64,
+            estimated_bytes: estimated_bytes as i64,
+            limit_bytes: limit_bytes as i64,
+        };
+
+        match stage
+            .skipped_compactions
+            .iter_mut()
+            .find(|s| s.partition_id == partition_id)
+        {
+            Some(s) => {
+                *s = sc;
+            }
+            None => stage.skipped_compactions.push(sc),
+        }
+        Ok(())
+    }
+
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_ids: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>> {
+        let stage = self.collections.lock();
+        let find: HashSet<&PartitionId> = partition_ids.iter().collect();
+        Ok(stage
+            .skipped_compactions
+            .iter()
+            .filter(|s| find.contains(&s.partition_id))
+            .cloned()
+            .collect())
+    }
+
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
+        let stage = self.collections.lock();
+        Ok(stage.skipped_compactions.clone())
+    }
+
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>> {
+        use std::mem;
+
+        let mut stage = self.collections.lock();
+        let skipped_compactions = mem::take(&mut stage.skipped_compactions);
+        let (mut removed, remaining) = skipped_compactions
+            .into_iter()
+            .partition(|sc| sc.partition_id == partition_id);
+        stage.skipped_compactions = remaining;
+
+        match removed.pop() {
+            Some(sc) if removed.is_empty() => Ok(Some(sc)),
+            Some(_) => unreachable!("There must be exactly one skipped compaction per partition"),
+            None => Ok(None),
+        }
+    }
+
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
+        let stage = self.collections.lock();
+        let iter = stage.partitions.iter().rev().take(n);
+        Ok(iter.map(|x| x.value.clone()).collect())
+    }
+
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>> {
+        let stage = self.collections.lock();
+
+        let partitions: Vec<_> = stage
+            .partitions
+            .iter()
+            .filter(|p| {
+                p.new_file_at > Some(minimum_time)
+                    && maximum_time
+                        .map(|max| p.new_file_at < Some(max))
+                        .unwrap_or(true)
+            })
+            .map(|p| p.id)
+            .collect();
+
+        Ok(partitions)
+    }
+
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
+        let stage = self.collections.lock();
+
+        let old_style: Vec<_> = stage
+            .partitions
+            .iter()
+            .filter(|p| p.hash_id().is_none())
+            .map(|x| x.value.clone())
+            .collect();
+
+        Ok(old_style)
+    }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let mut guard = self.collections.lock();
+        let (partition, generation) = {
+            let search = guard.partitions.iter_mut().find(|x| x.id == partition_id);
+            let partition = search.ok_or_else(|| Error::NotFound {
+                descr: format!("Partition {partition_id} not found"),
+            })?;
+
+            let generation = partition.generation;
+            partition.generation += 1;
+            (partition.value.clone(), generation)
+        };
+
+        let files = guard
+            .parquet_files
+            .iter()
+            .filter(|x| x.partition_id == partition_id && x.to_delete.is_none())
+            .cloned()
+            .collect();
+
+        let search = guard.tables.iter().find(|x| x.id == partition.table_id);
+        let table = search.ok_or_else(|| Error::NotFound {
+            descr: format!("Table {} not found", partition.table_id),
+        })?;
+
+        let sc = guard
+            .skipped_compactions
+            .iter()
+            .find(|sc| sc.partition_id == partition_id)
+            .cloned();
+
+        Ok(PartitionSnapshot::encode(
+            table.namespace_id,
+            partition,
+            files,
+            sc,
+            generation,
+        )?)
+    }
+}
+
+#[async_trait]
+impl ParquetFileRepo for MemTxn {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let mut stage = self.collections.lock();
+        let now = Timestamp::from(self.time_provider.now());
+        let stage = stage.deref_mut();
+
+        Ok(stage
+            .parquet_files
+            .iter_mut()
+            // don't flag if already flagged for deletion
+            .filter(|f| f.to_delete.is_none())
+            .filter_map(|f| {
+                // table retention, if it exists, overrides namespace retention
+                // TODO - include check of table retention period once implemented
+                stage
+                    .namespaces
+                    .iter()
+                    .find(|n| n.id == f.namespace_id)
+                    .and_then(|ns| {
+                        ns.retention_period_ns.and_then(|rp| {
+                            if f.max_time < now - rp {
+                                f.to_delete = Some(now);
+                                Some((f.partition_id, f.object_store_id))
+                            } else {
+                                None
+                            }
+                        })
+                    })
+            })
+            .take(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION as usize)
+            .collect())
+    }
+
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        let mut stage = self.collections.lock();
+
+        let (delete, keep): (Vec<_>, Vec<_>) = stage.parquet_files.iter().cloned().partition(
+            |f| matches!(f.to_delete, Some(marked_deleted) if marked_deleted < older_than),
+        );
+
+        stage.parquet_files = keep;
+
+        let delete = delete
+            .into_iter()
+            .take(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE as usize)
+            .map(|f| f.object_store_id)
+            .collect();
+        Ok(delete)
+    }
+
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>> {
+        let partition_ids = partition_ids.into_iter().collect::<HashSet<_>>();
+        let stage = self.collections.lock();
+
+        Ok(stage
+            .parquet_files
+            .iter()
+            .filter(|f| partition_ids.contains(&f.partition_id) && f.to_delete.is_none())
+            .cloned()
+            .collect())
+    }
+
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>> {
+        let stage = self.collections.lock();
+
+        Ok(stage
+            .parquet_files
+            .iter()
+            .find(|f| f.object_store_id.eq(&object_store_id))
+            .cloned())
+    }
+
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        let stage = self.collections.lock();
+
+        Ok(stage
+            .parquet_files
+            .iter()
+            .filter(|f| object_store_ids.contains(&f.object_store_id))
+            .map(|f| f.object_store_id)
+            .collect())
+    }
+
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>> {
+        let delete_set = delete.iter().copied().collect::<HashSet<_>>();
+        let upgrade_set = upgrade.iter().copied().collect::<HashSet<_>>();
+
+        assert!(
+            delete_set.is_disjoint(&upgrade_set),
+            "attempted to upgrade a file scheduled for delete"
+        );
+
+        let mut collections = self.collections.lock();
+        let mut stage = collections.clone();
+
+        for id in delete {
+            let marked_at = Timestamp::from(self.time_provider.now());
+            flag_for_delete(&mut stage, partition_id, *id, marked_at)?;
+        }
+
+        update_compaction_level(&mut stage, partition_id, upgrade, target_level)?;
+
+        let mut ids = Vec::with_capacity(create.len());
+        for file in create {
+            if file.partition_id != partition_id {
+                return Err(Error::External {
+                    source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(),
+                });
+            }
+            let res = create_parquet_file(&mut stage, file.clone())?;
+            ids.push(res.id);
+        }
+
+        *collections = stage;
+
+        Ok(ids)
+    }
+}
+
+fn filter_namespace_soft_delete<'a>(
+    v: impl IntoIterator<Item = &'a Namespace>,
+    deleted: SoftDeletedRows,
+) -> impl Iterator<Item = &'a Namespace> {
+    v.into_iter().filter(move |v| match deleted {
+        SoftDeletedRows::AllRows => true,
+        SoftDeletedRows::ExcludeDeleted => v.deleted_at.is_none(),
+        SoftDeletedRows::OnlyDeleted => v.deleted_at.is_some(),
+    })
+}
+
+fn create_or_get_column(
+    stage: &mut MemCollections,
+    name: &str,
+    table_id: TableId,
+    column_type: ColumnType,
+) -> Result<Column> {
+    // this block is just to ensure the mem impl correctly creates ColumnCreateLimitError in
+    // tests, we don't care about any of the errors it is discarding
+    stage
+        .tables
+        .iter()
+        .find(|t| t.id == table_id)
+        .cloned()
+        .ok_or(Error::NotFound {
+            descr: format!("table: {}", table_id),
+        }) // error never used, this is just for flow control
+        .and_then(|t| {
+            stage
+                .namespaces
+                .iter()
+                .find(|n| n.id == t.namespace_id)
+                .cloned()
+                .ok_or_else(|| Error::NotFound {
+                    // we're never going to use this error, this is just for flow control,
+                    // so it doesn't matter that we only have the ID, not the name
+                    descr: "".to_string(),
+                })
+                .and_then(|n| {
+                    let max_columns_per_table = n.max_columns_per_table;
+                    let columns_count = stage
+                        .columns
+                        .iter()
+                        .filter(|t| t.table_id == table_id)
+                        .count();
+                    if columns_count >= max_columns_per_table.get() {
+                        return Err(Error::LimitExceeded {
+                            descr: format!(
+                                "couldn't create column {} in table {}; limit reached on namespace",
+                                name, table_id
+                            ),
+                        });
+                    }
+                    Ok(())
+                })?;
+            Ok(())
+        })?;
+
+    let column = match stage
+        .columns
+        .iter()
+        .find(|t| t.name == name && t.table_id == table_id)
+    {
+        Some(c) => {
+            ensure!(
+                column_type == c.column_type,
+                AlreadyExistsSnafu {
+                    descr: format!(
+                        "column {} is type {} but schema update has type {}",
+                        name, c.column_type, column_type
+                    ),
+                }
+            );
+            c
+        }
+        None => {
+            let column = Column {
+                id: ColumnId::new(stage.columns.len() as i64 + 1),
+                table_id,
+                name: name.to_string(),
+                column_type,
+            };
+            stage.columns.push(column);
+            stage.columns.last().unwrap()
+        }
+    };
+
+    Ok(column.clone())
+}
+
+// The following three functions are helpers to the create_upgrade_delete method.
+// They are also used by the respective create/flag_for_delete/update_compaction_level methods.
+fn create_parquet_file(
+    stage: &mut MemCollections,
+    parquet_file_params: ParquetFileParams,
+) -> Result<ParquetFile> {
+    if stage
+        .parquet_files
+        .iter()
+        .any(|f| f.object_store_id == parquet_file_params.object_store_id)
+    {
+        return Err(Error::AlreadyExists {
+            descr: parquet_file_params.object_store_id.to_string(),
+        });
+    }
+
+    let parquet_file = ParquetFile::from_params(
+        parquet_file_params,
+        ParquetFileId::new(stage.parquet_files.len() as i64 + 1),
+    );
+    let created_at = parquet_file.created_at;
+    let partition_id = parquet_file.partition_id;
+    stage.parquet_files.push(parquet_file);
+
+    // Update the new_file_at field its partition to the time of created_at
+    let partition = stage
+        .partitions
+        .iter_mut()
+        .find(|p| p.id == partition_id)
+        .ok_or(Error::NotFound {
+            descr: partition_id.to_string(),
+        })?;
+    partition.new_file_at = Some(created_at);
+
+    Ok(stage.parquet_files.last().unwrap().clone())
+}
+
+fn flag_for_delete(
+    stage: &mut MemCollections,
+    partition_id: PartitionId,
+    id: ObjectStoreId,
+    marked_at: Timestamp,
+) -> Result<()> {
+    match stage
+        .parquet_files
+        .iter_mut()
+        .find(|p| p.object_store_id == id && p.partition_id == partition_id)
+    {
+        Some(f) if f.to_delete.is_none() => f.to_delete = Some(marked_at),
+        _ => {
+            return Err(Error::NotFound {
+                descr: format!("parquet file {id} not found for delete"),
+            })
+        }
+    }
+
+    Ok(())
+}
+
+fn update_compaction_level(
+    stage: &mut MemCollections,
+    partition_id: PartitionId,
+    object_store_ids: &[ObjectStoreId],
+    compaction_level: CompactionLevel,
+) -> Result<Vec<ObjectStoreId>> {
+    let all_ids = stage
+        .parquet_files
+        .iter()
+        .filter(|f| f.partition_id == partition_id && f.to_delete.is_none())
+        .map(|f| f.object_store_id)
+        .collect::<HashSet<_>>();
+    for id in object_store_ids {
+        if !all_ids.contains(id) {
+            return Err(Error::NotFound {
+                descr: format!("parquet file {id} not found for upgrade"),
+            });
+        }
+    }
+
+    let update_ids = object_store_ids.iter().copied().collect::<HashSet<_>>();
+    let mut updated = Vec::with_capacity(object_store_ids.len());
+    for f in stage
+        .parquet_files
+        .iter_mut()
+        .filter(|p| update_ids.contains(&p.object_store_id) && p.partition_id == partition_id)
+    {
+        f.compaction_level = compaction_level;
+        updated.push(f.object_store_id);
+    }
+
+    Ok(updated)
+}
+
+#[cfg(test)]
+mod tests {
+    use iox_time::SystemProvider;
+
+    use super::*;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn test_catalog() {
+        crate::interface_tests::test_catalog(|| async {
+            let metrics = Arc::new(metric::Registry::default());
+            let time_provider = Arc::new(SystemProvider::new());
+            let x: Arc<dyn Catalog> = Arc::new(MemCatalog::new(metrics, time_provider));
+            x
+        })
+        .await;
+    }
+}
diff --git a/iox_catalog/src/metrics.rs b/iox_catalog/src/metrics.rs
new file mode 100644
index 0000000..b179fd3
--- /dev/null
+++ b/iox_catalog/src/metrics.rs
@@ -0,0 +1,203 @@
+//! Metric instrumentation for catalog implementations.
+
+use crate::interface::{
+    CasFailure, ColumnRepo, NamespaceRepo, ParquetFileRepo, PartitionRepo, RepoCollection, Result,
+    SoftDeletedRows, TableRepo,
+};
+use async_trait::async_trait;
+use data_types::snapshot::table::TableSnapshot;
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    snapshot::partition::PartitionSnapshot,
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
+};
+use iox_time::TimeProvider;
+use metric::{DurationHistogram, Metric};
+use std::{collections::HashMap, fmt::Debug, sync::Arc};
+
+/// Decorates a implementation of the catalog's [`RepoCollection`] (and the
+/// transactional variant) with instrumentation that emits latency histograms
+/// for each method.
+///
+/// Values are recorded under the `catalog_op_duration` metric, labelled by
+/// operation name and result (success/error).
+#[derive(Debug)]
+pub struct MetricDecorator<T> {
+    inner: T,
+    time_provider: Arc<dyn TimeProvider>,
+    metrics: Arc<metric::Registry>,
+}
+
+impl<T> MetricDecorator<T> {
+    /// Wrap `T` with instrumentation recording operation latency in `metrics`.
+    pub fn new(
+        inner: T,
+        metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
+    ) -> Self {
+        Self {
+            inner,
+            time_provider,
+            metrics,
+        }
+    }
+}
+
+impl<T> RepoCollection for MetricDecorator<T>
+where
+    T: NamespaceRepo + TableRepo + ColumnRepo + PartitionRepo + ParquetFileRepo + Debug,
+{
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+/// Emit a trait impl for `impl_trait` that delegates calls to the inner
+/// implementation, recording the duration and result to the metrics registry.
+///
+/// Format:
+///
+/// ```ignore
+///     decorate!(
+///         impl_trait = <trait name>,
+///         methods = [
+///             "<metric name>" = <method signature>;
+///             "<metric name>" = <method signature>;
+///             // ... and so on
+///         ]
+///     );
+/// ```
+///
+/// All methods of a given trait MUST be defined in the `decorate!()` call so
+/// they are all instrumented or the decorator will not compile as it won't
+/// fully implement the trait.
+macro_rules! decorate {
+    (
+        impl_trait = $trait:ident,
+        methods = [$(
+            $metric:literal = $method:ident(
+                &mut self $(,)?
+                $($arg:ident : $t:ty),*
+            ) -> Result<$out:ty$(, $err:ty)?>;
+        )+]
+    ) => {
+        #[async_trait]
+        impl<T:$trait> $trait for MetricDecorator<T> {
+            /// NOTE: if you're seeing an error here about "not all trait items
+            /// implemented" or something similar, one or more methods are
+            /// missing from / incorrectly defined in the decorate!() blocks
+            /// below.
+
+            $(
+                async fn $method(&mut self, $($arg : $t),*) -> Result<$out$(, $err)?> {
+                    let observer: Metric<DurationHistogram> = self.metrics.register_metric(
+                        "catalog_op_duration",
+                        "catalog call duration",
+                    );
+
+                    let t = self.time_provider.now();
+                    let res = self.inner.$method($($arg),*).await;
+
+                    // Avoid exploding if time goes backwards - simply drop the
+                    // measurement if it happens.
+                    if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
+                        let tag = match &res {
+                            Ok(_) => "success",
+                            Err(_) => "error",
+                        };
+                        observer.recorder(&[("op", $metric), ("result", tag)]).record(delta);
+                    }
+
+                    res
+                }
+            )+
+        }
+    };
+}
+
+decorate!(
+    impl_trait = NamespaceRepo,
+    methods = [
+        "namespace_create" = create(&mut self, name: &NamespaceName<'_>, partition_template: Option<NamespacePartitionTemplateOverride>, retention_period_ns: Option<i64>, service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>) -> Result<Namespace>;
+        "namespace_update_retention_period" = update_retention_period(&mut self, name: &str, retention_period_ns: Option<i64>) -> Result<Namespace>;
+        "namespace_list" = list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>>;
+        "namespace_get_by_id" = get_by_id(&mut self, id: NamespaceId, deleted: SoftDeletedRows) -> Result<Option<Namespace>>;
+        "namespace_get_by_name" = get_by_name(&mut self, name: &str, deleted: SoftDeletedRows) -> Result<Option<Namespace>>;
+        "namespace_soft_delete" = soft_delete(&mut self, name: &str) -> Result<()>;
+        "namespace_update_table_limit" = update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace>;
+        "namespace_update_column_limit" = update_column_limit(&mut self, name: &str, new_max: MaxColumnsPerTable) -> Result<Namespace>;
+    ]
+);
+
+decorate!(
+    impl_trait = TableRepo,
+    methods = [
+        "table_create" = create(&mut self, name: &str, partition_template: TablePartitionTemplateOverride, namespace_id: NamespaceId) -> Result<Table>;
+        "table_get_by_id" = get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>>;
+        "table_get_by_namespace_and_name" = get_by_namespace_and_name(&mut self, namespace_id: NamespaceId, name: &str) -> Result<Option<Table>>;
+        "table_list_by_namespace_id" = list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>>;
+        "table_list" = list(&mut self) -> Result<Vec<Table>>;
+        "table_snapshot" = snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot>;
+    ]
+);
+
+decorate!(
+    impl_trait = ColumnRepo,
+    methods = [
+        "column_create_or_get" = create_or_get(&mut self, name: &str, table_id: TableId, column_type: ColumnType) -> Result<Column>;
+        "column_list_by_namespace_id" = list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>>;
+        "column_list_by_table_id" = list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>>;
+        "column_create_or_get_many_unchecked" = create_or_get_many_unchecked(&mut self, table_id: TableId, columns: HashMap<&str, ColumnType>) -> Result<Vec<Column>>;
+        "column_list" = list(&mut self) -> Result<Vec<Column>>;
+    ]
+);
+
+decorate!(
+    impl_trait = PartitionRepo,
+    methods = [
+        "partition_create_or_get" = create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition>;
+        "partition_get_by_id_batch" = get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>>;
+        "partition_list_by_table_id" = list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>>;
+        "partition_list_ids" = list_ids(&mut self) -> Result<Vec<PartitionId>>;
+        "partition_update_sort_key" = cas_sort_key(&mut self, partition_id: PartitionId, old_sort_key_ids: Option<&SortKeyIds>, new_sort_key_ids: &SortKeyIds) -> Result<Partition, CasFailure<SortKeyIds>>;
+        "partition_record_skipped_compaction" = record_skipped_compaction(&mut self, partition_id: PartitionId, reason: &str, num_files: usize, limit_num_files: usize, limit_num_files_first_in_partition: usize, estimated_bytes: u64, limit_bytes: u64) -> Result<()>;
+        "partition_list_skipped_compactions" = list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>>;
+        "partition_delete_skipped_compactions" = delete_skipped_compactions(&mut self, partition_id: PartitionId) -> Result<Option<SkippedCompaction>>;
+        "partition_most_recent_n" = most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>>;
+        "partition_partitions_new_file_between" = partitions_new_file_between(&mut self, minimum_time: Timestamp, maximum_time: Option<Timestamp>) -> Result<Vec<PartitionId>>;
+        "partition_get_in_skipped_compactions" = get_in_skipped_compactions(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<SkippedCompaction>>;
+        "partition_list_old_style" = list_old_style(&mut self) -> Result<Vec<Partition>>;
+        "partition_snapshot" = snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot>;
+    ]
+);
+
+decorate!(
+    impl_trait = ParquetFileRepo,
+    methods = [
+        "parquet_flag_for_delete_by_retention" = flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>>;
+        "parquet_delete_old_ids_only" = delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>>;
+        "parquet_list_by_partition_not_to_delete_batch" = list_by_partition_not_to_delete_batch(&mut self, partition_ids: Vec<PartitionId>) -> Result<Vec<ParquetFile>>;
+        "parquet_get_by_object_store_id" = get_by_object_store_id(&mut self, object_store_id: ObjectStoreId) -> Result<Option<ParquetFile>>;
+        "parquet_exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec<ObjectStoreId>) -> Result<Vec<ObjectStoreId>>;
+        "parquet_create_upgrade_delete" = create_upgrade_delete(&mut self, partition_id: PartitionId, delete: &[ObjectStoreId], upgrade: &[ObjectStoreId], create: &[ParquetFileParams], target_level: CompactionLevel) -> Result<Vec<ParquetFileId>>;
+    ]
+);
diff --git a/iox_catalog/src/migrate.rs b/iox_catalog/src/migrate.rs
new file mode 100644
index 0000000..5bbf963
--- /dev/null
+++ b/iox_catalog/src/migrate.rs
@@ -0,0 +1,2437 @@
+//! Better migrations.
+//!
+//! # Why
+//!
+//! SQLx migrations don't work for us, see:
+//!
+//! - <https://github.com/launchbadge/sqlx/issues/2085>
+//! - <https://github.com/influxdata/influxdb_iox/issues/5031>
+//!
+//! # Usage
+//!
+//! Just place your migration in the `migrations` folder. They basically work like normal SQLx migrations but there are
+//! a few extra, magic comments you can put in your code to modify the behavior.
+//!
+//! ## Steps
+//!
+//! The entire SQL text will be executed as a single statement. However, you can split it into multiple steps by using
+//! a marker:
+//!
+//! ```sql
+//! CREATE TABLE t1 (x INT);
+//!
+//! -- IOX_STEP_BOUNDARY
+//!
+//! CREATE TABLE t2 (x INT);
+//! ```
+//!
+//! ## Transactions & Idempotency
+//!
+//! All steps will be executed within one transaction. However, you can opt-out of this:
+//!
+//! ```sql
+//! -- this step is wrapped in a transaction
+//! CREATE TABLE t1 (x INT);
+//!
+//! -- IOX_STEP_BOUNDARY
+//!
+//! -- this step isn't
+//! -- IOX_NO_TRANSACTION
+//! CREATE TABLE t2 (x INT);
+//! ```
+//!
+//! If all steps can be run in a transaction, the entire migration (including its bookkeeping) will be executed in a
+//! transaction. In this case, the transaction is automatically idempotent.
+//!
+//! Migrations that opt out of the transaction handling MUST ensure that they are idempotent. This also includes that
+//! they end up in the desired target state even if they were interrupted midway in a previous run.
+//!
+//! ## Updating / Fixing Migrations
+//!
+//! **⚠️ In general a migration MUST NOT be updated / changed after it was committed to `main`. ⚠️**
+//!
+//! However, there is one exception to this rule: if the new version has the same outcome when applied successfully.
+//! This can be due to:
+//!
+//! - **Optimization:** The migration script turns out to be too slow in production workloads, but you find a better
+//!   version that does the same but runs faster.
+//! - **Failure:** The script worked fine during testing but in prod it always fails, e.g. because it is missing NULL
+//!   handling. It is important to remember that the fix MUST NOT change the outcome of the successful runs.
+//! - **Idempotency:** The script works only w/o transactions (see section above) and cannot be re-applied when
+//!   interrupted midway. One common case is `CREATE INDEX CONCURRENTLY ...` where you MUST drop the index beforehand
+//!   via `DROP INDEX IF EXISTS ...` because a previous interrupted migration might have left it in an invalid state.
+//!   See ["Building Indexes Concurrently"].
+//!
+//! If you are very sure that you found a fix for your migration that does the same operation, you still MUST NOT just
+//! change the existing migration. The reason is that we keep a checksum of the migration stored in the database.
+//! Changing the script will change the checksum, which will lead to a [failure](MigrateError::VersionMismatch) when
+//! running the migrations. You can work around that by obtaining the old checksum (in hex) and adding it to the new
+//! version as: `-- IOX_OTHER_CHECKSUM: 42feedbull`. This pragma can be repeated multiple times.
+//!
+//! ### Example
+//!
+//! If the old migration script looks like this:
+//!
+//! ```sql
+//! -- IOX_NO_TRANSACTION
+//! SET statement_timeout TO '60min';
+//!
+//! -- IOX_STEP_BOUNDARY
+//!
+//! -- IOX_NO_TRANSACTION
+//! CREATE INDEX CONCURRENTLY IF NOT EXISTS i ON t (x);
+//! ```
+//!
+//! You can fix the idempotency by creating a new migration that contains:
+//!
+//! ```sql
+//! -- IOX_OTHER_CHECKSUM: 067431eaa74f26ee86200aaed4992a5fe22354322102f1ed795e424ec529469079569072d856e96ee9fdb6cc848b6137
+//! -- IOX_NO_TRANSACTION
+//! SET statement_timeout TO '60min';
+//!
+//! -- IOX_STEP_BOUNDARY
+//! DROP INDEX CONCURRENTLY IF EXISTS i;
+//!
+//! -- IOX_NO_TRANSACTION
+//!
+//! -- IOX_STEP_BOUNDARY
+//!
+//! -- IOX_NO_TRANSACTION
+//! CREATE INDEX CONCURRENTLY IF NOT EXISTS i ON t (x);
+//! ```
+//!
+//! ## Non-SQL steps
+//!
+//! At the moment, we only support SQL-based migration steps, but other step types can easily be added.
+//!
+//! ["Building Indexes Concurrently"]: https://www.postgresql.org/docs/15/sql-createindex.html#SQL-CREATEINDEX-CONCURRENTLY
+
+use std::{
+    borrow::Cow,
+    collections::{HashMap, HashSet},
+    hash::{Hash, Hasher},
+    ops::Deref,
+    str::FromStr,
+    time::{Duration, Instant},
+};
+
+use async_trait::async_trait;
+use observability_deps::tracing::{debug, info, warn};
+use siphasher::sip::SipHasher13;
+use sqlx::{
+    migrate::{Migrate, MigrateError, Migration, MigrationType, Migrator},
+    query, query_as, query_scalar, Acquire, Connection, Executor, PgConnection, Postgres,
+    Transaction,
+};
+
+/// A single [`IOxMigration`] step.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum IOxMigrationStep {
+    /// Execute a SQL statement.
+    ///
+    /// A SQL statement MAY contain multiple sub-statements, e.g.:
+    ///
+    /// ```sql
+    /// CREATE TABLE IF NOT EXISTS table1 (
+    ///     id BIGINT GENERATED ALWAYS AS IDENTITY,
+    ///     PRIMARY KEY (id),
+    /// );
+    ///
+    /// CREATE TABLE IF NOT EXISTS table2 (
+    ///     id BIGINT GENERATED ALWAYS AS IDENTITY,
+    ///     PRIMARY KEY (id),
+    /// );
+    /// ```
+    SqlStatement {
+        /// The SQL text.
+        ///
+        /// If [`in_transaction`](Self::SqlStatement::in_transaction) is set, this MUST NOT contain any transaction
+        /// modifiers like `COMMIT`/`ROLLBACK`/`BEGIN`!
+        sql: Cow<'static, str>,
+
+        /// Should the execution of the SQL text be wrapped into a transaction?
+        ///
+        /// Whenever possible, you likely want to set this to `true`. However, some database changes like `CREATE INDEX
+        /// CONCURRENTLY` under PostgreSQL cannot be executed within a transaction.
+        in_transaction: bool,
+    },
+}
+
+impl IOxMigrationStep {
+    /// Apply migration step.
+    async fn apply<C>(&self, conn: &mut C) -> Result<(), MigrateError>
+    where
+        C: IOxMigrate,
+    {
+        match self {
+            Self::SqlStatement { sql, .. } => {
+                conn.exec(sql).await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Will this step set up a transaction if there is none yet?
+    fn in_transaction(&self) -> bool {
+        match self {
+            Self::SqlStatement { in_transaction, .. } => *in_transaction,
+        }
+    }
+}
+
+/// Migration checksum.
+#[derive(Clone, PartialEq, Eq)]
+pub struct Checksum(Box<[u8]>);
+
+impl Checksum {
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl std::fmt::Debug for Checksum {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        for b in &*self.0 {
+            write!(f, "{:02x}", b)?;
+        }
+        Ok(())
+    }
+}
+
+impl std::fmt::Display for Checksum {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+impl<const N: usize> From<[u8; N]> for Checksum {
+    fn from(value: [u8; N]) -> Self {
+        Self(value.into())
+    }
+}
+
+impl From<&[u8]> for Checksum {
+    fn from(value: &[u8]) -> Self {
+        Self(value.into())
+    }
+}
+
+impl FromStr for Checksum {
+    type Err = MigrateError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let inner = (0..s.len())
+            .step_by(2)
+            .map(|i| u8::from_str_radix(&s[i..(i + 2).min(s.len())], 16))
+            .collect::<Result<Box<[u8]>, _>>()
+            .map_err(|e| {
+                MigrateError::Source(format!("cannot parse checksum '{s}': {e}").into())
+            })?;
+
+        Ok(Self(inner))
+    }
+}
+
+/// Database migration.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct IOxMigration {
+    /// Version.
+    ///
+    /// This is used to order migrations.
+    pub version: i64,
+
+    /// Human-readable description.
+    pub description: Cow<'static, str>,
+
+    /// Steps that compose this migration.
+    ///
+    /// In most cases you want a single [SQL step](IOxMigrationStep::SqlStatement) which is executed
+    /// [in a transaction](IOxMigrationStep::SqlStatement::in_transaction).
+    pub steps: Box<[IOxMigrationStep]>,
+
+    /// Checksum of the given steps.
+    pub checksum: Checksum,
+
+    /// Checksums of other versions of this migration that are known to be compatible.
+    ///
+    /// **Using this should be a rare exception!**
+    ///
+    /// This can be used to convert a non-idempotent migration into an idempotent one.
+    pub other_compatible_checksums: Box<[Checksum]>,
+}
+
+impl IOxMigration {
+    /// Apply migration and return elapsed wall-clock time (measured locally).
+    async fn apply<C>(&self, conn: &mut C) -> Result<Duration, MigrateError>
+    where
+        C: IOxMigrate,
+    {
+        let single_transaction = self.single_transaction();
+        info!(
+            version = self.version,
+            description = self.description.as_ref(),
+            steps = self.steps.len(),
+            single_transaction,
+            "applying migration"
+        );
+
+        let elapsed = if single_transaction {
+            let mut txn = conn.begin_txn().await?;
+            let elapsed = {
+                let conn = txn.acquire_conn().await?;
+                self.apply_inner(conn, true).await?
+            };
+            txn.commit_txn().await?;
+            elapsed
+        } else {
+            self.apply_inner(conn, false).await?
+        };
+
+        info!(
+            version = self.version,
+            description = self.description.as_ref(),
+            steps = self.steps.len(),
+            elapsed_secs = elapsed.as_secs_f64(),
+            "migration applied"
+        );
+
+        Ok(elapsed)
+    }
+
+    /// Run actual application of the migration.
+    ///
+    /// This may or may NOT be guarded by a transaction block.
+    async fn apply_inner<C>(&self, conn: &mut C, single_txn: bool) -> Result<Duration, MigrateError>
+    where
+        C: IOxMigrate,
+    {
+        let start = Instant::now();
+        conn.start_migration(self).await?;
+
+        for (i, step) in self.steps.iter().enumerate() {
+            info!(
+                version = self.version,
+                steps = self.steps.len(),
+                step = i + 1,
+                single_txn,
+                in_transaction = step.in_transaction(),
+                "applying migration step"
+            );
+
+            if step.in_transaction() && !single_txn {
+                let mut txn = conn.begin_txn().await?;
+                {
+                    let conn = txn.acquire_conn().await?;
+                    step.apply(conn).await?;
+                }
+                txn.commit_txn().await?;
+            } else {
+                step.apply(conn).await?;
+            }
+
+            info!(
+                version = self.version,
+                steps = self.steps.len(),
+                step = i + 1,
+                "applied migration step"
+            );
+        }
+
+        let elapsed = start.elapsed();
+        conn.run_sanity_checks().await?;
+        conn.finish_migration(self, elapsed).await?;
+
+        Ok(elapsed)
+    }
+
+    /// This migration can be run in a single transaction and will never be dirty.
+    pub fn single_transaction(&self) -> bool {
+        self.steps.iter().all(|s| s.in_transaction())
+    }
+}
+
+impl TryFrom<&Migration> for IOxMigration {
+    type Error = MigrateError;
+
+    fn try_from(migration: &Migration) -> Result<Self, Self::Error> {
+        if migration.migration_type != MigrationType::Simple {
+            return Err(MigrateError::Source(
+                format!(
+                    "migration type has to be simple but is {:?}",
+                    migration.migration_type
+                )
+                .into(),
+            ));
+        }
+
+        let other_compatible_checksums = migration
+            .sql
+            .lines()
+            .filter_map(|s| {
+                s.strip_prefix("-- IOX_OTHER_CHECKSUM:")
+                    .map(|s| s.trim().parse())
+            })
+            .collect::<Result<_, _>>()?;
+
+        let steps = migration
+            .sql
+            .split("-- IOX_STEP_BOUNDARY")
+            .map(|sql| {
+                let sql = sql.trim().to_owned();
+                let in_transaction = !sql.contains("IOX_NO_TRANSACTION");
+                IOxMigrationStep::SqlStatement {
+                    sql: sql.into(),
+                    in_transaction,
+                }
+            })
+            .collect();
+
+        Ok(Self {
+            version: migration.version,
+            description: migration.description.clone(),
+            steps,
+            // Keep original (unprocessed) checksum for backwards compatibility.
+            checksum: migration.checksum.deref().into(),
+            other_compatible_checksums,
+        })
+    }
+}
+
+/// Migration manager.
+#[derive(Debug, PartialEq, Eq)]
+pub struct IOxMigrator {
+    /// List of migrations.
+    migrations: Vec<IOxMigration>,
+}
+
+impl IOxMigrator {
+    /// Create new migrator.
+    ///
+    /// # Error
+    /// Fails if migrations are not sorted or if there are duplicate [versions](IOxMigration::version).
+    pub fn try_new(
+        migrations: impl IntoIterator<Item = IOxMigration>,
+    ) -> Result<Self, MigrateError> {
+        let migrations = migrations.into_iter().collect::<Vec<_>>();
+
+        if let Some(m) = migrations.windows(2).find(|m| m[0].version > m[1].version) {
+            return Err(MigrateError::Source(
+                format!(
+                    "migrations are not sorted: version {} is before {} but should not be",
+                    m[0].version, m[1].version,
+                )
+                .into(),
+            ));
+        }
+        if let Some(m) = migrations.windows(2).find(|m| m[0].version == m[1].version) {
+            return Err(MigrateError::Source(
+                format!(
+                    "migrations are not unique: version {} found twice",
+                    m[0].version,
+                )
+                .into(),
+            ));
+        }
+
+        Ok(Self { migrations })
+    }
+
+    /// Run migrator on connection/pool.
+    ///
+    /// Returns set of executed [migrations](IOxMigration).
+    ///
+    /// This may fail and some migrations may be applied. Also, it is possible that a migration itself fails half-way,
+    /// in which case it is marked as dirty. Subsequent migrations will fail until the issue is resolved.
+    pub async fn run<'a, A>(&self, migrator: A) -> Result<HashSet<i64>, MigrateError>
+    where
+        A: Acquire<'a> + Send,
+        <A::Connection as Deref>::Target: IOxMigrate,
+    {
+        let mut conn = migrator.acquire().await?;
+        self.run_direct(&mut *conn).await
+    }
+
+    /// Run migrator on open connection.
+    ///
+    /// See docs for [run](Self::run).
+    async fn run_direct<C>(&self, conn: &mut C) -> Result<HashSet<i64>, MigrateError>
+    where
+        C: IOxMigrate,
+    {
+        let lock_id = conn.generate_lock_id().await?;
+        <C as IOxMigrate>::lock(conn, lock_id).await?;
+
+        let run_res = self.run_inner(conn).await;
+
+        // always try to unlock, even when we failed.
+        // While PG is timing out the lock, unlocking manually will give others the chance to re-lock faster. This is
+        // mostly relevant for tests where we re-use connections.
+        let unlock_res = <C as IOxMigrate>::unlock(conn, lock_id).await;
+
+        // return first error but also first OK (there doesn't seem to be an stdlib method for this)
+        match (run_res, unlock_res) {
+            (Err(e), _) => Err(e),
+            (Ok(_), Err(e)) => Err(e),
+            (Ok(res), Ok(())) => Ok(res),
+        }
+    }
+
+    /// Run migrator.
+    ///
+    /// This expects that locking was already done.
+    async fn run_inner<C>(&self, conn: &mut C) -> Result<HashSet<i64>, MigrateError>
+    where
+        C: IOxMigrate,
+    {
+        // creates [_migrations] table only if needed
+        // eventually this will likely migrate previous versions of the table
+        conn.ensure_migrations_table().await?;
+
+        let applied_migrations = <C as IOxMigrate>::list_applied_migrations(conn).await?;
+        validate_applied_migrations(&applied_migrations, self)?;
+
+        let applied_and_not_dirty: HashSet<_> = applied_migrations
+            .into_iter()
+            .filter(|m| !m.dirty)
+            .map(|m| m.version)
+            .collect();
+
+        let mut new_migrations = HashSet::new();
+        for migration in &self.migrations {
+            if applied_and_not_dirty.contains(&migration.version) {
+                continue;
+            }
+            migration.apply(conn).await?;
+            new_migrations.insert(migration.version);
+        }
+
+        Ok(new_migrations)
+    }
+}
+
+impl TryFrom<&Migrator> for IOxMigrator {
+    type Error = MigrateError;
+
+    fn try_from(migrator: &Migrator) -> Result<Self, Self::Error> {
+        if migrator.ignore_missing {
+            return Err(MigrateError::Source(
+                "`Migrator::ignore_missing` MUST NOT be set"
+                    .to_owned()
+                    .into(),
+            ));
+        }
+        if !migrator.locking {
+            return Err(MigrateError::Source(
+                "`Migrator::locking` MUST be set".to_owned().into(),
+            ));
+        }
+
+        let migrations = migrator
+            .migrations
+            .iter()
+            .map(|migration| migration.try_into())
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Self::try_new(migrations)
+    }
+}
+
+/// Validate already-applied migrations
+///
+/// Checks that:
+///
+/// - all applied migrations are known or all known migrations are applied
+/// - checksum of applied migration and known migration match
+/// - new migrations are newer than both the successfully applied and the dirty version
+/// - there is at most one dirty migration (bug check)
+/// - the dirty migration is the last applied one (bug check)
+fn validate_applied_migrations(
+    applied_migrations: &[IOxAppliedMigration],
+    migrator: &IOxMigrator,
+) -> Result<(), MigrateError> {
+    let migrations: HashMap<_, _> = migrator.migrations.iter().map(|m| (m.version, m)).collect();
+
+    let mut dirty_version = None;
+    for (idx, applied_migration) in applied_migrations.iter().enumerate() {
+        match migrations.get(&applied_migration.version) {
+            None => {
+                if idx == migrations.len() && dirty_version.is_none() {
+                    // All migrations in `migrator` have been applied
+                    // We therefore continue as this should not prevent startup
+                    // if there are no local migrations to apply
+                    warn!("found applied migrations not present locally, but all local migrations applied - continuing");
+                    return Ok(());
+                }
+
+                return Err(MigrateError::VersionMissing(applied_migration.version));
+            }
+            Some(migration) => {
+                if !std::iter::once(&migration.checksum)
+                    .chain(migration.other_compatible_checksums.iter())
+                    .any(|cs| cs.as_bytes() == applied_migration.checksum.deref())
+                {
+                    return Err(MigrateError::VersionMismatch(migration.version));
+                }
+
+                if applied_migration.dirty {
+                    if let Some(first) = dirty_version {
+                        return Err(MigrateError::Source(format!(
+                            "there are multiple dirty versions, this should not happen and is considered a bug: {:?}",
+                            &[first, migration.version],
+                        ).into()));
+                    }
+                    dirty_version = Some(migration.version);
+                    warn!(
+                        version = migration.version,
+                        "found dirty migration, trying to recover"
+                    );
+                }
+            }
+        }
+    }
+
+    let applied_last = applied_migrations
+        .iter()
+        .filter(|m| Some(m.version) != dirty_version)
+        .map(|m| m.version)
+        .max();
+    if let (Some(applied_last), Some(dirty_version)) = (applied_last, dirty_version) {
+        // algorithm error in this method, use an assertion
+        assert_ne!(applied_last, dirty_version);
+
+        if applied_last > dirty_version {
+            // database state error, so use a proper error
+            return Err(MigrateError::Source(format!(
+                "dirty version ({dirty_version}) is not the last applied version ({applied_last}), this is a bug",
+            ).into()));
+        }
+    }
+
+    let applied_set = applied_migrations
+        .iter()
+        .map(|m| m.version)
+        .collect::<HashSet<_>>();
+    let new_first = migrator
+        .migrations
+        .iter()
+        .filter(|m| !applied_set.contains(&m.version))
+        .map(|m| m.version)
+        .min();
+    if let (Some(dirty_version), Some(new_first)) = (dirty_version, new_first) {
+        // algorithm error in this method, use an assertion
+        assert_ne!(dirty_version, new_first);
+
+        if dirty_version > new_first {
+            // database state error, so use a proper error
+            return Err(MigrateError::Source(
+                format!(
+                    "new migration ({new_first}) goes before dirty version ({dirty_version}), \
+                this should not have been merged!",
+                )
+                .into(),
+            ));
+        }
+    }
+    if let (Some(applied_last), Some(new_first)) = (applied_last, new_first) {
+        // algorithm error in this method, use an assertion
+        assert_ne!(applied_last, new_first);
+
+        if applied_last > new_first {
+            // database state error, so use a proper error
+            return Err(MigrateError::Source(
+                format!(
+                "new migration ({new_first}) goes before last applied migration ({applied_last}), \
+                this should not have been merged!",
+            )
+                .into(),
+            ));
+        }
+    }
+
+    Ok(())
+}
+
+/// Information about a migration found in the database.
+#[derive(Debug)]
+pub struct IOxAppliedMigration {
+    /// Version of the migration.
+    pub version: i64,
+
+    /// Checksum.
+    pub checksum: Cow<'static, [u8]>,
+
+    /// Dirty flag.
+    ///
+    /// If this is set, then the migration was interrupted midway.
+    pub dirty: bool,
+}
+
+/// Transaction type linked to [`IOxMigrate`].
+///
+/// This is a separate type because we need to own the transaction object at some point before handing out mutable
+/// borrows to the actual connection again.
+#[async_trait]
+pub trait IOxMigrateTxn: Send {
+    /// The migration interface.
+    type M: IOxMigrate;
+
+    /// Acquire connection.
+    async fn acquire_conn(&mut self) -> Result<&mut Self::M, MigrateError>;
+
+    /// Commit transaction.
+    async fn commit_txn(self) -> Result<(), MigrateError>;
+}
+
+/// Interface of a specific database implementation (like Postgres) and the IOx migration system.
+///
+/// This mostly delegates to the SQLx [`Migrate`] interface but also has some extra methods.
+#[async_trait]
+pub trait IOxMigrate: Connection + Migrate + Send {
+    /// Transaction type.
+    type Txn<'a>: IOxMigrateTxn
+    where
+        Self: 'a;
+
+    /// Start new transaction.
+    async fn begin_txn<'a>(&'a mut self) -> Result<Self::Txn<'a>, MigrateError>;
+
+    /// Generate a lock ID that is used for [`lock`](Self::lock) and [`unlock`](Self::unlock).
+    async fn generate_lock_id(&mut self) -> Result<i64, MigrateError>;
+
+    /// Lock database for migrations.
+    async fn lock(&mut self, lock_id: i64) -> Result<(), MigrateError>;
+
+    /// Unlock database after migration.
+    async fn unlock(&mut self, lock_id: i64) -> Result<(), MigrateError>;
+
+    /// Get list of applied migrations.
+    async fn list_applied_migrations(&mut self) -> Result<Vec<IOxAppliedMigration>, MigrateError>;
+
+    /// Start a migration and mark it as "not finished".
+    async fn start_migration(&mut self, migration: &IOxMigration) -> Result<(), MigrateError>;
+
+    /// Finish a migration and register the elapsed time.
+    async fn finish_migration(
+        &mut self,
+        migration: &IOxMigration,
+        elapsed: Duration,
+    ) -> Result<(), MigrateError>;
+
+    /// Execute a SQL statement (that may contain multiple sub-statements)
+    async fn exec(&mut self, sql: &str) -> Result<(), MigrateError>;
+
+    /// Run DB-specific sanity checks on the schema.
+    ///
+    /// This mostly includes checks for "validity" markers (e.g. for indices).
+    async fn run_sanity_checks(&mut self) -> Result<(), MigrateError>;
+}
+
+#[async_trait]
+impl<'a> IOxMigrateTxn for Transaction<'a, Postgres> {
+    type M = PgConnection;
+
+    async fn acquire_conn(&mut self) -> Result<&mut Self::M, MigrateError> {
+        let conn = self.acquire().await?;
+        Ok(conn)
+    }
+
+    async fn commit_txn(self) -> Result<(), MigrateError> {
+        self.commit().await?;
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl IOxMigrate for PgConnection {
+    type Txn<'a> = Transaction<'a, Postgres>;
+
+    async fn begin_txn<'a>(&'a mut self) -> Result<Self::Txn<'a>, MigrateError> {
+        let txn = <Self as Connection>::begin(self).await?;
+        Ok(txn)
+    }
+
+    async fn generate_lock_id(&mut self) -> Result<i64, MigrateError> {
+        let db: String = query_scalar("SELECT current_database()")
+            .fetch_one(self)
+            .await?;
+
+        // A randomly generated static siphash key to ensure all migrations use the same locks.
+        //
+        // Generated with: xxd -i -l 16 /dev/urandom
+        let key = [
+            0xb8, 0x52, 0x81, 0x3c, 0x12, 0x83, 0x6f, 0xd9, 0x00, 0x4f, 0xe7, 0xe3, 0x61, 0xbd,
+            0x03, 0xaf,
+        ];
+
+        let mut hasher = SipHasher13::new_with_key(&key);
+        db.hash(&mut hasher);
+
+        Ok(i64::from_ne_bytes(hasher.finish().to_ne_bytes()))
+    }
+
+    async fn lock(&mut self, lock_id: i64) -> Result<(), MigrateError> {
+        loop {
+            let is_locked: bool = query_scalar("SELECT pg_try_advisory_lock($1)")
+                .bind(lock_id)
+                .fetch_one(&mut *self)
+                .await?;
+
+            if is_locked {
+                return Ok(());
+            }
+
+            let t_wait = Duration::from_millis(20);
+            debug!(
+                lock_id,
+                t_wait_millis = t_wait.as_millis(),
+                "lock held, waiting"
+            );
+            tokio::time::sleep(t_wait).await;
+        }
+    }
+
+    async fn unlock(&mut self, lock_id: i64) -> Result<(), MigrateError> {
+        let was_locked: bool = query_scalar("SELECT pg_advisory_unlock($1)")
+            .bind(lock_id)
+            .fetch_one(self)
+            .await?;
+
+        if !was_locked {
+            return Err(MigrateError::Source(
+                format!("did not own lock: {lock_id}").into(),
+            ));
+        }
+
+        Ok(())
+    }
+
+    async fn list_applied_migrations(&mut self) -> Result<Vec<IOxAppliedMigration>, MigrateError> {
+        let rows: Vec<(i64, Vec<u8>, bool)> = query_as(
+            "SELECT version, checksum, NOT success FROM _sqlx_migrations ORDER BY version",
+        )
+        .fetch_all(self)
+        .await?;
+
+        let migrations = rows
+            .into_iter()
+            .map(|(version, checksum, dirty)| IOxAppliedMigration {
+                version,
+                checksum: checksum.into(),
+                dirty,
+            })
+            .collect();
+
+        Ok(migrations)
+    }
+
+    async fn start_migration(&mut self, migration: &IOxMigration) -> Result<(), MigrateError> {
+        let _ = query(
+            r#"
+INSERT INTO _sqlx_migrations ( version, description, success, checksum, execution_time )
+VALUES ( $1, $2, FALSE, $3, -1 )
+ON CONFLICT (version)
+DO NOTHING
+            "#,
+        )
+        .bind(migration.version)
+        .bind(&*migration.description)
+        .bind(migration.checksum.as_bytes())
+        .execute(self)
+        .await?;
+
+        Ok(())
+    }
+
+    async fn finish_migration(
+        &mut self,
+        migration: &IOxMigration,
+        elapsed: Duration,
+    ) -> Result<(), MigrateError> {
+        let _ = query(
+            r#"
+UPDATE _sqlx_migrations
+SET success = TRUE, execution_time = $1
+WHERE version = $2
+            "#,
+        )
+        .bind(elapsed.as_nanos() as i64)
+        .bind(migration.version)
+        .execute(self)
+        .await?;
+
+        Ok(())
+    }
+
+    async fn exec(&mut self, sql: &str) -> Result<(), MigrateError> {
+        let _ = self.execute(sql).await?;
+        Ok(())
+    }
+
+    async fn run_sanity_checks(&mut self) -> Result<(), MigrateError> {
+        let dirty_indices: Vec<String> = query_scalar(
+            r#"
+SELECT pg_class.relname
+FROM pg_index
+JOIN pg_class     ON pg_index.indexrelid   = pg_class.oid
+JOIN pg_namespace ON pg_class.relnamespace = pg_namespace.oid
+WHERE pg_namespace.nspname = current_schema() AND NOT pg_index.indisvalid
+ORDER BY pg_class.relname
+            "#,
+        )
+        .fetch_all(self)
+        .await?;
+
+        if !dirty_indices.is_empty() {
+            return Err(MigrateError::Source(
+                format!("Found invalid indexes: {}", dirty_indices.join(", ")).into(),
+            ));
+        }
+
+        Ok(())
+    }
+}
+
+/// Testing tools for migrations.
+#[cfg(test)]
+pub mod test_utils {
+    use super::*;
+
+    use std::future::Future;
+
+    /// Test migration.
+    ///
+    /// This runs the migrations to check if they pass. The given factory must provide an empty schema (i.e. w/o any
+    /// migrations applied).
+    ///
+    /// # Tests
+    ///
+    /// This tests that:
+    ///
+    /// - **run once:** All migrations work when ran once.
+    /// - **idempotency:** Migrations marked as [`idempotent`](IOxMigration::idempotent) can be executed twice.
+    ///
+    /// # Error
+    ///
+    /// Fails if this finds a bug.
+    pub async fn test_migration<Factory, FactoryFut, Pool>(
+        migrator: &IOxMigrator,
+        factory: Factory,
+    ) -> Result<(), MigrateError>
+    where
+        Factory: (Fn() -> FactoryFut) + Send + Sync,
+        FactoryFut: Future<Output = Pool> + Send,
+        Pool: Send,
+        for<'a> &'a Pool: Acquire<'a> + Send,
+        for<'a> <<&'a Pool as Acquire<'a>>::Connection as Deref>::Target: IOxMigrate,
+    {
+        {
+            info!("test: run all migrations");
+            let conn = factory().await;
+            let applied = migrator.run(&conn).await?;
+            assert_eq!(applied.len(), migrator.migrations.len());
+        }
+
+        info!("interrupt non-transaction migrations");
+        for (idx_m, m) in migrator.migrations.iter().enumerate() {
+            if m.single_transaction() {
+                info!(
+                    version = m.version,
+                    "skip migration because single transaction property"
+                );
+                continue;
+            }
+
+            let steps = m.steps.len();
+            info!(
+                version = m.version,
+                steps, "found non-transactional migration"
+            );
+
+            for step in 1..(steps + 1) {
+                info!(version = m.version, steps, step, "test: die after step");
+
+                let broken_cmd = "iox_this_is_a_broken_test_cmd";
+                let migrator_broken = IOxMigrator::try_new(
+                    migrator
+                        .migrations
+                        .iter()
+                        .take(idx_m)
+                        .cloned()
+                        .chain(std::iter::once(IOxMigration {
+                            steps: m
+                                .steps
+                                .iter()
+                                .take(step)
+                                .cloned()
+                                .chain(std::iter::once(IOxMigrationStep::SqlStatement {
+                                    sql: broken_cmd.into(),
+                                    in_transaction: false,
+                                }))
+                                .collect(),
+                            ..m.clone()
+                        })),
+                )
+                .expect("bug in test");
+
+                let conn = factory().await;
+                let err = migrator_broken.run(&conn).await.unwrap_err();
+                if !err.to_string().contains(broken_cmd) {
+                    panic!("migrator broke in expected way, bug in test setup: {err}");
+                }
+
+                info!(
+                    version = m.version,
+                    steps, step, "test: die after step, recover from error"
+                );
+                let applied = migrator.run(&conn).await?;
+                assert!(applied.contains(&m.version));
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    mod generic {
+        use super::*;
+
+        use proptest::prelude::*;
+
+        proptest! {
+            #[test]
+            fn test_checksum_string_roundtrip(s: Vec<u8>) {
+                let checksum_1 = Checksum::from(s.as_slice());
+                let string_1 = checksum_1.to_string();
+                let checksum_2 = Checksum::from_str(&string_1).unwrap();
+                let string_2 = checksum_2.to_string();
+                assert_eq!(checksum_1, checksum_2);
+                assert_eq!(string_1, string_2);
+            }
+        }
+
+        #[test]
+        fn test_parse_valid_checksum() {
+            let actual = Checksum::from_str(
+                "b88c635e27f8b9ba8547b24efcb081429a8f3e85b70f35916e1900dffc4e6a77eed8a02acc7c72526dd7d50166b63fbd"
+            ).unwrap();
+            let expected = Checksum::from([
+                184, 140, 99, 94, 39, 248, 185, 186, 133, 71, 178, 78, 252, 176, 129, 66, 154, 143,
+                62, 133, 183, 15, 53, 145, 110, 25, 0, 223, 252, 78, 106, 119, 238, 216, 160, 42,
+                204, 124, 114, 82, 109, 215, 213, 1, 102, 182, 63, 189,
+            ]);
+
+            assert_eq!(actual, expected);
+        }
+
+        #[test]
+        fn test_parse_invalid_checksum() {
+            let err = Checksum::from_str("foo").unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: cannot parse checksum 'foo': invalid digit found in string",
+            );
+        }
+
+        #[test]
+        fn test_migrator_new_error_not_sorted() {
+            let err = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: migrations are not sorted: version 2 is before 1 but should not be",
+            );
+        }
+
+        #[test]
+        fn test_migrator_new_error_not_unique() {
+            let err = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: migrations are not unique: version 2 found twice",
+            );
+        }
+
+        #[test]
+        fn test_convert_migrator_from_sqlx_error_no_locking() {
+            let err = IOxMigrator::try_from(&Migrator {
+                migrations: vec![].into(),
+                ignore_missing: false,
+                locking: false,
+            })
+            .unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: `Migrator::locking` MUST be set",
+            );
+        }
+
+        #[test]
+        fn test_convert_migrator_from_sqlx_error_ignore_missing() {
+            let err = IOxMigrator::try_from(&Migrator {
+                migrations: vec![].into(),
+                ignore_missing: true,
+                locking: true,
+            })
+            .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: `Migrator::ignore_missing` MUST NOT be set",
+            );
+        }
+
+        #[test]
+        fn test_convert_migrator_from_sqlx_error_invalid_migration_type_rev_up() {
+            let err = IOxMigrator::try_from(&Migrator {
+                migrations: vec![Migration {
+                    version: 1,
+                    description: "".into(),
+                    migration_type: MigrationType::ReversibleUp,
+                    sql: "".into(),
+                    checksum: vec![].into(),
+                }]
+                .into(),
+                ignore_missing: false,
+                locking: true,
+            })
+            .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: migration type has to be simple but is ReversibleUp",
+            );
+        }
+
+        #[test]
+        fn test_convert_migrator_from_sqlx_error_invalid_migration_type_rev_down() {
+            let err = IOxMigrator::try_from(&Migrator {
+                migrations: vec![Migration {
+                    version: 1,
+                    description: "".into(),
+                    migration_type: MigrationType::ReversibleDown,
+                    sql: "".into(),
+                    checksum: vec![].into(),
+                }]
+                .into(),
+                ignore_missing: false,
+                locking: true,
+            })
+            .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: migration type has to be simple but is ReversibleDown",
+            );
+        }
+
+        #[test]
+        fn test_convert_migrator_from_sqlx_error_invalid_other_compatible_checksum() {
+            let err = IOxMigrator::try_from(&Migrator {
+                migrations: vec![Migration {
+                    version: 1,
+                    description: "".into(),
+                    migration_type: MigrationType::Simple,
+                    sql: "-- IOX_OTHER_CHECKSUM: foo".into(),
+                    checksum: vec![].into(),
+                }]
+                .into(),
+                ignore_missing: false,
+                locking: true,
+            })
+            .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: cannot parse checksum 'foo': invalid digit found in string",
+            );
+        }
+
+        #[test]
+        fn test_convert_migrator_from_sqlx_ok() {
+            let actual = IOxMigrator::try_from(&Migrator {
+                migrations: vec![
+                    Migration {
+                        version: 1,
+                        description: "some descr".into(),
+                        migration_type: MigrationType::Simple,
+                        sql: "SELECT 1;".into(),
+                        checksum: vec![1, 2, 3].into(),
+                    },
+                    Migration {
+                        version: 10,
+                        description: "more descr".into(),
+                        migration_type: MigrationType::Simple,
+                        sql: "SELECT 2;\n-- IOX_STEP_BOUNDARY\n-- IOX_NO_TRANSACTION\nSELECT 3;"
+                            .into(),
+                        checksum: vec![4, 5, 6].into(),
+                    },
+                    Migration {
+                        version: 11,
+                        description: "xxx".into(),
+                        migration_type: MigrationType::Simple,
+                        sql: "-- IOX_OTHER_CHECKSUM:1ff\n-- IOX_OTHER_CHECKSUM:   2ff   \nSELECT4;"
+                            .into(),
+                        checksum: vec![7, 8, 9].into(),
+                    },
+                ]
+                .into(),
+                ignore_missing: false,
+                locking: true,
+            })
+            .unwrap();
+
+            let expected = IOxMigrator {
+                migrations: vec![
+                    IOxMigration {
+                        version: 1,
+                        description: "some descr".into(),
+                        steps: [IOxMigrationStep::SqlStatement {
+                            sql: "SELECT 1;".into(),
+                            in_transaction: true,
+                        }]
+                        .into(),
+                        checksum: [1, 2, 3].into(),
+                        other_compatible_checksums: [].into(),
+                    },
+                    IOxMigration {
+                        version: 10,
+                        description: "more descr".into(),
+                        steps: [
+                            IOxMigrationStep::SqlStatement {
+                                sql: "SELECT 2;".into(),
+                                in_transaction: true,
+                            },
+                            IOxMigrationStep::SqlStatement {
+                                sql: "-- IOX_NO_TRANSACTION\nSELECT 3;".into(),
+                                in_transaction: false,
+                            },
+                        ]
+                        .into(),
+                        checksum: [4, 5, 6].into(),
+                        other_compatible_checksums: [].into(),
+                    },
+                    IOxMigration {
+                        version: 11,
+                        description: "xxx".into(),
+                        steps: [IOxMigrationStep::SqlStatement {
+                            sql: "-- IOX_OTHER_CHECKSUM:1ff\n-- IOX_OTHER_CHECKSUM:   2ff   \nSELECT4;".into(),
+                            in_transaction: true,
+                        }]
+                        .into(),
+                        checksum: [7, 8, 9].into(),
+                        other_compatible_checksums: [
+                            Checksum::from_str("1ff").unwrap(),
+                            Checksum::from_str("2ff").unwrap(),
+                        ].into(),
+                    },
+                ],
+            };
+
+            assert_eq!(actual, expected);
+        }
+    }
+
+    mod postgres {
+        use std::sync::Arc;
+
+        use futures::{stream::FuturesUnordered, StreamExt};
+        use sqlx::{pool::PoolConnection, Postgres};
+        use sqlx_hotswap_pool::HotSwapPool;
+        use test_helpers::maybe_start_logging;
+
+        use crate::postgres::test_utils::{maybe_skip_integration, setup_db_no_migration};
+
+        use super::*;
+
+        #[tokio::test]
+        async fn test_lock_id_deterministic() {
+            maybe_skip_integration!();
+
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let first = conn.generate_lock_id().await.unwrap();
+            let second = conn.generate_lock_id().await.unwrap();
+            assert_eq!(first, second);
+        }
+
+        #[tokio::test]
+        async fn test_lock_unlock_twice() {
+            maybe_skip_integration!();
+
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let lock_id = conn.generate_lock_id().await.unwrap();
+
+            <PgConnection as IOxMigrate>::lock(conn, lock_id)
+                .await
+                .unwrap();
+            <PgConnection as IOxMigrate>::unlock(conn, lock_id)
+                .await
+                .unwrap();
+
+            <PgConnection as IOxMigrate>::lock(conn, lock_id)
+                .await
+                .unwrap();
+            <PgConnection as IOxMigrate>::unlock(conn, lock_id)
+                .await
+                .unwrap();
+        }
+
+        #[tokio::test]
+        async fn test_lock_prevents_2nd_lock() {
+            maybe_skip_integration!();
+
+            let pool = setup_pool().await;
+
+            let mut conn1 = pool.acquire().await.unwrap();
+            let conn1 = &mut *conn1;
+
+            let mut conn2 = pool.acquire().await.unwrap();
+            let conn2 = &mut *conn2;
+
+            let lock_id = conn1.generate_lock_id().await.unwrap();
+
+            <PgConnection as IOxMigrate>::lock(conn1, lock_id)
+                .await
+                .unwrap();
+            tokio::time::timeout(Duration::from_secs(1), async {
+                <PgConnection as IOxMigrate>::lock(conn2, lock_id)
+                    .await
+                    .unwrap();
+            })
+            .await
+            .unwrap_err();
+            <PgConnection as IOxMigrate>::unlock(conn1, lock_id)
+                .await
+                .unwrap();
+
+            <PgConnection as IOxMigrate>::lock(conn2, lock_id)
+                .await
+                .unwrap();
+            <PgConnection as IOxMigrate>::unlock(conn2, lock_id)
+                .await
+                .unwrap();
+        }
+
+        #[tokio::test]
+        async fn test_locks_are_scoped() {
+            maybe_skip_integration!();
+
+            let pool = setup_pool().await;
+
+            let mut conn1 = pool.acquire().await.unwrap();
+            let conn1 = &mut *conn1;
+
+            let mut conn2 = pool.acquire().await.unwrap();
+            let conn2 = &mut *conn2;
+
+            let lock_id1 = conn1.generate_lock_id().await.unwrap();
+            let lock_id2 = !lock_id1;
+
+            <PgConnection as IOxMigrate>::lock(conn1, lock_id1)
+                .await
+                .unwrap();
+            <PgConnection as IOxMigrate>::lock(conn1, lock_id2)
+                .await
+                .unwrap();
+            <PgConnection as IOxMigrate>::unlock(conn1, lock_id1)
+                .await
+                .unwrap();
+
+            // id2 is still lock (i.e. unlock is also scoped)
+            tokio::time::timeout(Duration::from_secs(1), async {
+                <PgConnection as IOxMigrate>::lock(conn2, lock_id2)
+                    .await
+                    .unwrap();
+            })
+            .await
+            .unwrap_err();
+
+            <PgConnection as IOxMigrate>::unlock(conn1, lock_id2)
+                .await
+                .unwrap();
+        }
+
+        #[tokio::test]
+        async fn test_unlock_without_lock_fails() {
+            maybe_skip_integration!();
+
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let lock_id = conn.generate_lock_id().await.unwrap();
+
+            let err = <PgConnection as IOxMigrate>::unlock(conn, lock_id)
+                .await
+                .unwrap_err();
+
+            assert_starts_with(
+                &err.to_string(),
+                "while resolving migrations: did not own lock:",
+            );
+        }
+
+        #[tokio::test]
+        async fn test_step_sql_statement_no_transaction() {
+            maybe_skip_integration!();
+
+            for in_transaction in [false, true] {
+                println!("in_transaction: {in_transaction}");
+
+                let mut conn = setup().await;
+                let conn = &mut *conn;
+
+                conn.execute("CREATE TABLE t (x INT);").await.unwrap();
+
+                let migrator = IOxMigrator::try_new([IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [IOxMigrationStep::SqlStatement {
+                        sql: "CREATE INDEX CONCURRENTLY i ON t (x);".into(),
+                        in_transaction,
+                    }]
+                    .into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                }])
+                .unwrap();
+                let res = migrator.run_direct(conn).await;
+
+                match in_transaction {
+                    false => {
+                        assert_eq!(res.unwrap(), HashSet::from([1]),);
+                    }
+                    true => {
+                        // `CREATE INDEX CONCURRENTLY` is NOT possible w/ a transaction. Verify that.
+                        assert_eq!(
+                            res.unwrap_err().to_string(),
+                            "while executing migrations: error returned from database: \
+                            CREATE INDEX CONCURRENTLY cannot run inside a transaction block",
+                        );
+                    }
+                }
+            }
+        }
+
+        #[tokio::test]
+        async fn test_migrator_happy_path() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [
+                        IOxMigrationStep::SqlStatement {
+                            sql: "CREATE TABLE t (x INT);".into(),
+                            in_transaction: false,
+                        },
+                        IOxMigrationStep::SqlStatement {
+                            sql: "INSERT INTO t (x) VALUES (1); INSERT INTO t (x) VALUES (10);"
+                                .into(),
+                            in_transaction: true,
+                        },
+                    ]
+                    .into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [IOxMigrationStep::SqlStatement {
+                        sql: "INSERT INTO t (x) VALUES (100);".into(),
+                        in_transaction: true,
+                    }]
+                    .into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap();
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(applied, HashSet::from([1, 2]));
+
+            let r: i32 = query_scalar("SELECT SUM(x)::INT AS r FROM t;")
+                .fetch_one(conn)
+                .await
+                .unwrap();
+
+            assert_eq!(r, 111);
+        }
+
+        #[tokio::test]
+        async fn test_migrator_only_apply_new_migrations() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    // NOT idempotent!
+                    sql: "CREATE TABLE t (x INT);".into(),
+                    in_transaction: false,
+                }]
+                .into(),
+                checksum: [].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(applied, HashSet::from([1]));
+
+            let migrator = IOxMigrator::try_new(
+                migrator.migrations.iter().cloned().chain([IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [IOxMigrationStep::SqlStatement {
+                        // NOT idempotent!
+                        sql: "CREATE TABLE s (x INT);".into(),
+                        in_transaction: false,
+                    }]
+                    .into(),
+                    checksum: [].into(),
+                    other_compatible_checksums: [].into(),
+                }]),
+            )
+            .unwrap();
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(applied, HashSet::from([2]));
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(applied, HashSet::from([]));
+        }
+
+        #[tokio::test]
+        async fn test_migrator_fail_clean_migration_missing() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 2,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "migration 1 was previously applied but is missing in the resolved migrations"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_fail_dirty_migration_missing() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    sql: "foo".into(),
+                    in_transaction: false,
+                }]
+                .into(),
+                checksum: [].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap_err();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 2,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "migration 1 was previously applied but is missing in the resolved migrations"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_fail_clean_checksum_mismatch() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [4, 5, 6].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "migration 1 was previously applied but has been modified"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_fail_dirty_checksum_mismatch() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    sql: "foo".into(),
+                    in_transaction: false,
+                }]
+                .into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap_err();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    sql: "foo".into(),
+                    in_transaction: false,
+                }]
+                .into(),
+                checksum: [4, 5, 6].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "migration 1 was previously applied but has been modified"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_other_compatible_checksum() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [4, 5, 6].into(),
+                other_compatible_checksums: [[1, 2, 3].into()].into(),
+            }])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap();
+        }
+
+        /// Migrations may have the same checksum.
+        ///
+        /// This is helpful if you want to revert a change later, e.g.:
+        ///
+        /// 1. add a index
+        /// 2. remove the index
+        /// 3. decide that you actually need the index again
+        #[tokio::test]
+        async fn test_migrator_migrations_can_have_same_checksum() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [1, 2, 3].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [1, 2, 3].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap();
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(applied, HashSet::from([1, 2]));
+        }
+
+        #[tokio::test]
+        async fn test_migrator_recover_dirty_same() {
+            test_migrator_recover_dirty_inner(RecoverFromDirtyMode::Same).await;
+        }
+
+        #[tokio::test]
+        async fn test_migrator_recover_dirty_fix_non_transactional() {
+            test_migrator_recover_dirty_inner(RecoverFromDirtyMode::FixNonTransactional).await;
+        }
+
+        #[tokio::test]
+        async fn test_migrator_recover_dirty_fix_transactional() {
+            test_migrator_recover_dirty_inner(RecoverFromDirtyMode::FixTransactional).await;
+        }
+
+        /// Modes for [`test_migrator_recover_dirty_inner`]
+        #[derive(Debug)]
+        enum RecoverFromDirtyMode {
+            /// Recover from a fluke.
+            ///
+            /// The checksum of the migration stays the same and it is non-transactional (otherwise we wouldn't have
+            /// ended up in a dirty state to begin with).
+            Same,
+
+            /// Recover using a fixed version, the fix is still non-transactional.
+            FixNonTransactional,
+
+            /// Recover using a fixed version, the fix is transactional (in contrast to the original version).
+            FixTransactional,
+        }
+
+        impl RecoverFromDirtyMode {
+            fn same_checksum(&self) -> bool {
+                match self {
+                    Self::Same => true,
+                    Self::FixNonTransactional => false,
+                    Self::FixTransactional => false,
+                }
+            }
+
+            fn fix_is_transactional(&self) -> bool {
+                match self {
+                    Self::Same => false,
+                    Self::FixNonTransactional => false,
+                    Self::FixTransactional => true,
+                }
+            }
+        }
+
+        async fn test_migrator_recover_dirty_inner(mode: RecoverFromDirtyMode) {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            conn.execute("CREATE TABLE t (x INT);").await.unwrap();
+            let test_query = "SELECT COALESCE(SUM(x), 0)::INT AS r FROM t;";
+
+            let steps_ok = vec![
+                IOxMigrationStep::SqlStatement {
+                    sql: "INSERT INTO t VALUES (1);".into(),
+                    // set to NO transaction, otherwise the migrator will happily wrap the migration bookkeeping and the
+                    // migration script itself into a single transaction to avoid the "dirty" state
+                    in_transaction: mode.fix_is_transactional(),
+                },
+                IOxMigrationStep::SqlStatement {
+                    sql: "INSERT INTO t VALUES (2);".into(),
+                    in_transaction: mode.fix_is_transactional(),
+                },
+            ];
+
+            let mut steps_broken = steps_ok.clone();
+            steps_broken[0] = IOxMigrationStep::SqlStatement {
+                sql: "foo".into(),
+                // set to NO transaction, otherwise the migrator will happily wrap the migration bookkeeping and the
+                // migration script itself into a single transaction to avoid the "dirty" state
+                in_transaction: false,
+            };
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: steps_broken.into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap_err();
+
+            let r: i32 = query_scalar(test_query)
+                .fetch_one(&mut *conn)
+                .await
+                .unwrap();
+            assert_eq!(r, 0);
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: steps_ok.into(),
+                checksum: if mode.same_checksum() {
+                    [1, 2, 3].into()
+                } else {
+                    [4, 5, 6].into()
+                },
+                other_compatible_checksums: if mode.same_checksum() {
+                    [].into()
+                } else {
+                    [[1, 2, 3].into()].into()
+                },
+            }])
+            .unwrap();
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(applied, HashSet::from([1]));
+
+            let r: i32 = query_scalar(test_query)
+                .fetch_one(&mut *conn)
+                .await
+                .unwrap();
+            assert_eq!(r, 3);
+        }
+
+        #[tokio::test]
+        async fn test_migrator_uses_single_transaction_when_possible() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            conn.execute("CREATE TABLE t (x INT);").await.unwrap();
+
+            let steps_ok = vec![
+                IOxMigrationStep::SqlStatement {
+                    sql: "INSERT INTO t VALUES (1);".into(),
+                    in_transaction: true,
+                },
+                IOxMigrationStep::SqlStatement {
+                    sql: "INSERT INTO t VALUES (2);".into(),
+                    in_transaction: true,
+                },
+                IOxMigrationStep::SqlStatement {
+                    sql: "INSERT INTO t VALUES (3);".into(),
+                    in_transaction: true,
+                },
+            ];
+
+            // break in-between step that is sandwiched by two valid ones
+            let mut steps_broken = steps_ok.clone();
+            steps_broken[1] = IOxMigrationStep::SqlStatement {
+                sql: "foo".into(),
+                in_transaction: true,
+            };
+
+            let test_query = "SELECT COALESCE(SUM(x), 0)::INT AS r FROM t;";
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: steps_broken.into(),
+                // use a placeholder checksum (normally this would be calculated based on the steps)
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+            migrator.run_direct(conn).await.unwrap_err();
+
+            // all or nothing: nothing
+            let r: i32 = query_scalar(test_query)
+                .fetch_one(&mut *conn)
+                .await
+                .unwrap();
+            assert_eq!(r, 0);
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: steps_ok.into(),
+                // same checksum, but now w/ valid steps (to simulate a once failed SQL statement)
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(applied, HashSet::from([1]),);
+
+            // all or nothing: all
+            let r: i32 = query_scalar(test_query).fetch_one(conn).await.unwrap();
+            assert_eq!(r, 6);
+        }
+
+        /// Tests that `CREATE INDEX CONCURRENTLY` doesn't deadlock.
+        ///
+        /// Originally we used SQLx to acquire the locks which uses `pg_advisory_lock`. However this seems to acquire a
+        /// global "shared lock". Other migration frameworks faced the same issue and use `pg_try_advisory_lock`
+        /// instead. Also see:
+        ///
+        /// - <https://github.com/flyway/flyway/issues/1654>
+        /// - <https://github.com/flyway/flyway/commit/4a185ebcddfb7dac875b7afa5fa270aca621ce1d>
+        #[tokio::test]
+        async fn test_locking() {
+            const N_TABLES_AND_INDICES: usize = 10;
+            const N_CONCURRENT_MIGRATIONS: usize = 100;
+
+            maybe_skip_integration!();
+            maybe_start_logging();
+            let pool = setup_pool().await;
+
+            let migrator = Arc::new(
+                IOxMigrator::try_new((0..N_TABLES_AND_INDICES).map(|i| {
+                    IOxMigration {
+                        version: i as i64,
+                        description: "".into(),
+                        steps: [
+                            IOxMigrationStep::SqlStatement {
+                                sql: format!("CREATE TABLE t{i} (x INT);").into(),
+                                in_transaction: false,
+                            },
+                            IOxMigrationStep::SqlStatement {
+                                sql: format!("CREATE INDEX CONCURRENTLY i{i} ON t{i} (x);").into(),
+                                in_transaction: false,
+                            },
+                        ]
+                        .into(),
+                        checksum: [].into(),
+                        other_compatible_checksums: [].into(),
+                    }
+                }))
+                .unwrap(),
+            );
+
+            let mut futures: FuturesUnordered<_> = (0..N_CONCURRENT_MIGRATIONS)
+                .map(move |_| {
+                    let migrator = Arc::clone(&migrator);
+                    let pool = pool.clone();
+                    async move {
+                        // pool might timeout, so add another retry loop around it
+                        let mut conn = loop {
+                            let pool = pool.clone();
+                            if let Ok(conn) = pool.acquire().await {
+                                break conn;
+                            }
+                        };
+                        let conn = &mut *conn;
+                        migrator.run_direct(conn).await.unwrap();
+                    }
+                })
+                .collect();
+            while futures.next().await.is_some() {}
+        }
+
+        /// This tests that:
+        ///
+        /// - indexes are sanity-checked
+        /// - sanity checks are applied after each new/dirty migration and we keep the migration dirty until the checks
+        ///   pass
+        /// - we can manually recover the database and make the non-idempotent migration pass
+        #[tokio::test]
+        async fn test_sanity_checks_index_1() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            conn.execute("CREATE TABLE t (x INT, y INT);")
+                .await
+                .unwrap();
+            conn.execute("INSERT INTO t VALUES (1, 1);").await.unwrap();
+            conn.execute("INSERT INTO t VALUES (1, 2);").await.unwrap();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    sql: "CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS i ON t (x);".into(),
+                    in_transaction: false,
+                }]
+                .into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            // fails because is not unique
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "while executing migrations: error returned from database: could not create unique index \"i\""
+            );
+
+            // re-applying fails due to sanity checks
+            // NOTE: Even though the actual migration script passes, the sanity checks DO NOT and hence the migration is
+            //       still considered dirty. It will be re-applied after the manual intervention below.
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: Found invalid indexes: i"
+            );
+
+            // fix data and wipe index
+            conn.execute("DELETE FROM t WHERE y = 2;").await.unwrap();
+            conn.execute("DROP INDEX i;").await.unwrap();
+
+            // applying works
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(HashSet::from([1]), applied);
+        }
+
+        /// This tests that:
+        ///
+        /// - indexes are sanity-checked
+        /// - we can fix a data error and a proper, idempotent migration will eventually pass
+        #[tokio::test]
+        async fn test_sanity_checks_index_2() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            conn.execute("CREATE TABLE t (x INT, y INT);")
+                .await
+                .unwrap();
+            conn.execute("INSERT INTO t VALUES (1, 1);").await.unwrap();
+            conn.execute("INSERT INTO t VALUES (1, 2);").await.unwrap();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [
+                    IOxMigrationStep::SqlStatement {
+                        sql: "DROP INDEX IF EXISTS i;".into(),
+                        in_transaction: false,
+                    },
+                    IOxMigrationStep::SqlStatement {
+                        sql: "CREATE UNIQUE INDEX CONCURRENTLY i ON t (x);".into(),
+                        in_transaction: false,
+                    },
+                ]
+                .into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            // fails because is not unique
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "while executing migrations: error returned from database: could not create unique index \"i\""
+            );
+
+            // re-applying fails with same error (index is wiped but fails w/ same error)
+            let err = migrator.run_direct(conn).await.unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "while executing migrations: error returned from database: could not create unique index \"i\""
+            );
+
+            // fix data issue
+            conn.execute("UPDATE t SET x = 2 WHERE y = 2")
+                .await
+                .unwrap();
+
+            // now it works
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(HashSet::from([1]), applied);
+        }
+
+        #[tokio::test]
+        async fn test_migrator_fail_new_migration_before_applied() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migration_1 = IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            };
+            let migration_2 = IOxMigration {
+                version: 2,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [4, 5, 6].into(),
+                other_compatible_checksums: [].into(),
+            };
+
+            let migrator = IOxMigrator::try_new([migration_2.clone()]).unwrap();
+
+            let applied = migrator.run_direct(conn).await.unwrap();
+            assert_eq!(HashSet::from([2]), applied);
+
+            let migrator = IOxMigrator::try_new([migration_1, migration_2]).unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: new migration (1) goes before last applied migration (2), \
+                this should not have been merged!",
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_fail_new_migration_before_dirty() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migration_1 = IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            };
+            let migration_2 = IOxMigration {
+                version: 2,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    sql: "foo".into(),
+                    in_transaction: false,
+                }]
+                .into(),
+                checksum: [4, 5, 6].into(),
+                other_compatible_checksums: [].into(),
+            };
+
+            let migrator = IOxMigrator::try_new([migration_2.clone()]).unwrap();
+
+            migrator.run_direct(conn).await.unwrap_err();
+
+            let migrator = IOxMigrator::try_new([migration_1, migration_2]).unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: new migration (1) goes before dirty version (2), \
+                this should not have been merged!",
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_bug_selftest_multiple_dirty_migrations() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [1, 2, 3].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [4, 5, 6].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap();
+
+            conn.execute("UPDATE _sqlx_migrations SET success = FALSE;")
+                .await
+                .unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: there are multiple dirty versions, \
+                this should not happen and is considered a bug: [1, 2]",
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_bug_selftest_applied_after_dirty() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [1, 2, 3].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [4, 5, 6].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap();
+
+            migrator.run_direct(conn).await.unwrap();
+
+            conn.execute("UPDATE _sqlx_migrations SET success = FALSE WHERE version = 1;")
+                .await
+                .unwrap();
+
+            let err = migrator.run_direct(conn).await.unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while resolving migrations: dirty version (1) is not the last applied version (2), this is a bug",
+            );
+        }
+
+        #[tokio::test]
+        async fn test_migrator_allows_unknown_migrations_if_they_are_clean() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator_1 = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [1, 2, 3].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [4, 5, 6].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap();
+            let migrator_2 = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator_1.run_direct(conn).await.unwrap();
+            migrator_2.run_direct(conn).await.unwrap();
+        }
+
+        #[tokio::test]
+        async fn test_tester_finds_invalid_migration() {
+            maybe_skip_integration!();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    sql: "foo".into(),
+                    in_transaction: true,
+                }]
+                .into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let err = test_utils::test_migration(&migrator, setup_pool)
+                .await
+                .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while executing migrations: error returned from database: syntax error at or near \"foo\"",
+            );
+        }
+
+        #[tokio::test]
+        async fn test_tester_finds_non_idempotent_migration_package() {
+            maybe_skip_integration!();
+
+            let migrator = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [IOxMigrationStep::SqlStatement {
+                    sql: "CREATE TABLE t (x INT);".into(),
+                    // do NOT run this in a transaction, otherwise this is automatically idempotent
+                    in_transaction: false,
+                }]
+                .into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            let err = test_utils::test_migration(&migrator, setup_pool)
+                .await
+                .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while executing migrations: error returned from database: relation \"t\" already exists",
+            );
+        }
+
+        #[tokio::test]
+        async fn test_tester_finds_non_idempotent_migration_step() {
+            maybe_skip_integration!();
+
+            let migrator = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [IOxMigrationStep::SqlStatement {
+                        sql: "CREATE TABLE t (x INT);".into(),
+                        in_transaction: true,
+                    }]
+                    .into(),
+                    checksum: [1, 2, 3].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [
+                        IOxMigrationStep::SqlStatement {
+                            sql: "DROP TABLE t;".into(),
+                            // do NOT run this in a transaction, otherwise this is automatically idempotent
+                            in_transaction: false,
+                        },
+                        IOxMigrationStep::SqlStatement {
+                            sql: "CREATE TABLE t (x INT);".into(),
+                            // do NOT run this in a transaction, otherwise this is automatically idempotent
+                            in_transaction: false,
+                        },
+                    ]
+                    .into(),
+                    checksum: [4, 5, 6].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap();
+
+            let err = test_utils::test_migration(&migrator, setup_pool)
+                .await
+                .unwrap_err();
+
+            assert_eq!(
+                err.to_string(),
+                "while executing migrations: error returned from database: table \"t\" does not exist",
+            );
+        }
+
+        async fn setup_pool() -> HotSwapPool<Postgres> {
+            maybe_start_logging();
+
+            setup_db_no_migration().await.into_pool()
+        }
+
+        async fn setup() -> PoolConnection<Postgres> {
+            let pool = setup_pool().await;
+            pool.acquire().await.unwrap()
+        }
+
+        #[track_caller]
+        fn assert_starts_with(s: &str, prefix: &str) {
+            if !s.starts_with(prefix) {
+                panic!("'{s}' does not start with '{prefix}'");
+            }
+        }
+    }
+}
diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs
new file mode 100644
index 0000000..ef9c5d2
--- /dev/null
+++ b/iox_catalog/src/postgres.rs
@@ -0,0 +1,2783 @@
+//! A Postgres backed implementation of the Catalog
+
+use crate::interface::PartitionRepoExt;
+use crate::{
+    constants::{
+        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
+    },
+    interface::{
+        AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo,
+        PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
+    },
+    metrics::MetricDecorator,
+    migrate::IOxMigrator,
+};
+use async_trait::async_trait;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
+use data_types::{
+    partition_template::{
+        NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart,
+    },
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey,
+    SkippedCompaction, SortKeyIds, Table, TableId, Timestamp,
+};
+use iox_time::{SystemProvider, TimeProvider};
+use metric::{Attributes, Instrument, MetricKind};
+use observability_deps::tracing::{debug, info, warn};
+use once_cell::sync::Lazy;
+use parking_lot::{RwLock, RwLockWriteGuard};
+use snafu::prelude::*;
+use sqlx::{
+    postgres::{PgConnectOptions, PgPoolOptions},
+    Acquire, ConnectOptions, Executor, Postgres, Row,
+};
+use sqlx_hotswap_pool::HotSwapPool;
+use std::{
+    borrow::Cow,
+    collections::{HashMap, HashSet},
+    env,
+    fmt::Display,
+    str::FromStr,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+
+static MIGRATOR: Lazy<IOxMigrator> =
+    Lazy::new(|| IOxMigrator::try_from(&sqlx::migrate!()).expect("valid migration"));
+
+/// Postgres connection options.
+#[derive(Debug, Clone)]
+pub struct PostgresConnectionOptions {
+    /// Application name.
+    ///
+    /// This will be reported to postgres.
+    pub app_name: String,
+
+    /// Schema name.
+    pub schema_name: String,
+
+    /// DSN.
+    pub dsn: String,
+
+    /// Maximum number of concurrent connections.
+    pub max_conns: u32,
+
+    /// Set the amount of time to attempt connecting to the database.
+    pub connect_timeout: Duration,
+
+    /// Set a maximum idle duration for individual connections.
+    pub idle_timeout: Duration,
+
+    /// If the DSN points to a file (i.e. starts with `dsn-file://`), this sets the interval how often the the file
+    /// should be polled for updates.
+    ///
+    /// If an update is encountered, the underlying connection pool will be hot-swapped.
+    pub hotswap_poll_interval: Duration,
+}
+
+impl PostgresConnectionOptions {
+    /// Default value for [`schema_name`](Self::schema_name).
+    pub const DEFAULT_SCHEMA_NAME: &'static str = "iox_catalog";
+
+    /// Default value for [`max_conns`](Self::max_conns).
+    pub const DEFAULT_MAX_CONNS: u32 = 10;
+
+    /// Default value for [`connect_timeout`](Self::connect_timeout).
+    pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(2);
+
+    /// Default value for [`idle_timeout`](Self::idle_timeout).
+    pub const DEFAULT_IDLE_TIMEOUT: Duration = Duration::from_secs(10);
+
+    /// Default value for [`hotswap_poll_interval`](Self::hotswap_poll_interval).
+    pub const DEFAULT_HOTSWAP_POLL_INTERVAL: Duration = Duration::from_secs(5);
+}
+
+impl Default for PostgresConnectionOptions {
+    fn default() -> Self {
+        Self {
+            app_name: String::from("iox"),
+            schema_name: String::from(Self::DEFAULT_SCHEMA_NAME),
+            dsn: String::new(),
+            max_conns: Self::DEFAULT_MAX_CONNS,
+            connect_timeout: Self::DEFAULT_CONNECT_TIMEOUT,
+            idle_timeout: Self::DEFAULT_IDLE_TIMEOUT,
+            hotswap_poll_interval: Self::DEFAULT_HOTSWAP_POLL_INTERVAL,
+        }
+    }
+}
+
+/// PostgreSQL catalog.
+#[derive(Debug)]
+pub struct PostgresCatalog {
+    metrics: Arc<metric::Registry>,
+    pool: HotSwapPool<Postgres>,
+    time_provider: Arc<dyn TimeProvider>,
+    // Connection options for display
+    options: PostgresConnectionOptions,
+}
+
+impl PostgresCatalog {
+    /// Connect to the catalog store.
+    pub async fn connect(
+        options: PostgresConnectionOptions,
+        metrics: Arc<metric::Registry>,
+    ) -> Result<Self> {
+        let pool = new_pool(&options, Arc::clone(&metrics)).await?;
+
+        Ok(Self {
+            pool,
+            metrics,
+            time_provider: Arc::new(SystemProvider::new()),
+            options,
+        })
+    }
+
+    fn schema_name(&self) -> &str {
+        &self.options.schema_name
+    }
+
+    #[cfg(test)]
+    pub(crate) fn into_pool(self) -> HotSwapPool<Postgres> {
+        self.pool
+    }
+}
+
+impl Display for PostgresCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            // Do not include dsn in log as it may have credentials
+            // that should not end up in the log
+            "Postgres(dsn=OMITTED, schema_name='{}')",
+            self.schema_name()
+        )
+    }
+}
+
+/// transaction for [`PostgresCatalog`].
+#[derive(Debug)]
+pub struct PostgresTxn {
+    inner: PostgresTxnInner,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+#[derive(Debug)]
+struct PostgresTxnInner {
+    pool: HotSwapPool<Postgres>,
+}
+
+impl<'c> Executor<'c> for &'c mut PostgresTxnInner {
+    type Database = Postgres;
+
+    #[allow(clippy::type_complexity)]
+    fn fetch_many<'e, 'q: 'e, E: 'q>(
+        self,
+        query: E,
+    ) -> futures::stream::BoxStream<
+        'e,
+        Result<
+            sqlx::Either<
+                <Self::Database as sqlx::Database>::QueryResult,
+                <Self::Database as sqlx::Database>::Row,
+            >,
+            sqlx::Error,
+        >,
+    >
+    where
+        'c: 'e,
+        E: sqlx::Execute<'q, Self::Database>,
+    {
+        self.pool.fetch_many(query)
+    }
+
+    fn fetch_optional<'e, 'q: 'e, E: 'q>(
+        self,
+        query: E,
+    ) -> futures::future::BoxFuture<
+        'e,
+        Result<Option<<Self::Database as sqlx::Database>::Row>, sqlx::Error>,
+    >
+    where
+        'c: 'e,
+        E: sqlx::Execute<'q, Self::Database>,
+    {
+        self.pool.fetch_optional(query)
+    }
+
+    fn prepare_with<'e, 'q: 'e>(
+        self,
+        sql: &'q str,
+        parameters: &'e [<Self::Database as sqlx::Database>::TypeInfo],
+    ) -> futures::future::BoxFuture<
+        'e,
+        Result<<Self::Database as sqlx::database::HasStatement<'q>>::Statement, sqlx::Error>,
+    >
+    where
+        'c: 'e,
+    {
+        self.pool.prepare_with(sql, parameters)
+    }
+
+    fn describe<'e, 'q: 'e>(
+        self,
+        sql: &'q str,
+    ) -> futures::future::BoxFuture<'e, Result<sqlx::Describe<Self::Database>, sqlx::Error>>
+    where
+        'c: 'e,
+    {
+        self.pool.describe(sql)
+    }
+}
+
+#[async_trait]
+impl Catalog for PostgresCatalog {
+    async fn setup(&self) -> Result<(), Error> {
+        // We need to create the schema if we're going to set it as the first item of the
+        // search_path otherwise when we run the sqlx migration scripts for the first time, sqlx
+        // will create the `_sqlx_migrations` table in the public namespace (the only namespace
+        // that exists), but the second time it will create it in the `<schema_name>` namespace and
+        // re-run all the migrations without skipping the ones already applied (see #3893).
+        //
+        // This makes the migrations/20210217134322_create_schema.sql step unnecessary; we need to
+        // keep that file because migration files are immutable.
+        let create_schema_query = format!("CREATE SCHEMA IF NOT EXISTS {};", self.schema_name());
+        self.pool.execute(sqlx::query(&create_schema_query)).await?;
+
+        MIGRATOR.run(&self.pool).await?;
+
+        Ok(())
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        Box::new(MetricDecorator::new(
+            PostgresTxn {
+                inner: PostgresTxnInner {
+                    pool: self.pool.clone(),
+                },
+                time_provider: Arc::clone(&self.time_provider),
+            },
+            Arc::clone(&self.metrics),
+            Arc::clone(&self.time_provider),
+        ))
+    }
+
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<metric::Registry> {
+        Arc::clone(&self.metrics)
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider)
+    }
+}
+
+/// Adapter to connect sqlx pools with our metrics system.
+#[derive(Debug, Clone, Default)]
+struct PoolMetrics {
+    /// Actual shared state.
+    state: Arc<PoolMetricsInner>,
+}
+
+/// Inner state of [`PoolMetrics`] that is wrapped into an [`Arc`].
+#[derive(Debug, Default)]
+struct PoolMetricsInner {
+    /// Next pool ID.
+    pool_id_gen: AtomicU64,
+
+    /// Set of known pools and their ID labels.
+    ///
+    /// Note: The pool is internally ref-counted via an [`Arc`]. Holding a reference does NOT prevent it from being closed.
+    pools: RwLock<Vec<(Arc<str>, sqlx::Pool<Postgres>)>>,
+}
+
+impl PoolMetrics {
+    /// Create new pool metrics.
+    fn new(metrics: Arc<metric::Registry>) -> Self {
+        metrics.register_instrument("iox_catalog_postgres", Self::default)
+    }
+
+    /// Register a new pool.
+    fn register_pool(&self, pool: sqlx::Pool<Postgres>) {
+        let id = self
+            .state
+            .pool_id_gen
+            .fetch_add(1, Ordering::SeqCst)
+            .to_string()
+            .into();
+        let mut pools = self.state.pools.write();
+        pools.push((id, pool));
+    }
+
+    /// Remove closed pools from given list.
+    fn clean_pools(pools: &mut Vec<(Arc<str>, sqlx::Pool<Postgres>)>) {
+        pools.retain(|(_id, p)| !p.is_closed());
+    }
+}
+
+impl Instrument for PoolMetrics {
+    fn report(&self, reporter: &mut dyn metric::Reporter) {
+        let mut pools = self.state.pools.write();
+        Self::clean_pools(&mut pools);
+        let pools = RwLockWriteGuard::downgrade(pools);
+
+        reporter.start_metric(
+            "sqlx_postgres_pools",
+            "Number of pools that sqlx uses",
+            MetricKind::U64Gauge,
+        );
+        reporter.report_observation(
+            &Attributes::from([]),
+            metric::Observation::U64Gauge(pools.len() as u64),
+        );
+        reporter.finish_metric();
+
+        reporter.start_metric(
+            "sqlx_postgres_connections",
+            "Number of connections within the postgres connection pool that sqlx uses",
+            MetricKind::U64Gauge,
+        );
+        for (id, p) in pools.iter() {
+            let active = p.size() as u64;
+            let idle = p.num_idle() as u64;
+
+            // We get both values independently (from underlying atomic counters) so they might be out of sync (with a
+            // low likelyhood). Calculating this value and emitting it is useful though since it allows easier use in
+            // dashboards since you can `max_over_time` w/o any recording rules.
+            let used = active.saturating_sub(idle);
+
+            reporter.report_observation(
+                &Attributes::from([
+                    ("pool_id", Cow::Owned(id.as_ref().to_owned())),
+                    ("state", Cow::Borrowed("active")),
+                ]),
+                metric::Observation::U64Gauge(active),
+            );
+            reporter.report_observation(
+                &Attributes::from([
+                    ("pool_id", Cow::Owned(id.as_ref().to_owned())),
+                    ("state", Cow::Borrowed("idle")),
+                ]),
+                metric::Observation::U64Gauge(idle),
+            );
+            reporter.report_observation(
+                &Attributes::from([
+                    ("pool_id", Cow::Owned(id.as_ref().to_owned())),
+                    ("state", Cow::Borrowed("used")),
+                ]),
+                metric::Observation::U64Gauge(used),
+            );
+            reporter.report_observation(
+                &Attributes::from([
+                    ("pool_id", Cow::Owned(id.as_ref().to_owned())),
+                    ("state", Cow::Borrowed("max")),
+                ]),
+                metric::Observation::U64Gauge(p.options().get_max_connections() as u64),
+            );
+            reporter.report_observation(
+                &Attributes::from([
+                    ("pool_id", Cow::Owned(id.as_ref().to_owned())),
+                    ("state", Cow::Borrowed("min")),
+                ]),
+                metric::Observation::U64Gauge(p.options().get_min_connections() as u64),
+            );
+        }
+
+        reporter.finish_metric();
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
+
+/// Creates a new [`sqlx::Pool`] from a database config and an explicit DSN.
+///
+/// This function doesn't support the IDPE specific `dsn-file://` uri scheme.
+async fn new_raw_pool(
+    options: &PostgresConnectionOptions,
+    parsed_dsn: &str,
+    metrics: PoolMetrics,
+) -> Result<sqlx::Pool<Postgres>, sqlx::Error> {
+    // sqlx exposes some options as pool options, while other options are available as connection options.
+    let mut connect_options = PgConnectOptions::from_str(parsed_dsn)?
+        // the default is INFO, which is frankly surprising.
+        .log_statements(log::LevelFilter::Trace);
+
+    // Workaround sqlx ignoring the SSL_CERT_FILE environment variable.
+    // Remove workaround when upstream sqlx handles SSL_CERT_FILE properly (#8994).
+    let cert_file = env::var("SSL_CERT_FILE").unwrap_or_default();
+    if !cert_file.is_empty() {
+        connect_options = connect_options.ssl_root_cert(cert_file);
+    }
+
+    let app_name = options.app_name.clone();
+    let app_name2 = options.app_name.clone(); // just to log below
+    let schema_name = options.schema_name.clone();
+    let pool = PgPoolOptions::new()
+        .min_connections(1)
+        .max_connections(options.max_conns)
+        .acquire_timeout(options.connect_timeout)
+        .idle_timeout(options.idle_timeout)
+        .test_before_acquire(true)
+        .after_connect(move |c, _meta| {
+            let app_name = app_name.to_owned();
+            let schema_name = schema_name.to_owned();
+            Box::pin(async move {
+                // Tag the connection with the provided application name, while allowing it to
+                // be override from the connection string (aka DSN).
+                // If current_application_name is empty here it means the application name wasn't
+                // set as part of the DSN, and we can set it explicitly.
+                // Recall that this block is running on connection, not when creating the pool!
+                let current_application_name: String =
+                    sqlx::query_scalar("SELECT current_setting('application_name');")
+                        .fetch_one(&mut *c)
+                        .await?;
+                if current_application_name.is_empty() {
+                    sqlx::query("SELECT set_config('application_name', $1, false);")
+                        .bind(&*app_name)
+                        .execute(&mut *c)
+                        .await?;
+                }
+                let search_path_query = format!("SET search_path TO {schema_name},public;");
+                c.execute(sqlx::query(&search_path_query)).await?;
+
+                // Ensure explicit timezone selection, instead of deferring to
+                // the server value.
+                c.execute("SET timezone = 'UTC';").await?;
+                Ok(())
+            })
+        })
+        .connect_with(connect_options)
+        .await?;
+
+    // Log a connection was successfully established and include the application
+    // name for cross-correlation between Conductor logs & database connections.
+    info!(application_name=%app_name2, "connected to config store");
+
+    metrics.register_pool(pool.clone());
+    Ok(pool)
+}
+
+/// Parse a postgres catalog dsn, handling the special `dsn-file://`
+/// syntax (see [`new_pool`] for more details).
+///
+/// Returns an error if the dsn-file could not be read correctly.
+pub fn parse_dsn(dsn: &str) -> Result<String, sqlx::Error> {
+    let dsn = match get_dsn_file_path(dsn) {
+        Some(filename) => std::fs::read_to_string(filename)?,
+        None => dsn.to_string(),
+    };
+    Ok(dsn)
+}
+
+/// Creates a new HotSwapPool
+///
+/// This function understands the IDPE specific `dsn-file://` dsn uri scheme
+/// and hot swaps the pool with a new sqlx::Pool when the file changes.
+/// This is useful because the credentials can be rotated by infrastructure
+/// agents while the service is running.
+///
+/// The file is polled for changes every `polling_interval`.
+///
+/// The pool is replaced only once the new pool is successfully created.
+/// The [`new_raw_pool`] function will return a new pool only if the connection
+/// is successfull (see [`sqlx::pool::PoolOptions::test_before_acquire`]).
+async fn new_pool(
+    options: &PostgresConnectionOptions,
+    metrics: Arc<metric::Registry>,
+) -> Result<HotSwapPool<Postgres>, sqlx::Error> {
+    let parsed_dsn = parse_dsn(&options.dsn)?;
+    let metrics = PoolMetrics::new(metrics);
+    let pool = HotSwapPool::new(new_raw_pool(options, &parsed_dsn, metrics.clone()).await?);
+    let polling_interval = options.hotswap_poll_interval;
+
+    if let Some(dsn_file) = get_dsn_file_path(&options.dsn) {
+        let pool = pool.clone();
+        let options = options.clone();
+
+        // TODO(mkm): return a guard that stops this background worker.
+        // We create only one pool per process, but it would be cleaner to be
+        // able to properly destroy the pool. If we don't kill this worker we
+        // effectively keep the pool alive (since it holds a reference to the
+        // Pool) and we also potentially pollute the logs with spurious warnings
+        // if the dsn file disappears (this may be annoying if they show up in the test
+        // logs).
+        tokio::spawn(async move {
+            let mut current_dsn = parsed_dsn.clone();
+            loop {
+                tokio::time::sleep(polling_interval).await;
+
+                async fn try_update(
+                    options: &PostgresConnectionOptions,
+                    current_dsn: &str,
+                    dsn_file: &str,
+                    pool: &HotSwapPool<Postgres>,
+                    metrics: PoolMetrics,
+                ) -> Result<Option<String>, sqlx::Error> {
+                    let new_dsn = std::fs::read_to_string(dsn_file)?;
+                    if new_dsn == current_dsn {
+                        Ok(None)
+                    } else {
+                        let new_pool = new_raw_pool(options, &new_dsn, metrics).await?;
+                        let old_pool = pool.replace(new_pool);
+                        info!("replaced hotswap pool");
+                        info!(?old_pool, "closing old DB connection pool");
+                        // The pool is not closed on drop. We need to call `close`.
+                        // It will close all idle connections, and wait until acquired connections
+                        // are returned to the pool or closed.
+                        old_pool.close().await;
+                        info!(?old_pool, "closed old DB connection pool");
+                        Ok(Some(new_dsn))
+                    }
+                }
+
+                match try_update(&options, &current_dsn, &dsn_file, &pool, metrics.clone()).await {
+                    Ok(None) => {}
+                    Ok(Some(new_dsn)) => {
+                        current_dsn = new_dsn;
+                    }
+                    Err(e) => {
+                        warn!(
+                            error=%e,
+                            filename=%dsn_file,
+                            "not replacing hotswap pool because of an error \
+                            connecting to the new DSN"
+                        );
+                    }
+                }
+            }
+        });
+    }
+
+    Ok(pool)
+}
+
+// Parses a `dsn-file://` scheme, according to the rules of the IDPE kit/sql package.
+//
+// If the dsn matches the `dsn-file://` prefix, the prefix is removed and the rest is interpreted
+// as a file name, in which case this function will return `Some(filename)`.
+// Otherwise it will return None. No URI decoding is performed on the filename.
+fn get_dsn_file_path(dsn: &str) -> Option<String> {
+    const DSN_SCHEME: &str = "dsn-file://";
+    dsn.starts_with(DSN_SCHEME)
+        .then(|| dsn[DSN_SCHEME.len()..].to_owned())
+}
+
+impl RepoCollection for PostgresTxn {
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+async fn insert_column_with_connection<'q, E>(
+    executor: E,
+    name: &str,
+    table_id: TableId,
+    column_type: ColumnType,
+) -> Result<Column>
+where
+    E: Executor<'q, Database = Postgres>,
+{
+    let rec = sqlx::query_as::<_, Column>(
+            r#"
+INSERT INTO column_name ( name, table_id, column_type )
+SELECT $1, table_id, $3 FROM (
+    SELECT max_columns_per_table, namespace.id, table_name.id as table_id, COUNT(column_name.*) AS count
+    FROM namespace LEFT JOIN table_name ON namespace.id = table_name.namespace_id
+                   LEFT JOIN column_name ON table_name.id = column_name.table_id
+    WHERE table_name.id = $2
+    GROUP BY namespace.max_columns_per_table, namespace.id, table_name.id
+) AS get_count WHERE count < max_columns_per_table
+ON CONFLICT ON CONSTRAINT column_name_unique
+DO UPDATE SET name = column_name.name
+RETURNING *;
+        "#,
+        )
+        .bind(name) // $1
+        .bind(table_id) // $2
+        .bind(column_type) // $3
+        .fetch_one(executor)
+        .await
+        .map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!("couldn't create column {} in table {}; limit reached on namespace", name, table_id)
+            },
+            _ => {
+            if is_fk_violation(&e) {
+                Error::NotFound { descr: e.to_string() }
+            } else {
+                Error::External { source: Box::new(e) }
+            }
+        }})?;
+
+    ensure!(
+        rec.column_type == column_type,
+        AlreadyExistsSnafu {
+            descr: format!(
+                "column {} is type {} but schema update has type {}",
+                name, rec.column_type, column_type
+            ),
+        }
+    );
+
+    Ok(rec)
+}
+
+#[async_trait]
+impl NamespaceRepo for PostgresTxn {
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace> {
+        let max_tables = service_protection_limits
+            .and_then(|l| l.max_tables)
+            .unwrap_or_default();
+        let max_columns_per_table = service_protection_limits
+            .and_then(|l| l.max_columns_per_table)
+            .unwrap_or_default();
+
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+INSERT INTO namespace (
+    name, retention_period_ns, max_tables, max_columns_per_table, partition_template
+)
+VALUES ( $1, $2, $3, $4, $5 )
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+            "#,
+        )
+        .bind(name.as_str()) // $1
+        .bind(retention_period_ns) // $2
+        .bind(max_tables) // $3
+        .bind(max_columns_per_table) // $4
+        .bind(partition_template); // $5
+
+        let rec = rec.fetch_one(&mut self.inner).await.map_err(|e| {
+            if is_unique_violation(&e) {
+                Error::AlreadyExists {
+                    descr: name.to_string(),
+                }
+            } else if is_fk_violation(&e) {
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
+            } else {
+                Error::External {
+                    source: Box::new(e),
+                }
+            }
+        })?;
+
+        Ok(rec)
+    }
+
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            format!(
+                r#"
+SELECT id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+       partition_template
+FROM namespace
+WHERE {v};
+                "#,
+                v = deleted.as_sql_predicate()
+            )
+            .as_str(),
+        )
+        .fetch_all(&mut self.inner)
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            format!(
+                r#"
+SELECT id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+       partition_template
+FROM namespace
+WHERE id=$1 AND {v};
+                "#,
+                v = deleted.as_sql_predicate()
+            )
+            .as_str(),
+        )
+        .bind(id) // $1
+        .fetch_one(&mut self.inner)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let namespace = rec?;
+
+        Ok(Some(namespace))
+    }
+
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            format!(
+                r#"
+SELECT id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+       partition_template
+FROM namespace
+WHERE name=$1 AND {v};
+                "#,
+                v = deleted.as_sql_predicate()
+            )
+            .as_str(),
+        )
+        .bind(name) // $1
+        .fetch_one(&mut self.inner)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let namespace = rec?;
+
+        Ok(Some(namespace))
+    }
+
+    async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        let flagged_at = Timestamp::from(self.time_provider.now());
+
+        // note that there is a uniqueness constraint on the name column in the DB
+        sqlx::query(r#"UPDATE namespace SET deleted_at=$1 WHERE name = $2;"#)
+            .bind(flagged_at) // $1
+            .bind(name) // $2
+            .execute(&mut self.inner)
+            .await
+            .map_err(Error::from)
+            .map(|_| ())
+    }
+
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+UPDATE namespace
+SET max_tables = $1
+WHERE name = $2
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+        "#,
+        )
+        .bind(new_max)
+        .bind(name)
+        .fetch_one(&mut self.inner)
+        .await;
+
+        let namespace = rec.map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
+            },
+        })?;
+
+        Ok(namespace)
+    }
+
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+UPDATE namespace
+SET max_columns_per_table = $1
+WHERE name = $2
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+        "#,
+        )
+        .bind(new_max)
+        .bind(name)
+        .fetch_one(&mut self.inner)
+        .await;
+
+        let namespace = rec.map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
+            },
+        })?;
+
+        Ok(namespace)
+    }
+
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+UPDATE namespace
+SET retention_period_ns = $1
+WHERE name = $2
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+        "#,
+        )
+        .bind(retention_period_ns) // $1
+        .bind(name) // $2
+        .fetch_one(&mut self.inner)
+        .await;
+
+        let namespace = rec.map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
+            },
+        })?;
+
+        Ok(namespace)
+    }
+}
+
+#[async_trait]
+impl TableRepo for PostgresTxn {
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table> {
+        let mut tx = self.inner.pool.begin().await?;
+
+        // A simple insert statement becomes quite complicated in order to avoid checking the table
+        // limits in a select and then conditionally inserting (which would be racey).
+        //
+        // from https://www.postgresql.org/docs/current/sql-insert.html
+        //   "INSERT inserts new rows into a table. One can insert one or more rows specified by
+        //   value expressions, or zero or more rows resulting from a query."
+        // By using SELECT rather than VALUES it will insert zero rows if it finds a null in the
+        // subquery, i.e. if count >= max_tables. fetch_one() will return a RowNotFound error if
+        // nothing was inserted. Not pretty!
+        let table = sqlx::query_as::<_, Table>(
+            r#"
+INSERT INTO table_name ( name, namespace_id, partition_template )
+SELECT $1, id, $2 FROM (
+    SELECT namespace.id AS id, max_tables, COUNT(table_name.*) AS count
+    FROM namespace LEFT JOIN table_name ON namespace.id = table_name.namespace_id
+    WHERE namespace.id = $3
+    GROUP BY namespace.max_tables, table_name.namespace_id, namespace.id
+) AS get_count WHERE count < max_tables
+RETURNING *;
+        "#,
+        )
+        .bind(name) // $1
+        .bind(partition_template) // $2
+        .bind(namespace_id) // $3
+        .fetch_one(&mut *tx)
+        .await
+        .map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!(
+                    "couldn't create table {}; limit reached on namespace {}",
+                    name, namespace_id
+                ),
+            },
+            _ => {
+                if is_unique_violation(&e) {
+                    Error::AlreadyExists {
+                        descr: format!("table '{name}' in namespace {namespace_id}"),
+                    }
+                } else if is_fk_violation(&e) {
+                    Error::NotFound {
+                        descr: e.to_string(),
+                    }
+                } else {
+                    Error::External {
+                        source: Box::new(e),
+                    }
+                }
+            }
+        })?;
+
+        // Partitioning is only supported for tags, so create tag columns for all `TagValue`
+        // partition template parts. It's important this happens within the table creation
+        // transaction so that there isn't a possibility of a concurrent write creating these
+        // columns with an unsupported type.
+        for template_part in table.partition_template.parts() {
+            if let TemplatePart::TagValue(tag_name) = template_part {
+                insert_column_with_connection(&mut *tx, tag_name, table.id, ColumnType::Tag)
+                    .await?;
+            }
+        }
+
+        tx.commit().await?;
+
+        Ok(table)
+    }
+
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
+        let rec = sqlx::query_as::<_, Table>(
+            r#"
+SELECT *
+FROM table_name
+WHERE id = $1;
+            "#,
+        )
+        .bind(table_id) // $1
+        .fetch_one(&mut self.inner)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let table = rec?;
+
+        Ok(Some(table))
+    }
+
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>> {
+        let rec = sqlx::query_as::<_, Table>(
+            r#"
+SELECT *
+FROM table_name
+WHERE namespace_id = $1 AND name = $2;
+            "#,
+        )
+        .bind(namespace_id) // $1
+        .bind(name) // $2
+        .fetch_one(&mut self.inner)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let table = rec?;
+
+        Ok(Some(table))
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        let rec = sqlx::query_as::<_, Table>(
+            r#"
+SELECT *
+FROM table_name
+WHERE namespace_id = $1;
+            "#,
+        )
+        .bind(namespace_id)
+        .fetch_all(&mut self.inner)
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn list(&mut self) -> Result<Vec<Table>> {
+        let rec = sqlx::query_as::<_, Table>("SELECT * FROM table_name;")
+            .fetch_all(&mut self.inner)
+            .await?;
+
+        Ok(rec)
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let mut tx = self.inner.pool.begin().await?;
+        let rec = sqlx::query_as::<_, Table>("SELECT * from table_name WHERE id = $1 FOR UPDATE;")
+            .bind(table_id) // $1
+            .fetch_one(&mut *tx)
+            .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("table: {table_id}"),
+            });
+        }
+        let table = rec?;
+
+        let columns = sqlx::query_as::<_, Column>("SELECT * from column_name where table_id = $1;")
+            .bind(table_id) // $1
+            .fetch_all(&mut *tx)
+            .await?;
+
+        let partitions =
+            sqlx::query_as::<_, Partition>(r#"SELECT * FROM partition WHERE table_id = $1;"#)
+                .bind(table_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        let (generation,): (i64,) = sqlx::query_as(
+            "UPDATE table_name SET generation = generation + 1 where id = $1 RETURNING generation;",
+        )
+        .bind(table_id) // $1
+        .fetch_one(&mut *tx)
+        .await?;
+
+        tx.commit().await?;
+
+        Ok(TableSnapshot::encode(
+            table,
+            partitions,
+            columns,
+            generation as _,
+        )?)
+    }
+}
+
+#[async_trait]
+impl ColumnRepo for PostgresTxn {
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column> {
+        insert_column_with_connection(&mut self.inner, name, table_id, column_type).await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        let rec = sqlx::query_as::<_, Column>(
+            r#"
+SELECT column_name.* FROM table_name
+INNER JOIN column_name on column_name.table_id = table_name.id
+WHERE table_name.namespace_id = $1;
+            "#,
+        )
+        .bind(namespace_id)
+        .fetch_all(&mut self.inner)
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
+        let rec = sqlx::query_as::<_, Column>(
+            r#"
+SELECT * FROM column_name
+WHERE table_id = $1;
+            "#,
+        )
+        .bind(table_id)
+        .fetch_all(&mut self.inner)
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn list(&mut self) -> Result<Vec<Column>> {
+        let rec = sqlx::query_as::<_, Column>("SELECT * FROM column_name;")
+            .fetch_all(&mut self.inner)
+            .await?;
+
+        Ok(rec)
+    }
+
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>> {
+        let num_columns = columns.len();
+        let (v_name, v_column_type): (Vec<&str>, Vec<i16>) = columns
+            .iter()
+            .map(|(&name, &column_type)| (name, column_type as i16))
+            .unzip();
+
+        // The `ORDER BY` in this statement is important to avoid deadlocks during concurrent
+        // writes to the same IOx table that each add many new columns. See:
+        //
+        // - <https://rcoh.svbtle.com/postgres-unique-constraints-can-cause-deadlock>
+        // - <https://dba.stackexchange.com/a/195220/27897>
+        // - <https://github.com/influxdata/idpe/issues/16298>
+        let out = sqlx::query_as::<_, Column>(
+            r#"
+INSERT INTO column_name ( name, table_id, column_type )
+SELECT name, $1, column_type
+FROM UNNEST($2, $3) as a(name, column_type)
+ORDER BY name
+ON CONFLICT ON CONSTRAINT column_name_unique
+DO UPDATE SET name = column_name.name
+RETURNING *;
+            "#,
+        )
+        .bind(table_id) // $1
+        .bind(&v_name) // $2
+        .bind(&v_column_type) // $3
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(|e| {
+            if is_fk_violation(&e) {
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
+            } else {
+                Error::External {
+                    source: Box::new(e),
+                }
+            }
+        })?;
+
+        assert_eq!(num_columns, out.len());
+
+        for existing in &out {
+            let want = columns.get(existing.name.as_str()).unwrap();
+            ensure!(
+                existing.column_type == *want,
+                AlreadyExistsSnafu {
+                    descr: format!(
+                        "column {} is type {} but schema update has type {}",
+                        existing.name, existing.column_type, want
+                    ),
+                }
+            );
+        }
+
+        Ok(out)
+    }
+}
+
+#[async_trait]
+impl PartitionRepo for PostgresTxn {
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
+        let hash_id = PartitionHashId::new(table_id, &key);
+
+        let v = sqlx::query_as::<_, Partition>(
+            r#"
+INSERT INTO partition
+    (partition_key, table_id, hash_id, sort_key_ids)
+VALUES
+    ( $1, $2, $3, '{}')
+ON CONFLICT ON CONSTRAINT partition_key_unique
+DO UPDATE SET partition_key = partition.partition_key
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
+        "#,
+        )
+        .bind(&key) // $1
+        .bind(table_id) // $2
+        .bind(&hash_id) // $3
+        .fetch_one(&mut self.inner)
+        .await
+        .map_err(|e| {
+            if is_fk_violation(&e) {
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
+            } else if is_unique_violation(&e) {
+                // Logging more information to diagnose a production issue maybe
+                warn!(
+                    error=?e,
+                    %table_id,
+                    %key,
+                    %hash_id,
+                    "possible duplicate partition_hash_id?"
+                );
+                Error::External {
+                    source: Box::new(e),
+                }
+            } else {
+                Error::External {
+                    source: Box::new(e),
+                }
+            }
+        })?;
+
+        Ok(v)
+    }
+
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect();
+
+        sqlx::query_as::<_, Partition>(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+WHERE id = ANY($1);
+        "#,
+        )
+        .bind(&ids[..]) // $1
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
+        sqlx::query_as::<_, Partition>(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+WHERE table_id = $1;
+            "#,
+        )
+        .bind(table_id) // $1
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
+        sqlx::query_as(
+            r#"
+            SELECT p.id as partition_id
+            FROM partition p
+            "#,
+        )
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    /// Update the sort key for `partition_id` if and only if `old_sort_key`
+    /// matches the current value in the database.
+    ///
+    /// This compare-and-swap operation is allowed to spuriously return
+    /// [`CasFailure::ValueMismatch`] for performance reasons (avoiding multiple
+    /// round trips to service a transaction in the happy path).
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let old_sort_key_ids = old_sort_key_ids
+            .map(std::ops::Deref::deref)
+            .unwrap_or_default();
+
+        // This `match` will go away when all partitions have hash IDs in the database.
+        let query = sqlx::query_as::<_, Partition>(
+            r#"
+UPDATE partition
+SET sort_key_ids = $1
+WHERE id = $2 AND sort_key_ids = $3
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
+        "#,
+        )
+        .bind(new_sort_key_ids) // $1
+        .bind(partition_id) // $2
+        .bind(old_sort_key_ids); // $3;
+
+        let res = query.fetch_one(&mut self.inner).await;
+
+        let partition = match res {
+            Ok(v) => v,
+            Err(sqlx::Error::RowNotFound) => {
+                // This update may have failed either because:
+                //
+                // * A row with the specified ID did not exist at query time
+                //   (but may exist now!)
+                // * The sort key does not match.
+                //
+                // To differentiate, we submit a get partition query, returning
+                // the actual sort key if successful.
+                //
+                // NOTE: this is racy, but documented - this might return "Sort
+                // key differs! Old key: <old sort key you provided>"
+                let partition = (self as &mut dyn PartitionRepo)
+                    .get_by_id(partition_id)
+                    .await
+                    .map_err(CasFailure::QueryError)?
+                    .ok_or(CasFailure::QueryError(Error::NotFound {
+                        descr: partition_id.to_string(),
+                    }))?;
+                return Err(CasFailure::ValueMismatch(
+                    partition.sort_key_ids().cloned().unwrap_or_default(),
+                ));
+            }
+            Err(e) => {
+                return Err(CasFailure::QueryError(Error::External {
+                    source: Box::new(e),
+                }))
+            }
+        };
+
+        debug!(
+            ?partition_id,
+            ?new_sort_key_ids,
+            "partition sort key cas successful"
+        );
+
+        Ok(partition)
+    }
+
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()> {
+        sqlx::query(
+            r#"
+INSERT INTO skipped_compactions
+    ( partition_id, reason, num_files, limit_num_files, limit_num_files_first_in_partition, estimated_bytes, limit_bytes, skipped_at )
+VALUES
+    ( $1, $2, $3, $4, $5, $6, $7, extract(epoch from NOW()) )
+ON CONFLICT ( partition_id )
+DO UPDATE
+SET
+reason = EXCLUDED.reason,
+num_files = EXCLUDED.num_files,
+limit_num_files = EXCLUDED.limit_num_files,
+limit_num_files_first_in_partition = EXCLUDED.limit_num_files_first_in_partition,
+estimated_bytes = EXCLUDED.estimated_bytes,
+limit_bytes = EXCLUDED.limit_bytes,
+skipped_at = EXCLUDED.skipped_at;
+        "#,
+        )
+        .bind(partition_id) // $1
+        .bind(reason)
+        .bind(num_files as i64)
+        .bind(limit_num_files as i64)
+        .bind(limit_num_files_first_in_partition as i64)
+        .bind(estimated_bytes as i64)
+        .bind(limit_bytes as i64)
+        .execute(&mut self.inner)
+        .await?;
+        Ok(())
+    }
+
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_ids: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>> {
+        let rec = sqlx::query_as::<_, SkippedCompaction>(
+            r#"SELECT * FROM skipped_compactions WHERE partition_id = ANY($1);"#,
+        )
+        .bind(partition_ids) // $1
+        .fetch_all(&mut self.inner)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(Vec::new());
+        }
+
+        let skipped_partition_records = rec?;
+
+        Ok(skipped_partition_records)
+    }
+
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
+        sqlx::query_as::<_, SkippedCompaction>(
+            r#"
+SELECT * FROM skipped_compactions
+        "#,
+        )
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>> {
+        sqlx::query_as::<_, SkippedCompaction>(
+            r#"
+DELETE FROM skipped_compactions
+WHERE partition_id = $1
+RETURNING *
+        "#,
+        )
+        .bind(partition_id)
+        .fetch_optional(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
+        sqlx::query_as(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+ORDER BY id DESC
+LIMIT $1;"#,
+        )
+        .bind(n as i64) // $1
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>> {
+        let sql = format!(
+            r#"
+            SELECT p.id as partition_id
+            FROM partition p
+            WHERE p.new_file_at > $1
+            {}
+            "#,
+            maximum_time
+                .map(|_| "AND p.new_file_at < $2")
+                .unwrap_or_default()
+        );
+
+        sqlx::query_as(&sql)
+            .bind(minimum_time) // $1
+            .bind(maximum_time) // $2
+            .fetch_all(&mut self.inner)
+            .await
+            .map_err(Error::from)
+    }
+
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
+        // Correctness: the main caller of this function, the partition bloom
+        // filter, relies on all partitions being made available to it.
+        //
+        // This function MUST return the full set of old partitions to the
+        // caller - do NOT apply a LIMIT to this query.
+        //
+        // The load this query saves vastly outsizes the load this query causes.
+        sqlx::query_as(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+WHERE hash_id IS NULL
+ORDER BY id DESC;"#,
+        )
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let mut tx = self.inner.pool.begin().await?;
+
+        let rec =
+            sqlx::query_as::<_, Partition>("SELECT * from partition WHERE id = $1 FOR UPDATE;")
+                .bind(partition_id) // $1
+                .fetch_one(&mut *tx)
+                .await;
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("partition: {partition_id}"),
+            });
+        }
+        let partition = rec?;
+
+        let files =
+            sqlx::query_as::<_, ParquetFile>("SELECT * from parquet_file where partition_id = $1 AND parquet_file.to_delete IS NULL;")
+                .bind(partition_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        let sc = sqlx::query_as::<_, SkippedCompaction>(
+            r#"SELECT * FROM skipped_compactions WHERE partition_id = $1;"#,
+        )
+        .bind(partition_id) // $1
+        .fetch_optional(&mut *tx)
+        .await?;
+
+        let (generation, namespace_id): (i64,NamespaceId) = sqlx::query_as(
+            "UPDATE partition SET generation = partition.generation + 1 from table_name where partition.id = $1 and table_name.id = partition.table_id RETURNING partition.generation, table_name.namespace_id;",
+        )
+        .bind(partition_id) // $1
+        .fetch_one(&mut *tx)
+        .await?;
+
+        tx.commit().await?;
+
+        Ok(PartitionSnapshot::encode(
+            namespace_id,
+            partition,
+            files,
+            sc,
+            generation as _,
+        )?)
+    }
+}
+
+#[async_trait]
+impl ParquetFileRepo for PostgresTxn {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let flagged_at = Timestamp::from(self.time_provider.now());
+        // TODO - include check of table retention period once implemented
+        let flagged = sqlx::query(
+            r#"
+WITH parquet_file_ids as (
+    SELECT parquet_file.object_store_id
+    FROM namespace, parquet_file
+    WHERE namespace.retention_period_ns IS NOT NULL
+    AND parquet_file.to_delete IS NULL
+    AND parquet_file.max_time < $1 - namespace.retention_period_ns
+    AND namespace.id = parquet_file.namespace_id
+    LIMIT $2
+)
+UPDATE parquet_file
+SET to_delete = $1
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING partition_id, object_store_id;
+            "#,
+        )
+        .bind(flagged_at) // $1
+        .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION) // $2
+        .fetch_all(&mut self.inner)
+        .await?;
+
+        let flagged = flagged
+            .into_iter()
+            .map(|row| (row.get("partition_id"), row.get("object_store_id")))
+            .collect();
+        Ok(flagged)
+    }
+
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        // see https://www.crunchydata.com/blog/simulating-update-or-delete-with-limit-in-postgres-ctes-to-the-rescue
+        let deleted = sqlx::query(
+            r#"
+WITH parquet_file_ids as (
+    SELECT object_store_id
+    FROM parquet_file
+    WHERE to_delete < $1
+    LIMIT $2
+)
+DELETE FROM parquet_file
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING object_store_id;
+             "#,
+        )
+        .bind(older_than) // $1
+        .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE) // $2
+        .fetch_all(&mut self.inner)
+        .await?;
+
+        let deleted = deleted
+            .into_iter()
+            .map(|row| row.get("object_store_id"))
+            .collect();
+        Ok(deleted)
+    }
+
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>> {
+        sqlx::query_as::<_, ParquetFile>(
+            r#"
+SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
+       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
+       compaction_level, created_at, column_set, max_l0_created_at
+FROM parquet_file
+WHERE parquet_file.partition_id = ANY($1)
+  AND parquet_file.to_delete IS NULL;
+        "#,
+        )
+        .bind(partition_ids) // $1
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>> {
+        let rec = sqlx::query_as::<_, ParquetFile>(
+            r#"
+SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
+       max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
+       max_l0_created_at
+FROM parquet_file
+WHERE object_store_id = $1;
+             "#,
+        )
+        .bind(object_store_id) // $1
+        .fetch_one(&mut self.inner)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let parquet_file = rec?;
+
+        Ok(Some(parquet_file))
+    }
+
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        sqlx::query(
+            // sqlx's readme suggests using PG's ANY operator instead of IN; see link below.
+            // https://github.com/launchbadge/sqlx/blob/main/FAQ.md#how-can-i-do-a-select--where-foo-in--query
+            r#"
+SELECT object_store_id
+FROM parquet_file
+WHERE object_store_id = ANY($1);
+             "#,
+        )
+        .bind(object_store_ids) // $1
+        .map(|pgr| pgr.get::<ObjectStoreId, _>("object_store_id"))
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>> {
+        let delete_set: HashSet<_> = delete.iter().map(|d| d.get_uuid()).collect();
+        let upgrade_set: HashSet<_> = upgrade.iter().map(|u| u.get_uuid()).collect();
+
+        assert!(
+            delete_set.is_disjoint(&upgrade_set),
+            "attempted to upgrade a file scheduled for delete"
+        );
+
+        let mut tx = self.inner.pool.begin().await?;
+
+        let marked_at = Timestamp::from(self.time_provider.now());
+        flag_for_delete(&mut *tx, partition_id, delete, marked_at).await?;
+
+        update_compaction_level(&mut *tx, partition_id, upgrade, target_level).await?;
+
+        let mut ids = Vec::with_capacity(create.len());
+        for file in create {
+            if file.partition_id != partition_id {
+                return Err(Error::External {
+                    source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(),
+                });
+            }
+            let id = create_parquet_file(&mut *tx, partition_id, file).await?;
+            ids.push(id);
+        }
+
+        tx.commit().await?;
+
+        Ok(ids)
+    }
+}
+
+// The following three functions are helpers to the create_upgrade_delete method.
+// They are also used by the respective create/flag_for_delete/update_compaction_level methods.
+async fn create_parquet_file<'q, E>(
+    executor: E,
+    partition_id: PartitionId,
+    parquet_file_params: &ParquetFileParams,
+) -> Result<ParquetFileId>
+where
+    E: Executor<'q, Database = Postgres>,
+{
+    let ParquetFileParams {
+        namespace_id,
+        table_id,
+        partition_id: _,
+        partition_hash_id,
+        object_store_id,
+        min_time,
+        max_time,
+        file_size_bytes,
+        row_count,
+        compaction_level,
+        created_at,
+        column_set,
+        max_l0_created_at,
+    } = parquet_file_params;
+
+    let query = sqlx::query_scalar::<_, ParquetFileId>(
+        r#"
+INSERT INTO parquet_file (
+    table_id, partition_id, partition_hash_id, object_store_id,
+    min_time, max_time, file_size_bytes,
+    row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at )
+VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 )
+RETURNING id;
+        "#,
+    )
+    .bind(table_id) // $1
+    .bind(partition_id) // $2
+    .bind(partition_hash_id.as_ref()) // $3
+    .bind(object_store_id) // $4
+    .bind(min_time) // $5
+    .bind(max_time) // $6
+    .bind(file_size_bytes) // $7
+    .bind(row_count) // $8
+    .bind(compaction_level) // $9
+    .bind(created_at) // $10
+    .bind(namespace_id) // $11
+    .bind(column_set) // $12
+    .bind(max_l0_created_at); // $13
+
+    let parquet_file_id = query.fetch_one(executor).await.map_err(|e| {
+        if is_unique_violation(&e) {
+            Error::AlreadyExists {
+                descr: object_store_id.to_string(),
+            }
+        } else if is_fk_violation(&e) {
+            Error::NotFound {
+                descr: e.to_string(),
+            }
+        } else {
+            Error::External {
+                source: Box::new(e),
+            }
+        }
+    })?;
+
+    Ok(parquet_file_id)
+}
+
+async fn flag_for_delete<'q, E>(
+    executor: E,
+    partition_id: PartitionId,
+    ids: &[ObjectStoreId],
+    marked_at: Timestamp,
+) -> Result<()>
+where
+    E: Executor<'q, Database = Postgres>,
+{
+    let updated =
+        sqlx::query_as::<_, (i64,)>(r#"UPDATE parquet_file SET to_delete = $1 WHERE object_store_id = ANY($2) AND partition_id = $3 AND to_delete is NULL RETURNING id;"#)
+            .bind(marked_at) // $1
+            .bind(ids) // $2
+            .bind(partition_id) // $3
+            .fetch_all(executor)
+            .await?;
+
+    if updated.len() != ids.len() {
+        return Err(Error::NotFound {
+            descr: "parquet file(s) not found for delete".to_string(),
+        });
+    }
+
+    Ok(())
+}
+
+async fn update_compaction_level<'q, E>(
+    executor: E,
+    partition_id: PartitionId,
+    parquet_file_ids: &[ObjectStoreId],
+    compaction_level: CompactionLevel,
+) -> Result<()>
+where
+    E: Executor<'q, Database = Postgres>,
+{
+    let updated = sqlx::query_as::<_, (i64,)>(
+        r#"
+UPDATE parquet_file
+SET compaction_level = $1
+WHERE object_store_id = ANY($2) AND partition_id = $3 AND to_delete is NULL RETURNING id;
+        "#,
+    )
+    .bind(compaction_level) // $1
+    .bind(parquet_file_ids) // $2
+    .bind(partition_id) // $3
+    .fetch_all(executor)
+    .await?;
+
+    if updated.len() != parquet_file_ids.len() {
+        return Err(Error::NotFound {
+            descr: "parquet file(s) not found for upgrade".to_string(),
+        });
+    }
+
+    Ok(())
+}
+
+/// The error code returned by Postgres for a unique constraint violation.
+///
+/// See <https://www.postgresql.org/docs/9.2/errcodes-appendix.html>
+const PG_UNIQUE_VIOLATION: &str = "23505";
+
+/// Returns true if `e` is a unique constraint violation error.
+fn is_unique_violation(e: &sqlx::Error) -> bool {
+    if let sqlx::Error::Database(inner) = e {
+        if let Some(code) = inner.code() {
+            if code == PG_UNIQUE_VIOLATION {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+/// Error code returned by Postgres for a foreign key constraint violation.
+const PG_FK_VIOLATION: &str = "23503";
+
+fn is_fk_violation(e: &sqlx::Error) -> bool {
+    if let sqlx::Error::Database(inner) = e {
+        if let Some(code) = inner.code() {
+            if code == PG_FK_VIOLATION {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+/// Test helpers postgres testing.
+#[cfg(test)]
+pub(crate) mod test_utils {
+    use super::*;
+    use rand::Rng;
+    use sqlx::migrate::MigrateDatabase;
+
+    pub(crate) const TEST_DSN_ENV: &str = "TEST_INFLUXDB_IOX_CATALOG_DSN";
+
+    /// Helper macro to skip tests if TEST_INTEGRATION and TEST_INFLUXDB_IOX_CATALOG_DSN environment
+    /// variables are not set.
+    macro_rules! maybe_skip_integration {
+        ($panic_msg:expr) => {{
+            dotenvy::dotenv().ok();
+
+            let required_vars = [crate::postgres::test_utils::TEST_DSN_ENV];
+            let unset_vars: Vec<_> = required_vars
+                .iter()
+                .filter_map(|&name| match std::env::var(name) {
+                    Ok(_) => None,
+                    Err(_) => Some(name),
+                })
+                .collect();
+            let unset_var_names = unset_vars.join(", ");
+
+            let force = std::env::var("TEST_INTEGRATION");
+
+            if force.is_ok() && !unset_var_names.is_empty() {
+                panic!(
+                    "TEST_INTEGRATION is set, \
+                            but variable(s) {} need to be set",
+                    unset_var_names
+                );
+            } else if force.is_err() {
+                eprintln!(
+                    "skipping Postgres integration test - set {}TEST_INTEGRATION to run",
+                    if unset_var_names.is_empty() {
+                        String::new()
+                    } else {
+                        format!("{} and ", unset_var_names)
+                    }
+                );
+
+                let panic_msg: &'static str = $panic_msg;
+                if !panic_msg.is_empty() {
+                    panic!("{}", panic_msg);
+                }
+
+                return;
+            }
+        }};
+        () => {
+            maybe_skip_integration!("")
+        };
+    }
+
+    pub(crate) use maybe_skip_integration;
+
+    pub(crate) async fn create_db(dsn: &str) {
+        // Create the catalog database if it doesn't exist
+        if !Postgres::database_exists(dsn).await.unwrap() {
+            // Ignore failure if another test has already created the database
+            let _ = Postgres::create_database(dsn).await;
+        }
+    }
+
+    pub(crate) async fn setup_db_no_migration() -> PostgresCatalog {
+        // create a random schema for this particular pool
+        let schema_name = {
+            // use scope to make it clear to clippy / rust that `rng` is
+            // not carried past await points
+            let mut rng = rand::thread_rng();
+            (&mut rng)
+                .sample_iter(rand::distributions::Alphanumeric)
+                .filter(|c| c.is_ascii_alphabetic())
+                .take(20)
+                .map(char::from)
+                .collect::<String>()
+                .to_ascii_lowercase()
+        };
+        info!(schema_name, "test schema");
+
+        let metrics = Arc::new(metric::Registry::default());
+        let dsn = std::env::var("TEST_INFLUXDB_IOX_CATALOG_DSN").unwrap();
+
+        create_db(&dsn).await;
+
+        let options = PostgresConnectionOptions {
+            app_name: String::from("test"),
+            schema_name: schema_name.clone(),
+            dsn,
+            max_conns: 3,
+            ..Default::default()
+        };
+        let pg = PostgresCatalog::connect(options, metrics)
+            .await
+            .expect("failed to connect catalog");
+
+        // Create the test schema
+        pg.pool
+            .execute(format!("CREATE SCHEMA {schema_name};").as_str())
+            .await
+            .expect("failed to create test schema");
+
+        // Ensure the test user has permission to interact with the test schema.
+        pg.pool
+            .execute(
+                format!(
+                    "GRANT USAGE ON SCHEMA {schema_name} TO public; GRANT CREATE ON SCHEMA {schema_name} TO public;"
+                )
+                .as_str(),
+            )
+            .await
+            .expect("failed to grant privileges to schema");
+
+        pg
+    }
+
+    pub(crate) async fn setup_db() -> PostgresCatalog {
+        let pg = setup_db_no_migration().await;
+        // Run the migrations against this random schema.
+        pg.setup().await.expect("failed to initialise database");
+        pg
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::interface::ParquetFileRepoExt;
+    use crate::{
+        postgres::test_utils::{
+            create_db, maybe_skip_integration, setup_db, setup_db_no_migration,
+        },
+        test_helpers::{arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table},
+    };
+    use assert_matches::assert_matches;
+    use data_types::partition_template::TemplatePart;
+    use generated_types::influxdata::iox::partition_template::v1 as proto;
+    use metric::{Observation, RawReporter};
+    use std::{io::Write, ops::Deref, sync::Arc, time::Instant};
+    use tempfile::NamedTempFile;
+    use test_helpers::maybe_start_logging;
+
+    /// Small no-op test just to print out the migrations.
+    ///
+    /// This is helpful to look up migration checksums and debug parsing of the migration files.
+    #[test]
+    fn print_migrations() {
+        println!("{:#?}", MIGRATOR.deref());
+    }
+
+    #[tokio::test]
+    async fn test_migration() {
+        maybe_skip_integration!();
+        maybe_start_logging();
+
+        let postgres = setup_db_no_migration().await;
+
+        // 1st setup
+        postgres.setup().await.unwrap();
+
+        // 2nd setup
+        postgres.setup().await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_migration_generic() {
+        use crate::migrate::test_utils::test_migration;
+
+        maybe_skip_integration!();
+        maybe_start_logging();
+
+        test_migration(&MIGRATOR, || async {
+            setup_db_no_migration().await.into_pool()
+        })
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_catalog() {
+        maybe_skip_integration!();
+
+        let postgres = setup_db().await;
+
+        // Validate the connection time zone is the expected UTC value.
+        let tz: String = sqlx::query_scalar("SHOW TIME ZONE;")
+            .fetch_one(&postgres.pool)
+            .await
+            .expect("read time zone");
+        assert_eq!(tz, "UTC");
+
+        let pool = postgres.pool.clone();
+        let schema_name = postgres.schema_name().to_string();
+
+        let postgres: Arc<dyn Catalog> = Arc::new(postgres);
+
+        crate::interface_tests::test_catalog(|| async {
+            // Clean the schema.
+            pool
+                .execute(format!("DROP SCHEMA {schema_name} CASCADE").as_str())
+                .await
+                .expect("failed to clean schema between tests");
+
+            // Recreate the test schema
+            pool
+                .execute(format!("CREATE SCHEMA {schema_name};").as_str())
+                .await
+                .expect("failed to create test schema");
+
+            // Ensure the test user has permission to interact with the test schema.
+            pool
+                .execute(
+                    format!(
+                        "GRANT USAGE ON SCHEMA {schema_name} TO public; GRANT CREATE ON SCHEMA {schema_name} TO public;"
+                    )
+                    .as_str(),
+                )
+                .await
+                .expect("failed to grant privileges to schema");
+
+            // Run the migrations against this random schema.
+            postgres.setup().await.expect("failed to initialise database");
+
+            Arc::clone(&postgres)
+        })
+        .await;
+    }
+
+    #[tokio::test]
+    async fn existing_partitions_without_hash_id() {
+        maybe_skip_integration!();
+
+        let postgres = setup_db().await;
+        let pool = postgres.pool.clone();
+        let postgres: Arc<dyn Catalog> = Arc::new(postgres);
+        let mut repos = postgres.repositories();
+
+        let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+        let table = arbitrary_table(&mut *repos, "table", &namespace).await;
+        let table_id = table.id;
+        let key = PartitionKey::from("francis-scott-key-key");
+
+        // Create a partition record in the database that has `NULL` for its `hash_id`
+        // value, which is what records existing before the migration adding that column will have.
+        sqlx::query(
+            r#"
+INSERT INTO partition
+    (partition_key, table_id, sort_key_ids)
+VALUES
+    ( $1, $2, '{}')
+ON CONFLICT ON CONSTRAINT partition_key_unique
+DO UPDATE SET partition_key = partition.partition_key
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
+        "#,
+        )
+        .bind(&key) // $1
+        .bind(table_id) // $2
+        .fetch_one(&pool)
+        .await
+        .unwrap();
+
+        // Check that the hash_id being null in the database doesn't break querying for partitions.
+        let table_partitions = repos.partitions().list_by_table_id(table_id).await.unwrap();
+        assert_eq!(table_partitions.len(), 1);
+        let partition = &table_partitions[0];
+        assert!(partition.hash_id().is_none());
+
+        // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent
+        // and that the hash_id still doesn't get set.
+        let inserted_again = repos
+            .partitions()
+            .create_or_get(key, table_id)
+            .await
+            .expect("idempotent write should succeed");
+
+        // Test: sort_key_ids from freshly insert with empty value
+        assert!(inserted_again.sort_key_ids().is_none());
+
+        assert_eq!(partition, &inserted_again);
+
+        // Create a Parquet file record in this partition to ensure we don't break new data
+        // ingestion for old-style partitions
+        let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, partition);
+        let parquet_file = repos
+            .parquet_files()
+            .create(parquet_file_params)
+            .await
+            .unwrap();
+        assert_eq!(parquet_file.partition_hash_id, None);
+
+        // Add a partition record WITH a hash ID
+        repos
+            .partitions()
+            .create_or_get(PartitionKey::from("Something else"), table_id)
+            .await
+            .unwrap();
+
+        // Ensure we can list only the old-style partitions
+        let old_style_partitions = repos.partitions().list_old_style().await.unwrap();
+        assert_eq!(old_style_partitions.len(), 1);
+        assert_eq!(old_style_partitions[0].id, partition.id);
+    }
+
+    #[test]
+    fn test_parse_dsn_file() {
+        assert_eq!(
+            get_dsn_file_path("dsn-file:///tmp/my foo.txt"),
+            Some("/tmp/my foo.txt".to_owned()),
+        );
+        assert_eq!(get_dsn_file_path("dsn-file:blah"), None,);
+        assert_eq!(get_dsn_file_path("postgres://user:pw@host/db"), None,);
+    }
+
+    #[tokio::test]
+    async fn test_reload() {
+        maybe_skip_integration!();
+
+        const POLLING_INTERVAL: Duration = Duration::from_millis(10);
+
+        // fetch dsn from envvar
+        let test_dsn = std::env::var("TEST_INFLUXDB_IOX_CATALOG_DSN").unwrap();
+        create_db(&test_dsn).await;
+        eprintln!("TEST_DSN={test_dsn}");
+
+        // create a temp file to store the initial dsn
+        let mut dsn_file = NamedTempFile::new().expect("create temp file");
+        dsn_file
+            .write_all(test_dsn.as_bytes())
+            .expect("write temp file");
+
+        const TEST_APPLICATION_NAME: &str = "test_application_name";
+        let dsn_good = format!("dsn-file://{}", dsn_file.path().display());
+        eprintln!("dsn_good={dsn_good}");
+
+        // create a hot swap pool with test application name and dsn file pointing to tmp file.
+        // we will later update this file and the pool should be replaced.
+        let options = PostgresConnectionOptions {
+            app_name: TEST_APPLICATION_NAME.to_owned(),
+            schema_name: String::from("test"),
+            dsn: dsn_good,
+            max_conns: 3,
+            hotswap_poll_interval: POLLING_INTERVAL,
+            ..Default::default()
+        };
+        let metrics = Arc::new(metric::Registry::new());
+        let pool = new_pool(&options, metrics).await.expect("connect");
+        eprintln!("got a pool");
+
+        // ensure the application name is set as expected
+        let application_name: String =
+            sqlx::query_scalar("SELECT current_setting('application_name') as application_name;")
+                .fetch_one(&pool)
+                .await
+                .expect("read application_name");
+        assert_eq!(application_name, TEST_APPLICATION_NAME);
+
+        // create a new temp file object with updated dsn and overwrite the previous tmp file
+        const TEST_APPLICATION_NAME_NEW: &str = "changed_application_name";
+        let mut new_dsn_file = NamedTempFile::new().expect("create temp file");
+        new_dsn_file
+            .write_all(test_dsn.as_bytes())
+            .expect("write temp file");
+        new_dsn_file
+            .write_all(format!("?application_name={TEST_APPLICATION_NAME_NEW}").as_bytes())
+            .expect("write temp file");
+        new_dsn_file
+            .persist(dsn_file.path())
+            .expect("overwrite new dsn file");
+
+        // wait until the hotswap machinery has reloaded the updated DSN file and
+        // successfully performed a new connection with the new DSN.
+        let mut application_name = "".to_string();
+        let start = Instant::now();
+        while start.elapsed() < Duration::from_secs(5)
+            && application_name != TEST_APPLICATION_NAME_NEW
+        {
+            tokio::time::sleep(POLLING_INTERVAL).await;
+
+            application_name = sqlx::query_scalar(
+                "SELECT current_setting('application_name') as application_name;",
+            )
+            .fetch_one(&pool)
+            .await
+            .expect("read application_name");
+        }
+        assert_eq!(application_name, TEST_APPLICATION_NAME_NEW);
+    }
+
+    #[tokio::test]
+    async fn test_billing_summary_on_parqet_file_creation() {
+        maybe_skip_integration!();
+
+        let postgres = setup_db().await;
+        let pool = postgres.pool.clone();
+        let postgres: Arc<dyn Catalog> = Arc::new(postgres);
+        let mut repos = postgres.repositories();
+        let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+        let table = arbitrary_table(&mut *repos, "table", &namespace).await;
+        let key = "bananas";
+        let partition = repos
+            .partitions()
+            .create_or_get(key.into(), table.id)
+            .await
+            .unwrap();
+
+        // parquet file to create- all we care about here is the size
+        let mut p1 = arbitrary_parquet_file_params(&namespace, &table, &partition);
+        p1.file_size_bytes = 1337;
+        let f1 = repos.parquet_files().create(p1.clone()).await.unwrap();
+        // insert the same again with a different size; we should then have 3x1337 as total file
+        // size
+        p1.object_store_id = ObjectStoreId::new();
+        p1.file_size_bytes *= 2;
+        let _f2 = repos
+            .parquet_files()
+            .create(p1.clone())
+            .await
+            .expect("create parquet file should succeed");
+
+        // after adding two files we should have 3x1337 in the summary
+        let total_file_size_bytes: i64 =
+            sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;")
+                .fetch_one(&pool)
+                .await
+                .expect("fetch total file size failed");
+        assert_eq!(total_file_size_bytes, 1337 * 3);
+
+        // flag f1 for deletion and assert that the total file size is reduced accordingly.
+        repos
+            .parquet_files()
+            .create_upgrade_delete(
+                partition.id,
+                &[f1.object_store_id],
+                &[],
+                &[],
+                CompactionLevel::Initial,
+            )
+            .await
+            .expect("flag parquet file for deletion should succeed");
+        let total_file_size_bytes: i64 =
+            sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;")
+                .fetch_one(&pool)
+                .await
+                .expect("fetch total file size failed");
+        // we marked the first file of size 1337 for deletion leaving only the second that was 2x
+        // that
+        assert_eq!(total_file_size_bytes, 1337 * 2);
+
+        // actually deleting shouldn't change the total
+        let older_than = p1.created_at + 1;
+        repos
+            .parquet_files()
+            .delete_old_ids_only(older_than)
+            .await
+            .expect("parquet file deletion should succeed");
+        let total_file_size_bytes: i64 =
+            sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;")
+                .fetch_one(&pool)
+                .await
+                .expect("fetch total file size failed");
+        assert_eq!(total_file_size_bytes, 1337 * 2);
+    }
+
+    #[tokio::test]
+    async fn namespace_partition_template_null_is_the_default_in_the_database() {
+        maybe_skip_integration!();
+
+        let postgres = setup_db().await;
+        let pool = postgres.pool.clone();
+        let postgres: Arc<dyn Catalog> = Arc::new(postgres);
+        let mut repos = postgres.repositories();
+
+        let namespace_name = "apples";
+
+        // Create a namespace record in the database that has `NULL` for its `partition_template`
+        // value, which is what records existing before the migration adding that column will have.
+        let insert_null_partition_template_namespace = sqlx::query(
+            r#"
+INSERT INTO namespace (
+    name, retention_period_ns, partition_template
+)
+VALUES ( $1, $2, NULL )
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+            "#,
+        )
+        .bind(namespace_name) // $1
+        .bind(None::<Option<i64>>); // $2
+
+        insert_null_partition_template_namespace
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+
+        let lookup_namespace = repos
+            .namespaces()
+            .get_by_name(namespace_name, SoftDeletedRows::ExcludeDeleted)
+            .await
+            .unwrap()
+            .unwrap();
+        // When fetching this namespace from the database, the `FromRow` impl should set its
+        // `partition_template` to the default.
+        assert_eq!(
+            lookup_namespace.partition_template,
+            NamespacePartitionTemplateOverride::default()
+        );
+
+        // When creating a namespace through the catalog functions without specifying a custom
+        // partition template,
+        let created_without_custom_template = repos
+            .namespaces()
+            .create(
+                &"lemons".try_into().unwrap(),
+                None, // no partition template
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+
+        // it should have the default template in the application,
+        assert_eq!(
+            created_without_custom_template.partition_template,
+            NamespacePartitionTemplateOverride::default()
+        );
+
+        // and store NULL in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM namespace WHERE id = $1;")
+            .bind(created_without_custom_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(created_without_custom_template.name, name);
+        let partition_template: Option<NamespacePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert!(partition_template.is_none());
+
+        // When explicitly setting a template that happens to be equal to the application default,
+        // assume it's important that it's being specially requested and store it rather than NULL.
+        let namespace_custom_template_name = "kumquats";
+        let custom_partition_template_equal_to_default =
+            NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat(
+                        "%Y-%m-%d".to_owned(),
+                    )),
+                }],
+            })
+            .unwrap();
+        let namespace_custom_template = repos
+            .namespaces()
+            .create(
+                &namespace_custom_template_name.try_into().unwrap(),
+                Some(custom_partition_template_equal_to_default.clone()),
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+        assert_eq!(
+            namespace_custom_template.partition_template,
+            custom_partition_template_equal_to_default
+        );
+        let record = sqlx::query("SELECT name, partition_template FROM namespace WHERE id = $1;")
+            .bind(namespace_custom_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(namespace_custom_template.name, name);
+        let partition_template: Option<NamespacePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert_eq!(
+            partition_template.unwrap(),
+            custom_partition_template_equal_to_default
+        );
+    }
+
+    #[tokio::test]
+    async fn table_partition_template_null_is_the_default_in_the_database() {
+        maybe_skip_integration!();
+
+        let postgres = setup_db().await;
+        let pool = postgres.pool.clone();
+        let postgres: Arc<dyn Catalog> = Arc::new(postgres);
+        let mut repos = postgres.repositories();
+
+        let namespace_default_template_name = "oranges";
+        let namespace_default_template = repos
+            .namespaces()
+            .create(
+                &namespace_default_template_name.try_into().unwrap(),
+                None, // no partition template
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+
+        let namespace_custom_template_name = "limes";
+        let namespace_custom_template = repos
+            .namespaces()
+            .create(
+                &namespace_custom_template_name.try_into().unwrap(),
+                Some(
+                    NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+                        parts: vec![proto::TemplatePart {
+                            part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                        }],
+                    })
+                    .unwrap(),
+                ),
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+
+        // In a namespace that also has a NULL template, create a table record in the database that
+        // has `NULL` for its `partition_template` value, which is what records existing before the
+        // migration adding that column will have.
+        let table_name = "null_template";
+        let insert_null_partition_template_table = sqlx::query(
+            r#"
+INSERT INTO table_name ( name, namespace_id, partition_template )
+VALUES ( $1, $2, NULL )
+RETURNING *;
+            "#,
+        )
+        .bind(table_name) // $1
+        .bind(namespace_default_template.id); // $2
+
+        insert_null_partition_template_table
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+
+        let lookup_table = repos
+            .tables()
+            .get_by_namespace_and_name(namespace_default_template.id, table_name)
+            .await
+            .unwrap()
+            .unwrap();
+        // When fetching this table from the database, the `FromRow` impl should set its
+        // `partition_template` to the system default (because the namespace didn't have a template
+        // either).
+        assert_eq!(
+            lookup_table.partition_template,
+            TablePartitionTemplateOverride::default()
+        );
+
+        // In a namespace that has a custom template, create a table record in the database that
+        // has `NULL` for its `partition_template` value.
+        //
+        // THIS ACTUALLY SHOULD BE IMPOSSIBLE because:
+        //
+        // * Namespaces have to exist before tables
+        // * `partition_tables` are immutable on both namespaces and tables
+        // * When the migration adding the `partition_table` column is deployed, namespaces can
+        //   begin to be created with `partition_templates`
+        // * *Then* tables can be created with `partition_templates` or not
+        // * When tables don't get a custom table partition template but their namespace has one,
+        //   their database record will get the namespace partition template.
+        //
+        // In other words, table `partition_template` values in the database is allowed to possibly
+        // be `NULL` IFF their namespace's `partition_template` is `NULL`.
+        //
+        // That said, this test creates this hopefully-impossible scenario to ensure that the
+        // defined, expected behavior if a table record somehow exists in the database with a `NULL`
+        // `partition_template` value is that it will have the application default partition
+        // template *even if the namespace `partition_template` is not null*.
+        let table_name = "null_template";
+        let insert_null_partition_template_table = sqlx::query(
+            r#"
+INSERT INTO table_name ( name, namespace_id, partition_template )
+VALUES ( $1, $2, NULL )
+RETURNING *;
+            "#,
+        )
+        .bind(table_name) // $1
+        .bind(namespace_custom_template.id); // $2
+
+        insert_null_partition_template_table
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+
+        let lookup_table = repos
+            .tables()
+            .get_by_namespace_and_name(namespace_custom_template.id, table_name)
+            .await
+            .unwrap()
+            .unwrap();
+        // When fetching this table from the database, the `FromRow` impl should set its
+        // `partition_template` to the system default *even though the namespace has a
+        // template*, because this should be impossible as detailed above.
+        assert_eq!(
+            lookup_table.partition_template,
+            TablePartitionTemplateOverride::default()
+        );
+
+        // # Table template false, namespace template true
+        //
+        // When creating a table through the catalog functions *without* a custom table template in
+        // a namespace *with* a custom partition template,
+        let table_no_template_with_namespace_template = repos
+            .tables()
+            .create(
+                "pomelo",
+                TablePartitionTemplateOverride::try_new(
+                    None, // no custom partition template
+                    &namespace_custom_template.partition_template,
+                )
+                .unwrap(),
+                namespace_custom_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the namespace's template
+        assert_eq!(
+            table_no_template_with_namespace_template.partition_template,
+            TablePartitionTemplateOverride::try_new(
+                None,
+                &namespace_custom_template.partition_template
+            )
+            .unwrap()
+        );
+
+        // and store that value in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_no_template_with_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_no_template_with_namespace_template.name, name);
+        let partition_template: Option<TablePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert_eq!(
+            partition_template.unwrap(),
+            TablePartitionTemplateOverride::try_new(
+                None,
+                &namespace_custom_template.partition_template
+            )
+            .unwrap()
+        );
+
+        // # Table template true, namespace template false
+        //
+        // When creating a table through the catalog functions *with* a custom table template in
+        // a namespace *without* a custom partition template,
+        let custom_table_template = proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("chemical".into())),
+            }],
+        };
+        let table_with_template_no_namespace_template = repos
+            .tables()
+            .create(
+                "tangerine",
+                TablePartitionTemplateOverride::try_new(
+                    Some(custom_table_template), // with custom partition template
+                    &namespace_default_template.partition_template,
+                )
+                .unwrap(),
+                namespace_default_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the custom table template
+        let table_template_parts: Vec<_> = table_with_template_no_namespace_template
+            .partition_template
+            .parts()
+            .collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "chemical"
+        );
+
+        // and store that value in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_with_template_no_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_with_template_no_namespace_template.name, name);
+        let partition_template = record
+            .try_get::<Option<TablePartitionTemplateOverride>, _>("partition_template")
+            .unwrap()
+            .unwrap();
+        let table_template_parts: Vec<_> = partition_template.parts().collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "chemical"
+        );
+
+        // # Table template true, namespace template true
+        //
+        // When creating a table through the catalog functions *with* a custom table template in
+        // a namespace *with* a custom partition template,
+        let custom_table_template = proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("vegetable".into())),
+            }],
+        };
+        let table_with_template_with_namespace_template = repos
+            .tables()
+            .create(
+                "nectarine",
+                TablePartitionTemplateOverride::try_new(
+                    Some(custom_table_template), // with custom partition template
+                    &namespace_custom_template.partition_template,
+                )
+                .unwrap(),
+                namespace_custom_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the custom table template
+        let table_template_parts: Vec<_> = table_with_template_with_namespace_template
+            .partition_template
+            .parts()
+            .collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "vegetable"
+        );
+
+        // and store that value in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_with_template_with_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_with_template_with_namespace_template.name, name);
+        let partition_template = record
+            .try_get::<Option<TablePartitionTemplateOverride>, _>("partition_template")
+            .unwrap()
+            .unwrap();
+        let table_template_parts: Vec<_> = partition_template.parts().collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "vegetable"
+        );
+
+        // # Table template false, namespace template false
+        //
+        // When creating a table through the catalog functions *without* a custom table template in
+        // a namespace *without* a custom partition template,
+        let table_no_template_no_namespace_template = repos
+            .tables()
+            .create(
+                "grapefruit",
+                TablePartitionTemplateOverride::try_new(
+                    None, // no custom partition template
+                    &namespace_default_template.partition_template,
+                )
+                .unwrap(),
+                namespace_default_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the default template in the application,
+        assert_eq!(
+            table_no_template_no_namespace_template.partition_template,
+            TablePartitionTemplateOverride::default()
+        );
+
+        // and store NULL in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_no_template_no_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_no_template_no_namespace_template.name, name);
+        let partition_template: Option<TablePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert!(partition_template.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_metrics() {
+        maybe_skip_integration!();
+
+        let postgres = setup_db_no_migration().await;
+
+        let mut reporter = RawReporter::default();
+        postgres.metrics.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("sqlx_postgres_connections")
+                .unwrap()
+                .observation(&[("pool_id", "0"), ("state", "min")])
+                .unwrap(),
+            &Observation::U64Gauge(1),
+        );
+        assert_eq!(
+            reporter
+                .metric("sqlx_postgres_connections")
+                .unwrap()
+                .observation(&[("pool_id", "0"), ("state", "max")])
+                .unwrap(),
+            &Observation::U64Gauge(3),
+        );
+    }
+}
diff --git a/iox_catalog/src/sqlite.rs b/iox_catalog/src/sqlite.rs
new file mode 100644
index 0000000..e91cde3
--- /dev/null
+++ b/iox_catalog/src/sqlite.rs
@@ -0,0 +1,2196 @@
+//! A SQLite backed implementation of the Catalog
+
+use crate::interface::PartitionRepoExt;
+use crate::{
+    constants::{
+        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
+    },
+    interface::{
+        AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo,
+        PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
+    },
+    metrics::MetricDecorator,
+};
+use async_trait::async_trait;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
+use data_types::{
+    partition_template::{
+        NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart,
+    },
+    Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables,
+    Namespace, NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId,
+    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId,
+    PartitionKey, SkippedCompaction, SortKeyIds, Table, TableId, Timestamp,
+};
+use iox_time::{SystemProvider, TimeProvider};
+use metric::Registry;
+use observability_deps::tracing::debug;
+use parking_lot::Mutex;
+use serde::{Deserialize, Serialize};
+use snafu::prelude::*;
+use sqlx::{
+    migrate::Migrator,
+    sqlite::{SqliteConnectOptions, SqliteRow},
+    types::Json,
+    Executor, FromRow, Pool, Row, Sqlite, SqlitePool,
+};
+use std::{
+    collections::{HashMap, HashSet},
+    fmt::Display,
+    str::FromStr,
+    sync::Arc,
+};
+
+static MIGRATOR: Migrator = sqlx::migrate!("sqlite/migrations");
+
+/// SQLite connection options.
+#[derive(Debug, Clone)]
+pub struct SqliteConnectionOptions {
+    /// local file path to .sqlite file
+    pub file_path: String,
+}
+
+/// SQLite catalog.
+#[derive(Debug)]
+pub struct SqliteCatalog {
+    metrics: Arc<Registry>,
+    pool: Pool<Sqlite>,
+    time_provider: Arc<dyn TimeProvider>,
+    options: SqliteConnectionOptions,
+}
+
+/// transaction for [`SqliteCatalog`].
+#[derive(Debug)]
+pub struct SqliteTxn {
+    inner: Mutex<SqliteTxnInner>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+#[derive(Debug)]
+struct SqliteTxnInner {
+    pool: Pool<Sqlite>,
+}
+
+impl<'c> Executor<'c> for &'c mut SqliteTxnInner {
+    type Database = Sqlite;
+
+    #[allow(clippy::type_complexity)]
+    fn fetch_many<'e, 'q: 'e, E: 'q>(
+        self,
+        query: E,
+    ) -> futures::stream::BoxStream<
+        'e,
+        Result<
+            sqlx::Either<
+                <Self::Database as sqlx::Database>::QueryResult,
+                <Self::Database as sqlx::Database>::Row,
+            >,
+            sqlx::Error,
+        >,
+    >
+    where
+        'c: 'e,
+        E: sqlx::Execute<'q, Self::Database>,
+    {
+        self.pool.fetch_many(query)
+    }
+
+    fn fetch_optional<'e, 'q: 'e, E: 'q>(
+        self,
+        query: E,
+    ) -> futures::future::BoxFuture<
+        'e,
+        Result<Option<<Self::Database as sqlx::Database>::Row>, sqlx::Error>,
+    >
+    where
+        'c: 'e,
+        E: sqlx::Execute<'q, Self::Database>,
+    {
+        self.pool.fetch_optional(query)
+    }
+
+    fn prepare_with<'e, 'q: 'e>(
+        self,
+        sql: &'q str,
+        parameters: &'e [<Self::Database as sqlx::Database>::TypeInfo],
+    ) -> futures::future::BoxFuture<
+        'e,
+        Result<<Self::Database as sqlx::database::HasStatement<'q>>::Statement, sqlx::Error>,
+    >
+    where
+        'c: 'e,
+    {
+        self.pool.prepare_with(sql, parameters)
+    }
+
+    fn describe<'e, 'q: 'e>(
+        self,
+        sql: &'q str,
+    ) -> futures::future::BoxFuture<'e, Result<sqlx::Describe<Self::Database>, sqlx::Error>>
+    where
+        'c: 'e,
+    {
+        self.pool.describe(sql)
+    }
+}
+
+impl SqliteCatalog {
+    /// Connect to the catalog store.
+    pub async fn connect(options: SqliteConnectionOptions, metrics: Arc<Registry>) -> Result<Self> {
+        let opts = SqliteConnectOptions::from_str(&options.file_path)?.create_if_missing(true);
+
+        let pool = SqlitePool::connect_with(opts).await?;
+        Ok(Self {
+            metrics,
+            pool,
+            time_provider: Arc::new(SystemProvider::new()),
+            options,
+        })
+    }
+}
+
+impl Display for SqliteCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Sqlite(dsn='{}')", self.options.file_path)
+    }
+}
+
+#[async_trait]
+impl Catalog for SqliteCatalog {
+    async fn setup(&self) -> Result<()> {
+        MIGRATOR.run(&self.pool).await?;
+
+        Ok(())
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        Box::new(MetricDecorator::new(
+            SqliteTxn {
+                inner: Mutex::new(SqliteTxnInner {
+                    pool: self.pool.clone(),
+                }),
+                time_provider: Arc::clone(&self.time_provider),
+            },
+            Arc::clone(&self.metrics),
+            Arc::clone(&self.time_provider),
+        ))
+    }
+
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<Registry> {
+        Arc::clone(&self.metrics)
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider)
+    }
+}
+
+impl RepoCollection for SqliteTxn {
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl NamespaceRepo for SqliteTxn {
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace> {
+        let max_tables = service_protection_limits
+            .and_then(|l| l.max_tables)
+            .unwrap_or_default();
+        let max_columns_per_table = service_protection_limits
+            .and_then(|l| l.max_columns_per_table)
+            .unwrap_or_default();
+
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+INSERT INTO namespace ( name, retention_period_ns, max_tables, max_columns_per_table, partition_template )
+VALUES ( $1, $2, $3, $4, $5 )
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+            "#,
+        )
+        .bind(name.as_str()) // $1
+        .bind(retention_period_ns) // $2
+        .bind(max_tables) // $3
+        .bind(max_columns_per_table) // $4
+        .bind(partition_template); // $5
+
+        let rec = rec.fetch_one(self.inner.get_mut()).await.map_err(|e| {
+            if is_unique_violation(&e) {
+                Error::AlreadyExists {
+                    descr: name.to_string(),
+                }
+            } else if is_fk_violation(&e) {
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
+            } else {
+                Error::External {
+                    source: Box::new(e),
+                }
+            }
+        })?;
+
+        Ok(rec)
+    }
+
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            format!(
+                r#"
+SELECT id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+       partition_template
+FROM namespace
+WHERE {v};
+                "#,
+                v = deleted.as_sql_predicate()
+            )
+            .as_str(),
+        )
+        .fetch_all(self.inner.get_mut())
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            format!(
+                r#"
+SELECT id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+       partition_template
+FROM namespace
+WHERE id=$1 AND {v};
+                "#,
+                v = deleted.as_sql_predicate()
+            )
+            .as_str(),
+        )
+        .bind(id) // $1
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let namespace = rec?;
+
+        Ok(Some(namespace))
+    }
+
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            format!(
+                r#"
+SELECT id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+       partition_template
+FROM namespace
+WHERE name=$1 AND {v};
+                "#,
+                v = deleted.as_sql_predicate()
+            )
+            .as_str(),
+        )
+        .bind(name) // $1
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let namespace = rec?;
+
+        Ok(Some(namespace))
+    }
+
+    async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        let flagged_at = Timestamp::from(self.time_provider.now());
+
+        // note that there is a uniqueness constraint on the name column in the DB
+        sqlx::query(r#"UPDATE namespace SET deleted_at=$1 WHERE name = $2;"#)
+            .bind(flagged_at) // $1
+            .bind(name) // $2
+            .execute(self.inner.get_mut())
+            .await
+            .map_err(Error::from)
+            .map(|_| ())
+    }
+
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+UPDATE namespace
+SET max_tables = $1
+WHERE name = $2
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+        "#,
+        )
+        .bind(new_max)
+        .bind(name)
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        let namespace = rec.map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
+            },
+        })?;
+
+        Ok(namespace)
+    }
+
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+UPDATE namespace
+SET max_columns_per_table = $1
+WHERE name = $2
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+        "#,
+        )
+        .bind(new_max)
+        .bind(name)
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        let namespace = rec.map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
+            },
+        })?;
+
+        Ok(namespace)
+    }
+
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace> {
+        let rec = sqlx::query_as::<_, Namespace>(
+            r#"
+UPDATE namespace
+SET retention_period_ns = $1
+WHERE name = $2
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+            "#,
+        )
+        .bind(retention_period_ns) // $1
+        .bind(name) // $2
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        let namespace = rec.map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
+            },
+        })?;
+
+        Ok(namespace)
+    }
+}
+
+/// [`TableRepo::create`] needs the ability to create some columns within the same transaction as
+/// the table creation. Column creation might also happen through [`ColumnRepo::create_or_get`],
+/// which doesn't need to be within an outer transaction. This function was extracted so that these
+/// two functions can share code but pass in either the transaction or the regular database
+/// connection as the query executor.
+async fn insert_column_with_connection<'q, E>(
+    executor: E,
+    name: &str,
+    table_id: TableId,
+    column_type: ColumnType,
+) -> Result<Column>
+where
+    E: Executor<'q, Database = Sqlite>,
+{
+    let rec = sqlx::query_as::<_, Column>(
+            r#"
+INSERT INTO column_name ( name, table_id, column_type )
+SELECT $1, table_id, $3 FROM (
+    SELECT max_columns_per_table, namespace.id, table_name.id as table_id, COUNT(column_name.id) AS count
+    FROM namespace LEFT JOIN table_name ON namespace.id = table_name.namespace_id
+                   LEFT JOIN column_name ON table_name.id = column_name.table_id
+    WHERE table_name.id = $2
+    GROUP BY namespace.max_columns_per_table, namespace.id, table_name.id
+) AS get_count WHERE count < max_columns_per_table
+ON CONFLICT (table_id, name)
+DO UPDATE SET name = column_name.name
+RETURNING *;
+        "#,
+        )
+        .bind(name) // $1
+        .bind(table_id) // $2
+        .bind(column_type) // $3
+        .fetch_one(executor)
+        .await
+        .map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!("couldn't create column {} in table {}; limit reached on namespace", name, table_id)
+            },
+            _ => {
+            if is_fk_violation(&e) {
+                Error::NotFound { descr: e.to_string() }
+            } else {
+                Error::External { source: Box::new(e) }
+            }
+        }})?;
+
+    ensure!(
+        rec.column_type == column_type,
+        AlreadyExistsSnafu {
+            descr: format!(
+                "column {} is type {} but schema update has type {}",
+                name, rec.column_type, column_type
+            ),
+        }
+    );
+
+    Ok(rec)
+}
+
+#[async_trait]
+impl TableRepo for SqliteTxn {
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table> {
+        let mut tx = self.inner.get_mut().pool.begin().await?;
+
+        // A simple insert statement becomes quite complicated in order to avoid checking the table
+        // limits in a select and then conditionally inserting (which would be racey).
+        //
+        // from https://www.postgresql.org/docs/current/sql-insert.html
+        //   "INSERT inserts new rows into a table. One can insert one or more rows specified by
+        //   value expressions, or zero or more rows resulting from a query."
+        // By using SELECT rather than VALUES it will insert zero rows if it finds a null in the
+        // subquery, i.e. if count >= max_tables. fetch_one() will return a RowNotFound error if
+        // nothing was inserted. Not pretty!
+        let table = sqlx::query_as::<_, Table>(
+            r#"
+INSERT INTO table_name ( name, namespace_id, partition_template )
+SELECT $1, id, $2 FROM (
+    SELECT namespace.id AS id, max_tables, COUNT(table_name.id) AS count
+    FROM namespace LEFT JOIN table_name ON namespace.id = table_name.namespace_id
+    WHERE namespace.id = $3
+    GROUP BY namespace.max_tables, table_name.namespace_id, namespace.id
+) AS get_count WHERE count < max_tables
+RETURNING *;
+        "#,
+        )
+        .bind(name) // $1
+        .bind(partition_template) // $2
+        .bind(namespace_id) // $3
+        .fetch_one(&mut *tx)
+        .await
+        .map_err(|e| match e {
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!(
+                    "couldn't create table {}; limit reached on namespace {}",
+                    name, namespace_id
+                ),
+            },
+            _ => {
+                if is_unique_violation(&e) {
+                    Error::AlreadyExists {
+                        descr: format!("table '{name}' in namespace {namespace_id}"),
+                    }
+                } else if is_fk_violation(&e) {
+                    Error::NotFound {
+                        descr: e.to_string(),
+                    }
+                } else {
+                    Error::External {
+                        source: Box::new(e),
+                    }
+                }
+            }
+        })?;
+
+        // Partitioning is only supported for tags, so create tag columns for all `TagValue`
+        // partition template parts. It's important this happens within the table creation
+        // transaction so that there isn't a possibility of a concurrent write creating these
+        // columns with an unsupported type.
+        for template_part in table.partition_template.parts() {
+            if let TemplatePart::TagValue(tag_name) = template_part {
+                insert_column_with_connection(&mut *tx, tag_name, table.id, ColumnType::Tag)
+                    .await?;
+            }
+        }
+
+        tx.commit().await?;
+
+        Ok(table)
+    }
+
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
+        let rec = sqlx::query_as::<_, Table>(
+            r#"
+SELECT *
+FROM table_name
+WHERE id = $1;
+            "#,
+        )
+        .bind(table_id) // $1
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let table = rec?;
+
+        Ok(Some(table))
+    }
+
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>> {
+        let rec = sqlx::query_as::<_, Table>(
+            r#"
+SELECT *
+FROM table_name
+WHERE namespace_id = $1 AND name = $2;
+            "#,
+        )
+        .bind(namespace_id) // $1
+        .bind(name) // $2
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let table = rec?;
+
+        Ok(Some(table))
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        let rec = sqlx::query_as::<_, Table>(
+            r#"
+SELECT *
+FROM table_name
+WHERE namespace_id = $1;
+            "#,
+        )
+        .bind(namespace_id)
+        .fetch_all(self.inner.get_mut())
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn list(&mut self) -> Result<Vec<Table>> {
+        let rec = sqlx::query_as::<_, Table>("SELECT * FROM table_name;")
+            .fetch_all(self.inner.get_mut())
+            .await?;
+
+        Ok(rec)
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let mut tx = self.inner.get_mut().pool.begin().await?;
+
+        // This will upgrade the transaction to be exclusive
+        let rec = sqlx::query(
+            "UPDATE table_name SET generation = generation + 1 where id = $1 RETURNING *;",
+        )
+        .bind(table_id) // $1
+        .fetch_one(&mut *tx)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("table: {table_id}"),
+            });
+        }
+        let row = rec?;
+
+        let generation: i64 = row.get("generation");
+        let table = Table::from_row(&row)?;
+
+        let columns = sqlx::query_as::<_, Column>("SELECT * from column_name where table_id = $1;")
+            .bind(table_id) // $1
+            .fetch_all(&mut *tx)
+            .await?;
+
+        let partitions =
+            sqlx::query_as::<_, PartitionPod>("SELECT * from partition where table_id = $1;")
+                .bind(table_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        tx.commit().await?;
+
+        Ok(TableSnapshot::encode(
+            table,
+            partitions.into_iter().map(Into::into).collect(),
+            columns,
+            generation as _,
+        )?)
+    }
+}
+
+#[async_trait]
+impl ColumnRepo for SqliteTxn {
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column> {
+        insert_column_with_connection(self.inner.get_mut(), name, table_id, column_type).await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        let rec = sqlx::query_as::<_, Column>(
+            r#"
+SELECT column_name.* FROM table_name
+INNER JOIN column_name on column_name.table_id = table_name.id
+WHERE table_name.namespace_id = $1;
+            "#,
+        )
+        .bind(namespace_id)
+        .fetch_all(self.inner.get_mut())
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
+        let rec = sqlx::query_as::<_, Column>(
+            r#"
+SELECT * FROM column_name
+WHERE table_id = $1;
+            "#,
+        )
+        .bind(table_id)
+        .fetch_all(self.inner.get_mut())
+        .await?;
+
+        Ok(rec)
+    }
+
+    async fn list(&mut self) -> Result<Vec<Column>> {
+        let rec = sqlx::query_as::<_, Column>("SELECT * FROM column_name;")
+            .fetch_all(self.inner.get_mut())
+            .await?;
+
+        Ok(rec)
+    }
+
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>> {
+        let num_columns = columns.len();
+        #[derive(Deserialize, Serialize)]
+        struct NameType<'a> {
+            name: &'a str,
+            column_type: i8,
+        }
+        impl<'a> NameType<'a> {
+            fn from(value: (&&'a str, &ColumnType)) -> Self {
+                Self {
+                    name: value.0,
+                    column_type: *value.1 as i8,
+                }
+            }
+        }
+        let cols = columns.iter().map(NameType::<'_>::from).collect::<Vec<_>>();
+
+        // The `ORDER BY` in this statement is important to avoid deadlocks during concurrent
+        // writes to the same IOx table that each add many new columns. See:
+        //
+        // - <https://rcoh.svbtle.com/sqlite-unique-constraints-can-cause-deadlock>
+        // - <https://dba.stackexchange.com/a/195220/27897>
+        // - <https://github.com/influxdata/idpe/issues/16298>
+        let out = sqlx::query_as::<_, Column>(
+            r#"
+INSERT INTO column_name ( name, table_id, column_type )
+SELECT a.value ->> 'name' AS name, $1, a.value ->> 'column_type' AS column_type
+FROM json_each($2) as a
+ORDER BY name
+ON CONFLICT (table_id, name)
+DO UPDATE SET name = column_name.name
+RETURNING *;
+            "#,
+        )
+        .bind(table_id) // $1
+        .bind(&Json(cols)) // $2
+        .fetch_all(self.inner.get_mut())
+        .await
+        .map_err(|e| {
+            if is_fk_violation(&e) {
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
+            } else {
+                Error::External {
+                    source: Box::new(e),
+                }
+            }
+        })?;
+
+        assert_eq!(num_columns, out.len());
+
+        for existing in &out {
+            let want = columns.get(existing.name.as_str()).unwrap();
+            ensure!(
+                existing.column_type == *want,
+                AlreadyExistsSnafu {
+                    descr: format!(
+                        "column {} is type {} but schema update has type {}",
+                        existing.name, existing.column_type, want
+                    ),
+                }
+            );
+        }
+
+        Ok(out)
+    }
+}
+
+// We can't use [`Partition`], as uses Vec<String> which the Sqlite
+// driver cannot serialise
+
+#[derive(Debug, Clone, PartialEq, Eq, sqlx::FromRow)]
+struct PartitionPod {
+    id: PartitionId,
+    hash_id: Option<PartitionHashId>,
+    table_id: TableId,
+    partition_key: PartitionKey,
+    sort_key_ids: Json<Vec<i64>>,
+    new_file_at: Option<Timestamp>,
+}
+
+impl From<PartitionPod> for Partition {
+    fn from(value: PartitionPod) -> Self {
+        let sort_key_ids = SortKeyIds::from(value.sort_key_ids.0);
+
+        Self::new_catalog_only(
+            value.id,
+            value.hash_id,
+            value.table_id,
+            value.partition_key,
+            sort_key_ids,
+            value.new_file_at,
+        )
+    }
+}
+
+#[async_trait]
+impl PartitionRepo for SqliteTxn {
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
+        // Note: since sort_key is now an array, we must explicitly insert '{}' which is an empty
+        // array rather than NULL which sqlx will throw `UnexpectedNullError` while is is doing
+        // `ColumnDecode`
+
+        let hash_id = PartitionHashId::new(table_id, &key);
+
+        let v = sqlx::query_as::<_, PartitionPod>(
+            r#"
+INSERT INTO partition
+    (partition_key, table_id, hash_id, sort_key_ids)
+VALUES
+    ($1, $2, $3, '[]')
+ON CONFLICT (table_id, partition_key)
+DO UPDATE SET partition_key = partition.partition_key
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
+        "#,
+        )
+        .bind(key) // $1
+        .bind(table_id) // $2
+        .bind(&hash_id) // $3
+        .fetch_one(self.inner.get_mut())
+        .await
+        .map_err(|e| {
+            if is_fk_violation(&e) {
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
+            } else {
+                Error::External {
+                    source: Box::new(e),
+                }
+            }
+        })?;
+
+        Ok(v.into())
+    }
+
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        // We use a JSON-based "IS IN" check.
+        let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect();
+
+        sqlx::query_as::<_, PartitionPod>(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+WHERE id IN (SELECT value FROM json_each($1));
+            "#,
+        )
+        .bind(Json(&ids[..])) // $1
+        .fetch_all(self.inner.get_mut())
+        .await
+        .map(|vals| vals.into_iter().map(Partition::from).collect())
+        .map_err(Error::from)
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
+        Ok(sqlx::query_as::<_, PartitionPod>(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+WHERE table_id = $1;
+            "#,
+        )
+        .bind(table_id) // $1
+        .fetch_all(self.inner.get_mut())
+        .await?
+        .into_iter()
+        .map(Into::into)
+        .collect())
+    }
+
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
+        sqlx::query_as(
+            r#"
+            SELECT p.id as partition_id
+            FROM partition p
+            "#,
+        )
+        .fetch_all(self.inner.get_mut())
+        .await
+        .map_err(Error::from)
+    }
+
+    /// Update the sort key for `partition_id` if and only if `old_sort_key`
+    /// matches the current value in the database.
+    ///
+    /// This compare-and-swap operation is allowed to spuriously return
+    /// [`CasFailure::ValueMismatch`] for performance reasons (avoiding multiple
+    /// round trips to service a transaction in the happy path).
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let old_sort_key_ids: Vec<i64> = old_sort_key_ids.map(Into::into).unwrap_or_default();
+
+        let raw_new_sort_key_ids: Vec<i64> = new_sort_key_ids.into();
+
+        // This `match` will go away when all partitions have hash IDs in the database.
+        let query = sqlx::query_as::<_, PartitionPod>(
+            r#"
+UPDATE partition
+SET sort_key_ids = $1
+WHERE id = $2 AND sort_key_ids = $3
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
+        "#,
+        )
+        .bind(Json(raw_new_sort_key_ids)) // $1
+        .bind(partition_id) // $2
+        .bind(Json(old_sort_key_ids)); // $3
+
+        let res = query.fetch_one(self.inner.get_mut()).await;
+
+        let partition = match res {
+            Ok(v) => v,
+            Err(sqlx::Error::RowNotFound) => {
+                // This update may have failed either because:
+                //
+                // * A row with the specified ID did not exist at query time
+                //   (but may exist now!)
+                // * The sort key does not match.
+                //
+                // To differentiate, we submit a get partition query, returning
+                // the actual sort key if successful.
+                //
+                // NOTE: this is racy, but documented - this might return "Sort
+                // key differs! Old key: <old sort key you provided>"
+
+                let partition = (self as &mut dyn PartitionRepo)
+                    .get_by_id(partition_id)
+                    .await
+                    .map_err(CasFailure::QueryError)?
+                    .ok_or(CasFailure::QueryError(Error::NotFound {
+                        descr: partition_id.to_string(),
+                    }))?;
+                return Err(CasFailure::ValueMismatch(
+                    partition.sort_key_ids().cloned().unwrap_or_default(),
+                ));
+            }
+            Err(e) => {
+                return Err(CasFailure::QueryError(Error::External {
+                    source: Box::new(e),
+                }))
+            }
+        };
+
+        debug!(?partition_id, "partition sort key cas successful");
+
+        Ok(partition.into())
+    }
+
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()> {
+        sqlx::query(
+            r#"
+INSERT INTO skipped_compactions
+    ( partition_id, reason, num_files, limit_num_files, limit_num_files_first_in_partition, estimated_bytes, limit_bytes, skipped_at )
+VALUES
+    ( $1, $2, $3, $4, $5, $6, $7, $8 )
+ON CONFLICT ( partition_id )
+DO UPDATE
+SET
+reason = EXCLUDED.reason,
+num_files = EXCLUDED.num_files,
+limit_num_files = EXCLUDED.limit_num_files,
+limit_num_files_first_in_partition = EXCLUDED.limit_num_files_first_in_partition,
+estimated_bytes = EXCLUDED.estimated_bytes,
+limit_bytes = EXCLUDED.limit_bytes,
+skipped_at = EXCLUDED.skipped_at;
+        "#,
+        )
+            .bind(partition_id) // $1
+            .bind(reason)
+            .bind(num_files as i64)
+            .bind(limit_num_files as i64)
+            .bind(limit_num_files_first_in_partition as i64)
+            .bind(estimated_bytes as i64)
+            .bind(limit_bytes as i64)
+            .bind(std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs() as i64)
+            .execute(self.inner.get_mut())
+            .await?;
+        Ok(())
+    }
+
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_ids: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>> {
+        let ids = partition_ids.iter().map(|p| p.get()).collect::<Vec<_>>();
+        let rec = sqlx::query_as::<sqlx::sqlite::Sqlite, SkippedCompaction>(
+            r#"SELECT * FROM skipped_compactions WHERE partition_id IN (SELECT value FROM json_each($1));"#,
+        )
+        .bind(Json(&ids[..]))
+        .fetch_all(self.inner.get_mut())
+        .await;
+
+        let skipped_partition_records = rec?;
+
+        Ok(skipped_partition_records)
+    }
+
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
+        sqlx::query_as::<_, SkippedCompaction>(
+            r#"
+SELECT * FROM skipped_compactions
+        "#,
+        )
+        .fetch_all(self.inner.get_mut())
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>> {
+        sqlx::query_as::<_, SkippedCompaction>(
+            r#"
+DELETE FROM skipped_compactions
+WHERE partition_id = $1
+RETURNING *
+        "#,
+        )
+        .bind(partition_id)
+        .fetch_optional(self.inner.get_mut())
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
+        Ok(sqlx::query_as::<_, PartitionPod>(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+ORDER BY id DESC
+LIMIT $1;
+        "#,
+        )
+        .bind(n as i64) // $1
+        .fetch_all(self.inner.get_mut())
+        .await?
+        .into_iter()
+        .map(Into::into)
+        .collect())
+    }
+
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>> {
+        let sql = format!(
+            r#"
+            SELECT p.id as partition_id
+            FROM partition p
+            WHERE p.new_file_at > $1
+            {}
+            "#,
+            maximum_time
+                .map(|_| "AND p.new_file_at < $2")
+                .unwrap_or_default()
+        );
+
+        sqlx::query_as(&sql)
+            .bind(minimum_time) // $1
+            .bind(maximum_time) // $2
+            .fetch_all(self.inner.get_mut())
+            .await
+            .map_err(Error::from)
+    }
+
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
+        Ok(sqlx::query_as::<_, PartitionPod>(
+            r#"
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
+FROM partition
+WHERE hash_id IS NULL
+ORDER BY id DESC;
+        "#,
+        )
+        .fetch_all(self.inner.get_mut())
+        .await?
+        .into_iter()
+        .map(Into::into)
+        .collect())
+    }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let mut tx = self.inner.get_mut().pool.begin().await?;
+
+        // This will upgrade the transaction to be exclusive
+        let rec = sqlx::query(
+            "UPDATE partition SET generation = generation + 1 where id = $1 RETURNING *;",
+        )
+        .bind(partition_id) // $1
+        .fetch_one(&mut *tx)
+        .await;
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("partition: {partition_id}"),
+            });
+        }
+        let row = rec?;
+
+        let generation: i64 = row.get("generation");
+        let partition = PartitionPod::from_row(&row)?;
+
+        let (namespace_id,): (NamespaceId,) =
+            sqlx::query_as("SELECT namespace_id from table_name where id = $1")
+                .bind(partition.table_id) // $1
+                .fetch_one(&mut *tx)
+                .await?;
+
+        let files =
+            sqlx::query_as::<_, ParquetFilePod>("SELECT * from parquet_file where partition_id = $1 AND parquet_file.to_delete IS NULL;")
+                .bind(partition_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        let sc = sqlx::query_as::<sqlx::sqlite::Sqlite, SkippedCompaction>(
+            r#"SELECT * FROM skipped_compactions WHERE partition_id = $1;"#,
+        )
+        .bind(partition_id)
+        .fetch_optional(&mut *tx)
+        .await?;
+
+        tx.commit().await?;
+
+        Ok(PartitionSnapshot::encode(
+            namespace_id,
+            partition.into(),
+            files.into_iter().map(Into::into).collect(),
+            sc,
+            generation as _,
+        )?)
+    }
+}
+
+fn from_column_set(v: &ColumnSet) -> Json<Vec<i64>> {
+    Json((*v).iter().map(ColumnId::get).collect())
+}
+
+fn to_column_set(v: &Json<Vec<i64>>) -> ColumnSet {
+    ColumnSet::new(v.0.iter().map(|v| ColumnId::new(*v)))
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, sqlx::FromRow)]
+struct ParquetFilePod {
+    id: ParquetFileId,
+    namespace_id: NamespaceId,
+    table_id: TableId,
+    partition_id: PartitionId,
+    partition_hash_id: Option<PartitionHashId>,
+    object_store_id: ObjectStoreId,
+    min_time: Timestamp,
+    max_time: Timestamp,
+    to_delete: Option<Timestamp>,
+    file_size_bytes: i64,
+    row_count: i64,
+    compaction_level: CompactionLevel,
+    created_at: Timestamp,
+    column_set: Json<Vec<i64>>,
+    max_l0_created_at: Timestamp,
+}
+
+impl From<ParquetFilePod> for ParquetFile {
+    fn from(value: ParquetFilePod) -> Self {
+        Self {
+            id: value.id,
+            namespace_id: value.namespace_id,
+            table_id: value.table_id,
+            partition_id: value.partition_id,
+            partition_hash_id: value.partition_hash_id,
+            object_store_id: value.object_store_id,
+            min_time: value.min_time,
+            max_time: value.max_time,
+            to_delete: value.to_delete,
+            file_size_bytes: value.file_size_bytes,
+            row_count: value.row_count,
+            compaction_level: value.compaction_level,
+            created_at: value.created_at,
+            column_set: to_column_set(&value.column_set),
+            max_l0_created_at: value.max_l0_created_at,
+        }
+    }
+}
+
+#[async_trait]
+impl ParquetFileRepo for SqliteTxn {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let flagged_at = Timestamp::from(self.time_provider.now());
+        // TODO - include check of table retention period once implemented
+        let flagged = sqlx::query(
+            r#"
+WITH parquet_file_ids as (
+    SELECT parquet_file.object_store_id
+    FROM namespace, parquet_file
+    WHERE namespace.retention_period_ns IS NOT NULL
+    AND parquet_file.to_delete IS NULL
+    AND parquet_file.max_time < $1 - namespace.retention_period_ns
+    AND namespace.id = parquet_file.namespace_id
+    LIMIT $2
+)
+UPDATE parquet_file
+SET to_delete = $1
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING partition_id, object_store_id;
+            "#,
+        )
+        .bind(flagged_at) // $1
+        .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION) // $2
+        .fetch_all(self.inner.get_mut())
+        .await?;
+
+        let flagged = flagged
+            .into_iter()
+            .map(|row| (row.get("partition_id"), row.get("object_store_id")))
+            .collect();
+        Ok(flagged)
+    }
+
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        // see https://www.crunchydata.com/blog/simulating-update-or-delete-with-limit-in-sqlite-ctes-to-the-rescue
+        let deleted = sqlx::query(
+            r#"
+WITH parquet_file_ids as (
+    SELECT object_store_id
+    FROM parquet_file
+    WHERE to_delete < $1
+    LIMIT $2
+)
+DELETE FROM parquet_file
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING object_store_id;
+             "#,
+        )
+        .bind(older_than) // $1
+        .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE) // $2
+        .fetch_all(self.inner.get_mut())
+        .await?;
+
+        let deleted = deleted
+            .into_iter()
+            .map(|row| row.get("object_store_id"))
+            .collect();
+        Ok(deleted)
+    }
+
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>> {
+        // We use a JSON-based "IS IN" check.
+        let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect();
+
+        let query = sqlx::query_as::<_, ParquetFilePod>(
+            r#"
+SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
+       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
+       compaction_level, created_at, column_set, max_l0_created_at
+FROM parquet_file
+WHERE parquet_file.partition_id IN (SELECT value FROM json_each($1))
+  AND parquet_file.to_delete IS NULL;
+        "#,
+        )
+        .bind(Json(&ids[..])); // $1
+
+        Ok(query
+            .fetch_all(self.inner.get_mut())
+            .await?
+            .into_iter()
+            .map(Into::into)
+            .collect())
+    }
+
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>> {
+        let rec = sqlx::query_as::<_, ParquetFilePod>(
+            r#"
+SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, min_time,
+       max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set,
+       max_l0_created_at
+FROM parquet_file
+WHERE object_store_id = $1;
+             "#,
+        )
+        .bind(object_store_id) // $1
+        .fetch_one(self.inner.get_mut())
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Ok(None);
+        }
+
+        let parquet_file = rec?;
+
+        Ok(Some(parquet_file.into()))
+    }
+
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        let in_value = object_store_ids
+            .into_iter()
+            // use a sqlite blob literal
+            .map(|id| format!("X'{}'", id.get_uuid().simple()))
+            .collect::<Vec<String>>()
+            .join(",");
+
+        sqlx::query(&format!(
+            "
+SELECT object_store_id
+FROM parquet_file
+WHERE object_store_id IN ({v});",
+            v = in_value
+        ))
+        .map(|slr: SqliteRow| slr.get::<ObjectStoreId, _>("object_store_id"))
+        // limitation of sqlx: will not bind arrays
+        // https://github.com/launchbadge/sqlx/blob/main/FAQ.md#how-can-i-do-a-select--where-foo-in--query
+        .fetch_all(self.inner.get_mut())
+        .await
+        .map_err(Error::from)
+    }
+
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>> {
+        let delete_set = delete.iter().copied().collect::<HashSet<_>>();
+        let upgrade_set = upgrade.iter().copied().collect::<HashSet<_>>();
+
+        assert!(
+            delete_set.is_disjoint(&upgrade_set),
+            "attempted to upgrade a file scheduled for delete"
+        );
+        let mut tx = self.inner.get_mut().pool.begin().await?;
+
+        for id in delete {
+            let marked_at = Timestamp::from(self.time_provider.now());
+            flag_for_delete(&mut *tx, partition_id, *id, marked_at).await?;
+        }
+
+        update_compaction_level(&mut *tx, partition_id, upgrade, target_level).await?;
+
+        let mut ids = Vec::with_capacity(create.len());
+        for file in create {
+            if file.partition_id != partition_id {
+                return Err(Error::External {
+                    source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(),
+                });
+            }
+            let res = create_parquet_file(&mut *tx, file.clone()).await?;
+            ids.push(res.id);
+        }
+        tx.commit().await?;
+
+        Ok(ids)
+    }
+}
+
+// The following three functions are helpers to the create_upgrade_delete method.
+// They are also used by the respective create/flag_for_delete/update_compaction_level methods.
+async fn create_parquet_file<'q, E>(
+    executor: E,
+    parquet_file_params: ParquetFileParams,
+) -> Result<ParquetFile>
+where
+    E: Executor<'q, Database = Sqlite>,
+{
+    let ParquetFileParams {
+        namespace_id,
+        table_id,
+        partition_id,
+        partition_hash_id,
+        object_store_id,
+        min_time,
+        max_time,
+        file_size_bytes,
+        row_count,
+        compaction_level,
+        created_at,
+        column_set,
+        max_l0_created_at,
+    } = parquet_file_params;
+
+    let res = sqlx::query_as::<_, ParquetFilePod>(
+        r#"
+INSERT INTO parquet_file (
+    table_id, partition_id, partition_hash_id, object_store_id,
+    min_time, max_time, file_size_bytes,
+    row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at )
+VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 )
+RETURNING
+    id, table_id, partition_id, partition_hash_id, object_store_id, min_time, max_time, to_delete,
+    file_size_bytes, row_count, compaction_level, created_at, namespace_id, column_set,
+    max_l0_created_at;
+        "#,
+    )
+    .bind(table_id) // $1
+    .bind(partition_id) // $2
+    .bind(partition_hash_id.as_ref()) // $3
+    .bind(object_store_id) // $4
+    .bind(min_time) // $5
+    .bind(max_time) // $6
+    .bind(file_size_bytes) // $7
+    .bind(row_count) // $8
+    .bind(compaction_level) // $9
+    .bind(created_at) // $10
+    .bind(namespace_id) // $11
+    .bind(from_column_set(&column_set)) // $12
+    .bind(max_l0_created_at) // $13
+    .fetch_one(executor)
+    .await;
+
+    let rec = res.map_err(|e| {
+        if is_unique_violation(&e) {
+            Error::AlreadyExists {
+                descr: object_store_id.to_string(),
+            }
+        } else if is_fk_violation(&e) {
+            Error::NotFound {
+                descr: e.to_string(),
+            }
+        } else {
+            Error::External {
+                source: Box::new(e),
+            }
+        }
+    })?;
+
+    Ok(rec.into())
+}
+
+async fn flag_for_delete<'q, E>(
+    executor: E,
+    partition_id: PartitionId,
+    id: ObjectStoreId,
+    marked_at: Timestamp,
+) -> Result<()>
+where
+    E: Executor<'q, Database = Sqlite>,
+{
+    let updated =
+        sqlx::query_as::<_, (i64,)>(r#"UPDATE parquet_file SET to_delete = $1 WHERE object_store_id = $2 AND partition_id = $3 AND to_delete is NULL returning id;"#)
+            .bind(marked_at) // $1
+            .bind(id) // $2
+            .bind(partition_id) // $3
+            .fetch_all(executor)
+            .await?;
+
+    if updated.len() != 1 {
+        return Err(Error::NotFound {
+            descr: format!("parquet file {id} not found for delete"),
+        });
+    }
+
+    Ok(())
+}
+
+async fn update_compaction_level<'q, E>(
+    executor: E,
+    partition_id: PartitionId,
+    object_store_ids: &[ObjectStoreId],
+    compaction_level: CompactionLevel,
+) -> Result<()>
+where
+    E: Executor<'q, Database = Sqlite>,
+{
+    let in_value = object_store_ids
+        .iter()
+        // use a sqlite blob literal
+        .map(|id| format!("X'{}'", id.get_uuid().simple()))
+        .collect::<Vec<String>>()
+        .join(",");
+
+    let updated = sqlx::query_as::<_, (i64,)>(&format!(
+        r#"
+UPDATE parquet_file
+SET compaction_level = $1
+WHERE object_store_id IN ({v}) AND partition_id = $2 AND to_delete is NULL returning id;
+        "#,
+        v = in_value,
+    ))
+    .bind(compaction_level) // $1
+    .bind(partition_id) // $2
+    .fetch_all(executor)
+    .await?;
+
+    if updated.len() != object_store_ids.len() {
+        return Err(Error::NotFound {
+            descr: "parquet file(s) not found for upgrade".to_string(),
+        });
+    }
+
+    Ok(())
+}
+
+/// The error code returned by SQLite for a unique constraint violation.
+///
+/// See <https://sqlite.org/rescode.html#constraint_unique>
+const SQLITE_UNIQUE_VIOLATION: &str = "2067";
+
+/// Error code returned by SQLite for a foreign key constraint violation.
+/// See <https://sqlite.org/rescode.html#constraint_foreignkey>
+const SQLITE_FK_VIOLATION: &str = "787";
+
+fn is_fk_violation(e: &sqlx::Error) -> bool {
+    if let sqlx::Error::Database(inner) = e {
+        if let Some(code) = inner.code() {
+            if code == SQLITE_FK_VIOLATION {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+/// Returns true if `e` is a unique constraint violation error.
+fn is_unique_violation(e: &sqlx::Error) -> bool {
+    if let sqlx::Error::Database(inner) = e {
+        if let Some(code) = inner.code() {
+            if code == SQLITE_UNIQUE_VIOLATION {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::interface::ParquetFileRepoExt;
+    use crate::test_helpers::{
+        arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table,
+    };
+    use assert_matches::assert_matches;
+    use data_types::partition_template::TemplatePart;
+    use generated_types::influxdata::iox::partition_template::v1 as proto;
+    use std::sync::Arc;
+
+    async fn setup_db() -> SqliteCatalog {
+        let dsn =
+            std::env::var("TEST_INFLUXDB_SQLITE_DSN").unwrap_or("sqlite::memory:".to_string());
+        let options = SqliteConnectionOptions { file_path: dsn };
+        let metrics = Arc::new(Registry::default());
+        let cat = SqliteCatalog::connect(options, metrics)
+            .await
+            .expect("failed to connect to catalog");
+        cat.setup().await.expect("failed to initialise database");
+        cat
+    }
+
+    #[tokio::test]
+    async fn test_catalog() {
+        crate::interface_tests::test_catalog(|| async {
+            let sqlite = setup_db().await;
+            let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
+            sqlite
+        })
+        .await;
+    }
+
+    #[tokio::test]
+    async fn existing_partitions_without_hash_id() {
+        let sqlite: SqliteCatalog = setup_db().await;
+        let pool = sqlite.pool.clone();
+        let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
+        let mut repos = sqlite.repositories();
+
+        let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+        let table = arbitrary_table(&mut *repos, "table", &namespace).await;
+        let table_id = table.id;
+        let key = PartitionKey::from("francis-scott-key-key");
+
+        // Create a partition record in the database that has `NULL` for its `hash_id`
+        // value, which is what records existing before the migration adding that column will have.
+        sqlx::query(
+            r#"
+INSERT INTO partition
+    (partition_key, table_id, sort_key_ids)
+VALUES
+    ($1, $2, '[]')
+ON CONFLICT (table_id, partition_key)
+DO UPDATE SET partition_key = partition.partition_key
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
+        "#,
+        )
+        .bind(&key) // $1
+        .bind(table_id) // $2
+        .fetch_one(&pool)
+        .await
+        .unwrap();
+
+        // Check that the hash_id being null in the database doesn't break querying for partitions.
+        let table_partitions = repos.partitions().list_by_table_id(table_id).await.unwrap();
+        assert_eq!(table_partitions.len(), 1);
+        let partition = &table_partitions[0];
+
+        // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent
+        // and that the hash_id still doesn't get set.
+        let inserted_again = repos
+            .partitions()
+            .create_or_get(key, table_id)
+            .await
+            .expect("idempotent write should succeed");
+
+        // Test: sort_key_ids from freshly insert with empty value
+        assert!(inserted_again.sort_key_ids().is_none());
+
+        assert_eq!(partition, &inserted_again);
+
+        // Create a Parquet file record in this partition to ensure we don't break new data
+        // ingestion for old-style partitions
+        let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, partition);
+        let parquet_file = repos
+            .parquet_files()
+            .create(parquet_file_params)
+            .await
+            .unwrap();
+        assert_eq!(parquet_file.partition_hash_id, None);
+
+        // Add a partition record WITH a hash ID
+        repos
+            .partitions()
+            .create_or_get(PartitionKey::from("Something else"), table_id)
+            .await
+            .unwrap();
+
+        // Ensure we can list only the old-style partitions
+        let old_style_partitions = repos.partitions().list_old_style().await.unwrap();
+        assert_eq!(old_style_partitions.len(), 1);
+        assert_eq!(old_style_partitions[0].id, partition.id);
+    }
+
+    #[tokio::test]
+    async fn test_billing_summary_on_parqet_file_creation() {
+        let sqlite = setup_db().await;
+        let pool = sqlite.pool.clone();
+        let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
+        let mut repos = sqlite.repositories();
+        let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+        let table = arbitrary_table(&mut *repos, "table", &namespace).await;
+        let key = "bananas";
+        let partition = repos
+            .partitions()
+            .create_or_get(key.into(), table.id)
+            .await
+            .unwrap();
+
+        // parquet file to create- all we care about here is the size
+        let mut p1 = arbitrary_parquet_file_params(&namespace, &table, &partition);
+        p1.file_size_bytes = 1337;
+        let f1 = repos
+            .parquet_files()
+            .create(p1.clone())
+            .await
+            .expect("create parquet file should succeed");
+        // insert the same again with a different size; we should then have 3x1337 as total file
+        // size
+        p1.object_store_id = ObjectStoreId::new();
+        p1.file_size_bytes *= 2;
+        let _f2 = repos
+            .parquet_files()
+            .create(p1.clone())
+            .await
+            .expect("create parquet file should succeed");
+
+        // after adding two files we should have 3x1337 in the summary
+        let total_file_size_bytes: i64 =
+            sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;")
+                .fetch_one(&pool)
+                .await
+                .expect("fetch total file size failed");
+        assert_eq!(total_file_size_bytes, 1337 * 3);
+
+        // flag f1 for deletion and assert that the total file size is reduced accordingly.
+        repos
+            .parquet_files()
+            .create_upgrade_delete(
+                partition.id,
+                &[f1.object_store_id],
+                &[],
+                &[],
+                CompactionLevel::Initial,
+            )
+            .await
+            .expect("flag parquet file for deletion should succeed");
+        let total_file_size_bytes: i64 =
+            sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;")
+                .fetch_one(&pool)
+                .await
+                .expect("fetch total file size failed");
+        // we marked the first file of size 1337 for deletion leaving only the second that was 2x that
+        assert_eq!(total_file_size_bytes, 1337 * 2);
+
+        // actually deleting shouldn't change the total
+        let older_than = p1.created_at + 1;
+        repos
+            .parquet_files()
+            .delete_old_ids_only(older_than)
+            .await
+            .expect("parquet file deletion should succeed");
+        let total_file_size_bytes: i64 =
+            sqlx::query_scalar("SELECT total_file_size_bytes FROM billing_summary;")
+                .fetch_one(&pool)
+                .await
+                .expect("fetch total file size failed");
+        assert_eq!(total_file_size_bytes, 1337 * 2);
+    }
+
+    #[tokio::test]
+    async fn namespace_partition_template_null_is_the_default_in_the_database() {
+        let sqlite = setup_db().await;
+        let pool = sqlite.pool.clone();
+        let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
+        let mut repos = sqlite.repositories();
+
+        let namespace_name = "apples";
+
+        // Create a namespace record in the database that has `NULL` for its `partition_template`
+        // value, which is what records existing before the migration adding that column will have.
+        let insert_null_partition_template_namespace = sqlx::query(
+            r#"
+INSERT INTO namespace (
+    name, retention_period_ns, partition_template
+)
+VALUES ( $1, $2, NULL )
+RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
+          partition_template;
+            "#,
+        )
+        .bind(namespace_name) // $1
+        .bind(None::<Option<i64>>); // $2
+
+        insert_null_partition_template_namespace
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+
+        let lookup_namespace = repos
+            .namespaces()
+            .get_by_name(namespace_name, SoftDeletedRows::ExcludeDeleted)
+            .await
+            .unwrap()
+            .unwrap();
+        // When fetching this namespace from the database, the `FromRow` impl should set its
+        // `partition_template` to the default.
+        assert_eq!(
+            lookup_namespace.partition_template,
+            NamespacePartitionTemplateOverride::default()
+        );
+
+        // When creating a namespace through the catalog functions without specifying a custom
+        // partition template,
+        let created_without_custom_template = repos
+            .namespaces()
+            .create(
+                &"lemons".try_into().unwrap(),
+                None, // no partition template
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+
+        // it should have the default template in the application,
+        assert_eq!(
+            created_without_custom_template.partition_template,
+            NamespacePartitionTemplateOverride::default()
+        );
+
+        // and store NULL in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM namespace WHERE id = $1;")
+            .bind(created_without_custom_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(created_without_custom_template.name, name);
+        let partition_template: Option<NamespacePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert!(partition_template.is_none());
+
+        // When explicitly setting a template that happens to be equal to the application default,
+        // assume it's important that it's being specially requested and store it rather than NULL.
+        let namespace_custom_template_name = "kumquats";
+        let custom_partition_template_equal_to_default =
+            NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat(
+                        "%Y-%m-%d".to_owned(),
+                    )),
+                }],
+            })
+            .unwrap();
+        let namespace_custom_template = repos
+            .namespaces()
+            .create(
+                &namespace_custom_template_name.try_into().unwrap(),
+                Some(custom_partition_template_equal_to_default.clone()),
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+        assert_eq!(
+            namespace_custom_template.partition_template,
+            custom_partition_template_equal_to_default
+        );
+        let record = sqlx::query("SELECT name, partition_template FROM namespace WHERE id = $1;")
+            .bind(namespace_custom_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(namespace_custom_template.name, name);
+        let partition_template: Option<NamespacePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert_eq!(
+            partition_template.unwrap(),
+            custom_partition_template_equal_to_default
+        );
+    }
+
+    #[tokio::test]
+    async fn table_partition_template_null_is_the_default_in_the_database() {
+        let sqlite = setup_db().await;
+        let pool = sqlite.pool.clone();
+        let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
+        let mut repos = sqlite.repositories();
+
+        let namespace_default_template_name = "oranges";
+        let namespace_default_template = repos
+            .namespaces()
+            .create(
+                &namespace_default_template_name.try_into().unwrap(),
+                None, // no partition template
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+
+        let namespace_custom_template_name = "limes";
+        let namespace_custom_template = repos
+            .namespaces()
+            .create(
+                &namespace_custom_template_name.try_into().unwrap(),
+                Some(
+                    NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+                        parts: vec![proto::TemplatePart {
+                            part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                        }],
+                    })
+                    .unwrap(),
+                ),
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+
+        // In a namespace that also has a NULL template, create a table record in the database that
+        // has `NULL` for its `partition_template` value, which is what records existing before the
+        // migration adding that column will have.
+        let table_name = "null_template";
+        let insert_null_partition_template_table = sqlx::query(
+            r#"
+INSERT INTO table_name ( name, namespace_id, partition_template )
+VALUES ( $1, $2, NULL )
+RETURNING *;
+            "#,
+        )
+        .bind(table_name) // $1
+        .bind(namespace_default_template.id); // $2
+
+        insert_null_partition_template_table
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+
+        let lookup_table = repos
+            .tables()
+            .get_by_namespace_and_name(namespace_default_template.id, table_name)
+            .await
+            .unwrap()
+            .unwrap();
+        // When fetching this table from the database, the `FromRow` impl should set its
+        // `partition_template` to the system default (because the namespace didn't have a template
+        // either).
+        assert_eq!(
+            lookup_table.partition_template,
+            TablePartitionTemplateOverride::default()
+        );
+
+        // In a namespace that has a custom template, create a table record in the database that
+        // has `NULL` for its `partition_template` value.
+        //
+        // THIS ACTUALLY SHOULD BE IMPOSSIBLE because:
+        //
+        // * Namespaces have to exist before tables
+        // * `partition_tables` are immutable on both namespaces and tables
+        // * When the migration adding the `partition_table` column is deployed, namespaces can
+        //   begin to be created with `partition_templates`
+        // * *Then* tables can be created with `partition_templates` or not
+        // * When tables don't get a custom table partition template but their namespace has one,
+        //   their database record will get the namespace partition template.
+        //
+        // In other words, table `partition_template` values in the database is allowed to possibly
+        // be `NULL` IFF their namespace's `partition_template` is `NULL`.
+        //
+        // That said, this test creates this hopefully-impossible scenario to ensure that the
+        // defined, expected behavior if a table record somehow exists in the database with a `NULL`
+        // `partition_template` value is that it will have the application default partition
+        // template *even if the namespace `partition_template` is not null*.
+        let table_name = "null_template";
+        let insert_null_partition_template_table = sqlx::query(
+            r#"
+INSERT INTO table_name ( name, namespace_id, partition_template )
+VALUES ( $1, $2, NULL )
+RETURNING *;
+            "#,
+        )
+        .bind(table_name) // $1
+        .bind(namespace_custom_template.id); // $2
+
+        insert_null_partition_template_table
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+
+        let lookup_table = repos
+            .tables()
+            .get_by_namespace_and_name(namespace_custom_template.id, table_name)
+            .await
+            .unwrap()
+            .unwrap();
+        // When fetching this table from the database, the `FromRow` impl should set its
+        // `partition_template` to the system default *even though the namespace has a
+        // template*, because this should be impossible as detailed above.
+        assert_eq!(
+            lookup_table.partition_template,
+            TablePartitionTemplateOverride::default()
+        );
+
+        // # Table template false, namespace template true
+        //
+        // When creating a table through the catalog functions *without* a custom table template in
+        // a namespace *with* a custom partition template,
+        let table_no_template_with_namespace_template = repos
+            .tables()
+            .create(
+                "pomelo",
+                TablePartitionTemplateOverride::try_new(
+                    None, // no custom partition template
+                    &namespace_custom_template.partition_template,
+                )
+                .unwrap(),
+                namespace_custom_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the namespace's template
+        assert_eq!(
+            table_no_template_with_namespace_template.partition_template,
+            TablePartitionTemplateOverride::try_new(
+                None,
+                &namespace_custom_template.partition_template
+            )
+            .unwrap()
+        );
+
+        // and store that value in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_no_template_with_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_no_template_with_namespace_template.name, name);
+        let partition_template: Option<TablePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert_eq!(
+            partition_template.unwrap(),
+            TablePartitionTemplateOverride::try_new(
+                None,
+                &namespace_custom_template.partition_template
+            )
+            .unwrap()
+        );
+
+        // # Table template true, namespace template false
+        //
+        // When creating a table through the catalog functions *with* a custom table template in
+        // a namespace *without* a custom partition template,
+        let custom_table_template = proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("chemical".into())),
+            }],
+        };
+        let table_with_template_no_namespace_template = repos
+            .tables()
+            .create(
+                "tangerine",
+                TablePartitionTemplateOverride::try_new(
+                    Some(custom_table_template), // with custom partition template
+                    &namespace_default_template.partition_template,
+                )
+                .unwrap(),
+                namespace_default_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the custom table template
+        let table_template_parts: Vec<_> = table_with_template_no_namespace_template
+            .partition_template
+            .parts()
+            .collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "chemical"
+        );
+
+        // and store that value in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_with_template_no_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_with_template_no_namespace_template.name, name);
+        let partition_template = record
+            .try_get::<Option<TablePartitionTemplateOverride>, _>("partition_template")
+            .unwrap()
+            .unwrap();
+        let table_template_parts: Vec<_> = partition_template.parts().collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "chemical"
+        );
+
+        // # Table template true, namespace template true
+        //
+        // When creating a table through the catalog functions *with* a custom table template in
+        // a namespace *with* a custom partition template,
+        let custom_table_template = proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("vegetable".into())),
+            }],
+        };
+        let table_with_template_with_namespace_template = repos
+            .tables()
+            .create(
+                "nectarine",
+                TablePartitionTemplateOverride::try_new(
+                    Some(custom_table_template), // with custom partition template
+                    &namespace_custom_template.partition_template,
+                )
+                .unwrap(),
+                namespace_custom_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the custom table template
+        let table_template_parts: Vec<_> = table_with_template_with_namespace_template
+            .partition_template
+            .parts()
+            .collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "vegetable"
+        );
+
+        // and store that value in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_with_template_with_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_with_template_with_namespace_template.name, name);
+        let partition_template = record
+            .try_get::<Option<TablePartitionTemplateOverride>, _>("partition_template")
+            .unwrap()
+            .unwrap();
+        let table_template_parts: Vec<_> = partition_template.parts().collect();
+        assert_eq!(table_template_parts.len(), 1);
+        assert_matches!(
+            table_template_parts[0],
+            TemplatePart::TagValue(tag) if tag == "vegetable"
+        );
+
+        // # Table template false, namespace template false
+        //
+        // When creating a table through the catalog functions *without* a custom table template in
+        // a namespace *without* a custom partition template,
+        let table_no_template_no_namespace_template = repos
+            .tables()
+            .create(
+                "grapefruit",
+                TablePartitionTemplateOverride::try_new(
+                    None, // no custom partition template
+                    &namespace_default_template.partition_template,
+                )
+                .unwrap(),
+                namespace_default_template.id,
+            )
+            .await
+            .unwrap();
+
+        // it should have the default template in the application,
+        assert_eq!(
+            table_no_template_no_namespace_template.partition_template,
+            TablePartitionTemplateOverride::default()
+        );
+
+        // and store NULL in the database record.
+        let record = sqlx::query("SELECT name, partition_template FROM table_name WHERE id = $1;")
+            .bind(table_no_template_no_namespace_template.id)
+            .fetch_one(&pool)
+            .await
+            .unwrap();
+        let name: String = record.try_get("name").unwrap();
+        assert_eq!(table_no_template_no_namespace_template.name, name);
+        let partition_template: Option<TablePartitionTemplateOverride> =
+            record.try_get("partition_template").unwrap();
+        assert!(partition_template.is_none());
+    }
+}
diff --git a/iox_catalog/src/test_helpers.rs b/iox_catalog/src/test_helpers.rs
new file mode 100644
index 0000000..0861d79
--- /dev/null
+++ b/iox_catalog/src/test_helpers.rs
@@ -0,0 +1,92 @@
+//! Catalog helper functions for creation of catalog objects
+use data_types::{
+    partition_template::TablePartitionTemplateOverride, ColumnId, ColumnSet, CompactionLevel,
+    Namespace, NamespaceName, ObjectStoreId, ParquetFileParams, Partition, Table, TableSchema,
+    Timestamp,
+};
+
+use crate::interface::RepoCollection;
+
+/// When the details of the namespace don't matter; the test just needs *a* catalog namespace
+/// with a particular name.
+///
+/// Use [`NamespaceRepo::create`] directly if:
+///
+/// - The values of the parameters to `create` need to be different than what's here
+/// - The values of the parameters to `create` are relevant to the behavior under test
+/// - You expect namespace creation to fail in the test
+///
+/// [`NamespaceRepo::create`]: crate::interface::NamespaceRepo::create
+pub async fn arbitrary_namespace<R: RepoCollection + ?Sized>(
+    repos: &mut R,
+    name: &str,
+) -> Namespace {
+    let namespace_name = NamespaceName::new(name).unwrap();
+    repos
+        .namespaces()
+        .create(&namespace_name, None, None, None)
+        .await
+        .unwrap()
+}
+
+/// When the details of the table don't matter; the test just needs *a* catalog table
+/// with a particular name in a particular namespace.
+///
+/// Use [`TableRepo::create`] directly if:
+///
+/// - The values of the parameters to `create_or_get` need to be different than what's here
+/// - The values of the parameters to `create_or_get` are relevant to the behavior under test
+/// - You expect table creation to fail in the test
+///
+/// [`TableRepo::create`]: crate::interface::TableRepo::create
+pub async fn arbitrary_table<R: RepoCollection + ?Sized>(
+    repos: &mut R,
+    name: &str,
+    namespace: &Namespace,
+) -> Table {
+    repos
+        .tables()
+        .create(
+            name,
+            TablePartitionTemplateOverride::try_new(None, &namespace.partition_template).unwrap(),
+            namespace.id,
+        )
+        .await
+        .unwrap()
+}
+
+/// Load or create an arbitrary table schema in the same way that a write implicitly creates a
+/// table, that is, with a time column.
+pub async fn arbitrary_table_schema_load_or_create<R: RepoCollection + ?Sized>(
+    repos: &mut R,
+    name: &str,
+    namespace: &Namespace,
+) -> TableSchema {
+    crate::util::table_load_or_create(repos, namespace.id, &namespace.partition_template, name)
+        .await
+        .unwrap()
+}
+
+/// When the details of a Parquet file record don't matter, the test just needs *a* Parquet
+/// file record in a particular namespace+table+partition.
+pub fn arbitrary_parquet_file_params(
+    namespace: &Namespace,
+    table: &Table,
+    partition: &Partition,
+) -> ParquetFileParams {
+    ParquetFileParams {
+        namespace_id: namespace.id,
+        table_id: table.id,
+        partition_id: partition.id,
+        partition_hash_id: partition.hash_id().cloned(),
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(1),
+        max_time: Timestamp::new(10),
+        file_size_bytes: 1337,
+        row_count: 0,
+        compaction_level: CompactionLevel::Initial,
+        created_at: Timestamp::new(1),
+        column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]),
+        max_l0_created_at: Timestamp::new(1),
+    }
+}
diff --git a/iox_catalog/src/util.rs b/iox_catalog/src/util.rs
new file mode 100644
index 0000000..d6d184f
--- /dev/null
+++ b/iox_catalog/src/util.rs
@@ -0,0 +1,897 @@
+//! Helper methods to simplify catalog work.
+//!
+//! They all use the public [`Catalog`] interface and have no special access to internals, so in theory they can be
+//! implement downstream as well.
+
+use std::{
+    borrow::Cow,
+    collections::{BTreeMap, HashMap, HashSet},
+    sync::Arc,
+};
+
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    ColumnType, ColumnsByName, Namespace, NamespaceId, NamespaceSchema, PartitionId, SortKeyIds,
+    TableId, TableSchema,
+};
+use mutable_batch::MutableBatch;
+use thiserror::Error;
+
+use crate::{
+    constants::TIME_COLUMN,
+    interface::{CasFailure, Catalog, Error, RepoCollection, SoftDeletedRows},
+};
+
+/// Gets the namespace schema including all tables and columns.
+pub async fn get_schema_by_id<R>(
+    id: NamespaceId,
+    repos: &mut R,
+    deleted: SoftDeletedRows,
+) -> Result<Option<NamespaceSchema>, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let Some(namespace) = repos.namespaces().get_by_id(id, deleted).await? else {
+        return Ok(None);
+    };
+
+    Ok(Some(get_schema_internal(namespace, repos).await?))
+}
+
+/// Gets the namespace schema including all tables and columns.
+pub async fn get_schema_by_name<R>(
+    name: &str,
+    repos: &mut R,
+    deleted: SoftDeletedRows,
+) -> Result<Option<NamespaceSchema>, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let Some(namespace) = repos.namespaces().get_by_name(name, deleted).await? else {
+        return Ok(None);
+    };
+
+    Ok(Some(get_schema_internal(namespace, repos).await?))
+}
+
+async fn get_schema_internal<R>(
+    namespace: Namespace,
+    repos: &mut R,
+) -> Result<NamespaceSchema, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    // get the columns first just in case someone else is creating schema while we're doing this.
+    let columns = repos.columns().list_by_namespace_id(namespace.id).await?;
+    let tables = repos.tables().list_by_namespace_id(namespace.id).await?;
+
+    let mut namespace = NamespaceSchema::new_empty_from(&namespace);
+
+    let mut table_id_to_schema = BTreeMap::new();
+    for t in tables {
+        let table_schema = TableSchema::new_empty_from(&t);
+        table_id_to_schema.insert(t.id, (t.name, table_schema));
+    }
+
+    for c in columns {
+        let (_, t) = table_id_to_schema.get_mut(&c.table_id).unwrap();
+        t.add_column(c);
+    }
+
+    for (_, (table_name, schema)) in table_id_to_schema {
+        namespace.tables.insert(table_name, schema);
+    }
+
+    Ok(namespace)
+}
+
+/// Gets the schema for one particular table in a namespace.
+pub async fn get_schema_by_namespace_and_table<R>(
+    name: &str,
+    table_name: &str,
+    repos: &mut R,
+    deleted: SoftDeletedRows,
+) -> Result<Option<NamespaceSchema>, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let Some(namespace) = repos.namespaces().get_by_name(name, deleted).await? else {
+        return Ok(None);
+    };
+
+    let Some(table) = repos
+        .tables()
+        .get_by_namespace_and_name(namespace.id, table_name)
+        .await?
+    else {
+        return Ok(None);
+    };
+
+    let mut table_schema = TableSchema::new_empty_from(&table);
+
+    let columns = repos.columns().list_by_table_id(table.id).await?;
+    for c in columns {
+        table_schema.add_column(c);
+    }
+
+    let mut namespace = NamespaceSchema::new_empty_from(&namespace);
+    namespace
+        .tables
+        .insert(table_name.to_string(), table_schema);
+
+    Ok(Some(namespace))
+}
+
+/// Gets all the table's columns.
+pub async fn get_table_columns_by_id<R>(
+    id: TableId,
+    repos: &mut R,
+) -> Result<ColumnsByName, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let columns = repos.columns().list_by_table_id(id).await?;
+
+    Ok(ColumnsByName::new(columns))
+}
+
+/// Fetch all [`NamespaceSchema`] in the catalog.
+///
+/// This method performs the minimal number of queries needed to build the
+/// result set. No table lock is obtained, nor are queries executed within a
+/// transaction, but this method does return a point-in-time snapshot of the
+/// catalog state.
+///
+/// # Soft Deletion
+///
+/// No schemas for soft-deleted namespaces are returned.
+pub async fn list_schemas(
+    catalog: &dyn Catalog,
+) -> Result<impl Iterator<Item = (Namespace, NamespaceSchema)>, crate::interface::Error> {
+    let mut repos = catalog.repositories();
+
+    // In order to obtain a point-in-time snapshot, first fetch the columns,
+    // then the tables, and then resolve the namespace IDs to Namespace in order
+    // to construct the schemas.
+    //
+    // The set of columns returned forms the state snapshot, with the subsequent
+    // queries resolving only what is needed to construct schemas for the
+    // retrieved columns (ignoring any newly added tables/namespaces since the
+    // column snapshot was taken).
+    //
+    // This approach also tolerates concurrently deleted namespaces, which are
+    // simply ignored at the end when joining to the namespace query result.
+
+    // First fetch all the columns - this is the state snapshot of the catalog
+    // schemas.
+    let columns = repos.columns().list().await?;
+
+    // Construct the set of table IDs these columns belong to.
+    let retain_table_ids = columns.iter().map(|c| c.table_id).collect::<HashSet<_>>();
+
+    // Fetch all tables, and filter for those that are needed to construct
+    // schemas for "columns" only.
+    //
+    // Discard any tables that have no columns or have been created since
+    // the "columns" snapshot was retrieved, and construct a map of ID->Table.
+    let tables = repos
+        .tables()
+        .list()
+        .await?
+        .into_iter()
+        .filter_map(|t| {
+            if !retain_table_ids.contains(&t.id) {
+                return None;
+            }
+
+            Some((t.id, t))
+        })
+        .collect::<HashMap<_, _>>();
+
+    // Drop the table ID set as it will not be referenced again.
+    drop(retain_table_ids);
+
+    // Do all the I/O to fetch the namespaces in the background, while this
+    // thread constructs the NamespaceId->TableSchema map below.
+    let namespaces = tokio::spawn(async move {
+        repos
+            .namespaces()
+            .list(SoftDeletedRows::ExcludeDeleted)
+            .await
+    });
+
+    // A set of tables within a single namespace.
+    type NamespaceTables = BTreeMap<String, TableSchema>;
+
+    let mut joined = HashMap::<NamespaceId, NamespaceTables>::default();
+    for column in columns {
+        // Resolve the table this column references
+        let table = tables.get(&column.table_id).expect("no table for column");
+
+        let table_schema = joined
+            // Find or create a record in the joined <NamespaceId, Tables> map
+            // for this namespace ID.
+            .entry(table.namespace_id)
+            .or_default()
+            // Fetch the schema record for this table, or create an empty one.
+            .entry(table.name.clone())
+            .or_insert_with(|| TableSchema::new_empty_from(table));
+
+        table_schema.add_column(column);
+    }
+
+    // The table map is no longer needed - immediately reclaim the memory.
+    drop(tables);
+
+    // Convert the Namespace instances into NamespaceSchema instances.
+    let iter = namespaces
+        .await
+        .expect("namespace list task panicked")?
+        .into_iter()
+        // Ignore any namespaces that did not exist when the "columns" snapshot
+        // was created, or have no tables/columns (and therefore have no entry
+        // in "joined").
+        .filter_map(move |v| {
+            // The catalog call explicitly asked for no soft deleted records.
+            assert!(v.deleted_at.is_none());
+
+            let mut ns = NamespaceSchema::new_empty_from(&v);
+
+            ns.tables = joined.remove(&v.id)?;
+            Some((v, ns))
+        });
+
+    Ok(iter)
+}
+
+/// In a backoff loop, retry calling the compare-and-swap sort key catalog function if the catalog
+/// returns a query error unrelated to the CAS operation.
+///
+/// Returns with a value of `Ok` containing the new sort key if:
+///
+/// - No concurrent updates were detected
+/// - A concurrent update was detected, but the other update resulted in the same value this update
+///   was attempting to set
+///
+/// Returns with a value of `Err(newly_observed_value)` if a concurrent, conflicting update was
+/// detected. It is expected that callers of this function will take the returned value into
+/// account (in whatever manner is appropriate) before calling this function again.
+///
+/// NOTE: it is expected that ONLY processes that ingest data (currently only the ingesters or the
+/// bulk ingest API) update sort keys for existing partitions. Consider how calling this function
+/// from new processes will interact with the existing calls.
+pub async fn retry_cas_sort_key(
+    old_sort_key_ids: Option<&SortKeyIds>,
+    new_sort_key_ids: &SortKeyIds,
+    partition_id: PartitionId,
+    catalog: Arc<dyn Catalog>,
+) -> Result<SortKeyIds, SortKeyIds> {
+    use backoff::Backoff;
+    use observability_deps::tracing::{info, warn};
+    use std::ops::ControlFlow;
+
+    Backoff::new(&Default::default())
+        .retry_with_backoff("cas_sort_key", || {
+            let new_sort_key_ids = new_sort_key_ids.clone();
+            let catalog = Arc::clone(&catalog);
+            async move {
+                let mut repos = catalog.repositories();
+                match repos
+                    .partitions()
+                    .cas_sort_key(partition_id, old_sort_key_ids, &new_sort_key_ids)
+                    .await
+                {
+                    Ok(_) => ControlFlow::Break(Ok(new_sort_key_ids)),
+                    Err(CasFailure::QueryError(e)) => ControlFlow::Continue(e),
+                    Err(CasFailure::ValueMismatch(observed_sort_key_ids))
+                        if observed_sort_key_ids == new_sort_key_ids =>
+                    {
+                        // A CAS failure occurred because of a concurrent
+                        // sort key update, however the new catalog sort key
+                        // exactly matches the sort key this node wants to
+                        // commit.
+                        //
+                        // This is the sad-happy path, and this task can
+                        // continue.
+                        info!(
+                            %partition_id,
+                            ?old_sort_key_ids,
+                            ?observed_sort_key_ids,
+                            update_sort_key_ids=?new_sort_key_ids,
+                            "detected matching concurrent sort key update"
+                        );
+                        ControlFlow::Break(Ok(new_sort_key_ids))
+                    }
+                    Err(CasFailure::ValueMismatch(observed_sort_key_ids)) => {
+                        // Another ingester concurrently updated the sort
+                        // key.
+                        //
+                        // This breaks a sort-key update invariant - sort
+                        // key updates MUST be serialised. This operation must
+                        // be retried.
+                        //
+                        // See:
+                        //   https://github.com/influxdata/influxdb_iox/issues/6439
+                        //
+                        warn!(
+                            %partition_id,
+                            ?old_sort_key_ids,
+                            ?observed_sort_key_ids,
+                            update_sort_key_ids=?new_sort_key_ids,
+                            "detected concurrent sort key update"
+                        );
+                        // Stop the retry loop with an error containing the
+                        // newly observed sort key.
+                        ControlFlow::Break(Err(observed_sort_key_ids))
+                    }
+                }
+            }
+        })
+        .await
+        .expect("retry forever")
+}
+
+/// An [`crate::interface::Error`] scoped to a single table for schema validation errors.
+#[derive(Debug, Error)]
+#[error("table {}, {}", .0, .1)]
+pub struct TableScopedError(String, Error);
+
+impl TableScopedError {
+    /// Return the table name for this error.
+    pub fn table(&self) -> &str {
+        &self.0
+    }
+
+    /// Return a reference to the error.
+    pub fn err(&self) -> &Error {
+        &self.1
+    }
+
+    /// Return ownership of the error, discarding the table name.
+    pub fn into_err(self) -> Error {
+        self.1
+    }
+}
+
+/// Given an iterator of `(table_name, batch)` to validate, this function
+/// ensures all the columns within `batch` match the existing schema for
+/// `table_name` in `schema`. If the column does not already exist in `schema`,
+/// it is created and an updated [`NamespaceSchema`] is returned.
+///
+/// This function pushes schema additions through to the backend catalog, and
+/// relies on the catalog to serialize concurrent additions of a given column,
+/// ensuring only one type is ever accepted per column.
+pub async fn validate_or_insert_schema<'a, T, U, R>(
+    tables: T,
+    schema: &NamespaceSchema,
+    repos: &mut R,
+) -> Result<Option<NamespaceSchema>, TableScopedError>
+where
+    T: IntoIterator<IntoIter = U, Item = (&'a str, &'a MutableBatch)> + Send + Sync,
+    U: Iterator<Item = T::Item> + Send,
+    R: RepoCollection + ?Sized,
+{
+    let tables = tables.into_iter();
+
+    // The (potentially updated) NamespaceSchema to return to the caller.
+    let mut schema = Cow::Borrowed(schema);
+
+    for (table_name, batch) in tables {
+        validate_mutable_batch(batch, table_name, &mut schema, repos).await?;
+    }
+
+    match schema {
+        Cow::Owned(v) => Ok(Some(v)),
+        Cow::Borrowed(_) => Ok(None),
+    }
+}
+
+// &mut Cow is used to avoid a copy, so allow it
+#[allow(clippy::ptr_arg)]
+async fn validate_mutable_batch<R>(
+    mb: &MutableBatch,
+    table_name: &str,
+    schema: &mut Cow<'_, NamespaceSchema>,
+    repos: &mut R,
+) -> Result<(), TableScopedError>
+where
+    R: RepoCollection + ?Sized,
+{
+    // Check if the table exists in the schema.
+    //
+    // Because the entry API requires &mut it is not used to avoid a premature
+    // clone of the Cow.
+    let mut table = match schema.tables.get(table_name) {
+        Some(t) => Cow::Borrowed(t),
+        None => {
+            // The table does not exist in the cached schema.
+            //
+            // Attempt to load an existing table from the catalog or create a new table in the
+            // catalog to populate the cache.
+            let table =
+                table_load_or_create(repos, schema.id, &schema.partition_template, table_name)
+                    .await
+                    .map_err(|e| TableScopedError(table_name.to_string(), e))?;
+
+            assert!(schema
+                .to_mut()
+                .tables
+                .insert(table_name.to_string(), table)
+                .is_none());
+
+            Cow::Borrowed(schema.tables.get(table_name).unwrap())
+        }
+    };
+
+    // The table is now in the schema (either by virtue of it already existing,
+    // or through adding it above).
+    //
+    // If the table itself needs to be updated during column validation it
+    // becomes a Cow::owned() copy and the modified copy should be inserted into
+    // the schema before returning.
+    validate_and_insert_columns(
+        mb.columns()
+            .map(|(name, col)| (name, col.influx_type().into())),
+        table_name,
+        &mut table,
+        repos,
+    )
+    .await?;
+
+    if let Cow::Owned(table) = table {
+        // The table schema was mutated and needs inserting into the namespace
+        // schema to make the changes visible to the caller.
+        assert!(schema
+            .to_mut()
+            .tables
+            .insert(table_name.to_string(), table)
+            .is_some());
+    }
+
+    Ok(())
+}
+
+/// Given an iterator of `(column_name, column_type)` to validate, this function ensures all the
+/// columns match the existing `TableSchema` in `table`. If the column does not already exist in
+/// `table`, it is created and the `table` is changed to the `Cow::Owned` variant.
+///
+/// This function pushes schema additions through to the backend catalog, and relies on the catalog
+/// to serialize concurrent additions of a given column, ensuring only one type is ever accepted
+/// per column.
+// &mut Cow is used to avoid a copy, so allow it
+#[allow(clippy::ptr_arg)]
+pub async fn validate_and_insert_columns<R>(
+    columns: impl Iterator<Item = (&String, ColumnType)> + Send,
+    table_name: &str,
+    table: &mut Cow<'_, TableSchema>,
+    repos: &mut R,
+) -> Result<(), TableScopedError>
+where
+    R: RepoCollection + ?Sized,
+{
+    let mut column_batch: HashMap<&str, ColumnType> = HashMap::new();
+
+    for (name, column_type) in columns {
+        // Check if the column exists in the cached schema.
+        //
+        // If it does, validate it. If it does not exist, create it and insert
+        // it into the cached schema.
+
+        match table.columns.get(name.as_str()) {
+            Some(existing) if existing.column_type == column_type => {
+                // No action is needed as the column matches the existing column
+                // schema.
+            }
+            Some(existing) => {
+                // The column schema and the column in the schema change are of
+                // different types.
+                return Err(TableScopedError(
+                    table_name.to_string(),
+                    Error::AlreadyExists {
+                        descr: format!(
+                            "column {} is type {} but schema update has type {}",
+                            name, existing.column_type, column_type
+                        ),
+                    },
+                ));
+            }
+            None => {
+                // The column does not exist in the cache, add it to the column
+                // batch to be bulk inserted later.
+                let old = column_batch.insert(name.as_str(), column_type);
+                assert!(
+                    old.is_none(),
+                    "duplicate column name `{name}` in new column schema shouldn't be possible"
+                );
+            }
+        }
+    }
+
+    if !column_batch.is_empty() {
+        repos
+            .columns()
+            .create_or_get_many_unchecked(table.id, column_batch)
+            .await
+            .map_err(|e| TableScopedError(table_name.to_string(), e))?
+            .into_iter()
+            .for_each(|c| table.to_mut().add_column(c));
+    }
+
+    Ok(())
+}
+
+/// Load or create table.
+pub async fn table_load_or_create<R>(
+    repos: &mut R,
+    namespace_id: NamespaceId,
+    namespace_partition_template: &NamespacePartitionTemplateOverride,
+    table_name: &str,
+) -> Result<TableSchema, Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let table = match repos
+        .tables()
+        .get_by_namespace_and_name(namespace_id, table_name)
+        .await?
+    {
+        Some(table) => table,
+        None => {
+            // There is a possibility of a race condition here, if another request has also
+            // created this table after the `get_by_namespace_and_name` call but before
+            // this `create` call. In that (hopefully) rare case, do an additional fetch
+            // from the catalog for the record that should now exist.
+            let create_result = repos
+                .tables()
+                .create(
+                    table_name,
+                    // This table is being created implicitly by this write, so there's no
+                    // possibility of a user-supplied partition template here, which is why there's
+                    // a hardcoded `None`. If there is a namespace template, it must be valid because
+                    // validity was checked during its creation, so that's why there's an `expect`.
+                    TablePartitionTemplateOverride::try_new(None, namespace_partition_template)
+                        .expect("no table partition template; namespace partition template has been validated"),
+                    namespace_id,
+                )
+                .await;
+            if let Err(Error::AlreadyExists { .. }) = create_result {
+                repos
+                    .tables()
+                    .get_by_namespace_and_name(namespace_id, table_name)
+                    // Propagate any `Err` returned by the catalog
+                    .await?
+                    // Getting `Ok(None)` should be impossible if we're in this code path because
+                    // the `create` request just said the table exists
+                    .expect(
+                        "Table creation failed because the table exists, so looking up the table \
+                        should return `Some(table)`, but it returned `None`",
+                    )
+            } else {
+                create_result?
+            }
+        }
+    };
+
+    let mut table = TableSchema::new_empty_from(&table);
+
+    // Always add a time column to all new tables.
+    let time_col = repos
+        .columns()
+        .create_or_get(TIME_COLUMN, table.id, ColumnType::Time)
+        .await?;
+
+    table.add_column(time_col);
+
+    Ok(table)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::BTreeMap, sync::Arc};
+
+    use super::*;
+    use crate::{interface::SoftDeletedRows, mem::MemCatalog, util::get_schema_by_name};
+
+    // Generate a test that simulates multiple, sequential writes in `lp` and
+    // asserts the resulting schema.
+    //
+    // This test asserts the cached schema and the database entry are always in
+    // sync.
+    macro_rules! test_validate_schema {
+        (
+            $name:ident,
+            lp = [$($lp:literal,)+],                                // An array of multi-line LP writes
+            want_observe_conflict = $want_observe_conflict:literal, // true if a schema validation error should be observed at some point
+            want_schema = {$($want_schema:tt) +}                    // The expected resulting schema after all writes complete.
+        ) => {
+            paste::paste! {
+                #[allow(clippy::bool_assert_comparison)]
+                #[tokio::test]
+                async fn [<test_validate_schema_ $name>]() {
+                    use crate::{interface::Catalog, test_helpers::arbitrary_namespace};
+                    use std::ops::DerefMut;
+                    use pretty_assertions::assert_eq;
+                    const NAMESPACE_NAME: &str = "bananas";
+
+                    let metrics = Arc::new(metric::Registry::default());
+                    let time_provider = Arc::new(iox_time::SystemProvider::new());
+                    let repo = MemCatalog::new(metrics, time_provider);
+                    let mut txn = repo.repositories();
+
+                    let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME)
+                        .await;
+                    let schema = NamespaceSchema::new_empty_from(&namespace);
+
+                    // Apply all the lp literals as individual writes, feeding
+                    // the result of one validation into the next to drive
+                    // incremental construction of the schemas.
+                    let mut observed_conflict = false;
+                    $(
+                        let schema = {
+                            let lp: String = $lp.to_string();
+
+                            let writes = mutable_batch_lp::lines_to_batches(lp.as_str(), 42)
+                                .expect("failed to build test writes from LP");
+
+                            let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, txn.deref_mut())
+                                .await;
+
+                            match got {
+                                Err(TableScopedError(_, Error::AlreadyExists{ .. })) => {
+                                    observed_conflict = true;
+                                    schema
+                                },
+                                Err(e) => panic!("unexpected error: {}", e),
+                                Ok(Some(new_schema)) => new_schema,
+                                Ok(None) => schema,
+                            }
+                        };
+                    )+
+
+                    assert_eq!($want_observe_conflict, observed_conflict, "should error mismatch");
+
+                    // Invariant: in absence of concurrency, the schema within
+                    // the database must always match the incrementally built
+                    // cached schema.
+                    let db_schema = get_schema_by_name(NAMESPACE_NAME, txn.deref_mut(), SoftDeletedRows::ExcludeDeleted)
+                        .await
+                        .expect("database failed to query for namespace schema")
+                        .expect("namespace exists");
+                    assert_eq!(schema, db_schema, "schema in DB and cached schema differ");
+
+                    // Generate the map of tables => desired column types
+                    let want_tables: BTreeMap<String, BTreeMap<Arc<str>, ColumnType>> = test_validate_schema!(@table, $($want_schema)+);
+
+                    // Generate a similarly structured map from the actual
+                    // schema
+                    let actual_tables: BTreeMap<String, BTreeMap<Arc<str>, ColumnType>> = schema
+                        .tables
+                        .iter()
+                        .map(|(table, table_schema)| {
+                            let desired_cols = table_schema
+                                .columns
+                                .iter()
+                                .map(|(column, column_schema)| (Arc::clone(&column), column_schema.column_type))
+                                .collect::<BTreeMap<_, _>>();
+
+                            (table.clone(), desired_cols)
+                        })
+                        .collect();
+
+                    // Assert the actual namespace contents matches the desired
+                    // table schemas in the test args.
+                    assert_eq!(want_tables, actual_tables, "cached schema and desired schema differ");
+                }
+            }
+        };
+        // Generate a map of table names => column map (below)
+        //
+        // out: BTreeMap<String, BTreeMap<ColName, ColumnType>>
+        (@table, $($table_name:literal: [$($columns:tt) +],)*) => {{
+            let mut tables = BTreeMap::new();
+            $(
+                let want_cols = test_validate_schema!(@column, $($columns)+);
+                assert!(tables.insert($table_name.to_string(), want_cols).is_none());
+            )*
+            tables
+        }};
+        // Generate a map of column names => ColumnType
+        //
+        // out: BTreeMap<ColName, ColumnType>
+        (@column, $($col_name:literal => $col_type:expr,)+) => {{
+            let mut cols = BTreeMap::new();
+            $(
+                assert!(cols.insert(Arc::from($col_name), $col_type).is_none());
+            )*
+            cols
+        }};
+    }
+
+    test_validate_schema!(
+        one_write_multiple_tables,
+        lp = [
+            "
+                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
+                m1,t1=a f1=3i 2\n\
+                m2,t3=b f1=true 1\n\
+            ",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "t2" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "f2" => ColumnType::F64,
+                "time" => ColumnType::Time,
+            ],
+            "m2": [
+                "f1" => ColumnType::Bool,
+                "t3" => ColumnType::Tag,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    // test that a new table will be created
+    test_validate_schema!(
+        two_writes_incremental_new_table,
+        lp = [
+            "
+                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
+                m1,t1=a f1=3i 2\n\
+                m2,t3=b f1=true 1\n\
+            ",
+            "
+                m1,t1=c f1=1i 2\n\
+                new_measurement,t9=a f10=true 1\n\
+            ",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "t2" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "f2" => ColumnType::F64,
+                "time" => ColumnType::Time,
+            ],
+            "m2": [
+                "f1" => ColumnType::Bool,
+                "t3" => ColumnType::Tag,
+                "time" => ColumnType::Time,
+                ],
+            "new_measurement": [
+                "t9" => ColumnType::Tag,
+                "f10" => ColumnType::Bool,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    // test that a new column for an existing table will be created
+    test_validate_schema!(
+        two_writes_incremental_new_column,
+        lp = [
+            "
+                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
+                m1,t1=a f1=3i 2\n\
+                m2,t3=b f1=true 1\n\
+            ",
+            "m1,new_tag=c new_field=1i 2",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "t2" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "f2" => ColumnType::F64,
+                "time" => ColumnType::Time,
+                // These are the incremental additions:
+                "new_tag" => ColumnType::Tag,
+                "new_field" => ColumnType::I64,
+            ],
+            "m2": [
+                "f1" => ColumnType::Bool,
+                "t3" => ColumnType::Tag,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    test_validate_schema!(
+        table_always_has_time_column,
+        lp = [
+            "m1,t1=a f1=2i",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    test_validate_schema!(
+        two_writes_conflicting_column_types,
+        lp = [
+            "m1,t1=a f1=2i",
+            // Second write has conflicting type for f1.
+            "m1,t1=a f1=2.0",
+        ],
+        want_observe_conflict = true,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    test_validate_schema!(
+        two_writes_tag_field_transposition,
+        lp = [
+            // x is a tag
+            "m1,t1=a,x=t f1=2i",
+            // x is a field
+            "m1,t1=a x=t,f1=2i",
+        ],
+        want_observe_conflict = true,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "x" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    #[tokio::test]
+    async fn validate_table_create_race_doesnt_get_all_columns() {
+        use crate::{interface::Catalog, test_helpers::arbitrary_namespace};
+        use std::{collections::BTreeSet, ops::DerefMut};
+        const NAMESPACE_NAME: &str = "bananas";
+
+        let repo = MemCatalog::new(
+            Default::default(),
+            Arc::new(iox_time::SystemProvider::new()),
+        );
+        let mut txn = repo.repositories();
+        let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME).await;
+
+        // One cached schema has no tables.
+        let empty_schema = NamespaceSchema::new_empty_from(&namespace);
+
+        // Another cached schema gets a write that creates a table with some columns.
+        let schema_with_table = empty_schema.clone();
+        let writes = mutable_batch_lp::lines_to_batches("m1,t1=a f1=2i", 42).unwrap();
+        validate_or_insert_schema(
+            writes.iter().map(|(k, v)| (k.as_str(), v)),
+            &schema_with_table,
+            txn.deref_mut(),
+        )
+        .await
+        .unwrap();
+
+        // then the empty schema adds the same table with some different columns
+        let other_writes = mutable_batch_lp::lines_to_batches("m1,t2=a f2=2i", 43).unwrap();
+        let formerly_empty_schema = validate_or_insert_schema(
+            other_writes.iter().map(|(k, v)| (k.as_str(), v)),
+            &empty_schema,
+            txn.deref_mut(),
+        )
+        .await
+        .unwrap()
+        .unwrap();
+
+        // the formerly-empty schema should NOT have all the columns; schema convergence is handled
+        // at a higher level by the namespace cache/gossip system
+        let table = formerly_empty_schema.tables.get("m1").unwrap();
+        assert_eq!(table.columns.names(), BTreeSet::from(["t2", "f2", "time"]));
+    }
+}
diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml
new file mode 100644
index 0000000..7289896
--- /dev/null
+++ b/iox_data_generator/Cargo.toml
@@ -0,0 +1,48 @@
+[package]
+name = "iox_data_generator"
+default-run = "iox_data_generator"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+bytes = "1.5"
+chrono = { version = "0.4", default-features = false }
+clap = { version = "4", features = ["derive", "env", "cargo"] }
+datafusion_util = { path = "../datafusion_util" }
+futures = "0.3"
+handlebars = "5.1.0"
+humantime = "2.1.0"
+influxdb2_client = { path = "../influxdb2_client" }
+itertools = "0.12.0"
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+mutable_batch = { path = "../mutable_batch" }
+parquet_file = { path = "../parquet_file" }
+rand = { version = "0.8.3", features = ["small_rng"] }
+regex = "1.10"
+schema = { path = "../schema" }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.111"
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+toml = "0.8.8"
+tracing = "0.1"
+tracing-subscriber = "0.3"
+uuid = { version = "1", default_features = false }
+
+[dev-dependencies]
+criterion = { version = "0.5", default-features = false, features = ["rayon"]}
+test_helpers = { path = "../test_helpers" }
+
+[[bench]]
+name = "point_generation"
+harness = false
+
+[lib]
+# Allow --save-baseline to work
+# https://github.com/bheisler/criterion.rs/issues/275
+bench = false
diff --git a/iox_data_generator/README.md b/iox_data_generator/README.md
new file mode 100644
index 0000000..3bf275f
--- /dev/null
+++ b/iox_data_generator/README.md
@@ -0,0 +1,19 @@
+# `iox_data_generator`
+
+The `iox_data_generator` tool creates random data points according to a specification and loads them
+into an `iox` instance to simulate real data.
+
+To build and run, [first install Rust](https://www.rust-lang.org/tools/install). Then from root of the `influxdb_iox` repo run:
+
+```
+cargo build --release
+```
+
+And the built binary has command line help:
+
+```
+./target/release/iox_data_generator --help
+```
+
+For examples of specifications see the [schemas folder](schemas). The [full_example](schemas/full_example.toml) is the
+most comprehensive with comments and example output.
diff --git a/iox_data_generator/benches/point_generation.rs b/iox_data_generator/benches/point_generation.rs
new file mode 100644
index 0000000..e29af90
--- /dev/null
+++ b/iox_data_generator/benches/point_generation.rs
@@ -0,0 +1,223 @@
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use iox_data_generator::{
+    agent::Agent,
+    specification::{
+        AgentAssignmentSpec, AgentSpec, DataSpec, DatabaseWriterSpec, FieldSpec, FieldValueSpec,
+        MeasurementSpec,
+    },
+    tag_set::GeneratedTagSets,
+    write::PointsWriterBuilder,
+};
+use std::{
+    sync::{atomic::AtomicU64, Arc},
+    time::Duration,
+};
+
+pub fn single_agent(c: &mut Criterion) {
+    let spec = DataSpec {
+        name: "benchmark".into(),
+        values: vec![],
+        tag_sets: vec![],
+        agents: vec![AgentSpec {
+            name: "foo".to_string(),
+            measurements: vec![MeasurementSpec {
+                name: "measurement-1".into(),
+                count: None,
+                fields: vec![FieldSpec {
+                    name: "field-1".into(),
+                    field_value_spec: FieldValueSpec::Bool(true),
+                    count: None,
+                }],
+                tag_set: None,
+                tag_pairs: vec![],
+            }],
+            has_one: vec![],
+            tag_pairs: vec![],
+        }],
+        database_writers: vec![DatabaseWriterSpec {
+            database_ratio: Some(1.0),
+            database_regex: None,
+            agents: vec![AgentAssignmentSpec {
+                name: "foo".to_string(),
+                count: None,
+                sampling_interval: "1s".to_string(),
+            }],
+        }],
+    };
+
+    let mut points_writer = PointsWriterBuilder::new_no_op(true);
+
+    let start_datetime = Some(0);
+    let one_hour_s = 60 * 60;
+    let ns_per_second = 1_000_000_000;
+    let end_datetime = Some(one_hour_s * ns_per_second);
+
+    let expected_points = 3601;
+
+    let mut group = c.benchmark_group("single_agent");
+    group.throughput(Throughput::Elements(expected_points));
+
+    group.bench_function("single agent with basic configuration", |b| {
+        b.iter(|| {
+            let r = block_on(iox_data_generator::generate(
+                &spec,
+                vec!["foo_bar".to_string()],
+                &mut points_writer,
+                start_datetime,
+                end_datetime,
+                0,
+                false,
+                1,
+                false,
+            ));
+            let n_points = r.expect("Could not generate data");
+            assert_eq!(n_points, expected_points as usize);
+        })
+    });
+}
+
+pub fn agent_pre_generated(c: &mut Criterion) {
+    let spec: DataSpec = toml::from_str(
+        r#"
+name = "storage_cardinality_example"
+
+# Values are automatically generated before the agents are initialized. They generate tag key/value
+# pairs with the name of the value as the tag key and the evaluated template as the value. These
+# pairs are Arc wrapped so they can be shared across tagsets and used in the agents as
+# pre-generated data.
+[[values]]
+# the name must not have a . in it, which is used to access children later. Otherwise it's open.
+name = "role"
+# the template can use a number of helpers to get an id, a random string and the name, see below
+# for examples
+template = "storage"
+# this number of tag pairs will be generated. If this is > 1, the id or a random character string
+# should be used in the template to ensure that the tag key/value pairs are unique.
+cardinality = 1
+
+[[values]]
+name = "url"
+template = "http://127.0.0.1:6060/metrics/usage"
+cardinality = 1
+
+[[values]]
+name = "org_id"
+# Fill in the value with the cardinality counter and 15 random alphanumeric characters
+template = "{{id}}_{{random 15}}"
+cardinality = 1000
+has_one = ["env"]
+
+[[values]]
+name = "env"
+template = "whatever-environment-{{id}}"
+cardinality = 10
+
+[[values]]
+name = "bucket_id"
+# a bucket belongs to an org. With this, you would be able to access the org.id or org.value in the
+# template
+belongs_to = "org_id"
+# each bucket will have a unique id, which is used here to guarantee uniqueness even across orgs.
+# We also have a random 15 character alphanumeric sequence to pad out the value length.
+template = "{{id}}_{{random 15}}"
+# For each org, 3 buckets will be generated
+cardinality = 3
+
+[[values]]
+name = "partition_id"
+template = "{{id}}"
+cardinality = 10
+
+# makes a tagset so every bucket appears in every partition. The other tags are descriptive and
+# don't increase the cardinality beyond count(bucket) * count(partition). Later this example will
+# use the agent and measurement generation to take this base tagset and increase cardinality on a
+# per-agent basis.
+[[tag_sets]]
+name = "bucket_set"
+for_each = [
+    "role",
+    "url",
+    "org_id",
+    "org_id.env",
+    "org_id.bucket_id",
+    "partition_id",
+]
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "storage_usage_bucket_cardinality"
+# each sampling will have all the tag sets from this collection in addition to the tags and
+# tag_pairs specified
+tag_set = "bucket_set"
+# for each agent, this specific measurement will be decorated with these additional tags.
+tag_pairs = [
+    {key = "node_id", template = "{{agent.id}}"},
+    {key = "hostname", template = "{{agent.id}}"},
+    {key = "host", template = "storage-{{agent.id}}"},
+]
+
+[[agents.measurements.fields]]
+name = "gauge"
+i64_range = [1, 8147240]
+
+[[database_writers]]
+agents = [{name = "foo", sampling_interval = "1s", count = 3}]
+"#,
+    )
+    .unwrap();
+
+    let generated_tag_sets = GeneratedTagSets::from_spec(&spec).unwrap();
+
+    let mut points_writer = PointsWriterBuilder::new_no_op(true);
+
+    let start_datetime = Some(0);
+    let one_hour_s = 60 * 60;
+    let ns_per_second = 1_000_000_000;
+    let end_datetime = Some(one_hour_s * ns_per_second);
+
+    let mut agents = Agent::from_spec(
+        &spec.agents[0],
+        3,
+        Duration::from_millis(10),
+        start_datetime,
+        end_datetime,
+        0,
+        false,
+        &generated_tag_sets,
+    )
+    .unwrap();
+    let agent = agents.first_mut().unwrap();
+    let expected_points = 30000;
+
+    let counter = Arc::new(AtomicU64::new(0));
+    let request_counter = Arc::new(AtomicU64::new(0));
+    let mut group = c.benchmark_group("agent_pre_generated");
+    group.measurement_time(std::time::Duration::from_secs(50));
+    group.throughput(Throughput::Elements(expected_points));
+
+    group.bench_function("single agent with basic configuration", |b| {
+        b.iter(|| {
+            agent.reset_current_date_time(0);
+            let points_writer =
+                Arc::new(points_writer.build_for_agent("foo", "foo", "foo").unwrap());
+            let r = block_on(agent.generate_all(
+                points_writer,
+                1,
+                Arc::clone(&counter),
+                Arc::clone(&request_counter),
+            ));
+            let n_points = r.expect("Could not generate data");
+            assert_eq!(n_points.row_count, expected_points as usize);
+        })
+    });
+}
+
+#[tokio::main]
+async fn block_on<F: std::future::Future>(f: F) -> F::Output {
+    f.await
+}
+
+criterion_group!(benches, single_agent, agent_pre_generated);
+criterion_main!(benches);
diff --git a/iox_data_generator/schemas/big_db.toml b/iox_data_generator/schemas/big_db.toml
new file mode 100644
index 0000000..73ca71d
--- /dev/null
+++ b/iox_data_generator/schemas/big_db.toml
@@ -0,0 +1,143 @@
+# this schema is for testing what it looks like with a database that has
+# hundreds of thousands of measurements with different levels of throughput.
+#
+# The high agent sends 10k lines with 500 measurements totaling 2.48 MB per sampling
+# The medium agent sends 10k lines with 1k measurements totaling 2.14 MB per sampling
+# The low agent sends 10k lines with 10k measurements and 1.45 MB per sampling
+#
+# Based on the database_writers at the bottom, this will write 225k total measurements
+# across 50 separate agents writing once every 10s. Aggregate throughput is about
+# 35.76 MB/sec of raw line protocol
+name = "big_db"
+
+[[values]]
+name = "some_tag_here"
+cardinality = 10
+template = "value-{{id}}-{{random 5}}"
+
+[[values]]
+name = "some_other_tag"
+cardinality = 2
+template = "value-{{id}}-{{random 10}}"
+belongs_to = "some_tag_here"
+
+[[values]]
+name = "some_static_tag"
+cardinality = 1
+template = "whatevs-is-something-we-have"
+
+[[tag_sets]]
+name = "20card"
+for_each = [
+    "some_tag_here",
+    "some_tag_here.some_other_tag",
+    "some_static_tag",
+]
+
+[[tag_sets]]
+name = "10card"
+for_each = [
+    "some_tag_here",
+    "some_static_tag",
+]
+
+[[tag_sets]]
+name = "2card"
+for_each = [
+    "some_other_tag",
+    "some_static_tag",
+]
+
+# generates data that looks like:
+#
+# high_measurement_10_card_500_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=500,some_other_tag=value-17-0wyJ8VuUO7,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-9-fuFo3 intfield=63976i,floatfield=0.6004810270043124 1639597814875290000
+# high_measurement_10_card_500_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=500,some_other_tag=value-18-I9P4V97Kfm,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-9-fuFo3 intfield=24564i,floatfield=0.11957361442062764 1639597814875290000
+# high_measurement_10_card_500_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=500,some_other_tag=value-19-HaW3lHJ2le,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-10-yH0Bj intfield=18157i,floatfield=0.10429525001385809 1639597814875290000
+# high_measurement_10_card_500_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=500,some_other_tag=value-20-XOgmzSFzm7,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-10-yH0Bj intfield=51041i,floatfield=0.802468465951919 1639597814875290000
+[[agents]]
+name = "high"
+tag_pairs = [
+    {key = "agent_id", template = "{{agent.id}}"},
+    {key = "foo_bar", template = "stuff-is-here-now"}
+]
+
+[[agents.measurements]]
+name = "high_measurement_10_card_{{measurement.id}}_{{agent.id}}"
+count = 500
+tag_set = "20card"
+tag_pairs = [
+    {key = "measurement_id", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield"
+i64_range = [1, 100000]
+
+[[agents.measurements.fields]]
+name = "floatfield"
+f64_range = [0.0, 1.0]
+
+# generates data that looks like:
+#
+# med_measurement_10_card_1000_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=1000,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-7-UhxFA intfield=24707i,floatfield=0.762661180672112 1639597855224165000
+# med_measurement_10_card_1000_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=1000,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-8-YzAUN intfield=94490i,floatfield=0.4309492192063673 1639597855224165000
+# med_measurement_10_card_1000_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=1000,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-9-vUmMN intfield=68817i,floatfield=0.9156455784544137 1639597855224165000
+# med_measurement_10_card_1000_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=1000,some_static_tag=whatevs-is-something-we-have,some_tag_here=value-10-gxcic intfield=84220i,floatfield=0.9267974321691199 1639597855224165000
+[[agents]]
+name = "medium"
+tag_pairs = [
+    {key = "agent_id", template = "{{agent.id}}"},
+    {key = "foo_bar", template = "stuff-is-here-now"}
+]
+
+[[agents.measurements]]
+name = "med_measurement_10_card_{{measurement.id}}_{{agent.id}}"
+count = 1000
+tag_set = "10card"
+tag_pairs = [
+    {key = "measurement_id", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield"
+i64_range = [1, 100000]
+
+[[agents.measurements.fields]]
+name = "floatfield"
+f64_range = [0.0, 1.0]
+
+# generates data that looks like:
+#
+# low_measurement_2_card_4986_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=4986 intfield=17484i,floatfield=0.5834872217437403 1639597582877742000
+# low_measurement_2_card_4987_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=4987 intfield=83563i,floatfield=0.7354522843365716 1639597582877742000
+# low_measurement_2_card_4988_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=4988 intfield=74676i,floatfield=0.7443686050113958 1639597582877742000
+# low_measurement_2_card_4989_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=4989 intfield=69285i,floatfield=0.05047660569705048 1639597582877742000
+# low_measurement_2_card_4990_1,agent_id=1,foo_bar=stuff-is-here-now,measurement_id=4990 intfield=36686i,floatfield=0.7546950434825994 1639597582877742000
+[[agents]]
+name = "low"
+tag_pairs = [
+    {key = "agent_id", template = "{{agent.id}}"},
+    {key = "foo_bar", template = "stuff-is-here-now"}
+]
+
+[[agents.measurements]]
+name = "low_measurement_2_card_{{measurement.id}}_{{agent.id}}"
+count = 10000
+tag_pairs = [
+    {key = "measurement_id", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield"
+i64_range = [1, 100000]
+
+[[agents.measurements.fields]]
+name = "floatfield"
+f64_range = [0.0, 1.0]
+
+[[database_writers]]
+agents = [
+    {name = "high", sampling_interval = "10s", count = 10}, # 5,000 measurements
+    {name = "medium", sampling_interval = "10s", count = 20}, # 20,000 measurements
+    {name = "low", sampling_interval = "10s", count = 20} # 200,000 measurements
+]
diff --git a/iox_data_generator/schemas/cap-write.toml b/iox_data_generator/schemas/cap-write.toml
new file mode 100644
index 0000000..5b77a85
--- /dev/null
+++ b/iox_data_generator/schemas/cap-write.toml
@@ -0,0 +1,405 @@
+# This config file aims to replicate the data produced by the capwrite tool:
+# https://github.com/influxdata/idpe/tree/e493a8e9b6b773e9374a8542ddcab7d8174d320d/performance/capacity/write
+name = "cap_write"
+
+[[database_writers]]
+database_ratio = 1.0
+agents = [{name = "telegraf", count = 3, sampling_interval = "10s"}]
+
+[[agents]]
+name = "telegraf"
+tag_pairs = [
+    {key = "host", template = "host-{{agent.id}}"}
+]
+
+[[agents.measurements]]
+name = "system"
+
+    [[agents.measurements.fields]]
+    name = "n_cpus"
+    i64_range = [8, 8]
+
+    [[agents.measurements.fields]]
+    name = "n_users"
+    i64_range = [2, 11]
+
+    [[agents.measurements.fields]]
+    name = "uptime"
+    uptime = "i64"
+
+    [[agents.measurements.fields]]
+    name = "uptime_format"
+    uptime = "telegraf"
+
+    [[agents.measurements.fields]]
+    name = "load1"
+    f64_range = [0.0, 8.0]
+
+    [[agents.measurements.fields]]
+    name = "load5"
+    f64_range = [0.0, 8.0]
+
+    [[agents.measurements.fields]]
+    name = "load15"
+    f64_range = [0.0, 8.0]
+
+
+[[agents.measurements]]
+name = "mem"
+
+    [[agents.measurements.fields]]
+    name = "active"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "available"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "buffered"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "cached"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "inactive"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "slab"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "used"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "available_percent"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "used_percent"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "wired"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "commit_limit"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "committed_as"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "dirty"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "high_free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "high_total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "huge_page_size"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "huge_pages_free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "huge_pages_total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "low_free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "low_total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "mapped"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "page_tables"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "shared"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "swap_cached"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "swap_free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "swap_total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "vmalloc_chunk"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "vmalloc_total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "vmalloc_used"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "write_back"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "write_back_tmp"
+    i64_range = [0, 10000000]
+
+[[agents.measurements]]
+name = "disk"
+
+    [[agents.measurements.fields]]
+    name = "free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "used"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "used_percent"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "inodes_free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "inodes_total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "inodes_used"
+    i64_range = [0, 10000000]
+
+[[agents.measurements]]
+name = "swap"
+
+    [[agents.measurements.fields]]
+    name = "free"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "total"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "used"
+    i64_range = [0, 1000000] # Note this is an order of magnitude less deliberately to match
+    # https://github.com/influxdata/idpe/blob/ffbceb04dd4b3aa0828d039135977a4f36f7b822/performance/capacity/write/swap.go#L17
+    # not sure if that value was intentional, perhaps it is to ensure used < total?
+
+    [[agents.measurements.fields]]
+    name = "used_percent"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "in"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "out"
+    i64_range = [0, 10000000]
+
+[[agents.measurements]]
+name = "cpu"
+tag_pairs = [{key = "cpu", template = "cpu-total"}]
+
+    [[agents.measurements.fields]]
+    name = "usage_user"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_nice"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_system"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_idle"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_irq"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_softirq"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_steal"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_guest"
+    f64_range = [0.0, 100.0]
+
+    [[agents.measurements.fields]]
+    name = "usage_guest_nice"
+    f64_range = [0.0, 100.0]
+
+[[agents.measurements]]
+name = "processes"
+
+    [[agents.measurements.fields]]
+    name = "blocked"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "running"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "sleeping"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "stopped"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "total"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "zombie"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "dead"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "wait"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "idle"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "paging"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "total_threads"
+    i64_range = [0, 255]
+
+    [[agents.measurements.fields]]
+    name = "unknown"
+    i64_range = [0, 255]
+
+[[agents.measurements]]
+name = "net"
+
+    [[agents.measurements.fields]]
+    name = "bytes_recv"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "bytes_sent"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "packets_sent"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "packets_recv"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "err_in"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "err_out"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "drop_in"
+    i64_range = [0, 10000000]
+
+    [[agents.measurements.fields]]
+    name = "drop_out"
+    i64_range = [0, 10000000]
+
+[[agents.measurements]]
+name = "diskio"
+
+    [[agents.measurements.fields]]
+    name = "reads"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "writes"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "read_bytes"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "write_bytes"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "read_time"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "write_time"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "io_time"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "weighted_io_time"
+    i64_range = [0, 1000000]
+
+    [[agents.measurements.fields]]
+    name = "iops_in_progress"
+    i64_range = [0, 1000000]
diff --git a/iox_data_generator/schemas/eu_central.toml b/iox_data_generator/schemas/eu_central.toml
new file mode 100644
index 0000000..4d8d966
--- /dev/null
+++ b/iox_data_generator/schemas/eu_central.toml
@@ -0,0 +1,107 @@
+# generates load with 20k measurements getting a little bit of data, 20 measurements getting 300x the amount of data
+# and 3 measurements that are very wide with 600 fields. Adjust the count or sampling interval of the three different
+# agents to adjust how much load each type generates. But note that the first_agent sends far more lines per request
+# which is how those measurements see so much more data.
+name = "eu_central_sim"
+
+[[values]]
+name = "some_tag"
+cardinality = 10
+template = "id_{{id}}_{{random 15}}"
+has_one = ["extra_static"]
+
+[[values]]
+name = "child_tag"
+cardinality = 10
+belongs_to = "some_tag"
+has_one = ["rotation"]
+template = "id_{{id}}_{{random 10}}"
+
+[[values]]
+name = "rotation"
+cardinality = 3
+template = "id_{{id}}_{{guid}}"
+
+[[values]]
+name = "extra_static"
+cardinality = 1
+template = "whatever-constant-value"
+
+[[tag_sets]]
+name = "first_set"
+for_each = [
+    "some_tag",
+    "some_tag.extra_static",
+    "some_tag.child_tag",
+    "child_tag.rotation",
+]
+
+[[tag_sets]]
+name = "lower_cardinality_set"
+for_each = [
+    "some_tag",
+]
+
+[[agents]]
+name = "first_agent"
+tag_pairs = [
+    {key = "agent_id", template = "{{agent.id}}"}
+]
+
+[[agents.measurements]]
+name = "first_agent_measurement_{{measurement.id}}"
+count = 20
+tag_set = "first_set"
+tag_pairs = [
+    {key = "measurement_id", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield"
+i64_range = [1, 100000]
+
+[[agents.measurements.fields]]
+name = "floatfield"
+f64_range = [0.0, 1.0]
+
+[[agents]]
+name = "second_agent"
+tag_pairs = [
+    {key = "agent_id", template = "second_agent_{{agent.id}}"}
+]
+
+[[agents.measurements]]
+name = "second_agent_measurement_{{measurement.id}}"
+count = 20000
+tag_pairs = [
+    {key = "measurement_id", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield"
+i64_range = [1,1000]
+
+[[agents]]
+name = "third_agent"
+tag_pairs = [
+    {key = "agent_id", template = "third_agent_{{agent.id}}"}
+]
+
+[[agents.measurements]]
+name = "third_agent_measurement_{{measurement.id}}"
+count = 3
+tag_pairs = [
+    {key = "measurement_id", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield_{{field.id}}"
+count = 600
+i64_range = [1,1000]
+
+[[database_writers]]
+agents = [
+    {name = "first_agent", sampling_interval = "1s", count = 5},
+    {name = "second_agent", sampling_interval = "1s", count = 40},
+    {name = "third_agent", sampling_interval = "1s", count = 5},
+]
diff --git a/iox_data_generator/schemas/full_example.toml b/iox_data_generator/schemas/full_example.toml
new file mode 100644
index 0000000..34cc987
--- /dev/null
+++ b/iox_data_generator/schemas/full_example.toml
@@ -0,0 +1,188 @@
+# One run of the data generator output to --print will generate lines like this:
+
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_1,t2=t2_1_t1_1,t3=t3_1 intfield=48.31541353358504,intfield=63.16007209180341 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_1,t2=t2_1_t1_1,t3=t3_1 intfield=88.35678081075594,intfield=92.55272385943789 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_1,t2=t2_2_t1_1,t3=t3_1 intfield=71.34233494102085,intfield=19.35816384444733 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_1,t2=t2_2_t1_1,t3=t3_1 intfield=76.63378118605834,intfield=16.298451067775588 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_2,t2=t2_3_t1_2,t3=t3_2 intfield=96.71554665990536,intfield=93.44948263155631 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_2,t2=t2_3_t1_2,t3=t3_2 intfield=78.16527647371738,intfield=2.302033401489534 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_2,t2=t2_4_t1_2,t3=t3_2 intfield=90.37434758868368,intfield=7.552315135635346 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_2,t2=t2_4_t1_2,t3=t3_2 intfield=25.173607422073285,intfield=99.10021825896477 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_3,t2=t2_5_t1_3,t3=t3_1 intfield=31.724290085601936,intfield=71.04269945188204 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_3,t2=t2_5_t1_3,t3=t3_1 intfield=98.38837237131071,intfield=95.35495119280799 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_3,t2=t2_6_t1_3,t3=t3_1 intfield=15.860338450579835,intfield=20.932831216902017 1635968173847440000
+# m1,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_3,t2=t2_6_t1_3,t3=t3_1 intfield=73.52354656855404,intfield=21.906048846128144 1635968173847440000
+# m2,agent_id=1,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03 i64field=1678687i,strfield="7bAK",uptime=0i,uptime_format="0 days, 00:00" 1635968173847440000
+# m2,agent_id=1,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03 i64field=7287348i,strfield="r2Xj",uptime=0i,uptime_format="0 days, 00:00" 1635968173847440000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_1,t2=t2_1_t1_1,t3=t3_1 intfield=34.21564966893025,intfield=28.404777885873145 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_1,t2=t2_1_t1_1,t3=t3_1 intfield=89.53280753147736,intfield=88.35520078152399 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_1,t2=t2_2_t1_1,t3=t3_1 intfield=93.0798657117769,intfield=95.15086332651886 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_1,t2=t2_2_t1_1,t3=t3_1 intfield=16.383204148086563,intfield=69.36287104937198 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_2,t2=t2_3_t1_2,t3=t3_2 intfield=86.07310267461553,intfield=84.1837111118747 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_2,t2=t2_3_t1_2,t3=t3_2 intfield=66.97292091697567,intfield=13.792714677819795 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_2,t2=t2_4_t1_2,t3=t3_2 intfield=41.66956499741617,intfield=60.54778655915278 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_2,t2=t2_4_t1_2,t3=t3_2 intfield=50.85432735762039,intfield=51.71473345880968 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_3,t2=t2_5_t1_3,t3=t3_1 intfield=35.488387176278735,intfield=40.69930728826883 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_3,t2=t2_5_t1_3,t3=t3_1 intfield=52.224104265522485,intfield=17.630042482636732 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_1,t1=t1_3,t2=t2_6_t1_3,t3=t3_1 intfield=37.061044012796174,intfield=71.24055048796617 1635968173849823000
+# m1,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03,m1tag=1-1,m1tag2=1-2,other=o_2,t1=t1_3,t2=t2_6_t1_3,t3=t3_1 intfield=31.513973186770073,intfield=61.978547758411295 1635968173849823000
+# m2,agent_id=2,foo_bar=foo_8386ce4f-958d-42d5-9826-5dffc7e35ff9_1_DsZup_2021-11-03 i64field=3258846i,strfield="j9PD",uptime=0i,uptime_format="0 days, 00:00" 1635968173849823000
+# m2,agent_id=2,foo_bar=foo_8ffe4113-4680-43bd-9512-663bc84164f5_2_l8wdy_2021-11-03 i64field=4426192i,strfield="RBhn",uptime=0i,uptime_format="0 days, 00:00" 1635968173849823000
+# new_agent_measurement-1,agent-name=another_example f1=f 1635968398968189000
+# new_agent_measurement-2,agent-name=another_example f1=f 1635968398968189000
+
+name = "full_example"
+
+# Values are automatically generated before the agents are initialized. They generate tag key/value pairs
+# with the name of the value as the tag key and the evaluated template as the value. These pairs
+# can be shared across tagsets and used in the agents as pre-generated data.
+[[values]]
+# The name appears as the tag key in generated tag pairs and is used later when specifying tag_sets.
+# It must not have a . in it, which is used to access children later.
+name = "foo_bar"
+# This number of tag pairs will be generated. If this is > 1, the id or a random character string should be
+# used in the template to ensure that the tag key/value pairs are unique.
+cardinality = 2
+# The template will be evaluated for each tag pair (so N times where n == cardinality)
+# the template can use a number of helpers, which are shown in this example.
+# guid - generate a guid
+# id - the id of the tag pair. ids start at 0
+# random - generate random character string of the passed in length
+# format-time - strftime formatted string for now
+template = "foo_{{guid}}_{{id}}_{{random 5}}_{{format-time \"%Y-%m-%d\"}}"
+
+[[values]]
+name = "t1"
+template = "t1_{{id}}"
+cardinality = 3
+# each t1 generated will reference one of t3 and one of foo_bar. As each t1 is generated
+# it will loop through the t3 and foo_bar collections. So the 3rd t1 that is generated will
+# reference the first t3 and foo_bar
+has_one = ["t3", "foo_bar"]
+
+[[values]]
+name = "t2"
+# note that in this template we can access the parent's id because of the belongs_to
+template = "t2_{{id}}_t1_{{t1.id}}"
+cardinality = 2
+belongs_to = "t1"
+
+[[values]]
+name = "t3"
+template = "t3_{{id}}"
+cardinality = 2
+
+[[values]]
+name = "other"
+template = "o_{{id}}"
+cardinality = 2
+
+# tag_sets can be used later in the measurement specification. Each measurement can use one
+# tag set (or none). For each sampling that is generated, each measurement will have lines
+# generated equal to the cardinality of the tagset.
+[[tag_sets]]
+name = "example"
+# for_each specifies how to iterate through the values to generate tagsets. If you want to
+# use values that belong_to others or are a has_one, specify their parent first. For values
+# without relationships, you'll get a combined cardinality of each multiplied by the other.
+# In this example we get cardinality of card(t1) * card(foo_bar) * card(other). The has_one
+# members of t1 don't increase cardinality.
+for_each = [
+    "t1",
+    "t1.t3",
+    "t1.foo_bar",
+    "t1.t2",
+    "other",
+]
+
+[[tag_sets]]
+name = "foos"
+# note here that we have a tag set of foo_bar tag pairs. Values can be used outside the
+# context of where they may be referenced in belong_to or has_one
+for_each = [
+    "foo_bar"
+]
+
+# Agent specs can be referenced later on by bucket writers, which specify how frequently
+# data should be written and by how many different agents.
+[[agents]]
+name = "first_agent"
+# if specifying tag_pairs at the agent level, every line that the agent generates will have these
+# tag pairs added to it. Note that the template has the same helpers as those in value (except for id).
+# In addition, it has an accessor for the agent id.
+tag_pairs = [
+    {key = "agent_id", template = "{{agent.id}}"}
+]
+
+[[agents.measurements]]
+name = "m1"
+# each sampling will have all the tag sets from this collection in addition to the tags and tag_pairs specified
+tag_set = "example"
+# for each agent, this specific measurement will be decorated with these additional tags. All the previous
+# template helpers are available including  now `measurement.id` and `tag.id`.
+# This example also shows how to automatically generate many tags using `count` and how to specify
+# that the tag value template should be re-evaluated after N number of lines. This N is counted across
+# samplings.
+tag_pairs = [
+    {key = "m1tag", template = "{{measurement.id}}-{{id}}", count = 2, regenerate_after_lines = 5}
+]
+
+# field values are generated on every line as they're written out
+[[agents.measurements.fields]]
+name = "intfield"
+# Count is optional, we can use it to automatically create many fields
+count = 2
+f64_range = [0.0, 100.00]
+
+[[agents.measurements]]
+name = "m2"
+tag_set = "foos"
+
+[[agents.measurements.fields]]
+name = "i64field"
+i64_range = [1, 8147240]
+
+[[agents.measurements.fields]]
+name = "strfield"
+template = "{{random 4}}"
+
+# this generates an int value representing how long the agent has been running
+[[agents.measurements.fields]]
+name = "uptime"
+uptime = "i64"
+
+# generates uptime as a string value
+[[agents.measurements.fields]]
+name = "uptime_format"
+uptime = "telegraf"
+
+[[agents]]
+name = "another_example"
+tag_pairs = [{key = "agent_name", template = "agent.name"}]
+
+[[agents.measurements]]
+name = "new_agent_measurement-{{measurement.id}}"
+# you can automatically generate many measurements with the same schema
+count = 2
+
+[[agents.measurements.fields]]
+name = "f1"
+bool = true
+
+# database_writers specify how to split up the list of supplied buckets to write to. If
+# only a single one is specified via the CLI flags, then you'd want only a single bucket_writer
+# with a percent of 1.0.
+#
+# These make it possible to split up a large list of buckets to write to and send different
+# amounts of write load as well as different schemas through specifying different agents.
+[[database_writers]]
+# the first 20% of the databases specified in the --bucket_list file will have these agents writing to them
+database_ratio = 0.2
+# for each of those databases, have 3 of the first_agent writing every 10s, and 1 of the another_example writing every minute.
+agents = [{name = "first_agent", count = 3, sampling_interval = "10s"}, {name = "another_example", sampling_interval = "1m"}]
+
+[[database_writers]]
+# the remaining 80% of the databases specified will write using these agents
+database_ratio = 0.8
+# we'll only have a single agent of another_example for each database
+agents = [{name = "another_example", sampling_interval = "1s"}]
diff --git a/iox_data_generator/schemas/many_dbs.toml b/iox_data_generator/schemas/many_dbs.toml
new file mode 100644
index 0000000..8f160e7
--- /dev/null
+++ b/iox_data_generator/schemas/many_dbs.toml
@@ -0,0 +1,80 @@
+# This schema is meant to test out many databases writing data in like a bunch of free tier users.
+# Start with a database_list of 10k to make things interesting. This will send on average of
+# 208 requests/sec and 6.25 MB/sec across the 10k databases. The top 10 will have 60 requests/min
+# and 1.8MB/min.
+name = "many_dbs"
+
+[[values]]
+name = "some_tag_10"
+cardinality = 2
+template = "id_{{id}}_{{random 15}}"
+has_one = ["extra_static"]
+
+[[values]]
+name = "child_tag"
+cardinality = 3
+belongs_to = "some_tag_10"
+has_one = ["rotation"]
+template = "id_{{id}}_{{random 10}}"
+
+[[values]]
+name = "rotation"
+cardinality = 4
+template = "id_{{id}}_{{guid}}"
+
+[[values]]
+name = "extra_static"
+cardinality = 1
+template = "whatever-constant-value"
+
+[[tag_sets]]
+name = "first_set"
+for_each = [
+    "some_tag_10",
+    "some_tag_10.extra_static",
+    "some_tag_10.child_tag",
+    "child_tag.rotation",
+]
+
+# each sampling from this agent generates 32,465 bytes of LP, first few lines look like:
+# main_measurement_1,agent_id=1,child_tag=id_1_T6iJnnBTE3,extra_static=whatever-constant-value,measurement_tag=1,rotation=id_1_de4ddb8c-31a6-440f-a273-7132bdd43bd7,some_tag_10=id_1_rWtIkI26LTlfu0J intfield=71334i,floatfield=0.7934452557768101 1639151629935287000
+# main_measurement_1,agent_id=1,child_tag=id_2_VsiUF2xVuz,extra_static=whatever-constant-value,measurement_tag=1,rotation=id_2_890145b3-8157-4d6f-ac02-1fe37584190f,some_tag_10=id_1_rWtIkI26LTlfu0J intfield=64582i,floatfield=0.0957134480635704 1639151629935287000
+# main_measurement_1,agent_id=1,child_tag=id_3_XNL51f1NdT,extra_static=whatever-constant-value,measurement_tag=1,rotation=id_3_8bcf7547-06e9-4033-9ffb-e00ac4e6c5a9,some_tag_10=id_1_rWtIkI26LTlfu0J intfield=26179i,floatfield=0.09993902612184669 1639151629935287000
+# main_measurement_1,agent_id=1,child_tag=id_4_mqCyprcTDQ,extra_static=whatever-constant-value,measurement_tag=1,rotation=id_4_f465d43e-f1ab-4250-99ac-67af7c1d4c72,some_tag_10=id_2_X4eWjH9ImjTeta2 intfield=16511i,floatfield=0.033060266070114475 1639151629935287000
+[[agents]]
+name = "first_agent"
+tag_pairs = [
+    {key = "agent_id", template = "{{agent.id}}"}
+]
+
+[[agents.measurements]]
+name = "main_measurement_{{measurement.id}}"
+count = 20
+tag_set = "first_set"
+tag_pairs = [
+    {key = "measurement_tag", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield"
+i64_range = [1, 100000]
+
+[[agents.measurements.fields]]
+name = "floatfield"
+f64_range = [0.0, 1.0]
+
+[[database_writers]]
+database_ratio = 0.001
+agents = [{name = "first_agent", sampling_interval = "1s"}]
+
+[[database_writers]]
+database_ratio = 0.01
+agents = [{name = "first_agent", sampling_interval = "10s"}]
+
+[[database_writers]]
+database_ratio = 0.1
+agents = [{name = "first_agent", sampling_interval = "30s"}]
+
+[[database_writers]]
+database_ratio = 1.0
+agents = [{name = "first_agent", sampling_interval = "60s"}]
\ No newline at end of file
diff --git a/iox_data_generator/schemas/many_measurements.toml b/iox_data_generator/schemas/many_measurements.toml
new file mode 100644
index 0000000..66bd27d
--- /dev/null
+++ b/iox_data_generator/schemas/many_measurements.toml
@@ -0,0 +1,61 @@
+# This schema tests what load looks like with many measurements (2,000). If pointed at a single database
+# with the configured 20 agents at 10s sampling, it will send an average of 2 requests/second (representing
+# 16k rows) with 4.1MB/second of LP being written. Each agent writes 8k lines per request.
+name = "many_measurements"
+
+[[values]]
+name = "some_tag"
+cardinality = 2
+template = "id_{{id}}_{{random 15}}"
+has_one = ["extra_static"]
+
+[[values]]
+name = "child_tag"
+cardinality = 2
+belongs_to = "some_tag"
+has_one = ["rotation"]
+template = "id_{{id}}_{{random 10}}"
+
+[[values]]
+name = "rotation"
+cardinality = 3
+template = "id_{{id}}_{{guid}}"
+
+[[values]]
+name = "extra_static"
+cardinality = 1
+template = "whatever-constant-value"
+
+[[tag_sets]]
+name = "first_set"
+for_each = [
+    "some_tag",
+    "some_tag.extra_static",
+    "some_tag.child_tag",
+    "child_tag.rotation",
+]
+
+[[agents]]
+name = "first_agent"
+tag_pairs = [
+    {key = "agent_id", template = "{{agent.id}}"}
+]
+
+[[agents.measurements]]
+name = "main_measurement_{{measurement.id}}"
+count = 2000
+tag_set = "first_set"
+tag_pairs = [
+    {key = "measurement_id", template = "{{measurement.id}}"}
+]
+
+[[agents.measurements.fields]]
+name = "intfield"
+i64_range = [1, 100000]
+
+[[agents.measurements.fields]]
+name = "floatfield"
+f64_range = [0.0, 1.0]
+
+[[database_writers]]
+agents = [{name = "first_agent", sampling_interval = "10s", count = 20}]
diff --git a/iox_data_generator/schemas/storage_cardinality_example.toml b/iox_data_generator/schemas/storage_cardinality_example.toml
new file mode 100644
index 0000000..15b9707
--- /dev/null
+++ b/iox_data_generator/schemas/storage_cardinality_example.toml
@@ -0,0 +1,81 @@
+name = "storage_cardinality_example"
+
+# Values are automatically generated before the agents are initialized. They generate tag key/value pairs
+# with the name of the value as the tag key and the evaluated template as the value. These pairs
+# are Arc wrapped so they can be shared across tagsets and used in the agents as pre-generated data.
+[[values]]
+# the name must not have a . in it, which is used to access children later. Otherwise it's open.
+name = "role"
+# the template can use a number of helpers to get an id, a random string and the name, see below for examples
+template = "storage"
+# this number of tag pairs will be generated. If this is > 1, the id or a random character string should be
+# used in the template to ensure that the tag key/value pairs are unique.
+cardinality = 1
+
+[[values]]
+name = "url"
+template = "http://127.0.0.1:6060/metrics/usage"
+cardinality = 1
+
+[[values]]
+name = "org_id"
+# Fill in the value with the cardinality counter and 15 random alphanumeric characters
+template = "{{id}}_{{random 15}}"
+cardinality = 100
+has_one = ["env"]
+
+[[values]]
+name = "env"
+template = "whatever-environment-{{id}}"
+cardinality = 2
+
+[[values]]
+name = "bucket_id"
+# a bucket belongs to an org. With this, you would be able to access the org.id or org.value in the template
+belongs_to = "org_id"
+# each bucket will have a unique id, which is used here to guarantee uniqueness even across orgs. We also
+# have a random 15 character alphanumeric sequence to pad out the value length.
+template = "{{id}}_{{random 15}}"
+# For each org, 3 buckets will be generated
+cardinality = 3
+
+[[values]]
+name = "partition_id"
+template = "{{id}}"
+cardinality = 10
+
+# makes a tagset so every bucket appears in every partition. The other tags are descriptive and don't
+# increase the cardinality beyond count(bucket) * count(partition). Later this example will use the
+# agent and measurement generation to take this base tagset and increase cardinality on a per-agent basis.
+[[tag_sets]]
+name = "bucket_set"
+for_each = [
+    "role",
+    "url",
+    "org_id",
+    "org_id.env",
+    "org_id.bucket_id",
+    "partition_id",
+]
+
+[[database_writers]]
+database_ratio = 1.0
+agents = [{name = "sender", sampling_interval = "10s"}]
+
+[[agents]]
+name = "sender"
+
+[[agents.measurements]]
+name = "storage_usage_bucket_cardinality"
+# each sampling will have all the tag sets from this collection in addition to the tags and tag_pairs specified
+tag_set = "bucket_set"
+# for each agent, this specific measurement will be decorated with these additional tags.
+tag_pairs = [
+    {key = "node_id", template = "{{agent.id}}"},
+    {key = "hostname", template = "{{agent.id}}"},
+    {key = "host", template = "storage-{{agent.id}}"},
+]
+
+[[agents.measurements.fields]]
+name = "gauge"
+i64_range = [1, 8147240]
diff --git a/iox_data_generator/schemas/tracing-spec.toml b/iox_data_generator/schemas/tracing-spec.toml
new file mode 100644
index 0000000..1df620d
--- /dev/null
+++ b/iox_data_generator/schemas/tracing-spec.toml
@@ -0,0 +1,35 @@
+name = "tracing_schema"
+
+[[values]]
+name = "host"
+template = "server-{{id}}"
+cardinality = 3000
+has_one = ["service"]
+
+[[values]]
+name = "service"
+template = "service-{{id}}"
+cardinality = 10
+
+[[tag_sets]]
+name = "host_services"
+for_each = ["host", "host.service"]
+
+[[agents]]
+name = "tracing_agent"
+
+[[agents.measurements]]
+name = "traces"
+tag_set = "host_services"
+tag_pairs = [
+    {key = "trace_id", template = "{{guid}}", regenerate_after_lines = 10},
+    {key = "span_id", template = "{{guid}}", regenerate_after_lines = 1},
+]
+
+[[agents.measurements.fields]]
+name = "timing"
+f64_range = [0.0, 500.0]
+
+[[database_writers]]
+database_ratio = 1.0
+agents = [{name = "tracing_agent", sampling_interval = "1s"}]
diff --git a/iox_data_generator/src/agent.rs b/iox_data_generator/src/agent.rs
new file mode 100644
index 0000000..aeed284
--- /dev/null
+++ b/iox_data_generator/src/agent.rs
@@ -0,0 +1,692 @@
+//! Agents responsible for generating points
+
+use crate::{
+    measurement::{MeasurementGenerator, MeasurementLineIterator},
+    now_ns, specification,
+    tag_pair::TagPair,
+    write::PointsWriter,
+};
+
+use crate::tag_set::GeneratedTagSets;
+use serde_json::json;
+use snafu::{ResultExt, Snafu};
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    Arc,
+};
+use std::time::{Duration, Instant};
+use tracing::debug;
+
+/// Agent-specific Results
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Errors that may happen while creating points
+#[derive(Snafu, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("{}", source))]
+    CouldNotGeneratePoint {
+        /// Underlying `measurement` module error that caused this problem
+        source: crate::measurement::Error,
+    },
+
+    #[snafu(display("Could not create measurement generators, caused by:\n{}", source))]
+    CouldNotCreateMeasurementGenerators {
+        /// Underlying `measurement` module error that caused this problem
+        source: crate::measurement::Error,
+    },
+
+    #[snafu(display("Could not write points, caused by:\n{}", source))]
+    CouldNotWritePoints {
+        /// Underlying `write` module error that caused this problem
+        source: crate::write::Error,
+    },
+
+    #[snafu(display("Error creating agent tag pairs: {}", source))]
+    CouldNotCreateAgentTagPairs { source: crate::tag_pair::Error },
+}
+
+/// Each `AgentSpec` informs the instantiation of an `Agent`, which coordinates
+/// the generation of the measurements in their specification.
+#[derive(Debug)]
+pub struct Agent {
+    /// identifier for the agent. This can be used in generated tags and fields
+    pub id: usize,
+    /// name for the agent. This can be used in generated tags and fields
+    pub name: String,
+    measurement_generators: Vec<MeasurementGenerator>,
+    sampling_interval: Option<Duration>,
+    /// nanoseconds since the epoch, used as the timestamp for the next
+    /// generated point
+    current_datetime: i64,
+    /// nanoseconds since the epoch, when current_datetime exceeds this, stop
+    /// generating points
+    end_datetime: i64,
+    /// whether to continue generating points after reaching the current time
+    continue_on: bool,
+    /// whether this agent is done generating points or not
+    finished: bool,
+    /// Optional interval at which to re-run the agent if generating data in
+    /// "continue" mode
+    interval: Option<tokio::time::Interval>,
+}
+
+/// Basic stats for agents generating requests
+#[derive(Debug, Default, Copy, Clone)]
+pub struct AgentGenerateStats {
+    /// number of rows the agent has written
+    pub row_count: usize,
+    /// number of requests the agent has made
+    pub request_count: usize,
+    /// number of errors
+    pub error_count: usize,
+}
+
+impl AgentGenerateStats {
+    /// Display output for agent writing stats
+    pub fn display_stats(&self, elapsed_time: Duration) -> String {
+        if elapsed_time.as_secs() == 0 {
+            format!(
+                "made {} requests with {} rows in {:?} with {} errors for a {:.2} error rate",
+                self.request_count,
+                self.row_count,
+                elapsed_time,
+                self.error_count,
+                self.error_rate()
+            )
+        } else {
+            let req_secs = elapsed_time.as_secs();
+            let rows_per_sec = self.row_count as u64 / req_secs;
+            let reqs_per_sec = self.request_count as u64 / req_secs;
+            format!("made {} requests at {}/sec with {} rows at {}/sec in {:?} with {} errors for a {:.2} error rate",
+                self.request_count, reqs_per_sec, self.row_count, rows_per_sec, elapsed_time, self.error_count, self.error_rate())
+        }
+    }
+
+    fn error_rate(&self) -> f64 {
+        if self.error_count == 0 {
+            return 0.0;
+        }
+        self.error_count as f64 / self.request_count as f64 * 100.0
+    }
+}
+
+impl Agent {
+    /// Create agents that will generate data points according to these
+    /// specs.
+    #[allow(clippy::too_many_arguments)]
+    pub fn from_spec(
+        agent_spec: &specification::AgentSpec,
+        count: usize,
+        sampling_interval: Duration,
+        start_datetime: Option<i64>, // in nanoseconds since the epoch, defaults to now
+        end_datetime: Option<i64>,   // also in nanoseconds since the epoch, defaults to now
+        execution_start_time: i64,
+        continue_on: bool, // If true, run in "continue" mode after historical data is generated
+        generated_tag_sets: &GeneratedTagSets,
+    ) -> Result<Vec<Self>> {
+        let agents: Vec<_> = (1..count + 1)
+            .map(|agent_id| {
+                let data = json!({"agent": {"id": agent_id, "name": agent_spec.name}});
+
+                let agent_tag_pairs = TagPair::pairs_from_specs(&agent_spec.tag_pairs, data)
+                    .context(CouldNotCreateAgentTagPairsSnafu)?;
+
+                let measurement_generators = agent_spec
+                    .measurements
+                    .iter()
+                    .map(|spec| {
+                        MeasurementGenerator::from_spec(
+                            agent_id,
+                            spec,
+                            execution_start_time,
+                            generated_tag_sets,
+                            &agent_tag_pairs,
+                        )
+                        .context(CouldNotCreateMeasurementGeneratorsSnafu)
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+                let measurement_generators = measurement_generators.into_iter().flatten().collect();
+
+                let current_datetime = start_datetime.unwrap_or_else(now_ns);
+                let end_datetime = end_datetime.unwrap_or_else(now_ns);
+
+                Ok(Self {
+                    id: agent_id,
+                    name: agent_spec.name.to_string(),
+                    measurement_generators,
+                    sampling_interval: Some(sampling_interval),
+                    current_datetime,
+                    end_datetime,
+                    continue_on,
+                    finished: false,
+                    interval: None,
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        Ok(agents)
+    }
+
+    /// Generate and write points in batches until `generate` doesn't return any
+    /// points. Points will be written to the writer in batches where `generate` is
+    /// called `batch_size` times before writing. Meant to be called in a `tokio::task`.
+    pub async fn generate_all(
+        &mut self,
+        points_writer: Arc<PointsWriter>,
+        batch_size: usize,
+        counter: Arc<AtomicU64>,
+        request_counter: Arc<AtomicU64>,
+    ) -> Result<AgentGenerateStats> {
+        let mut points_this_batch = 1;
+        let start = Instant::now();
+        let mut stats = AgentGenerateStats::default();
+
+        while points_this_batch != 0 {
+            let batch_start = Instant::now();
+            points_this_batch = 0;
+
+            let mut streams = Vec::with_capacity(batch_size);
+            for _ in 0..batch_size {
+                if self.finished {
+                    break;
+                } else {
+                    let mut s = self.generate().await?;
+                    streams.append(&mut s);
+                }
+            }
+
+            for s in &streams {
+                points_this_batch += s.line_count();
+            }
+
+            if points_this_batch == 0 && self.finished {
+                break;
+            }
+
+            stats.request_count += 1;
+            match points_writer
+                .write_points(streams.into_iter().flatten())
+                .await
+                .context(CouldNotWritePointsSnafu)
+            {
+                Ok(_) => {
+                    stats.row_count += points_this_batch;
+
+                    if stats.request_count % 10 == 0 {
+                        println!(
+                            "Agent {} wrote {} in {:?}",
+                            self.id,
+                            points_this_batch,
+                            batch_start.elapsed()
+                        );
+                    }
+
+                    // output something on the aggregate stats every 100 requests across all agents
+                    let total_rows = counter.fetch_add(points_this_batch as u64, Ordering::SeqCst);
+                    let total_requests = request_counter.fetch_add(1, Ordering::SeqCst);
+
+                    if total_requests % 100 == 0 {
+                        let secs = start.elapsed().as_secs();
+                        if secs != 0 {
+                            println!(
+                                "{} rows written in {} requests for {} rows/sec and {} reqs/sec",
+                                total_rows,
+                                total_requests,
+                                total_rows / secs,
+                                total_requests / secs,
+                            )
+                        }
+                    }
+                }
+                Err(e) => {
+                    eprintln!("Error writing points: {e}");
+                    stats.error_count += 1;
+                }
+            }
+        }
+
+        Ok(stats)
+    }
+
+    /// Generate data points from the configuration in this agent.
+    pub async fn generate(&mut self) -> Result<Vec<MeasurementLineIterator>> {
+        debug!(
+            "[agent {}] finished? {} current: {}, end: {}",
+            self.id, self.finished, self.current_datetime, self.end_datetime
+        );
+
+        if !self.finished {
+            let mut measurement_streams = Vec::with_capacity(self.measurement_generators.len());
+
+            // Save the current_datetime to use in the set of points that we're generating
+            // because we might increment current_datetime to see if we're done
+            // or not.
+            let point_timestamp = self.current_datetime;
+
+            if let Some(i) = &mut self.interval {
+                i.tick().await;
+                self.current_datetime = now_ns();
+            } else if let Some(sampling_interval) = self.sampling_interval {
+                self.current_datetime += sampling_interval.as_nanos() as i64;
+
+                if self.current_datetime > self.end_datetime {
+                    if self.continue_on {
+                        let mut i = tokio::time::interval(sampling_interval);
+                        i.tick().await; // first tick completes immediately
+                        self.current_datetime = now_ns();
+                        self.interval = Some(i);
+                    } else {
+                        self.finished = true;
+                    }
+                }
+            } else {
+                self.finished = true;
+            }
+
+            for mgs in &mut self.measurement_generators {
+                measurement_streams.push(
+                    mgs.generate(point_timestamp)
+                        .context(CouldNotGeneratePointSnafu)?,
+                );
+            }
+
+            Ok(measurement_streams)
+        } else {
+            Ok(Vec::new())
+        }
+    }
+
+    /// Sets the current date and time for the agent and resets its finished state to false. Enables
+    /// calling generate again during testing and benchmarking.
+    pub fn reset_current_date_time(&mut self, current_datetime: i64) {
+        self.finished = false;
+        self.current_datetime = current_datetime;
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::measurement::LineToGenerate;
+    use crate::{now_ns, specification::*};
+    use influxdb2_client::models::WriteDataPoint;
+
+    type Error = Box<dyn std::error::Error>;
+    type Result<T = (), E = Error> = std::result::Result<T, E>;
+
+    impl Agent {
+        /// Instantiate an agent only with the parameters we're interested in
+        /// testing, keeping everything else constant across different
+        /// tests.
+        fn test_instance(
+            sampling_interval: Option<Duration>,
+            continue_on: bool,
+            current_datetime: i64,
+            end_datetime: i64,
+        ) -> Self {
+            let measurement_spec = MeasurementSpec {
+                name: "measurement-{{agent.id}}-{{measurement.id}}".into(),
+                count: Some(2),
+                fields: vec![FieldSpec {
+                    name: "field-{{agent.id}}-{{measurement.id}}-{{field.id}}".into(),
+                    field_value_spec: FieldValueSpec::I64 {
+                        range: 0..60,
+                        increment: false,
+                        reset_after: None,
+                    },
+                    count: Some(2),
+                }],
+                tag_pairs: vec![],
+                tag_set: None,
+            };
+
+            let generated_tag_sets = GeneratedTagSets::default();
+
+            let measurement_generators = MeasurementGenerator::from_spec(
+                1,
+                &measurement_spec,
+                current_datetime,
+                &generated_tag_sets,
+                &[],
+            )
+            .unwrap();
+
+            Self {
+                id: 0,
+                name: "foo".to_string(),
+                finished: false,
+                interval: None,
+
+                sampling_interval,
+                current_datetime,
+                end_datetime,
+                continue_on,
+                measurement_generators,
+            }
+        }
+    }
+
+    fn timestamps(points: &[LineToGenerate]) -> Result<Vec<i64>> {
+        points
+            .iter()
+            .map(|point| {
+                let mut v = Vec::new();
+                point.write_data_point_to(&mut v)?;
+                let line = String::from_utf8(v)?;
+
+                Ok(line.split(' ').last().unwrap().trim().parse()?)
+            })
+            .collect()
+    }
+
+    #[rustfmt::skip]
+    // # Summary: No Sampling Interval
+    //
+    // If there isn't a sampling interval, we don't know how often to run, so we can neither
+    // generate historical data nor can we continue into the future. The only thing we'll do is
+    // generate once then stop.
+    //
+    // | sampling_interval | continue | cmp(current_time, end_time) | expected outcome |
+    // |-------------------+----------+-----------------------------+------------------|
+    // | None              | false    | Less                        | gen 1x, stop     |
+    // | None              | false    | Equal                       | gen 1x, stop     |
+    // | None              | false    | Greater                     | gen 1x, stop     |
+    // | None              | true     | Less                        | gen 1x, stop     |
+    // | None              | true     | Equal                       | gen 1x, stop     |
+    // | None              | true     | Greater                     | gen 1x, stop     |
+
+    mod without_sampling_interval {
+        use super::*;
+
+        mod without_continue {
+            use super::*;
+
+            #[tokio::test]
+            async fn current_time_less_than_end_time() -> Result<()> {
+                let mut agent = Agent::test_instance(None, false, 0, 10);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_equal_end_time() -> Result<()> {
+                let mut agent = Agent::test_instance(None, false, 10, 10);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_greater_than_end_time() -> Result<()> {
+                let mut agent = Agent::test_instance(None, false, 11, 10);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+        }
+
+        mod with_continue {
+            use super::*;
+
+            #[tokio::test]
+            async fn current_time_less_than_end_time() -> Result<()> {
+                let mut agent = Agent::test_instance(None, true, 0, 10);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_equal_end_time() -> Result<()> {
+                let mut agent = Agent::test_instance(None, true, 10, 10);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_greater_than_end_time() -> Result<()> {
+                let mut agent = Agent::test_instance(None, true, 11, 10);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+        }
+    }
+
+    mod with_sampling_interval {
+        use super::*;
+
+        // The tests take about 5 ms to run on my computer, so set the sampling interval
+        // to 10 ms to be able to test that the delay is happening when
+        // `continue` is true without making the tests too artificially slow.
+        const TEST_SAMPLING_INTERVAL: Duration = Duration::from_millis(10);
+
+        #[rustfmt::skip]
+        // # Summary: Not continuing
+        //
+        // If there is a sampling interval but we're not continuing, we should generate points at
+        // least once but if the current time is greater than the ending time (which might be set
+        // to `now`), we've generated everything we need to and should stop.
+        //
+        // | sampling_interval | continue | cmp(current_time, end_time) | expected outcome |
+        // |-------------------+----------+-----------------------------+------------------|
+        // | Some(_)           | false    | Less                        | gen & increment  |
+        // | Some(_)           | false    | Equal                       | gen 1x, stop     |
+        // | Some(_)           | false    | Greater                     | gen 1x, stop     |
+
+        mod without_continue {
+            use super::*;
+
+            #[tokio::test]
+            async fn current_time_less_than_end_time() -> Result<()> {
+                let current = 0;
+                let end = TEST_SAMPLING_INTERVAL.as_nanos() as i64;
+
+                let mut agent =
+                    Agent::test_instance(Some(TEST_SAMPLING_INTERVAL), false, current, end);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_equal_end_time() -> Result<()> {
+                let current = TEST_SAMPLING_INTERVAL.as_nanos() as i64;
+                let end = current;
+
+                let mut agent =
+                    Agent::test_instance(Some(TEST_SAMPLING_INTERVAL), false, current, end);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_greater_than_end_time() -> Result<()> {
+                let current = 2 * TEST_SAMPLING_INTERVAL.as_nanos() as i64;
+                let end = TEST_SAMPLING_INTERVAL.as_nanos() as i64;
+
+                let mut agent =
+                    Agent::test_instance(Some(TEST_SAMPLING_INTERVAL), false, current, end);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                assert_eq!(points.count(), 2);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert!(points.is_empty(), "expected no points, got {points:?}");
+
+                Ok(())
+            }
+        }
+
+        #[rustfmt::skip]
+        // # Summary: After generating historical data, continue sampling in "real time"
+        //
+        // If there is a sampling interval and we are continuing, generate points as fast as
+        // possible (but with timestamps separated by sampling_interval amounts) until we catch up
+        // to `now`. Then add pauses of the sampling_interval's duration, generating points with
+        // their timestamps set to the current time to simulate "real" point generation.
+        //
+        // | sampling_interval | continue | cmp(current_time, end_time) | expected outcome |
+        // |-------------------+----------+-----------------------------+------------------|
+        // | Some(_)           | true     | Less                        | gen, no delay    |
+        // | Some(_)           | true     | Equal                       | gen, delay       |
+        // | Some(_)           | true     | Greater                     | gen, delay       |
+
+        mod with_continue {
+            use super::*;
+
+            #[tokio::test]
+            async fn current_time_less_than_end_time() -> Result<()> {
+                let end = now_ns();
+                let current = end - TEST_SAMPLING_INTERVAL.as_nanos() as i64;
+
+                let mut agent =
+                    Agent::test_instance(Some(TEST_SAMPLING_INTERVAL), true, current, end);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert_eq!(points.len(), 2);
+
+                let times = timestamps(&points).unwrap();
+                assert_eq!(vec![current, current], times);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert_eq!(points.len(), 2);
+
+                let times = timestamps(&points).unwrap();
+                assert_eq!(vec![end, end], times);
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_equal_end_time() -> Result<()> {
+                let end = now_ns();
+                let current = end;
+
+                let mut agent =
+                    Agent::test_instance(Some(TEST_SAMPLING_INTERVAL), true, current, end);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert_eq!(points.len(), 2);
+
+                let times = timestamps(&points).unwrap();
+                assert_eq!(vec![end, end], times);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert_eq!(points.len(), 2);
+
+                let real_now = now_ns();
+
+                let times = timestamps(&points).unwrap();
+                for time in times {
+                    assert!(
+                        time <= real_now,
+                        "expected timestamp {} to be generated before now ({}); \
+                        was {} nanoseconds greater",
+                        time,
+                        real_now,
+                        time - real_now
+                    );
+                }
+
+                Ok(())
+            }
+
+            #[tokio::test]
+            async fn current_time_greater_than_end_time() -> Result<()> {
+                let end = now_ns();
+                let current = end + TEST_SAMPLING_INTERVAL.as_nanos() as i64;
+
+                let mut agent =
+                    Agent::test_instance(Some(TEST_SAMPLING_INTERVAL), true, current, end);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert_eq!(points.len(), 2);
+
+                let times = timestamps(&points).unwrap();
+                assert_eq!(vec![current, current], times);
+
+                let points = agent.generate().await?.into_iter().flatten();
+                let points: Vec<_> = points.collect();
+                assert_eq!(points.len(), 2);
+
+                let real_now = now_ns();
+
+                let times = timestamps(&points).unwrap();
+                for time in times {
+                    assert!(
+                        time <= real_now,
+                        "expected timestamp {} to be generated before now ({}); \
+                        was {} nanoseconds greater",
+                        time,
+                        real_now,
+                        time - real_now
+                    );
+                }
+
+                Ok(())
+            }
+        }
+    }
+}
diff --git a/iox_data_generator/src/bin/iox_data_generator.rs b/iox_data_generator/src/bin/iox_data_generator.rs
new file mode 100644
index 0000000..3355b28
--- /dev/null
+++ b/iox_data_generator/src/bin/iox_data_generator.rs
@@ -0,0 +1,268 @@
+//! Entry point for generator CLI.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr
+)]
+
+use chrono::prelude::*;
+use iox_data_generator::{specification::DataSpec, write::PointsWriterBuilder};
+use std::{
+    fs::File,
+    io::{self, BufRead},
+};
+use tracing::info;
+
+#[derive(clap::Parser)]
+#[clap(
+    name = "iox_data_generator",
+    about = "IOx data point generator",
+    long_about = r#"IOx data point generator
+
+Examples:
+    # Generate data points using the specification in `spec.toml` and save in the `lp` directory
+    iox_data_generator -s spec.toml -o lp
+
+    # Generate data points and write to the server running at localhost:8080 with the provided org,
+    # bucket and authorization token
+    iox_data_generator -s spec.toml -h localhost:8080 --org myorg --bucket mybucket --token mytoken
+
+    # Generate data points for the 24 hours between midnight 2020-01-01 and 2020-01-02
+    iox_data_generator -s spec.toml -o lp --start 2020-01-01 --end 2020-01-02
+
+    # Generate data points starting from an hour ago until now, generating the historical data as
+    # fast as possible. Then generate data according to the sampling interval until terminated.
+    iox_data_generator -s spec.toml -o lp --start "1 hr" --continue
+
+Logging:
+    Use the RUST_LOG environment variable to configure the desired logging level.
+    For example:
+
+    # Enable INFO level logging for all of iox_data_generator
+    RUST_LOG=iox_data_generator=info iox_data_generator -s spec.toml -o lp
+"#,
+    author,
+    version,
+    disable_help_flag = true,
+    arg(
+        clap::Arg::new("help")
+            .long("help")
+            .help("Print help information")
+            .action(clap::ArgAction::Help)
+            .global(true)
+    ),
+)]
+struct Config {
+    /// Path to the specification TOML file describing the data generation
+    #[clap(long, short, action)]
+    specification: String,
+
+    /// Print the generated line protocol from a single sample collection to the terminal
+    #[clap(long, action)]
+    print: bool,
+
+    /// Runs the generation with agents writing to a sink. Useful for quick stress test to see how
+    /// much resources the generator will take
+    #[clap(long, action)]
+    noop: bool,
+
+    /// The directory to write line protocol to
+    #[clap(long, short, action)]
+    output: Option<String>,
+
+    /// The directory to write Parquet files to
+    #[clap(long, short, action)]
+    parquet: Option<String>,
+
+    /// The host name part of the API endpoint to write to
+    #[clap(long, short, action)]
+    host: Option<String>,
+
+    /// The organization name to write to
+    #[clap(long, action)]
+    org: Option<String>,
+
+    /// The bucket name to write to
+    #[clap(long, action)]
+    bucket: Option<String>,
+
+    /// File name with a list of databases. 1 per line with <org>_<bucket> format
+    #[clap(long, action)]
+    database_list: Option<String>,
+
+    /// The API authorization token used for all requests
+    #[clap(long, action)]
+    token: Option<String>,
+
+    /// The date and time at which to start the timestamps of the generated data.
+    ///
+    /// Can be an exact datetime like `2020-01-01T01:23:45-05:00` or a fuzzy
+    /// specification like `1 hour`. If not specified, defaults to no.
+    #[clap(long, action)]
+    start: Option<String>,
+
+    /// The date and time at which to stop the timestamps of the generated data.
+    ///
+    /// Can be an exact datetime like `2020-01-01T01:23:45-05:00` or a fuzzy
+    /// specification like `1 hour`. If not specified, defaults to now.
+    #[clap(long, action)]
+    end: Option<String>,
+
+    /// Generate live data using the intervals from the spec after generating historical data.
+    ///
+    /// This option has no effect if you specify an end time.
+    #[clap(long = "continue", action)]
+    do_continue: bool,
+
+    /// Generate this many samplings to batch into a single API call. Good for sending a bunch of
+    /// historical data in quickly if paired with a start time from long ago.
+    #[clap(long, action, default_value = "1")]
+    batch_size: usize,
+
+    /// Generate jaeger debug header with given key during write
+    #[clap(long, action)]
+    jaeger_debug_header: Option<String>,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let config: Config = clap::Parser::parse();
+
+    if !config.print {
+        tracing_subscriber::fmt::init();
+    }
+
+    let execution_start_time = Local::now();
+    let execution_start_time_nanos = execution_start_time
+        .timestamp_nanos_opt()
+        .expect("'now' is in nano range");
+
+    let start_datetime = datetime_nanoseconds(config.start.as_deref(), execution_start_time);
+    let end_datetime = datetime_nanoseconds(config.end.as_deref(), execution_start_time);
+
+    let start_display = start_datetime.unwrap_or(execution_start_time_nanos);
+    let end_display = end_datetime.unwrap_or(execution_start_time_nanos);
+
+    let continue_on = config.do_continue;
+
+    info!(
+        "Starting at {}, ending at {} ({}){}",
+        start_display,
+        end_display,
+        (end_display - start_display) / 1_000_000_000,
+        if continue_on { " then continuing" } else { "" },
+    );
+
+    let data_spec = DataSpec::from_file(&config.specification)?;
+
+    let mut points_writer_builder = if let Some(line_protocol_filename) = config.output {
+        PointsWriterBuilder::new_file(line_protocol_filename)?
+    } else if let Some(parquet_directory) = config.parquet {
+        PointsWriterBuilder::new_parquet(parquet_directory)?
+    } else if let Some(ref host) = config.host {
+        let token = config.token.expect("--token must be specified");
+
+        PointsWriterBuilder::new_api(host, token, config.jaeger_debug_header.as_deref()).await?
+    } else if config.print {
+        PointsWriterBuilder::new_std_out()
+    } else if config.noop {
+        PointsWriterBuilder::new_no_op(true)
+    } else {
+        panic!("One of --print or --output or --host must be provided.");
+    };
+
+    let buckets = if config.host.is_some() {
+        // Buckets are only relevant if we're writing to the API
+        match (config.org, config.bucket, config.database_list) {
+            (Some(org), Some(bucket), None) => {
+                vec![format!("{org}_{bucket}")]
+            }
+            (None, None, Some(bucket_list)) => {
+                let f = File::open(bucket_list).expect("unable to open database_list file");
+
+                io::BufReader::new(f)
+                    .lines()
+                    .map(|l| l.expect("unable to read database from database_list file"))
+                    .collect::<Vec<_>>()
+            }
+            _ => panic!("must specify either --org AND --bucket OR --database_list"),
+        }
+    } else {
+        // But we need at least one database or nothing will be written anywhere
+        vec![String::from("org_bucket")]
+    };
+
+    let result = iox_data_generator::generate(
+        &data_spec,
+        buckets,
+        &mut points_writer_builder,
+        start_datetime,
+        end_datetime,
+        execution_start_time_nanos,
+        continue_on,
+        config.batch_size,
+        config.print,
+    )
+    .await;
+
+    match result {
+        Ok(total_points) => {
+            if !config.print {
+                eprintln!("Submitted {total_points} total points");
+            }
+        }
+        Err(e) => eprintln!("Execution failed: \n{e}"),
+    }
+
+    Ok(())
+}
+
+fn datetime_nanoseconds(arg: Option<&str>, now: DateTime<Local>) -> Option<i64> {
+    arg.map(|s| {
+        let datetime = humantime::parse_rfc3339(s)
+            .map(Into::into)
+            .unwrap_or_else(|_| {
+                let std_duration = humantime::parse_duration(s).expect("Could not parse time");
+                let chrono_duration = chrono::Duration::from_std(std_duration)
+                    .expect("Could not convert std::time::Duration to chrono::Duration");
+                now - chrono_duration
+            });
+
+        datetime
+            .timestamp_nanos_opt()
+            .expect("timestamp out of range")
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn none_datetime_is_none_nanoseconds() {
+        let ns = datetime_nanoseconds(None, Local::now());
+        assert!(ns.is_none());
+    }
+
+    #[test]
+    fn rfc3339() {
+        let ns = datetime_nanoseconds(Some("2020-01-01T01:23:45Z"), Local::now());
+        assert_eq!(ns, Some(1_577_841_825_000_000_000));
+    }
+
+    #[test]
+    fn relative() {
+        let fixed_now = Local::now();
+        let ns = datetime_nanoseconds(Some("1hr"), fixed_now);
+        let expected = (fixed_now - chrono::Duration::hours(1))
+            .timestamp_nanos_opt()
+            .unwrap();
+        assert_eq!(ns, Some(expected));
+    }
+}
diff --git a/iox_data_generator/src/field.rs b/iox_data_generator/src/field.rs
new file mode 100644
index 0000000..a32a9d8
--- /dev/null
+++ b/iox_data_generator/src/field.rs
@@ -0,0 +1,546 @@
+//! Generating a set of field keys and values given a specification
+
+use crate::{
+    now_ns, specification,
+    substitution::{self, pick_from_replacements},
+};
+
+use handlebars::Handlebars;
+use rand::rngs::SmallRng;
+use rand::Rng;
+use rand::SeedableRng;
+use serde_json::json;
+use serde_json::Value;
+use snafu::{ResultExt, Snafu};
+use std::{ops::Range, time::Duration};
+
+/// Field-specific Results
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Errors that may happen while creating fields
+#[derive(Snafu, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Could not create field name, caused by:\n{}", source))]
+    CouldNotCreateFieldName { source: crate::substitution::Error },
+
+    #[snafu(display("Could not compile string field template: {}", source))]
+    CouldNotCompileStringTemplate {
+        #[snafu(source(from(handlebars::TemplateError, Box::new)))]
+        source: Box<handlebars::TemplateError>,
+    },
+
+    #[snafu(display("Could not render string field template: {}", source))]
+    CouldNotRenderStringTemplate {
+        #[snafu(source(from(handlebars::RenderError, Box::new)))]
+        source: Box<handlebars::RenderError>,
+    },
+}
+
+/// Different field type generators
+#[derive(Debug)]
+pub enum FieldGeneratorImpl {
+    /// Boolean field generator
+    Bool(BooleanFieldGenerator),
+    /// Integer field generator
+    I64(I64FieldGenerator),
+    /// Float field generator
+    F64(F64FieldGenerator),
+    /// String field generator
+    String(Box<StringFieldGenerator>),
+    /// Uptime field generator
+    Uptime(UptimeFieldGenerator),
+}
+
+impl FieldGeneratorImpl {
+    /// Create fields that will generate according to the spec
+    pub fn from_spec(
+        spec: &specification::FieldSpec,
+        data: Value,
+        execution_start_time: i64,
+    ) -> Result<Vec<Self>> {
+        use specification::FieldValueSpec::*;
+
+        let field_count = spec.count.unwrap_or(1);
+
+        let mut fields = Vec::with_capacity(field_count);
+
+        for field_id in 1..field_count + 1 {
+            let mut data = data.clone();
+            let d = data.as_object_mut().expect("data must be object");
+            d.insert("field".to_string(), json!({ "id": field_id }));
+
+            let field_name = substitution::render_once("field", &spec.name, &data)
+                .context(CouldNotCreateFieldNameSnafu)?;
+
+            let rng =
+                SmallRng::from_rng(&mut rand::thread_rng()).expect("SmallRng should always create");
+
+            let field = match &spec.field_value_spec {
+                Bool(true) => Self::Bool(BooleanFieldGenerator::new(&field_name, rng)),
+                Bool(false) => unimplemented!("Not sure what false means for bool fields yet"),
+                I64 {
+                    range,
+                    increment,
+                    reset_after,
+                } => Self::I64(I64FieldGenerator::new(
+                    &field_name,
+                    range,
+                    *increment,
+                    *reset_after,
+                    rng,
+                )),
+                F64 { range } => Self::F64(F64FieldGenerator::new(&field_name, range, rng)),
+                String {
+                    pattern,
+                    replacements,
+                } => Self::String(Box::new(StringFieldGenerator::new(
+                    &field_name,
+                    pattern,
+                    data,
+                    replacements.to_vec(),
+                    rng,
+                )?)),
+                Uptime { kind } => Self::Uptime(UptimeFieldGenerator::new(
+                    &field_name,
+                    kind,
+                    execution_start_time,
+                )),
+            };
+
+            fields.push(field);
+        }
+
+        Ok(fields)
+    }
+
+    /// Writes the field in line protocol to the passed writer
+    pub fn write_to<W: std::io::Write>(&mut self, mut w: W, timestamp: i64) -> std::io::Result<()> {
+        match self {
+            Self::Bool(f) => {
+                let v: bool = f.rng.gen();
+                write!(w, "{}={}", f.name, v)
+            }
+            Self::I64(f) => {
+                let v = f.generate_value();
+                write!(w, "{}={}", f.name, v)
+            }
+            Self::F64(f) => {
+                let v = f.generate_value();
+                write!(w, "{}={}", f.name, v)
+            }
+            Self::String(f) => {
+                let v = f.generate_value(timestamp);
+                write!(w, "{}=\"{}\"", f.name, v)
+            }
+            Self::Uptime(f) => match f.kind {
+                specification::UptimeKind::I64 => {
+                    let v = f.generate_value();
+                    write!(w, "{}={}", f.name, v)
+                }
+                specification::UptimeKind::Telegraf => {
+                    let v = f.generate_value_as_string();
+                    write!(w, "{}=\"{}\"", f.name, v)
+                }
+            },
+        }
+    }
+}
+
+/// Generate boolean field names and values.
+#[derive(Debug)]
+pub struct BooleanFieldGenerator {
+    /// The name (key) of the field
+    pub name: String,
+    rng: SmallRng,
+}
+
+impl BooleanFieldGenerator {
+    /// Create a new boolean field generator that will always use the specified
+    /// name.
+    pub fn new(name: &str, rng: SmallRng) -> Self {
+        let name = name.into();
+
+        Self { name, rng }
+    }
+
+    /// Generate a random value
+    pub fn generate_value(&mut self) -> bool {
+        self.rng.gen()
+    }
+}
+
+/// Generate integer field names and values.
+#[derive(Debug)]
+pub struct I64FieldGenerator {
+    /// The name (key) of the field
+    pub name: String,
+    range: Range<i64>,
+    increment: bool,
+    rng: SmallRng,
+    previous_value: i64,
+    reset_after: Option<usize>,
+    current_tick: usize,
+}
+
+impl I64FieldGenerator {
+    /// Create a new integer field generator that will always use the specified
+    /// name.
+    pub fn new(
+        name: impl Into<String>,
+        range: &Range<i64>,
+        increment: bool,
+        reset_after: Option<usize>,
+        rng: SmallRng,
+    ) -> Self {
+        Self {
+            name: name.into(),
+            range: range.to_owned(),
+            increment,
+            rng,
+            previous_value: 0,
+            reset_after,
+            current_tick: 0,
+        }
+    }
+
+    /// Generate a random value
+    pub fn generate_value(&mut self) -> i64 {
+        let mut value = if self.range.start == self.range.end {
+            self.range.start
+        } else {
+            self.rng.gen_range(self.range.clone())
+        };
+
+        if self.increment {
+            self.previous_value = self.previous_value.wrapping_add(value);
+            value = self.previous_value;
+
+            if let Some(reset) = self.reset_after {
+                self.current_tick += 1;
+                if self.current_tick >= reset {
+                    self.previous_value = 0;
+                    self.current_tick = 0;
+                }
+            }
+        }
+
+        value
+    }
+}
+
+/// Generate floating point field names and values.
+#[derive(Debug)]
+pub struct F64FieldGenerator {
+    /// The name (key) of the field
+    pub name: String,
+    range: Range<f64>,
+    rng: SmallRng,
+}
+
+impl F64FieldGenerator {
+    /// Create a new floating point field generator that will always use the
+    /// specified name.
+    pub fn new(name: impl Into<String>, range: &Range<f64>, rng: SmallRng) -> Self {
+        Self {
+            name: name.into(),
+            range: range.to_owned(),
+            rng,
+        }
+    }
+
+    /// Generate a random value
+    pub fn generate_value(&mut self) -> f64 {
+        if (self.range.start - self.range.end).abs() < f64::EPSILON {
+            self.range.start
+        } else {
+            self.rng.gen_range(self.range.clone())
+        }
+    }
+}
+
+/// Generate string field names and values.
+#[derive(Debug)]
+pub struct StringFieldGenerator {
+    /// The name (key) of the field
+    pub name: String,
+    rng: SmallRng,
+    replacements: Vec<specification::Replacement>,
+    handlebars: Handlebars<'static>,
+    data: Value,
+}
+
+impl StringFieldGenerator {
+    /// Create a new string field generator
+    pub fn new(
+        name: impl Into<String>,
+        template: impl Into<String>,
+        data: Value,
+        replacements: Vec<specification::Replacement>,
+        rng: SmallRng,
+    ) -> Result<Self> {
+        let name = name.into();
+        let mut registry = substitution::new_handlebars_registry();
+        registry
+            .register_template_string(&name, template.into())
+            .context(CouldNotCompileStringTemplateSnafu)?;
+
+        Ok(Self {
+            name,
+            rng,
+            replacements,
+            handlebars: registry,
+            data,
+        })
+    }
+
+    /// Generate a random value
+    pub fn generate_value(&mut self, timestamp: i64) -> String {
+        let replacements = pick_from_replacements(&mut self.rng, &self.replacements);
+        let d = self.data.as_object_mut().expect("data must be object");
+
+        if replacements.is_empty() {
+            d.remove("replacements");
+        } else {
+            d.insert("replacements".to_string(), json!(replacements));
+        }
+
+        d.insert("timestamp".to_string(), json!(timestamp));
+
+        self.handlebars
+            .render(&self.name, &self.data)
+            .expect("Unable to substitute string field value")
+    }
+}
+
+/// Generate an i64 field that has the name `uptime` and the value of the number
+/// of seconds since the data generator started running
+#[derive(Debug)]
+pub struct UptimeFieldGenerator {
+    /// The name (key) of the field
+    pub name: String,
+    execution_start_time: i64,
+    /// The specification type of the uptime field. Either an int64 or a string
+    pub kind: specification::UptimeKind,
+}
+
+impl UptimeFieldGenerator {
+    fn new(
+        name: impl Into<String>,
+        kind: &specification::UptimeKind,
+        execution_start_time: i64,
+    ) -> Self {
+        Self {
+            name: name.into(),
+            kind: *kind,
+            execution_start_time,
+        }
+    }
+
+    /// Generates the uptime as an i64
+    pub fn generate_value(&mut self) -> i64 {
+        let elapsed = Duration::from_nanos((now_ns() - self.execution_start_time) as u64);
+        elapsed.as_secs() as i64
+    }
+
+    /// Generates the uptime as a string, which is what should be used if `self.kind == specification::UptimeKind::Telegraf`
+    pub fn generate_value_as_string(&mut self) -> String {
+        let elapsed_seconds = self.generate_value();
+        let days = elapsed_seconds / (60 * 60 * 24);
+        let days_plural = if days == 1 { "" } else { "s" };
+
+        let mut minutes = elapsed_seconds / 60;
+        let mut hours = minutes / 60;
+        hours %= 24;
+        minutes %= 60;
+
+        format!("{days} day{days_plural}, {hours:02}:{minutes:02}")
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::specification::UptimeKind;
+    use rand::SeedableRng;
+    use test_helpers::approximately_equal;
+
+    #[test]
+    fn generate_i64_field_always_the_same() {
+        // If the specification has the same number for the start and end of the
+        // range...
+        let mut i64fg =
+            I64FieldGenerator::new("i64fg", &(3..3), false, None, SmallRng::from_entropy());
+
+        let i64_fields: Vec<_> = (0..10).map(|_| i64fg.generate_value()).collect();
+        let expected = i64_fields[0];
+
+        // All the values generated will always be the same.
+        assert!(i64_fields.iter().all(|f| *f == expected), "{i64_fields:?}");
+
+        // If the specification has n for the start and n+1 for the end of the range...
+        let mut i64fg =
+            I64FieldGenerator::new("i64fg", &(4..5), false, None, SmallRng::from_entropy());
+
+        let i64_fields: Vec<_> = (0..10).map(|_| i64fg.generate_value()).collect();
+        // We know what the value will be even though we're using a real random number generator
+        let expected = 4;
+
+        // All the values generated will also always be the same, because the end of the
+        // range is exclusive.
+        assert!(i64_fields.iter().all(|f| *f == expected), "{i64_fields:?}");
+    }
+
+    #[test]
+    fn generate_i64_field_within_a_range() {
+        let range = 3..1000;
+
+        let mut i64fg =
+            I64FieldGenerator::new("i64fg", &range, false, None, SmallRng::from_entropy());
+
+        let val = i64fg.generate_value();
+
+        assert!(range.contains(&val), "`{val}` was not in the range");
+    }
+
+    #[test]
+    fn generate_incrementing_i64_field() {
+        let mut i64fg =
+            I64FieldGenerator::new("i64fg", &(3..10), true, None, SmallRng::from_entropy());
+
+        let val1 = i64fg.generate_value();
+        let val2 = i64fg.generate_value();
+        let val3 = i64fg.generate_value();
+        let val4 = i64fg.generate_value();
+
+        assert!(val1 < val2, "`{val1}` < `{val2}` was false");
+        assert!(val2 < val3, "`{val2}` < `{val3}` was false");
+        assert!(val3 < val4, "`{val3}` < `{val4}` was false");
+    }
+
+    #[test]
+    fn incrementing_i64_wraps() {
+        let rng = SmallRng::from_entropy();
+        let range = 3..10;
+        let previous_value = i64::MAX;
+
+        // Construct by hand to set the previous value at the end of i64's range
+        let mut i64fg = I64FieldGenerator {
+            name: "i64fg".into(),
+            range: range.clone(),
+            increment: true,
+            reset_after: None,
+            rng,
+            previous_value,
+            current_tick: 0,
+        };
+
+        let resulting_range =
+            range.start.wrapping_add(previous_value)..range.end.wrapping_add(previous_value);
+
+        let val = i64fg.generate_value();
+
+        assert!(
+            resulting_range.contains(&val),
+            "`{val}` was not in the range"
+        );
+    }
+
+    #[test]
+    fn incrementing_i64_that_resets() {
+        let reset_after = Some(3);
+        let mut i64fg = I64FieldGenerator::new(
+            "i64fg",
+            &(3..8),
+            true,
+            reset_after,
+            SmallRng::from_entropy(),
+        );
+
+        let val1 = i64fg.generate_value();
+        let val2 = i64fg.generate_value();
+        let val3 = i64fg.generate_value();
+        let val4 = i64fg.generate_value();
+
+        assert!(val1 < val2, "`{val1}` < `{val2}` was false");
+        assert!(val2 < val3, "`{val2}` < `{val3}` was false");
+        assert!(val4 < val3, "`{val4}` < `{val3}` was false");
+    }
+
+    #[test]
+    fn generate_f64_field_always_the_same() {
+        // If the specification has the same number for the start and end of the
+        // range...
+        let start_and_end = 3.0;
+        let range = start_and_end..start_and_end;
+        let mut f64fg = F64FieldGenerator::new("f64fg", &range, SmallRng::from_entropy());
+
+        let f64_fields: Vec<_> = (0..10).map(|_| f64fg.generate_value()).collect();
+
+        // All the values generated will always be the same known value.
+        assert!(
+            f64_fields
+                .iter()
+                .all(|f| approximately_equal(*f, start_and_end)),
+            "{f64_fields:?}"
+        );
+    }
+
+    #[test]
+    fn generate_f64_field_within_a_range() {
+        let range = 3.0..1000.0;
+        let mut f64fg = F64FieldGenerator::new("f64fg", &range, SmallRng::from_entropy());
+
+        let val = f64fg.generate_value();
+        assert!(range.contains(&val), "`{val}` was not in the range");
+    }
+
+    #[test]
+    fn generate_string_field_with_data() {
+        let fake_now = 1633595510000000000;
+
+        let mut stringfg = StringFieldGenerator::new(
+            "str",
+            r#"my value {{measurement.name}} {{format-time "%Y-%m-%d"}}"#,
+            json!({"measurement": {"name": "foo"}}),
+            vec![],
+            SmallRng::from_entropy(),
+        )
+        .unwrap();
+
+        assert_eq!("my value foo 2021-10-07", stringfg.generate_value(fake_now));
+    }
+
+    #[test]
+    fn uptime_i64() {
+        // Pretend data generator started running 10 seconds ago
+        let seconds_ago = 10;
+        let execution_start_time = now_ns() - seconds_ago * 1_000_000_000;
+        let mut uptimefg = UptimeFieldGenerator::new("foo", &UptimeKind::I64, execution_start_time);
+
+        assert_eq!(seconds_ago, uptimefg.generate_value());
+    }
+
+    #[test]
+    fn uptime_telegraf() {
+        // Pretend data generator started running 10 days, 2 hours, and 33 minutes ago
+        let seconds_ago = 10 * 24 * 60 * 60 + 2 * 60 * 60 + 33 * 60;
+        let execution_start_time = now_ns() - seconds_ago * 1_000_000_000;
+        let mut uptimefg = UptimeFieldGenerator::new("foo", &UptimeKind::I64, execution_start_time);
+
+        assert_eq!("10 days, 02:33", uptimefg.generate_value_as_string());
+
+        // Pretend data generator started running 1 day, 14 hours, and 5 minutes ago
+        // to exercise different formatting
+        let seconds_in_1_day = 24 * 60 * 60;
+        let seconds_in_14_hours = 14 * 60 * 60;
+        let seconds_in_5_minutes = 5 * 60;
+
+        let seconds_ago = seconds_in_1_day + seconds_in_14_hours + seconds_in_5_minutes;
+        let execution_start_time = now_ns() - seconds_ago * 1_000_000_000;
+
+        let mut uptimefg = UptimeFieldGenerator::new("foo", &UptimeKind::I64, execution_start_time);
+
+        assert_eq!("1 day, 14:05", uptimefg.generate_value_as_string());
+    }
+}
diff --git a/iox_data_generator/src/lib.rs b/iox_data_generator/src/lib.rs
new file mode 100644
index 0000000..a496e61
--- /dev/null
+++ b/iox_data_generator/src/lib.rs
@@ -0,0 +1,343 @@
+//! This crate contains structures and generators for specifying how to generate
+//! historical and real-time test data for Delorean. The rules for how to
+//! generate data and what shape it should take can be specified in a TOML file.
+//!
+//! Generators can output in line protocol, Parquet, or can be used to generate
+//! real-time load on a server that implements the [InfluxDB 2.0 write
+//! path][write-api].
+//!
+//! [write-api]: https://v2.docs.influxdata.com/v2.0/api/#tag/Write
+//!
+//! While this generator could be compared to [the Go based one that creates TSM
+//! data][go-gen], its purpose is meant to be more far reaching. In addition to
+//! generating historical data, it should be useful for generating data in a
+//! sequence as you would expect it to arrive in a production environment. That
+//! means many agents sending data with their different tags and timestamps.
+//!
+//! [go-gen]: https://github.com/influxdata/influxdb/pull/12710
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use clap as _;
+#[cfg(test)]
+use criterion as _;
+use tracing_subscriber as _;
+
+use crate::{
+    agent::{Agent, AgentGenerateStats},
+    tag_set::GeneratedTagSets,
+};
+use snafu::{ResultExt, Snafu};
+use std::{
+    convert::TryFrom,
+    sync::{atomic::AtomicU64, Arc},
+    time::{SystemTime, UNIX_EPOCH},
+};
+
+pub mod agent;
+pub mod field;
+pub mod measurement;
+pub mod specification;
+pub mod substitution;
+mod tag_pair;
+pub mod tag_set;
+pub mod write;
+
+/// Errors that may happen while generating points.
+#[derive(Snafu, Debug)]
+pub enum Error {
+    /// Error that may happen when waiting on a tokio task
+    #[snafu(display("Could not join tokio task: {}", source))]
+    TokioError {
+        /// Underlying tokio error that caused this problem
+        source: tokio::task::JoinError,
+    },
+
+    /// Error that may happen when constructing an agent name
+    #[snafu(display("Could not create agent name, caused by:\n{}", source))]
+    CouldNotCreateAgentName {
+        /// Underlying `substitution` module error that caused this problem
+        source: substitution::Error,
+    },
+
+    /// Error that may happen when an agent generates points
+    #[snafu(display("Agent could not generate points, caused by:\n{}", source))]
+    AgentCouldNotGeneratePoints {
+        /// Underlying `agent` module error that caused this problem
+        source: agent::Error,
+    },
+
+    /// Error that may happen when creating agents
+    #[snafu(display("Could not create agents, caused by:\n{}", source))]
+    CouldNotCreateAgent {
+        /// Underlying `agent` module error that caused this problem
+        source: agent::Error,
+    },
+
+    /// Error that may happen when constructing an agent's writer
+    #[snafu(display("Could not create writer for agent, caused by:\n{}", source))]
+    CouldNotCreateAgentWriter {
+        /// Underlying `write` module error that caused this problem
+        source: write::Error,
+    },
+
+    /// Error generating tags sets
+    #[snafu(display("Error generating tag sets prior to creating agents: \n{}", source))]
+    CouldNotGenerateTagSets {
+        /// Underlying `tag_set` module error
+        source: tag_set::Error,
+    },
+
+    /// Error splitting input buckets to agents that write to them
+    #[snafu(display(
+        "Error splitting input buckets into agents that write to them: {}",
+        source
+    ))]
+    CouldNotAssignAgents {
+        /// Underlying `specification` module error
+        source: specification::Error,
+    },
+}
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Generate data from the configuration in the spec.
+///
+/// Provide a writer that the line protocol should be written to.
+///
+/// If `start_datetime` or `end_datetime` are `None`,  the current datetime will
+/// be used.
+#[allow(clippy::too_many_arguments)]
+pub async fn generate(
+    spec: &specification::DataSpec,
+    databases: Vec<String>,
+    points_writer_builder: &mut write::PointsWriterBuilder,
+    start_datetime: Option<i64>,
+    end_datetime: Option<i64>,
+    execution_start_time: i64,
+    continue_on: bool,
+    batch_size: usize,
+    one_agent_at_a_time: bool, // run one agent after another, if printing to stdout
+) -> Result<usize> {
+    let mut handles = vec![];
+
+    let database_agents = spec
+        .database_split_to_agents(&databases)
+        .context(CouldNotAssignAgentsSnafu)?;
+
+    let generated_tag_sets =
+        GeneratedTagSets::from_spec(spec).context(CouldNotGenerateTagSetsSnafu)?;
+
+    let lock = Arc::new(tokio::sync::Mutex::new(()));
+
+    let start = std::time::Instant::now();
+    let total_rows = Arc::new(AtomicU64::new(0));
+    let total_requests = Arc::new(AtomicU64::new(0));
+
+    for database_assignments in &database_agents {
+        let (org, bucket) = org_and_bucket_from_database(database_assignments.database);
+
+        for agent_assignment in database_assignments.agent_assignments.iter() {
+            let agents = Agent::from_spec(
+                agent_assignment.spec,
+                agent_assignment.count,
+                agent_assignment.sampling_interval,
+                start_datetime,
+                end_datetime,
+                execution_start_time,
+                continue_on,
+                &generated_tag_sets,
+            )
+            .context(CouldNotCreateAgentSnafu)?;
+
+            println!(
+                "Configuring {} agents of \"{}\" to write data \
+                to org {} and bucket {} (database {})",
+                agent_assignment.count,
+                agent_assignment.spec.name,
+                org,
+                bucket,
+                database_assignments.database,
+            );
+
+            let agent_points_writer = Arc::new(
+                points_writer_builder
+                    .build_for_agent(&agent_assignment.spec.name, org, bucket)
+                    .context(CouldNotCreateAgentWriterSnafu)?,
+            );
+
+            for mut agent in agents.into_iter() {
+                let lock_ref = Arc::clone(&lock);
+                let agent_points_writer = Arc::clone(&agent_points_writer);
+
+                let total_rows = Arc::clone(&total_rows);
+                let total_requests = Arc::clone(&total_requests);
+                handles.push(tokio::task::spawn(async move {
+                    // did this weird hack because otherwise the stdout outputs would be jumbled
+                    // together garbage
+                    if one_agent_at_a_time {
+                        let _l = lock_ref.lock().await;
+                        agent
+                            .generate_all(
+                                agent_points_writer,
+                                batch_size,
+                                total_rows,
+                                total_requests,
+                            )
+                            .await
+                    } else {
+                        agent
+                            .generate_all(
+                                agent_points_writer,
+                                batch_size,
+                                total_rows,
+                                total_requests,
+                            )
+                            .await
+                    }
+                }));
+            }
+        }
+    }
+
+    let mut stats = vec![];
+    for handle in handles {
+        stats.push(
+            handle
+                .await
+                .context(TokioSnafu)?
+                .context(AgentCouldNotGeneratePointsSnafu)?,
+        );
+    }
+    let stats = stats
+        .into_iter()
+        .fold(AgentGenerateStats::default(), |totals, res| {
+            AgentGenerateStats {
+                request_count: totals.request_count + res.request_count,
+                error_count: totals.error_count + res.error_count,
+                row_count: totals.row_count + res.row_count,
+            }
+        });
+
+    println!("{}", stats.display_stats(start.elapsed()));
+
+    Ok(stats.row_count)
+}
+
+/// Gets the current time in nanoseconds since the epoch
+pub fn now_ns() -> i64 {
+    let since_the_epoch = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .expect("Time went backwards");
+    i64::try_from(since_the_epoch.as_nanos()).expect("Time does not fit")
+}
+
+fn org_and_bucket_from_database(database: &str) -> (&str, &str) {
+    let parts = database.split('_').collect::<Vec<_>>();
+    if parts.len() != 2 {
+        panic!("error parsing org and bucket from {database}");
+    }
+
+    (parts[0], parts[1])
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::specification::*;
+    use influxdb2_client::models::WriteDataPoint;
+    use std::str::FromStr;
+    use std::time::Duration;
+
+    type Error = Box<dyn std::error::Error>;
+    type Result<T = (), E = Error> = std::result::Result<T, E>;
+
+    #[tokio::test]
+    async fn historical_data_sampling_interval() -> Result<()> {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "cpu"
+
+[[agents.measurements.fields]]
+name = "val"
+i64_range = [1, 1]
+
+[[database_writers]]
+agents = [{name = "foo", sampling_interval = "10s"}]
+"#;
+        let data_spec = DataSpec::from_str(toml).unwrap();
+        let agent_spec = &data_spec.agents[0];
+
+        let execution_start_time = now_ns();
+
+        // imagine we've specified at the command line that we want to generate metrics
+        // for 1970
+        let start_datetime = Some(0);
+        // for the first 15 seconds of the year
+        let end_datetime = Some(15 * 1_000_000_000);
+
+        let generated_tag_sets = GeneratedTagSets::default();
+
+        let mut agent = agent::Agent::from_spec(
+            agent_spec,
+            1,
+            Duration::from_secs(10),
+            start_datetime,
+            end_datetime,
+            execution_start_time,
+            false,
+            &generated_tag_sets,
+        )?;
+
+        let data_points = agent[0].generate().await?.into_iter().flatten();
+        let mut v = Vec::new();
+        for data_point in data_points {
+            data_point.write_data_point_to(&mut v).unwrap();
+        }
+        let line_protocol = String::from_utf8(v).unwrap();
+
+        // Get a point for time 0
+        let expected_line_protocol = "cpu val=1i 0\n";
+        assert_eq!(line_protocol, expected_line_protocol);
+
+        let data_points = agent[0].generate().await?.into_iter().flatten();
+        let mut v = Vec::new();
+        for data_point in data_points {
+            data_point.write_data_point_to(&mut v).unwrap();
+        }
+        let line_protocol = String::from_utf8(v).unwrap();
+
+        // Get a point for time 10s
+        let expected_line_protocol = "cpu val=1i 10000000000\n";
+        assert_eq!(line_protocol, expected_line_protocol);
+
+        // Don't get any points anymore because we're past the ending datetime
+        let data_points = agent[0].generate().await?.into_iter().flatten();
+        let data_points: Vec<_> = data_points.collect();
+        assert!(
+            data_points.is_empty(),
+            "expected no data points, got {data_points:?}"
+        );
+
+        Ok(())
+    }
+}
diff --git a/iox_data_generator/src/measurement.rs b/iox_data_generator/src/measurement.rs
new file mode 100644
index 0000000..a354404
--- /dev/null
+++ b/iox_data_generator/src/measurement.rs
@@ -0,0 +1,661 @@
+//! Generating a set of points for one measurement configuration
+
+#![allow(clippy::result_large_err)]
+
+use crate::{
+    field::FieldGeneratorImpl,
+    specification, substitution,
+    tag_pair::TagPair,
+    tag_set::{GeneratedTagSets, TagSet},
+};
+use influxdb2_client::models::WriteDataPoint;
+use serde_json::json;
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::{
+    fmt::Debug,
+    sync::{Arc, Mutex},
+};
+
+/// Measurement-specific Results
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Errors that may happen while creating measurements
+#[derive(Snafu, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display(
+        "Could not build data point for measurement `{}` with Influx Client, caused by:\n{}",
+        name,
+        source
+    ))]
+    InfluxDataPointError {
+        name: String,
+        source: influxdb2_client::models::data_point::DataPointError,
+    },
+
+    #[snafu(display("Could not create measurement name, caused by:\n{}", source))]
+    CouldNotCreateMeasurementName { source: crate::substitution::Error },
+
+    #[snafu(display(
+        "Could not create field generator sets for measurement `{}`, caused by:\n{}",
+        name,
+        source
+    ))]
+    CouldNotCreateFieldGeneratorSets {
+        name: String,
+        source: crate::field::Error,
+    },
+
+    #[snafu(display(
+        "Tag set {} referenced not found for measurement {}",
+        tag_set,
+        measurement
+    ))]
+    GeneratedTagSetNotFound {
+        tag_set: String,
+        measurement: String,
+    },
+
+    #[snafu(display("Could not compile template `{}`, caused by:\n{}", template, source))]
+    CantCompileTemplate {
+        source: handlebars::TemplateError,
+        template: String,
+    },
+
+    #[snafu(display("Could not render template `{}`, caused by:\n{}", template, source))]
+    CantRenderTemplate {
+        source: handlebars::RenderError,
+        template: String,
+    },
+
+    #[snafu(display("Error creating measurement tag pairs: {}", source))]
+    CouldNotCreateMeasurementTagPairs { source: crate::tag_pair::Error },
+}
+
+/// Generate measurements
+#[derive(Debug)]
+pub struct MeasurementGenerator {
+    measurement: Arc<Mutex<Measurement>>,
+}
+
+impl MeasurementGenerator {
+    /// Create the count specified number of measurement generators from
+    /// the passed `MeasurementSpec`
+    pub fn from_spec(
+        agent_id: usize,
+        spec: &specification::MeasurementSpec,
+        execution_start_time: i64,
+        generated_tag_sets: &GeneratedTagSets,
+        agent_tag_pairs: &[Arc<TagPair>],
+    ) -> Result<Vec<Self>> {
+        let count = spec.count.unwrap_or(1) + 1;
+
+        (1..count)
+            .map(|measurement_id| {
+                Self::new(
+                    agent_id,
+                    measurement_id,
+                    spec,
+                    execution_start_time,
+                    generated_tag_sets,
+                    agent_tag_pairs,
+                )
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+
+    /// Create a new way to generate measurements from a specification
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        agent_id: usize,
+        measurement_id: usize,
+        spec: &specification::MeasurementSpec,
+        execution_start_time: i64,
+        generated_tag_sets: &GeneratedTagSets,
+        agent_tag_pairs: &[Arc<TagPair>],
+    ) -> Result<Self> {
+        let measurement_name = substitution::render_once(
+            "measurement",
+            &spec.name,
+            &json!({
+                "agent": {"id": agent_id},
+                "measurement": {"id": measurement_id},
+            }),
+        )
+        .context(CouldNotCreateMeasurementNameSnafu)?;
+
+        let fields = spec
+            .fields
+            .iter()
+            .map(|field_spec| {
+                let data = json!({
+                    "agent": {"id": agent_id},
+                    "measurement": {"id": measurement_id, "name": &measurement_name},
+                });
+
+                FieldGeneratorImpl::from_spec(field_spec, data, execution_start_time)
+            })
+            .collect::<crate::field::Result<Vec<_>>>()
+            .context(CouldNotCreateFieldGeneratorSetsSnafu {
+                name: &measurement_name,
+            })?
+            .into_iter()
+            .flatten()
+            .collect();
+
+        // generate the tag pairs
+        let template_data = json!({
+            "agent": {"id": agent_id},
+            "measurement": {"id": measurement_id, "name": &measurement_name},
+        });
+
+        let mut tag_pairs = TagPair::pairs_from_specs(&spec.tag_pairs, template_data)
+            .context(CouldNotCreateMeasurementTagPairsSnafu)?;
+        for t in agent_tag_pairs {
+            tag_pairs.push(Arc::clone(t));
+        }
+
+        let generated_tag_sets = match &spec.tag_set {
+            Some(t) => Arc::clone(generated_tag_sets.sets_for(t).context(
+                GeneratedTagSetNotFoundSnafu {
+                    tag_set: t,
+                    measurement: &measurement_name,
+                },
+            )?),
+            // if there's no generated tag set, just have an empty set as a single row so
+            // it can be used to generate the single line that will come out of each generation
+            // for this measurement.
+            None => Arc::new(vec![TagSet { tags: vec![] }]),
+        };
+
+        // I have this gnarly tag ordering construction so that I can keep the pre-generated
+        // tag sets in their existing vecs without moving them around so that I can have
+        // many thousands of agents and measurements that use the same tagset without blowing
+        // up the number of vectors and memory I consume.
+        let mut tag_ordering: Vec<_> = tag_pairs
+            .iter()
+            .enumerate()
+            .map(|(i, p)| (p.key(), TagOrdering::Pair(i)))
+            .chain(
+                generated_tag_sets[0]
+                    .tags
+                    .iter()
+                    .enumerate()
+                    .map(|(i, p)| (p.key.to_string(), TagOrdering::Generated(i))),
+            )
+            .collect();
+        tag_ordering.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
+        let tag_ordering: Vec<_> = tag_ordering.into_iter().map(|(_, o)| o).collect();
+
+        Ok(Self {
+            measurement: Arc::new(Mutex::new(Measurement {
+                name: measurement_name,
+                tag_pairs,
+                generated_tag_sets,
+                tag_ordering,
+                fields,
+            })),
+        })
+    }
+
+    /// Create a line iterator to generate lines for a single sampling
+    pub fn generate(&mut self, timestamp: i64) -> Result<MeasurementLineIterator> {
+        Ok(MeasurementLineIterator {
+            measurement: Arc::clone(&self.measurement),
+            index: 0,
+            timestamp,
+        })
+    }
+}
+
+/// Details for the measurement to be generated. Can generate many lines
+/// for each sampling.
+#[derive(Debug)]
+pub struct Measurement {
+    name: String,
+    tag_pairs: Vec<Arc<TagPair>>,
+    generated_tag_sets: Arc<Vec<TagSet>>,
+    tag_ordering: Vec<TagOrdering>,
+    fields: Vec<FieldGeneratorImpl>,
+}
+
+impl Measurement {
+    /// The number of lines that will be generated for each sampling of this measurement.
+    pub fn line_count(&self) -> usize {
+        self.generated_tag_sets.len()
+    }
+
+    /// Write the specified line as line protocol to the passed in writer.
+    pub fn write_index_to<W: std::io::Write>(
+        &mut self,
+        index: usize,
+        timestamp: i64,
+        mut w: W,
+    ) -> std::io::Result<()> {
+        write!(w, "{}", self.name)?;
+        let row_tags = &self.generated_tag_sets[index].tags;
+        for t in &self.tag_ordering {
+            match t {
+                TagOrdering::Generated(index) => {
+                    let t = &row_tags[*index];
+                    write!(w, ",{}={}", t.key, t.value)?;
+                }
+                TagOrdering::Pair(index) => {
+                    let t = &self.tag_pairs[*index].as_ref();
+                    match t {
+                        TagPair::Static(t) => write!(w, ",{}={}", t.key, t.value)?,
+                        TagPair::Regenerating(t) => {
+                            let mut t = t.lock().expect("mutex poisoned");
+                            let p = t.tag_pair();
+                            write!(w, ",{}={}", p.key, p.value)?
+                        }
+                    }
+                }
+            }
+        }
+
+        for (i, field) in self.fields.iter_mut().enumerate() {
+            let d = if i == 0 { b" " } else { b"," };
+            w.write_all(d)?;
+
+            match field {
+                FieldGeneratorImpl::Bool(f) => {
+                    let v = f.generate_value();
+                    write!(w, "{}={}", f.name, if v { "t" } else { "f" })?;
+                }
+                FieldGeneratorImpl::I64(f) => {
+                    let v = f.generate_value();
+                    write!(w, "{}={}i", f.name, v)?;
+                }
+                FieldGeneratorImpl::F64(f) => {
+                    let v = f.generate_value();
+                    write!(w, "{}={}", f.name, v)?;
+                }
+                FieldGeneratorImpl::String(f) => {
+                    let v = f.generate_value(timestamp);
+                    write!(w, "{}=\"{}\"", f.name, v)?;
+                }
+                FieldGeneratorImpl::Uptime(f) => match f.kind {
+                    specification::UptimeKind::I64 => {
+                        let v = f.generate_value();
+                        write!(w, "{}={}i", f.name, v)?;
+                    }
+                    specification::UptimeKind::Telegraf => {
+                        let v = f.generate_value_as_string();
+                        write!(w, "{}=\"{}\"", f.name, v)?;
+                    }
+                },
+            }
+        }
+
+        writeln!(w, " {timestamp}")
+    }
+}
+
+#[derive(Debug)]
+enum TagOrdering {
+    Pair(usize),
+    Generated(usize),
+}
+
+/// Iterator to generate the lines for a given measurement
+#[derive(Debug)]
+pub struct MeasurementLineIterator {
+    measurement: Arc<Mutex<Measurement>>,
+    index: usize,
+    timestamp: i64,
+}
+
+impl MeasurementLineIterator {
+    /// Number of lines that will be generated for this measurement
+    pub fn line_count(&self) -> usize {
+        let m = self.measurement.lock().expect("mutex poinsoned");
+        m.line_count()
+    }
+}
+
+impl Iterator for MeasurementLineIterator {
+    type Item = LineToGenerate;
+
+    /// Get the details for the next `LineToGenerate`
+    fn next(&mut self) -> Option<Self::Item> {
+        let m = self.measurement.lock().expect("mutex poinsoned");
+
+        if self.index >= m.line_count() {
+            None
+        } else {
+            let n = Some(LineToGenerate {
+                measurement: Arc::clone(&self.measurement),
+                index: self.index,
+                timestamp: self.timestamp,
+            });
+            self.index += 1;
+            n
+        }
+    }
+}
+
+/// A pointer to the line to be generated. Will be evaluated when asked to write.
+#[derive(Debug)]
+pub struct LineToGenerate {
+    /// The measurement state to be used to generate the line
+    pub measurement: Arc<Mutex<Measurement>>,
+    /// The index into the generated tag pairs of the line we're generating
+    pub index: usize,
+    /// The timestamp of the line that we're generating
+    pub timestamp: i64,
+}
+
+impl WriteDataPoint for LineToGenerate {
+    /// Generate the data and write the line to the passed in writer.
+    fn write_data_point_to<W>(&self, w: W) -> std::io::Result<()>
+    where
+        W: std::io::Write,
+    {
+        let mut m = self.measurement.lock().expect("mutex poisoned");
+        m.write_index_to(self.index, self.timestamp, w)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::specification::*;
+    use influxdb2_client::models::WriteDataPoint;
+    use std::str;
+
+    type Error = Box<dyn std::error::Error>;
+    type Result<T = (), E = Error> = std::result::Result<T, E>;
+
+    impl MeasurementGenerator {
+        fn generate_string(&mut self, timestamp: i64) -> Result<String> {
+            self.generate_strings(timestamp)
+                .map(|mut strings| strings.swap_remove(0))
+        }
+
+        fn generate_strings(&mut self, timestamp: i64) -> Result<Vec<String>> {
+            let points = self.generate(timestamp)?;
+            points
+                .into_iter()
+                .map(|point| {
+                    let mut v = Vec::new();
+                    point.write_data_point_to(&mut v)?;
+                    Ok(String::from_utf8(v)?)
+                })
+                .collect()
+        }
+    }
+
+    #[test]
+    fn generate_measurement() -> Result {
+        let fake_now = 5678;
+
+        // This is the same as the previous test but with an additional field.
+        let measurement_spec = MeasurementSpec {
+            name: "cpu".into(),
+            count: Some(2),
+            fields: vec![
+                FieldSpec {
+                    name: "load".into(),
+                    field_value_spec: FieldValueSpec::F64 { range: 0.0..100.0 },
+                    count: None,
+                },
+                FieldSpec {
+                    name: "response_time".into(),
+                    field_value_spec: FieldValueSpec::I64 {
+                        range: 0..60_000,
+                        increment: false,
+                        reset_after: None,
+                    },
+                    count: None,
+                },
+            ],
+            tag_set: None,
+            tag_pairs: vec![],
+        };
+
+        let generated_tag_sets = GeneratedTagSets::default();
+
+        let mut measurement_generator =
+            MeasurementGenerator::new(0, 0, &measurement_spec, fake_now, &generated_tag_sets, &[])
+                .unwrap();
+
+        let line_protocol = vec![measurement_generator.generate_string(fake_now)?];
+        let response_times = extract_field_values("response_time", &line_protocol);
+
+        let next_line_protocol = vec![measurement_generator.generate_string(fake_now + 1)?];
+        let next_response_times = extract_field_values("response_time", &next_line_protocol);
+
+        // Each line should have a different response time unless we get really, really unlucky
+        assert_ne!(response_times, next_response_times);
+
+        Ok(())
+    }
+
+    #[test]
+    fn generate_measurement_with_basic_tags() -> Result {
+        let fake_now = 678;
+
+        let measurement_spec = MeasurementSpec {
+            name: "measurement".to_string(),
+            count: None,
+            tag_set: None,
+            tag_pairs: vec![
+                TagPairSpec {
+                    key: "some_name".to_string(),
+                    template: "some_value".to_string(),
+                    count: None,
+                    regenerate_after_lines: None,
+                },
+                TagPairSpec {
+                    key: "tag_name".to_string(),
+                    template: "tag_value".to_string(),
+                    count: None,
+                    regenerate_after_lines: None,
+                },
+            ],
+            fields: vec![FieldSpec {
+                name: "field_name".to_string(),
+                field_value_spec: FieldValueSpec::I64 {
+                    range: 1..1,
+                    increment: false,
+                    reset_after: None,
+                },
+                count: None,
+            }],
+        };
+        let generated_tag_sets = GeneratedTagSets::default();
+
+        let mut measurement_generator =
+            MeasurementGenerator::new(0, 0, &measurement_spec, fake_now, &generated_tag_sets, &[])
+                .unwrap();
+
+        let line_protocol = measurement_generator.generate_string(fake_now)?;
+
+        assert_eq!(
+            line_protocol,
+            format!(
+                "measurement,some_name=some_value,tag_name=tag_value field_name=1i {fake_now}\n"
+            )
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn generate_measurement_with_tags_with_count() {
+        let fake_now = 678;
+
+        let measurement_spec = MeasurementSpec {
+            name: "measurement".to_string(),
+            count: None,
+            tag_set: None,
+            tag_pairs: vec![TagPairSpec {
+                key: "some_name".to_string(),
+                template: "some_value {{id}}".to_string(),
+                count: Some(2),
+                regenerate_after_lines: None,
+            }],
+            fields: vec![FieldSpec {
+                name: "field_name".to_string(),
+                field_value_spec: FieldValueSpec::I64 {
+                    range: 1..1,
+                    increment: false,
+                    reset_after: None,
+                },
+                count: None,
+            }],
+        };
+        let generated_tag_sets = GeneratedTagSets::default();
+
+        let mut measurement_generator =
+            MeasurementGenerator::new(0, 0, &measurement_spec, fake_now, &generated_tag_sets, &[])
+                .unwrap();
+
+        let line_protocol = measurement_generator.generate_string(fake_now).unwrap();
+
+        assert_eq!(
+            line_protocol,
+            format!(
+                "measurement,some_name=some_value 1,some_name2=some_value 2 field_name=1i {fake_now}\n"
+            )
+        );
+    }
+
+    #[test]
+    fn regenerating_after_lines() {
+        let data_spec: specification::DataSpec = toml::from_str(
+            r#"
+            name = "ex"
+
+            [[values]]
+            name = "foo"
+            template = "{{id}}"
+            cardinality = 3
+
+            [[tag_sets]]
+            name = "foo_set"
+            for_each = ["foo"]
+
+            [[agents]]
+            name = "foo"
+
+            [[agents.measurements]]
+            name = "m1"
+            tag_set = "foo_set"
+            tag_pairs = [{key = "reg", template = "data-{{line_number}}", regenerate_after_lines = 2}]
+
+            [[agents.measurements.fields]]
+            name = "val"
+            i64_range = [3, 3]
+
+            [[database_writers]]
+            agents = [{name = "foo", sampling_interval = "10s"}]"#,
+        )
+            .unwrap();
+
+        let fake_now = 678;
+
+        let generated_tag_sets = GeneratedTagSets::from_spec(&data_spec).unwrap();
+
+        let mut measurement_generator = MeasurementGenerator::new(
+            42,
+            1,
+            &data_spec.agents[0].measurements[0],
+            fake_now,
+            &generated_tag_sets,
+            &[],
+        )
+        .unwrap();
+
+        let points = measurement_generator.generate(fake_now).unwrap();
+        let mut v = Vec::new();
+        for point in points {
+            point.write_data_point_to(&mut v).unwrap();
+        }
+        let line_protocol = str::from_utf8(&v).unwrap();
+
+        assert_eq!(
+            line_protocol,
+            format!(
+                "m1,foo=1,reg=data-1 val=3i {fake_now}\nm1,foo=2,reg=data-1 val=3i {fake_now}\nm1,foo=3,reg=data-3 val=3i {fake_now}\n"
+            )
+        );
+    }
+
+    #[test]
+    fn tag_set_and_tag_pairs() {
+        let data_spec: specification::DataSpec = toml::from_str(
+            r#"
+            name = "ex"
+
+            [[values]]
+            name = "foo"
+            template = "foo-{{id}}"
+            cardinality = 2
+
+            [[tag_sets]]
+            name = "foo_set"
+            for_each = ["foo"]
+
+            [[agents]]
+            name = "foo"
+
+            [[agents.measurements]]
+            name = "m1"
+            tag_set = "foo_set"
+            tag_pairs = [{key = "hello", template = "world{{measurement.id}}"}]
+
+            [[agents.measurements.fields]]
+            name = "val"
+            i64_range = [3, 3]
+
+            [[database_writers]]
+            database_ratio = 1.0
+            agents = [{name = "foo", sampling_interval = "10s"}]"#,
+        )
+        .unwrap();
+
+        let fake_now = 678;
+
+        let generated_tag_sets = GeneratedTagSets::from_spec(&data_spec).unwrap();
+
+        let mut measurement_generator = MeasurementGenerator::new(
+            42,
+            1,
+            &data_spec.agents[0].measurements[0],
+            fake_now,
+            &generated_tag_sets,
+            &[],
+        )
+        .unwrap();
+
+        let points = measurement_generator.generate(fake_now).unwrap();
+        let mut v = Vec::new();
+        for point in points {
+            point.write_data_point_to(&mut v).unwrap();
+        }
+        let line_protocol = str::from_utf8(&v).unwrap();
+
+        assert_eq!(
+            line_protocol,
+            format!(
+                "m1,foo=foo-1,hello=world1 val=3i {fake_now}\nm1,foo=foo-2,hello=world1 val=3i {fake_now}\n"
+            )
+        );
+    }
+
+    fn extract_field_values<'a>(field_name: &str, lines: &'a [String]) -> Vec<&'a str> {
+        lines
+            .iter()
+            .map(|line| {
+                let mut split = line.splitn(2, ' ');
+                split.next();
+                let after_space = split.next().unwrap();
+                let prefix = format!(",{field_name}=");
+                let after = after_space.rsplit_once(&prefix).unwrap().1;
+                after.split_once(',').map_or(after, |x| x.0)
+            })
+            .collect()
+    }
+}
diff --git a/iox_data_generator/src/specification.rs b/iox_data_generator/src/specification.rs
new file mode 100644
index 0000000..4b30935
--- /dev/null
+++ b/iox_data_generator/src/specification.rs
@@ -0,0 +1,900 @@
+//! Reading and interpreting data generation specifications.
+
+use humantime::parse_duration;
+use regex::Regex;
+use serde::Deserialize;
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::{fs, ops::Range, str::FromStr, sync::Arc, time::Duration};
+use tracing::warn;
+
+/// Errors that may happen while reading a TOML specification.
+#[derive(Snafu, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    /// File-related error that may happen while reading a specification
+    #[snafu(display(
+        r#"Error reading data spec from TOML file at {}: {}"#,
+        file_name,
+        source
+    ))]
+    ReadFile {
+        file_name: String,
+        /// Underlying I/O error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// TOML parsing error that may happen while interpreting a specification
+    #[snafu(display(r#"Error parsing data spec from TOML: {}"#, source))]
+    Parse {
+        /// Underlying TOML error that caused this problem
+        source: toml::de::Error,
+    },
+
+    #[snafu(display("Sampling interval must be valid string: {}", source))]
+    InvalidSamplingInterval { source: humantime::DurationError },
+
+    #[snafu(display(
+        "Agent {} referenced in database_writers, but not present in spec",
+        agent
+    ))]
+    AgentNotFound { agent: String },
+
+    #[snafu(display("database_writers can only use database_ratio or database_regex, not both"))]
+    DatabaseWritersConfig,
+
+    #[snafu(display(
+        "database_writer missing database_regex. If one uses a regex, all others must also use it"
+    ))]
+    RegexMissing,
+
+    #[snafu(display("database_writers regex {} failed with error: {}", regex, source))]
+    RegexCompile { regex: String, source: regex::Error },
+}
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// The full specification for the generation of a data set.
+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct DataSpec {
+    /// This name can be referenced in handlebars templates as `{{spec_name}}`
+    pub name: String,
+    /// Specifies values that are generated before agents are created. These values
+    /// can be used in tag set specs, which will pre-create tag sets that can then be
+    /// used by the agent specs.
+    #[serde(default)]
+    pub values: Vec<ValuesSpec>,
+    /// Specifies collections of tag sets that can be referenced by agents. These
+    /// pre-generated tag sets are an efficient way to have many tags without
+    /// re-rendering their values on every agent generation. They can also have
+    /// dependent values, making it easy to create high cardinality data sets
+    /// without running through many handlebar renders while having a well defined
+    /// set of tags that appear.
+    #[serde(default)]
+    pub tag_sets: Vec<TagSetsSpec>,
+    /// The specification for the agents that can be used to write data to databases.
+    pub agents: Vec<AgentSpec>,
+    /// The specification for writing to the provided list of databases.
+    pub database_writers: Vec<DatabaseWriterSpec>,
+}
+
+impl DataSpec {
+    /// Given a filename, read the file and parse the specification.
+    pub fn from_file(file_name: &str) -> Result<Self> {
+        let spec_toml = fs::read_to_string(file_name).context(ReadFileSnafu { file_name })?;
+        Self::from_str(&spec_toml)
+    }
+
+    /// Given a collection of database names, assign each a set of agents based on the spec
+    pub fn database_split_to_agents<'a>(
+        &'a self,
+        databases: &'a [String],
+    ) -> Result<Vec<DatabaseAgents<'a>>> {
+        let mut database_agents = Vec::with_capacity(databases.len());
+
+        let mut start = 0;
+
+        // either all database writers must use regex or none of them can. It's either ratio or
+        // regex for assignment
+        let use_ratio = self.database_writers[0].database_regex.is_none();
+        for b in &self.database_writers {
+            if use_ratio && b.database_regex.is_some() {
+                return DatabaseWritersConfigSnafu.fail();
+            }
+        }
+
+        for w in &self.database_writers {
+            let agents: Vec<_> = w
+                .agents
+                .iter()
+                .map(|a| {
+                    let count = a.count.unwrap_or(1);
+                    let sampling_interval = parse_duration(&a.sampling_interval)
+                        .context(InvalidSamplingIntervalSnafu)?;
+                    let spec = self
+                        .agent_by_name(&a.name)
+                        .context(AgentNotFoundSnafu { agent: &a.name })?;
+
+                    Ok(AgentAssignment {
+                        spec,
+                        count,
+                        sampling_interval,
+                    })
+                })
+                .collect::<Result<Vec<_>>>()?;
+            let agents = Arc::new(agents);
+
+            let selected_databases = if use_ratio {
+                if start >= databases.len() {
+                    warn!(
+                        "database_writers percentages > 1.0. Writer {:?} and later skipped.",
+                        w
+                    );
+                    break;
+                }
+
+                let mut end = (databases.len() as f64 * w.database_ratio.unwrap_or(1.0)).ceil()
+                    as usize
+                    + start;
+                if end > databases.len() {
+                    end = databases.len();
+                }
+
+                let selected_databases = databases[start..end].iter().collect::<Vec<_>>();
+                start = end;
+                selected_databases
+            } else {
+                let p = w.database_regex.as_ref().context(RegexMissingSnafu)?;
+                let re = Regex::new(p).context(RegexCompileSnafu { regex: p })?;
+                databases
+                    .iter()
+                    .filter(|name| re.is_match(name))
+                    .collect::<Vec<_>>()
+            };
+
+            for database in selected_databases {
+                database_agents.push(DatabaseAgents {
+                    database,
+                    agent_assignments: Arc::clone(&agents),
+                })
+            }
+        }
+
+        Ok(database_agents)
+    }
+
+    /// Get the agent spec by its name
+    pub fn agent_by_name(&self, name: &str) -> Option<&AgentSpec> {
+        self.agents.iter().find(|&a| a.name == name)
+    }
+}
+
+#[derive(Debug)]
+/// Assignment info for an agent to a database
+pub struct AgentAssignment<'a> {
+    /// The agent specification for writing to the assigned database
+    pub spec: &'a AgentSpec,
+    /// The number of these agents that should be writing to the database
+    pub count: usize,
+    /// The sampling interval agents will generate data on
+    pub sampling_interval: Duration,
+}
+
+#[derive(Debug)]
+/// Agent assignments mapped to a database
+pub struct DatabaseAgents<'a> {
+    /// The database data will get written to
+    pub database: &'a str,
+    /// The agents specifications that will be writing to the database
+    pub agent_assignments: Arc<Vec<AgentAssignment<'a>>>,
+}
+
+impl FromStr for DataSpec {
+    type Err = Error;
+
+    fn from_str(spec_toml: &str) -> std::result::Result<Self, <Self as FromStr>::Err> {
+        let spec: Self = toml::from_str(spec_toml).context(ParseSnafu)?;
+        Ok(spec)
+    }
+}
+
+/// The specification of values that can be used to generate tag sets
+#[derive(Deserialize, Debug, Clone)]
+#[cfg_attr(test, derive(Default))]
+#[serde(deny_unknown_fields)]
+pub struct ValuesSpec {
+    /// The name of the collection of values
+    pub name: String,
+    /// If values not specified this handlebars template will be used to create each value in the
+    /// collection
+    pub template: String,
+    /// How many of these values should be generated. If belongs_to is
+    /// specified, each parent will have this many of this value. So
+    /// the total number of these values generated would be parent.len() * self.cardinality
+    pub cardinality: usize,
+    /// A collection of strings to other values. Each one of these values will have one
+    /// of the referenced has_one. Further, when generating this, the has_one collection
+    /// will cycle through so that each successive value will use the next has_one value
+    /// for association
+    pub has_one: Option<Vec<String>>,
+    /// A collection of values that each of these values belongs to. These relationships
+    /// can be referenced in the value generation and in the generation of tag sets.
+    pub belongs_to: Option<String>,
+}
+
+impl ValuesSpec {
+    /// returns true if there are other value collections that this values spec must use to
+    /// be generated
+    pub fn has_dependent_values(&self) -> bool {
+        self.has_one.is_some() || self.belongs_to.is_some()
+    }
+}
+
+/// The specification of tag sets that can be referenced in measurements to pull a pre-generated
+/// set of tags in.
+#[derive(Deserialize, Debug)]
+#[cfg_attr(test, derive(Default))]
+#[serde(deny_unknown_fields)]
+pub struct TagSetsSpec {
+    /// The name of the tag set spec
+    pub name: String,
+    /// An array of the `ValuesSpec` to loop through. To reference parent belongs_to or has_one
+    /// values, the parent should come first and then the has_one or child next. Each successive
+    /// entry in this array is a nested loop. Multiple has_one and a belongs_to on a parent can
+    /// be traversed.
+    pub for_each: Vec<String>,
+}
+
+/// The specification for what should be written to the list of provided databases.
+/// Databases will be written to by one or more agents with the given sampling interval and
+/// agent count.
+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct DatabaseWriterSpec {
+    /// The ratio of databases from the provided list that should use these agents. The
+    /// ratios of the collection of database_writer specs should add up to 1.0. If ratio
+    /// is not provided it will default to 1.0 (useful for when you specify only a single
+    /// database_writer.
+    ///
+    /// The interval over the provided list of databases is the cumulative sum of the
+    /// previous ratios to this ratio. So if you have 10 input databases and 3 database_writers
+    /// with ratios (in order) of `[0.2, 0.4, and 0.6]` you would have the input list of
+    /// 10 databases split into these three based on their index in the list: `[0, 1]`,
+    /// `[2, 5]`, and `[6, 9]`. The first 2 databases, then the next 4, then the remaining 6.
+    ///
+    /// The list isn't shuffled as ratios are applied.
+    pub database_ratio: Option<f64>,
+    /// Regex to select databases from the provided list. If regex is used in any one
+    /// of the database_writers, it must be used for all of them.
+    pub database_regex: Option<String>,
+    /// The agents that should be used to write to these databases.
+    pub agents: Vec<AgentAssignmentSpec>,
+}
+
+/// The specification for the specific configuration of how an agent should write to a database.
+#[derive(Deserialize, Debug, Clone)]
+#[serde(deny_unknown_fields)]
+pub struct AgentAssignmentSpec {
+    /// The name of the `AgentSpec` to use
+    pub name: String,
+    /// The number of these agents that should write to the database
+    pub count: Option<usize>,
+    /// How frequently each agent will write to the database. This is applicable when using the
+    /// --continue flag. Otherwise, if doing historical backfill, timestamps of generated data
+    /// will be this far apart and data will be written in as quickly as possible.
+    pub sampling_interval: String,
+}
+
+/// The specification of the behavior of an agent, the entity responsible for
+/// generating a number of data points according to its configuration.
+#[derive(Deserialize, Debug)]
+#[cfg_attr(test, derive(Default))]
+#[serde(deny_unknown_fields)]
+pub struct AgentSpec {
+    /// The name of the agent, which can be referenced in templates with `agent.name`.
+    pub name: String,
+    /// The specifications for the measurements for the agent to generate.
+    pub measurements: Vec<MeasurementSpec>,
+    /// A collection of strings that reference other `Values` collections. Each agent will have one
+    /// of the referenced has_one. Further, when generating this, the has_one collection
+    /// will cycle through so that each successive agent will use the next has_one value
+    /// for association
+    #[serde(default)]
+    pub has_one: Vec<String>,
+    /// Specification of tag key/value pairs that get generated once and reused for
+    /// every sampling. Every measurement (and thus line) will have these tag pairs added onto it.
+    /// The template can use `{{agent.id}}` to reference the agent's id and `{{guid}}` or
+    /// `{{random N}}` to generate random strings.
+    #[serde(default)]
+    pub tag_pairs: Vec<TagPairSpec>,
+}
+
+/// The specification of how to generate data points for a particular
+/// measurement.
+#[derive(Deserialize, Debug)]
+#[cfg_attr(test, derive(Default))]
+#[serde(deny_unknown_fields)]
+pub struct MeasurementSpec {
+    /// Name of the measurement. Can be a plain string or a string with
+    /// placeholders for:
+    ///
+    /// - `{{agent.id}}` - the agent ID
+    /// - `{{measurement.id}}` - the measurement's ID, which must be used if
+    ///   `count` > 1 so that unique measurement names are created
+    pub name: String,
+    /// The number of measurements with this configuration that should be
+    /// created. Default value is 1. If specified, use `{{id}}`
+    /// in this measurement's `name` to create unique measurements.
+    pub count: Option<usize>,
+    /// Specifies a tag set to include in every sampling in addition to tags specified
+    pub tag_set: Option<String>,
+    /// Specification of tag key/value pairs that get generated once and reused for
+    /// every sampling.
+    #[serde(default)]
+    pub tag_pairs: Vec<TagPairSpec>,
+    /// Specification of the fields for this measurement. At least one field is
+    /// required.
+    pub fields: Vec<FieldSpec>,
+}
+
+/// Specification of a tag key/value pair whose template will be evaluated once and
+/// the value will be reused across every sampling.
+#[derive(Deserialize, Debug, Clone)]
+#[cfg_attr(test, derive(Default))]
+#[serde(deny_unknown_fields)]
+pub struct TagPairSpec {
+    /// The tag key. If `count` is specified, the id of the tag will be automatically
+    /// appended to the end of the key to ensure it is unique.
+    pub key: String,
+    /// The template to generate the tag value
+    pub template: String,
+    /// If specified, this number of tags will be generated with this template. Each will
+    /// have a key of `key#` where # is the number. Useful for creating a degenerate case
+    /// of having dozens or hundreds of tags
+    pub count: Option<usize>,
+    /// If specified, the tag template will be re-evaluated after this many lines have been
+    /// generated. This will go across samplings. For example, if you have this set to 3 and
+    /// each sample generates two lines, it will get regenerated after the first line in the
+    /// second sample. This is useful for simulating things like tracing use cases or ephemeral
+    /// identifiers like process or container IDs. The template has access to the normal data
+    /// accessible as well as `line_number`.
+    pub regenerate_after_lines: Option<usize>,
+}
+
+/// The specification of how to generate field keys and values for a particular
+/// measurement.
+#[derive(Deserialize, Debug)]
+#[cfg_attr(test, derive(Default))]
+#[serde(from = "FieldSpecIntermediate")]
+pub struct FieldSpec {
+    /// Key/name for this field. Can be a plain string or a string with
+    /// placeholders for:
+    ///
+    /// - `{{agent.id}}` - the agent ID
+    /// - `{{measurement.id}}` - the measurement ID
+    /// - `{{field.id}}` - the field ID, which must be used if `count` > 1 so
+    ///   that unique field names are created
+    pub name: String,
+    /// Specification for the value for this field.
+    pub field_value_spec: FieldValueSpec,
+    /// How many fields with this configuration should be created
+    pub count: Option<usize>,
+}
+
+impl From<FieldSpecIntermediate> for FieldSpec {
+    fn from(value: FieldSpecIntermediate) -> Self {
+        let field_value_spec = if let Some(b) = value.bool {
+            FieldValueSpec::Bool(b)
+        } else if let Some((start, end)) = value.i64_range {
+            FieldValueSpec::I64 {
+                range: (start..end),
+                increment: value.increment.unwrap_or(false),
+                reset_after: value.reset_after,
+            }
+        } else if let Some((start, end)) = value.f64_range {
+            FieldValueSpec::F64 {
+                range: (start..end),
+            }
+        } else if let Some(pattern) = value.template {
+            FieldValueSpec::String {
+                pattern,
+                replacements: value.replacements,
+            }
+        } else if let Some(kind) = value.uptime {
+            FieldValueSpec::Uptime { kind }
+        } else {
+            panic!(
+                "Can't tell what type of field value you're trying to specify with this \
+                configuration: `{value:?}"
+            );
+        };
+
+        Self {
+            name: value.name,
+            field_value_spec,
+            count: value.count,
+        }
+    }
+}
+
+/// The specification of a field value of a particular type. Instances should be
+/// created by converting a `FieldSpecIntermediate`, which more closely matches
+/// the TOML structure.
+#[derive(Debug, PartialEq)]
+pub enum FieldValueSpec {
+    /// Configuration of a boolean field.
+    Bool(bool),
+    /// Configuration of an integer field.
+    I64 {
+        /// The `Range` in which random integer values will be generated. If the
+        /// range only contains one value, all instances of this field
+        /// will have the same value.
+        range: Range<i64>,
+        /// When set to true, after an initial random value in the range is
+        /// generated, a random increment in the range will be generated
+        /// and added to the initial value. That means the
+        /// value for this field will always be increasing. When the value
+        /// reaches the max value of i64, the value will wrap around to
+        /// the min value of i64 and increment again.
+        increment: bool,
+        /// If `increment` is true, after this many samples, reset the value to
+        /// start the increasing value over. If this is `None`, the
+        /// value won't restart until reaching the max value of i64. If
+        /// `increment` is false, this has no effect.
+        reset_after: Option<usize>,
+    },
+    /// Configuration of a floating point field.
+    F64 {
+        /// The `Range` in which random floating point values will be generated.
+        /// If start == end, all instances of this field will have the
+        /// same value.
+        range: Range<f64>,
+    },
+    /// Configuration of a string field.
+    String {
+        /// Pattern containing placeholders that specifies how to generate the
+        /// string values.
+        ///
+        /// Valid placeholders include:
+        ///
+        /// - `{{agent_name}}` - the agent spec's name, with any replacements
+        ///   done
+        /// - `{{time}}` - the current time in nanoseconds since the epoch.
+        ///   TODO: support specifying a strftime
+        /// - any other placeholders as specified in `replacements`. If a
+        ///   placeholder has no value specified in `replacements`, it will end
+        ///   up as-is in the field value.
+        pattern: String,
+        /// A list of replacement placeholders and the values to replace them
+        /// with. The values can optionally have weights associated with
+        /// them to change the probabilities that its value
+        /// will be used.
+        replacements: Vec<Replacement>,
+    },
+    /// Configuration of a field with the value of the number of seconds the
+    /// data generation tool has been running.
+    Uptime {
+        /// Format of the uptime value in this field
+        kind: UptimeKind,
+    },
+}
+
+/// The kind of field value to create using the data generation tool's uptime
+#[derive(Debug, PartialEq, Eq, Copy, Clone, Deserialize)]
+pub enum UptimeKind {
+    /// Number of seconds since the tool started running as an i64 field
+    #[serde(rename = "i64")]
+    I64,
+    /// Number of seconds since the tool started running, formatted as a string
+    /// field containing the value in the format "x day(s), HH:MM"
+    #[serde(rename = "telegraf")]
+    Telegraf,
+}
+
+#[cfg(test)]
+impl Default for FieldValueSpec {
+    fn default() -> Self {
+        Self::Bool(true)
+    }
+}
+
+/// An intermediate representation of the field specification that more directly
+/// corresponds to the way field configurations are expressed in TOML. This
+/// structure is transformed into the `FieldValueSpec` enum that ensures the
+/// options for the different field value types are mutually exclusive.
+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+struct FieldSpecIntermediate {
+    /// Key/name for this field. Can be a plain string or a string with
+    /// placeholders for:
+    ///
+    /// - `{{agent_id}}` - the agent ID
+    /// - `{{measurement_id}}` - the measurement ID
+    /// - `{{field_id}}` - the field ID, which must be used if `count` > 1 so
+    ///   that unique field names are created
+    name: String,
+    /// The number of fields with this configuration that should be created.
+    /// Default value is 1. If specified, use `{{field_id}}` in this field's
+    /// `name` to create unique fields.
+    count: Option<usize>,
+    /// Specify `bool` to make a field that has the Boolean type. `true` means
+    /// to generate the boolean randomly with equal probability. `false`
+    /// means...? Specifying any other optional fields along with this one
+    /// is invalid.
+    bool: Option<bool>,
+    /// Specify `i64_range` to make an integer field. The values will be
+    /// randomly generated within the specified range with equal
+    /// probability. If the range only contains one element, all occurrences
+    /// of this field will have the same value. Can be combined with
+    /// `increment`; specifying any other optional fields is invalid.
+    i64_range: Option<(i64, i64)>,
+    /// Specify `f64_range` to make a floating point field. The values will be
+    /// randomly generated within the specified range. If start == end, all
+    /// occurrences of this field will have that value.
+    /// Can this be combined with `increment`?
+    f64_range: Option<(f64, f64)>,
+    /// When set to true with an `i64_range` (is this valid with any other
+    /// type?), after an initial random value is generated, a random
+    /// increment will be generated and added to the initial value. That
+    /// means the value for this field will always be increasing. When the value
+    /// reaches the end of the range...? The end of the range will be repeated
+    /// forever? The series will restart at the start of the range?
+    /// Something else? Setting this to `Some(false)` has the same effect as
+    /// `None`.
+    increment: Option<bool>,
+    /// If `increment` is true, after this many samples, reset the value to
+    /// start the increasing value over. If this is `None`, the value won't
+    /// restart until reaching the max value of i64. If `increment` is
+    /// false, this has no effect.
+    reset_after: Option<usize>,
+    /// Set `pattern` to make a field with the string type. If this doesn't
+    /// include any placeholders, all occurrences of this field will have
+    /// this value.
+    ///
+    /// Valid placeholders include:
+    ///
+    /// - `{{agent.id}}` - the agent spec's name, with any replacements done
+    /// - any other placeholders as specified in `replacements`. If a
+    ///   placeholder has no value specified in `replacements`, it will end up
+    ///   as-is in the field value.
+    template: Option<String>,
+    /// A list of replacement placeholders and the values to replace them with.
+    /// If a placeholder specified here is not used in `pattern`, it will
+    /// have no effect. The values may optionally have a probability weight
+    /// specified with them; if not specified, the value will have weight 1.
+    /// If no weights are specified, the values will be generated with equal
+    /// probability.
+    #[serde(default)]
+    replacements: Vec<Replacement>,
+    /// The kind of uptime that should be used for this field. If specified, no
+    /// other options are valid. If not specified, this is not an uptime
+    /// field.
+    uptime: Option<UptimeKind>,
+}
+
+/// The specification of what values to substitute in for placeholders specified
+/// in `String` field values.
+#[derive(Deserialize, Debug, PartialEq, Eq, Clone)]
+#[serde(deny_unknown_fields)]
+pub struct Replacement {
+    /// A placeholder key that can be used in field `pattern`s.
+    pub replace: String,
+    /// The possible values to use instead of the placeholder key in `pattern`.
+    /// Values may optionally have a weight specified. If no weights are
+    /// specified, the values will be randomly generated with equal
+    /// probability. The weights are passed to [`rand`'s `choose_weighted`
+    /// method][choose_weighted] and are a relative likelihood such that the
+    /// probability of each item being selected is its weight divided by the sum
+    /// of all weights in this group.
+    ///
+    /// [choose_weighted]: https://docs.rs/rand/0.7.3/rand/seq/trait.SliceRandom.html#tymethod.choose_weighted
+    pub with: Vec<ReplacementValue>,
+}
+
+#[derive(Debug, Deserialize, PartialEq, Eq, Clone)]
+#[serde(untagged, deny_unknown_fields)]
+/// A possible value to use instead of a placeholder key, optionally with an
+/// associated weight. If no weight is specified, the weight used will be 1.
+pub enum ReplacementValue {
+    /// Just a value without a weight
+    String(String),
+    /// A value with a specified relative likelihood weight that gets passed on
+    /// to [`rand`'s `choose_weighted` method][choose_weighted]. The
+    /// probability of each item being selected is its weight divided by the
+    /// sum of all weights in the `Replacement` group.
+    ///
+    /// [choose_weighted]: https://docs.rs/rand/0.7.3/rand/seq/trait.SliceRandom.html#tymethod.choose_weighted
+    Weighted(String, u32),
+}
+
+impl ReplacementValue {
+    /// The associated replacement value
+    pub fn value(&self) -> &str {
+        use ReplacementValue::*;
+        match self {
+            String(s) => s,
+            Weighted(s, ..) => s,
+        }
+    }
+
+    /// The associated weight value specified; defaults to 1.
+    pub fn weight(&self) -> u32 {
+        use ReplacementValue::*;
+        match self {
+            String(..) => 1,
+            Weighted(.., w) => *w,
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn sample_schemas_parse() {
+        let schemas: Vec<&str> = vec![
+            include_str!("../schemas/storage_cardinality_example.toml"),
+            include_str!("../schemas/cap-write.toml"),
+            include_str!("../schemas/tracing-spec.toml"),
+            include_str!("../schemas/full_example.toml"),
+        ];
+
+        for s in schemas {
+            if let Err(e) = DataSpec::from_str(s) {
+                panic!("error {e:?} on\n{s}")
+            }
+        }
+    }
+
+    #[test]
+    fn not_specifying_vectors_gets_default_empty_vector() {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "cpu"
+
+[[agents.measurements.fields]]
+name = "host"
+template = "server"
+
+[[database_writers]]
+database_ratio = 1.0
+agents = [{name = "foo", sampling_interval = "10s"}]
+"#;
+        let spec = DataSpec::from_str(toml).unwrap();
+
+        let agent0 = &spec.agents[0];
+        assert!(agent0.tag_pairs.is_empty());
+
+        let agent0_measurements = &agent0.measurements;
+        let a0m0 = &agent0_measurements[0];
+        assert!(a0m0.tag_pairs.is_empty());
+
+        let a0m0_fields = &a0m0.fields;
+        let a0m0f0 = &a0m0_fields[0];
+        let field_spec = &a0m0f0.field_value_spec;
+
+        assert!(
+            matches!(
+                field_spec,
+                FieldValueSpec::String { replacements, .. } if replacements.is_empty()
+            ),
+            "expected a String field with empty replacements; was {field_spec:?}"
+        );
+    }
+
+    #[test]
+    fn split_databases_by_writer_spec_ratio() {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+[[agents.measurements]]
+name = "cpu"
+[[agents.measurements.fields]]
+name = "host"
+template = "server"
+
+[[agents]]
+name = "bar"
+[[agents.measurements]]
+name = "whatevs"
+[[agents.measurements.fields]]
+name = "val"
+i64_range = [0, 10]
+
+[[database_writers]]
+database_ratio = 0.6
+agents = [{name = "foo", sampling_interval = "10s"}]
+
+[[database_writers]]
+database_ratio = 0.4
+agents = [{name = "bar", sampling_interval = "1m", count = 3}]
+"#;
+        let spec = DataSpec::from_str(toml).unwrap();
+        let databases = vec!["a_1".to_string(), "a_2".to_string(), "b_1".to_string()];
+
+        let database_agents = spec.database_split_to_agents(&databases).unwrap();
+
+        let b = &database_agents[0];
+        assert_eq!(b.database, &databases[0]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(10)
+        );
+        assert_eq!(b.agent_assignments[0].count, 1);
+        assert_eq!(b.agent_assignments[0].spec.name, "foo");
+
+        let b = &database_agents[1];
+        assert_eq!(b.database, &databases[1]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(10)
+        );
+        assert_eq!(b.agent_assignments[0].count, 1);
+        assert_eq!(b.agent_assignments[0].spec.name, "foo");
+
+        let b = &database_agents[2];
+        assert_eq!(b.database, &databases[2]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(60)
+        );
+        assert_eq!(b.agent_assignments[0].count, 3);
+        assert_eq!(b.agent_assignments[0].spec.name, "bar");
+    }
+
+    #[test]
+    fn split_databases_by_writer_spec_regex() {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+[[agents.measurements]]
+name = "cpu"
+[[agents.measurements.fields]]
+name = "host"
+template = "server"
+
+[[agents]]
+name = "bar"
+[[agents.measurements]]
+name = "whatevs"
+[[agents.measurements.fields]]
+name = "val"
+i64_range = [0, 10]
+
+[[database_writers]]
+database_regex = "foo.*"
+agents = [{name = "foo", sampling_interval = "10s"}]
+
+[[database_writers]]
+database_regex = ".*_bar"
+agents = [{name = "bar", sampling_interval = "1m", count = 3}]
+"#;
+
+        let spec = DataSpec::from_str(toml).unwrap();
+        let databases = vec![
+            "foo_1".to_string(),
+            "foo_2".to_string(),
+            "asdf_bar".to_string(),
+        ];
+
+        let database_agents = spec.database_split_to_agents(&databases).unwrap();
+
+        let b = &database_agents[0];
+        assert_eq!(b.database, &databases[0]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(10)
+        );
+        assert_eq!(b.agent_assignments[0].count, 1);
+        assert_eq!(b.agent_assignments[0].spec.name, "foo");
+
+        let b = &database_agents[1];
+        assert_eq!(b.database, &databases[1]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(10)
+        );
+        assert_eq!(b.agent_assignments[0].count, 1);
+        assert_eq!(b.agent_assignments[0].spec.name, "foo");
+
+        let b = &database_agents[2];
+        assert_eq!(b.database, &databases[2]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(60)
+        );
+        assert_eq!(b.agent_assignments[0].count, 3);
+        assert_eq!(b.agent_assignments[0].spec.name, "bar");
+    }
+
+    #[test]
+    fn split_databases_by_writer_regex_and_ratio_error() {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+[[agents.measurements]]
+name = "cpu"
+[[agents.measurements.fields]]
+name = "host"
+template = "server"
+
+[[agents]]
+name = "bar"
+[[agents.measurements]]
+name = "whatevs"
+[[agents.measurements.fields]]
+name = "val"
+i64_range = [0, 10]
+
+[[database_writers]]
+database_ratio = 0.8
+agents = [{name = "foo", sampling_interval = "10s"}]
+
+[[database_writers]]
+database_regex = "foo.*"
+agents = [{name = "bar", sampling_interval = "1m", count = 3}]
+"#;
+
+        let spec = DataSpec::from_str(toml).unwrap();
+        let databases = vec!["a_1".to_string(), "a_2".to_string(), "b_1".to_string()];
+
+        let database_agents = spec.database_split_to_agents(&databases);
+        assert!(matches!(
+            database_agents.unwrap_err(),
+            Error::DatabaseWritersConfig
+        ));
+    }
+
+    #[test]
+    fn split_databases_by_writer_ratio_defaults() {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+[[agents.measurements]]
+name = "cpu"
+[[agents.measurements.fields]]
+name = "host"
+template = "server"
+
+[[database_writers]]
+agents = [{name = "foo", sampling_interval = "10s"}]
+"#;
+
+        let spec = DataSpec::from_str(toml).unwrap();
+        let databases = vec!["a_1".to_string(), "a_2".to_string()];
+
+        let database_agents = spec.database_split_to_agents(&databases).unwrap();
+
+        let b = &database_agents[0];
+        assert_eq!(b.database, &databases[0]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(10)
+        );
+        assert_eq!(b.agent_assignments[0].count, 1);
+        assert_eq!(b.agent_assignments[0].spec.name, "foo");
+
+        let b = &database_agents[1];
+        assert_eq!(b.database, &databases[1]);
+        assert_eq!(
+            b.agent_assignments[0].sampling_interval,
+            Duration::from_secs(10)
+        );
+        assert_eq!(b.agent_assignments[0].count, 1);
+        assert_eq!(b.agent_assignments[0].spec.name, "foo");
+    }
+}
diff --git a/iox_data_generator/src/substitution.rs b/iox_data_generator/src/substitution.rs
new file mode 100644
index 0000000..b5e558a
--- /dev/null
+++ b/iox_data_generator/src/substitution.rs
@@ -0,0 +1,241 @@
+//! Substituting dynamic values into a template as specified in various places
+//! in the schema.
+
+use crate::specification;
+use chrono::prelude::*;
+use handlebars::{
+    Context, Handlebars, Helper, HelperDef, HelperResult, Output, RenderContext, RenderErrorReason,
+};
+use rand::rngs::SmallRng;
+use rand::{distributions::Alphanumeric, seq::SliceRandom, Rng, RngCore};
+use serde_json::Value;
+use snafu::{ResultExt, Snafu};
+use std::collections::BTreeMap;
+
+/// Substitution-specific Results
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Errors that may happen while substituting values into templates.
+#[derive(Snafu, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display(
+        "Could not perform text substitution in `{}`, caused by:\n{}",
+        template,
+        source
+    ))]
+    CantCompileTemplate {
+        #[snafu(source(from(handlebars::TemplateError, Box::new)))]
+        source: Box<handlebars::TemplateError>,
+        template: String,
+    },
+
+    #[snafu(display("Could not render template {}, caused by: {}", name, source))]
+    CantRenderTemplate {
+        name: String,
+        #[snafu(source(from(handlebars::RenderError, Box::new)))]
+        source: Box<handlebars::RenderError>,
+    },
+
+    #[snafu(display(
+        "Could not perform text substitution in `{}`, caused by:\n{}",
+        template,
+        source
+    ))]
+    CantPerformSubstitution {
+        #[snafu(source(from(handlebars::RenderError, Box::new)))]
+        source: Box<handlebars::RenderError>,
+        template: String,
+    },
+}
+
+pub(crate) fn render_once(name: &str, template: impl Into<String>, data: &Value) -> Result<String> {
+    let mut registry = new_handlebars_registry();
+    registry.set_strict_mode(true);
+    let template = template.into();
+    registry
+        .register_template_string(name, &template)
+        .context(CantCompileTemplateSnafu { template })?;
+    registry
+        .render(name, data)
+        .context(CantRenderTemplateSnafu { name })
+}
+
+pub(crate) fn new_handlebars_registry() -> Handlebars<'static> {
+    let mut registry = Handlebars::new();
+    registry.set_strict_mode(true);
+    registry.register_helper("format-time", Box::new(FormatNowHelper));
+    registry.register_helper("random", Box::new(RandomHelper));
+    registry.register_helper("guid", Box::new(GuidHelper));
+    registry
+}
+
+#[derive(Debug)]
+pub(crate) struct RandomHelper;
+
+impl HelperDef for RandomHelper {
+    fn call<'reg: 'rc, 'rc>(
+        &self,
+        h: &Helper<'_>,
+        _: &Handlebars<'_>,
+        _: &Context,
+        _: &mut RenderContext<'_, '_>,
+        out: &mut dyn Output,
+    ) -> HelperResult {
+        let param = h
+            .param(0)
+            .ok_or(RenderErrorReason::ParamNotFoundForIndex("random", 0))?
+            .value()
+            .as_u64()
+            .ok_or_else(|| {
+                RenderErrorReason::ParamTypeMismatchForName(
+                    "random",
+                    "0".to_string(),
+                    "unsigned integer".to_string(),
+                )
+            })?
+            .try_into()
+            .map_err(|_| {
+                RenderErrorReason::Other("`random`'s parameter must fit in a usize".to_string())
+            })?;
+
+        let mut rng = rand::thread_rng();
+
+        let random: String = std::iter::repeat(())
+            .map(|()| rng.sample(Alphanumeric))
+            .map(char::from)
+            .take(param)
+            .collect();
+
+        out.write(&random)?;
+
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct FormatNowHelper;
+
+impl HelperDef for FormatNowHelper {
+    fn call<'reg: 'rc, 'rc>(
+        &self,
+        h: &Helper<'_>,
+        _: &Handlebars<'_>,
+        c: &Context,
+        _: &mut RenderContext<'_, '_>,
+        out: &mut dyn Output,
+    ) -> HelperResult {
+        let format = h
+            .param(0)
+            .ok_or(RenderErrorReason::ParamNotFoundForIndex("format-time", 0))?
+            .render();
+
+        let timestamp = c
+            .data()
+            .get("timestamp")
+            .and_then(|t| t.as_i64())
+            .expect("Caller of `render` should have set `timestamp` to an `i64` value");
+
+        let datetime = Utc.timestamp_nanos(timestamp);
+
+        out.write(&datetime.format(&format).to_string())?;
+
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct GuidHelper;
+
+impl HelperDef for GuidHelper {
+    fn call<'reg: 'rc, 'rc>(
+        &self,
+        _h: &Helper<'_>,
+        _: &Handlebars<'_>,
+        _: &Context,
+        _: &mut RenderContext<'_, '_>,
+        out: &mut dyn Output,
+    ) -> HelperResult {
+        let mut rng = rand::thread_rng();
+
+        let mut bytes = [0u8; 16];
+        rng.fill_bytes(&mut bytes);
+        let mut uid_builder = uuid::Builder::from_bytes(bytes);
+        uid_builder.set_variant(uuid::Variant::RFC4122);
+        uid_builder.set_version(uuid::Version::Random);
+        let uid = uid_builder.into_uuid().to_string();
+
+        out.write(&uid)?;
+
+        Ok(())
+    }
+}
+
+/// Given a random number generator and replacement specification, choose a
+/// particular value from the list of possible values according to any specified
+/// weights (or with equal probability if there are no weights).
+pub fn pick_from_replacements<'a>(
+    rng: &mut SmallRng,
+    replacements: &'a [specification::Replacement],
+) -> BTreeMap<&'a str, &'a str> {
+    replacements
+        .iter()
+        .map(|replacement| {
+            let chosen = replacement
+                .with
+                .choose_weighted(rng, |value| value.weight())
+                .expect("`Replacement` `with` should have items")
+                .value();
+
+            (replacement.replace.as_str(), chosen)
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn format_now_valid_strftime() {
+        let mut registry = new_handlebars_registry();
+        registry
+            .register_template_string("t", r#"the date is {{format-time "%Y-%m-%d"}}."#)
+            .unwrap();
+
+        let timestamp: i64 = 1599154445000000000;
+        let value = registry
+            .render("t", &json!({ "timestamp": timestamp }))
+            .unwrap();
+
+        assert_eq!(&value, "the date is 2020-09-03.");
+    }
+
+    #[test]
+    #[should_panic(expected = "a Display implementation returned an error unexpectedly: Error")]
+    fn format_now_invalid_strftime_panics() {
+        let mut registry = new_handlebars_registry();
+        registry
+            .register_template_string("t", r#"the date is {{format-time "%-B"}}."#)
+            .unwrap();
+
+        let timestamp: i64 = 1599154445000000000;
+        let _value = registry
+            .render("t", &json!({ "timestamp": timestamp }))
+            .expect("this is unreachable");
+    }
+
+    #[test]
+    fn format_now_missing_strftime() {
+        let mut registry = new_handlebars_registry();
+        registry
+            .register_template_string("t", r#"the date is {{format-time}}."#)
+            .unwrap();
+
+        let timestamp: i64 = 1599154445000000000;
+        let result = registry.render("t", &json!({ "timestamp": timestamp }));
+
+        assert!(result.is_err());
+    }
+}
diff --git a/iox_data_generator/src/tag_pair.rs b/iox_data_generator/src/tag_pair.rs
new file mode 100644
index 0000000..302fc4d
--- /dev/null
+++ b/iox_data_generator/src/tag_pair.rs
@@ -0,0 +1,173 @@
+//! Module for generating tag key/value pairs to be used in the data generator
+
+use crate::specification::TagPairSpec;
+use crate::substitution::new_handlebars_registry;
+use handlebars::Handlebars;
+use serde_json::{json, Value};
+use snafu::{ResultExt, Snafu};
+use std::fmt::Formatter;
+use std::sync::{Arc, Mutex};
+
+/// Results specific to the tag_pair module
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Errors that may happen while creating or regenerating tag pairs
+#[derive(Snafu, Debug)]
+pub enum Error {
+    #[snafu(display(
+        "Could not compile template for tag pair {} caused by: {}",
+        tag_key,
+        source
+    ))]
+    CantCompileTemplate {
+        tag_key: String,
+        #[snafu(source(from(handlebars::TemplateError, Box::new)))]
+        source: Box<handlebars::TemplateError>,
+    },
+
+    #[snafu(display(
+        "Could not render template for tag pair {}, cause by: {}",
+        tag_key,
+        source
+    ))]
+    CantRenderTemplate {
+        tag_key: String,
+        #[snafu(source(from(handlebars::RenderError, Box::new)))]
+        source: Box<handlebars::RenderError>,
+    },
+}
+
+#[derive(Debug)]
+pub enum TagPair {
+    Static(StaticTagPair),
+    Regenerating(Box<Mutex<RegeneratingTagPair>>),
+}
+
+impl TagPair {
+    pub fn pairs_from_specs(
+        specs: &[TagPairSpec],
+        mut template_data: Value,
+    ) -> Result<Vec<Arc<Self>>> {
+        let tag_pairs: Vec<_> = specs
+            .iter()
+            .map(|tag_pair_spec| {
+                let tag_count = tag_pair_spec.count.unwrap_or(1);
+
+                let tags: Vec<_> = (1..tag_count + 1)
+                    .map(|tag_id| {
+                        let tag_key = if tag_id == 1 {
+                            tag_pair_spec.key.to_string()
+                        } else {
+                            format!("{}{}", tag_pair_spec.key, tag_id)
+                        };
+
+                        let data = template_data.as_object_mut().expect("data must be object");
+                        data.insert("id".to_string(), json!(tag_id));
+                        data.insert("line_number".to_string(), json!(1));
+
+                        let mut template = new_handlebars_registry();
+                        template
+                            .register_template_string(&tag_key, &tag_pair_spec.template)
+                            .context(CantCompileTemplateSnafu {
+                                tag_key: &tag_pair_spec.key,
+                            })?;
+
+                        let value = template
+                            .render(&tag_key, &template_data)
+                            .context(CantRenderTemplateSnafu { tag_key: &tag_key })?;
+
+                        let tag_pair = StaticTagPair {
+                            key: Arc::new(tag_key),
+                            value: Arc::new(value),
+                        };
+
+                        let tag_pair = if let Some(regenerate_after_lines) =
+                            tag_pair_spec.regenerate_after_lines
+                        {
+                            let regenerating_pair = RegeneratingTagPair {
+                                regenerate_after_lines,
+                                tag_pair,
+                                template,
+                                line_number: 0,
+                                data: template_data.clone(),
+                            };
+
+                            Self::Regenerating(Box::new(Mutex::new(regenerating_pair)))
+                        } else {
+                            Self::Static(tag_pair)
+                        };
+
+                        Ok(Arc::new(tag_pair))
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
+                Ok(tags)
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        Ok(tag_pairs.into_iter().flatten().collect())
+    }
+
+    pub fn key(&self) -> String {
+        match self {
+            Self::Static(p) => p.key.to_string(),
+            Self::Regenerating(p) => {
+                let p = p.lock().expect("mutex poisoned");
+                p.tag_pair.key.to_string()
+            }
+        }
+    }
+}
+
+/// A tag key/value pair
+#[derive(Debug, PartialEq, Eq, PartialOrd, Clone)]
+pub struct StaticTagPair {
+    /// the key
+    pub key: Arc<String>,
+    /// the value
+    pub value: Arc<String>,
+}
+
+impl std::fmt::Display for StaticTagPair {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}={}", self.key, self.value)
+    }
+}
+
+/// Used for tag pairs specified in either an agent or measurement specification. The
+/// spec must be kept around to support regenerating the tag pair.
+#[derive(Debug, Clone)]
+pub struct RegeneratingTagPair {
+    regenerate_after_lines: usize,
+    tag_pair: StaticTagPair,
+    template: Handlebars<'static>,
+    data: Value,
+    line_number: usize,
+}
+
+impl RegeneratingTagPair {
+    pub fn tag_pair(&mut self) -> &StaticTagPair {
+        self.line_number += 1;
+
+        if self.should_regenerate() {
+            let data = self.data.as_object_mut().expect("data must be object");
+            data.insert("line_number".to_string(), json!(self.line_number));
+
+            let value = self
+                .template
+                .render(self.tag_pair.key.as_str(), &self.data)
+                .expect("this template has been rendered before so this shouldn't be possible");
+
+            self.tag_pair = StaticTagPair {
+                key: Arc::clone(&self.tag_pair.key),
+                value: Arc::new(value),
+            };
+        }
+
+        &self.tag_pair
+    }
+
+    fn should_regenerate(&self) -> bool {
+        self.line_number % (self.regenerate_after_lines + 1) == 0
+    }
+}
diff --git a/iox_data_generator/src/tag_set.rs b/iox_data_generator/src/tag_set.rs
new file mode 100644
index 0000000..a92f4a4
--- /dev/null
+++ b/iox_data_generator/src/tag_set.rs
@@ -0,0 +1,624 @@
+//! Code for defining values and tag sets with tags that are dependent on other tags.
+
+use crate::now_ns;
+use crate::specification::{DataSpec, ValuesSpec};
+use crate::substitution::new_handlebars_registry;
+use crate::tag_pair::StaticTagPair;
+use handlebars::Handlebars;
+use itertools::Itertools;
+use serde_json::json;
+use snafu::{OptionExt, ResultExt, Snafu};
+/// Module for pre-generated values and tag sets that can be used when generating samples from
+/// agents.
+use std::collections::BTreeMap;
+use std::fmt::Formatter;
+use std::sync::Arc;
+
+/// Errors that may happen while reading a TOML specification.
+#[derive(Snafu, Debug)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("{} specifies a has_one member {} that isn't defined", value, has_one))]
+    HasOneDependencyNotDefined { value: String, has_one: String },
+
+    /// Error that may happen when compiling a template from the values specification
+    #[snafu(display("Could not compile template `{}`, caused by:\n{}", template, source))]
+    CantCompileTemplate {
+        /// Underlying Handlebars error that caused this problem
+        #[snafu(source(from(handlebars::TemplateError, Box::new)))]
+        source: Box<handlebars::TemplateError>,
+        /// Template that caused this problem
+        template: String,
+    },
+
+    /// Error that may happen when rendering a template with passed in data
+    #[snafu(display("Could not render template `{}`, caused by:\n{}", template, source))]
+    CantRenderTemplate {
+        /// Underlying Handlebars error that caused this problem
+        #[snafu(source(from(handlebars::RenderError, Box::new)))]
+        source: Box<handlebars::RenderError>,
+        /// Template that caused this problem
+        template: String,
+    },
+
+    #[snafu(display(
+        "has_one {} must be accessed through its parent (e.g. parent foo with has_one bar: foo.bar",
+        has_one
+    ))]
+    HasOneWithoutParent { has_one: String },
+
+    #[snafu(display("no has_one found values for {}", has_one))]
+    HasOneNotFound { has_one: String },
+
+    #[snafu(display("has_one {} not found for parent id {}", has_one, parent_id))]
+    HasOneNotFoundForParent { has_one: String, parent_id: usize },
+}
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A single generated value's id and tag key/value pair.
+#[derive(Debug)]
+pub struct GeneratedValue {
+    id: usize,
+    tag_pair: Arc<StaticTagPair>,
+}
+
+/// All generated tag sets specified
+#[derive(Debug, Default)]
+pub struct GeneratedTagSets {
+    // These map the name of a collection of values to its values. All values will have
+    // an entry in this map. For has_one and child_values, they will have duplicates there
+    // as well to make generating the tag sets possible.
+    values: BTreeMap<String, Vec<Arc<GeneratedValue>>>,
+    // each parent-child will have its children stored in this map. The children map
+    // the id of the parent to the collection of its children values
+    child_values: BTreeMap<String, BTreeMap<usize, Vec<Arc<GeneratedValue>>>>,
+    // each parent-has_one will have its has_ones stored in this map
+    has_one_values: BTreeMap<String, ParentToHasOnes>,
+    // this maps the name of the tag set specified in the spec to the collection of tag
+    // sets that were pre-generated.
+    tag_sets: BTreeMap<String, Arc<Vec<TagSet>>>,
+}
+
+/// Generated parent to has_one mappings
+#[derive(Debug, Default)]
+pub struct ParentToHasOnes {
+    // each parent id will have its has_ones stored in this map. The map within
+    // maps the has_one name to its generated value
+    id_to_has_ones: BTreeMap<usize, BTreeMap<Arc<String>, Arc<GeneratedValue>>>,
+}
+
+impl GeneratedTagSets {
+    /// Generate tag sets from a `DataSpec`
+    pub fn from_spec(spec: &DataSpec) -> Result<Self> {
+        let mut generated_tag_sets = Self::default();
+        let mut template = new_handlebars_registry();
+
+        let mut leftover_specs = -1;
+
+        loop {
+            if leftover_specs == 0 {
+                break;
+            }
+
+            let new_leftover = generated_tag_sets.generate_values(&mut template, spec)? as i64;
+            if new_leftover == leftover_specs {
+                panic!("unresolvable loop in values generation");
+            }
+            leftover_specs = new_leftover;
+        }
+
+        generated_tag_sets.generate_tag_sets(spec)?;
+
+        Ok(generated_tag_sets)
+    }
+
+    /// Returns the tag sets for the given name
+    pub fn sets_for(&self, name: &str) -> Option<&Arc<Vec<TagSet>>> {
+        self.tag_sets.get(name)
+    }
+
+    fn generate_values(
+        &mut self,
+        registry: &mut Handlebars<'static>,
+        data_spec: &DataSpec,
+    ) -> Result<usize> {
+        let mut leftover_count = 0;
+
+        for spec in &data_spec.values {
+            if self.values.contains_key(&spec.name) {
+                continue;
+            } else if !self.can_generate(spec) {
+                leftover_count += 1;
+                continue;
+            }
+
+            self.generate_values_spec(registry, spec)?;
+        }
+
+        Ok(leftover_count)
+    }
+
+    fn generate_tag_sets(&mut self, data_spec: &DataSpec) -> Result<()> {
+        for set_spec in &data_spec.tag_sets {
+            self.generate_tag_set_spec(&set_spec.name, &set_spec.for_each)?;
+        }
+
+        Ok(())
+    }
+
+    fn generate_tag_set_spec(&mut self, set_name: &str, for_each: &[String]) -> Result<()> {
+        let mut tag_set_keys: Vec<_> = for_each
+            .iter()
+            .map(|v| Key {
+                name: v.split('.').last().unwrap(),
+                value: v.to_string(),
+                position: 0,
+            })
+            .collect();
+
+        // this weird bit is so that we don't need to sort the tag pairs as we're generating. All
+        // tag sets here will have the exact same tags and sort order, so do it once and inject tags
+        // in the appropriate place
+        let mut sorted_keys: Vec<_> = tag_set_keys.iter_mut().collect();
+        sorted_keys.sort_unstable_by(|a, b| a.name.partial_cmp(b.name).unwrap());
+        for (pos, k) in sorted_keys.iter_mut().enumerate() {
+            k.position = pos;
+        }
+
+        // we pass in a pre-built tag_pairs vec so that we can fill it out as we walk down the for_each
+        // iteration and then just do a single clone at the very end.
+        let mut tag_pairs: Vec<_> = (0..for_each.len())
+            .map(|_| {
+                Arc::new(StaticTagPair {
+                    key: Arc::new("default".to_string()),
+                    value: Arc::new("default".to_string()),
+                })
+            })
+            .collect();
+        let tag_sets = self.for_each_tag_set(None, &tag_set_keys, &mut tag_pairs, 0)?;
+        self.tag_sets
+            .insert(set_name.to_string(), Arc::new(tag_sets));
+
+        Ok(())
+    }
+
+    fn for_each_tag_set(
+        &self,
+        parent_id: Option<usize>,
+        keys: &[Key<'_>],
+        tag_pairs: &mut Vec<Arc<StaticTagPair>>,
+        position: usize,
+    ) -> Result<Vec<TagSet>> {
+        let key = &keys[position];
+
+        match self.get_generated_values(parent_id, &key.value) {
+            Some(values) => {
+                if position == keys.len() - 1 {
+                    let mut tag_sets = Vec::with_capacity(values.len());
+
+                    for v in values {
+                        tag_pairs[key.position] = Arc::clone(&v.tag_pair);
+                        tag_sets.push(TagSet::new(tag_pairs.clone()));
+                    }
+
+                    return Ok(tag_sets);
+                }
+
+                let mut tag_sets = vec![];
+
+                for v in values {
+                    tag_pairs[key.position] = Arc::clone(&v.tag_pair);
+                    let mut sets =
+                        self.for_each_tag_set(Some(v.id), keys, tag_pairs, position + 1)?;
+                    tag_sets.append(&mut sets);
+                }
+
+                Ok(tag_sets)
+            }
+            None => {
+                let parent_id = parent_id.expect("for_each_tag_set should never be called without a parent id if in has_one evaluation");
+                let one = self
+                    .has_one_values
+                    .get(&key.value)
+                    .context(HasOneNotFoundSnafu {
+                        has_one: &key.value,
+                    })?
+                    .id_to_has_ones
+                    .get(&parent_id)
+                    .context(HasOneNotFoundForParentSnafu {
+                        has_one: &key.value,
+                        parent_id,
+                    })?
+                    .get(&key.value)
+                    .expect("bug in generating values for has_one");
+                let tag = Arc::clone(&one.tag_pair);
+                tag_pairs[key.position] = tag;
+
+                if position == keys.len() - 1 {
+                    Ok(vec![TagSet::new(tag_pairs.clone())])
+                } else {
+                    self.for_each_tag_set(Some(parent_id), keys, tag_pairs, position + 1)
+                }
+            }
+        }
+    }
+
+    fn get_generated_values(
+        &self,
+        parent_id: Option<usize>,
+        key: &str,
+    ) -> Option<&Vec<Arc<GeneratedValue>>> {
+        match self.child_values.get(key) {
+            Some(child_values) => child_values.get(&parent_id.expect(
+                "should never get_get_generated_values for child values without a parent_id",
+            )),
+            None => self.values.get(key),
+        }
+    }
+
+    fn can_generate(&self, spec: &ValuesSpec) -> bool {
+        match (&spec.has_one, &spec.belongs_to) {
+            (None, None) => true,
+            (None, Some(b)) => self.values.contains_key(b),
+            (Some(has_ones), None) => {
+                for name in has_ones {
+                    if !self.values.contains_key(name) {
+                        return false;
+                    }
+                }
+
+                true
+            }
+            (Some(has_ones), Some(b)) => {
+                for name in has_ones {
+                    if !self.values.contains_key(name) {
+                        return false;
+                    }
+                }
+
+                self.values.contains_key(b)
+            }
+        }
+    }
+
+    fn generate_values_spec(
+        &mut self,
+        template: &mut Handlebars<'static>,
+        spec: &ValuesSpec,
+    ) -> Result<()> {
+        template
+            .register_template_string(&spec.name, &spec.template)
+            .context(CantCompileTemplateSnafu {
+                template: &spec.name,
+            })?;
+
+        match &spec.belongs_to {
+            Some(belongs_to) => self.generate_belongs_to(template, belongs_to.as_str(), spec)?,
+            None => {
+                let mut vals = Vec::with_capacity(spec.cardinality);
+                let mut id_map = BTreeMap::new();
+                let tag_key = Arc::new(spec.name.clone());
+
+                for i in 1..(spec.cardinality + 1) {
+                    id_map.insert("id", i);
+                    id_map.insert("timestamp", now_ns() as usize);
+                    let rendered_value =
+                        template
+                            .render(&spec.name, &id_map)
+                            .context(CantRenderTemplateSnafu {
+                                template: &spec.name,
+                            })?;
+                    let value = Arc::new(rendered_value);
+
+                    vals.push(Arc::new(GeneratedValue {
+                        id: i,
+                        tag_pair: Arc::new(StaticTagPair {
+                            key: Arc::clone(&tag_key),
+                            value,
+                        }),
+                    }));
+                }
+                self.values.insert(spec.name.to_string(), vals);
+            }
+        }
+
+        if let Some(has_ones) = spec.has_one.as_ref() {
+            self.add_has_ones(&spec.name, has_ones)?;
+        }
+
+        Ok(())
+    }
+
+    fn add_has_ones(&mut self, parent: &str, has_ones: &[String]) -> Result<()> {
+        let parent_values = self
+            .values
+            .get(parent)
+            .expect("add_has_ones should never be called before the parent values are inserted");
+
+        for has_one in has_ones {
+            let parent_has_one_key = Arc::new(has_one_values_key(parent, has_one));
+            let parent_has_ones = self
+                .has_one_values
+                .entry(parent_has_one_key.as_str().to_owned())
+                .or_default();
+
+            let has_one_values = self.values.get(has_one.as_str()).expect(
+                "add_has_ones should never be called before the values collection is created",
+            );
+
+            let mut ones_iter = has_one_values.iter();
+            for parent in parent_values {
+                let one_val = ones_iter.next().unwrap_or_else(|| {
+                    ones_iter = has_one_values.iter();
+                    ones_iter.next().unwrap()
+                });
+
+                let has_one_map = parent_has_ones.id_to_has_ones.entry(parent.id).or_default();
+                has_one_map.insert(Arc::clone(&parent_has_one_key), Arc::clone(one_val));
+            }
+        }
+
+        Ok(())
+    }
+
+    fn generate_belongs_to(
+        &mut self,
+        template: &mut Handlebars<'static>,
+        belongs_to: &str,
+        spec: &ValuesSpec,
+    ) -> Result<()> {
+        let parent_values = self.values.get(belongs_to).expect(
+            "generate_belongs_to should never be called before the parent values are inserted",
+        );
+        let tag_key = Arc::new(spec.name.clone());
+
+        let mut all_children = Vec::with_capacity(parent_values.len() * spec.cardinality);
+
+        for parent in parent_values {
+            let mut parent_owned = Vec::with_capacity(spec.cardinality);
+
+            for _ in 0..spec.cardinality {
+                let child_value_id = all_children.len() + 1;
+                let data = json!({
+                    belongs_to: {
+                        "id": parent.id,
+                        "value": &parent.tag_pair.value.as_ref(),
+                    },
+                    "id": child_value_id,
+                });
+
+                let rendered_value =
+                    template
+                        .render(&spec.name, &data)
+                        .context(CantRenderTemplateSnafu {
+                            template: &spec.name,
+                        })?;
+                let value = Arc::new(rendered_value);
+
+                let child_value = Arc::new(GeneratedValue {
+                    id: child_value_id,
+                    tag_pair: Arc::new(StaticTagPair {
+                        key: Arc::clone(&tag_key),
+                        value,
+                    }),
+                });
+
+                parent_owned.push(Arc::clone(&child_value));
+                all_children.push(child_value);
+            }
+
+            let child_vals = self
+                .child_values
+                .entry(child_values_key(belongs_to, &spec.name))
+                .or_default();
+            child_vals.insert(parent.id, parent_owned);
+        }
+        self.values.insert(spec.name.to_string(), all_children);
+
+        Ok(())
+    }
+}
+
+struct Key<'a> {
+    name: &'a str,
+    value: String,
+    position: usize,
+}
+
+fn child_values_key(parent: &str, child: &str) -> String {
+    format!("{parent}.{child}")
+}
+
+fn has_one_values_key(parent: &str, child: &str) -> String {
+    format!("{parent}.{child}")
+}
+
+/// A collection of tag key/value pairs
+#[derive(Debug)]
+pub struct TagSet {
+    /// The tags in the set
+    pub tags: Vec<Arc<StaticTagPair>>,
+}
+
+impl TagSet {
+    fn new(tags: Vec<Arc<StaticTagPair>>) -> Self {
+        Self { tags }
+    }
+}
+
+impl std::fmt::Display for TagSet {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let s = self.tags.iter().map(|t| t.to_string()).join(",");
+        write!(f, "{s}")
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use std::str::FromStr;
+
+    #[test]
+    fn generate_tag_sets_basic() {
+        let toml = r#"
+name = "demo"
+
+[[values]]
+name = "foo"
+template = "{{id}}#foo"
+cardinality = 3
+
+[[tag_sets]]
+name = "testage"
+for_each = ["foo"]
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "cpu"
+
+[[agents.measurements.fields]]
+name = "f1"
+i64_range = [0, 23]
+
+[[database_writers]]
+agents = [{name = "foo", sampling_interval = "10s"}]"#;
+
+        let spec = DataSpec::from_str(toml).unwrap();
+        let tag_sets = GeneratedTagSets::from_spec(&spec).unwrap();
+        let testage = tag_sets.sets_for("testage").unwrap();
+        let sets = testage.iter().map(|t| t.to_string()).join("\n");
+        let expected = r#"
+foo=1#foo
+foo=2#foo
+foo=3#foo"#;
+        assert_eq!(expected[1..], sets);
+    }
+
+    #[test]
+    fn generate_tag_sets_belongs_to() {
+        let toml = r#"
+name = "demo"
+
+[[values]]
+name = "foo"
+template = "{{id}}#foo"
+cardinality = 2
+
+[[values]]
+name = "bar"
+template = "{{id}}-{{foo.id}}-{{foo.value}}"
+cardinality = 2
+belongs_to = "foo"
+
+[[tag_sets]]
+name = "testage"
+for_each = [
+    "foo",
+    "foo.bar",
+]
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "cpu"
+
+[[agents.measurements.fields]]
+name = "f1"
+i64_range = [0, 23]
+
+[[database_writers]]
+agents = [{name = "foo", sampling_interval = "10s"}]"#;
+
+        let spec = DataSpec::from_str(toml).unwrap();
+        let tag_sets = GeneratedTagSets::from_spec(&spec).unwrap();
+        let testage = tag_sets.sets_for("testage").unwrap();
+        let sets = testage.iter().map(|t| t.to_string()).join("\n");
+        let expected = r#"
+bar=1-1-1#foo,foo=1#foo
+bar=2-1-1#foo,foo=1#foo
+bar=3-2-2#foo,foo=2#foo
+bar=4-2-2#foo,foo=2#foo"#;
+        assert_eq!(expected[1..], sets);
+    }
+
+    #[test]
+    fn generate_tag_sets_test() {
+        let toml = r#"
+name = "demo"
+
+[[values]]
+name = "foo"
+template = "{{id}}-foo"
+cardinality = 3
+has_one = ["bar"]
+
+[[values]]
+name = "bar"
+template = "{{id}}-bar"
+cardinality = 2
+
+[[values]]
+name = "asdf"
+template = "{{id}}-asdf"
+cardinality = 2
+belongs_to = "foo"
+has_one = ["qwer"]
+
+[[values]]
+name = "jkl"
+template = "{{id}}-jkl"
+cardinality = 2
+
+[[values]]
+name = "qwer"
+template = "{{id}}-qwer"
+cardinality = 6
+
+[[tag_sets]]
+name = "testage"
+for_each = [
+    "foo",
+    "foo.bar",
+    "foo.asdf",
+    "asdf.qwer",
+    "jkl"
+]
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "cpu"
+
+[[agents.measurements.fields]]
+name = "f1"
+i64_range = [0, 23]
+
+[[database_writers]]
+database_ratio = 1.0
+agents = [{name = "foo", sampling_interval = "10s"}]"#;
+
+        let spec = DataSpec::from_str(toml).unwrap();
+        let tag_sets = GeneratedTagSets::from_spec(&spec).unwrap();
+        let testage = tag_sets.sets_for("testage").unwrap();
+        let sets = testage.iter().map(|t| t.to_string()).join("\n");
+        let expected = r#"
+asdf=1-asdf,bar=1-bar,foo=1-foo,jkl=1-jkl,qwer=1-qwer
+asdf=1-asdf,bar=1-bar,foo=1-foo,jkl=2-jkl,qwer=1-qwer
+asdf=2-asdf,bar=1-bar,foo=1-foo,jkl=1-jkl,qwer=2-qwer
+asdf=2-asdf,bar=1-bar,foo=1-foo,jkl=2-jkl,qwer=2-qwer
+asdf=3-asdf,bar=2-bar,foo=2-foo,jkl=1-jkl,qwer=3-qwer
+asdf=3-asdf,bar=2-bar,foo=2-foo,jkl=2-jkl,qwer=3-qwer
+asdf=4-asdf,bar=2-bar,foo=2-foo,jkl=1-jkl,qwer=4-qwer
+asdf=4-asdf,bar=2-bar,foo=2-foo,jkl=2-jkl,qwer=4-qwer
+asdf=5-asdf,bar=1-bar,foo=3-foo,jkl=1-jkl,qwer=5-qwer
+asdf=5-asdf,bar=1-bar,foo=3-foo,jkl=2-jkl,qwer=5-qwer
+asdf=6-asdf,bar=1-bar,foo=3-foo,jkl=1-jkl,qwer=6-qwer
+asdf=6-asdf,bar=1-bar,foo=3-foo,jkl=2-jkl,qwer=6-qwer"#;
+        assert_eq!(expected[1..], sets);
+    }
+}
diff --git a/iox_data_generator/src/write.rs b/iox_data_generator/src/write.rs
new file mode 100644
index 0000000..1b0f701
--- /dev/null
+++ b/iox_data_generator/src/write.rs
@@ -0,0 +1,543 @@
+//! Writing generated points
+
+use crate::measurement::LineToGenerate;
+use bytes::Bytes;
+use datafusion_util::{unbounded_memory_pool, MemoryStream};
+use futures::stream;
+use influxdb2_client::models::WriteDataPoint;
+use mutable_batch_lp::lines_to_batches;
+use parquet_file::{metadata::IoxMetadata, serialize};
+use schema::Projection;
+use snafu::{ensure, ResultExt, Snafu};
+#[cfg(test)]
+use std::{collections::BTreeMap, sync::Arc};
+use std::{
+    fs::{self, File, OpenOptions},
+    io::{BufWriter, Write},
+    path::{Path, PathBuf},
+    sync::Mutex,
+};
+
+/// Errors that may happen while writing points.
+#[derive(Snafu, Debug)]
+pub enum Error {
+    /// Error that may happen when writing line protocol to a file
+    #[snafu(display("Couldn't open line protocol file {}: {}", filename.display(), source))]
+    CantOpenLineProtocolFile {
+        /// The location of the file we tried to open
+        filename: PathBuf,
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen when writing Parquet to a file
+    #[snafu(display("Couldn't open Parquet file {}: {}", filename.display(), source))]
+    CantOpenParquetFile {
+        /// The location of the file we tried to open
+        filename: PathBuf,
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen when writing line protocol to a no-op sink
+    #[snafu(display("Could not generate line protocol: {}", source))]
+    CantWriteToNoOp {
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen when writing line protocol to a file
+    #[snafu(display("Could not write line protocol to file: {}", source))]
+    CantWriteToLineProtocolFile {
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen when writing line protocol to a Vec of bytes
+    #[snafu(display("Could not write to vec: {}", source))]
+    WriteToVec {
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen when writing Parquet to a file
+    #[snafu(display("Could not write Parquet: {}", source))]
+    WriteToParquetFile {
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen when converting line protocol to a mutable batch
+    #[snafu(display("Could not convert to a mutable batch: {}", source))]
+    ConvertToMutableBatch {
+        /// Underlying mutable_batch_lp error that caused this problem
+        source: mutable_batch_lp::Error,
+    },
+
+    /// Error that may happen when converting a mutable batch to an Arrow RecordBatch
+    #[snafu(display("Could not convert to a record batch: {}", source))]
+    ConvertToArrow {
+        /// Underlying mutable_batch error that caused this problem
+        source: mutable_batch::Error,
+    },
+
+    /// Error that may happen when creating a directory to store files to write
+    /// to
+    #[snafu(display("Could not create directory: {}", source))]
+    CantCreateDirectory {
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen when checking a path's metadata to see if it's a
+    /// directory
+    #[snafu(display("Could not get metadata: {}", source))]
+    CantGetMetadata {
+        /// Underlying IO error that caused this problem
+        source: std::io::Error,
+    },
+
+    /// Error that may happen if the path given to the file-based writer isn't a
+    /// directory
+    #[snafu(display("Expected to get a directory"))]
+    MustBeDirectory,
+
+    /// Error that may happen while writing points to the API
+    #[snafu(display("Could not write points to API: {}", source))]
+    CantWriteToApi {
+        /// Underlying Influx client request error that caused this problem
+        source: influxdb2_client::RequestError,
+    },
+
+    /// Error that may happen while trying to create a bucket via the API
+    #[snafu(display("Could not create bucket: {}", source))]
+    CantCreateBucket {
+        /// Underlying Influx client request error that caused this problem
+        source: influxdb2_client::RequestError,
+    },
+
+    /// Error that may happen if attempting to create a bucket without
+    /// specifying the org ID
+    #[snafu(display("Could not create a bucket without an `org_id`"))]
+    OrgIdRequiredToCreateBucket,
+
+    /// Error that may happen when serializing to Parquet
+    #[snafu(display("Could not serialize to Parquet"))]
+    ParquetSerialization {
+        /// Underlying `parquet_file` error that caused this problem
+        source: parquet_file::serialize::CodecError,
+    },
+}
+
+type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Responsible for holding shared configuration needed to construct per-agent
+/// points writers
+#[derive(Debug)]
+pub struct PointsWriterBuilder {
+    config: PointsWriterConfig,
+}
+
+#[derive(Debug)]
+enum PointsWriterConfig {
+    Api(influxdb2_client::Client),
+    Directory(PathBuf),
+    ParquetFile(PathBuf),
+    NoOp {
+        perform_write: bool,
+    },
+    #[cfg(test)]
+    Vector(BTreeMap<String, Arc<Mutex<Vec<u8>>>>),
+    Stdout,
+}
+
+impl PointsWriterBuilder {
+    /// Write points to the API at the specified host and put them in the
+    /// specified org and bucket.
+    pub async fn new_api(
+        host: impl Into<String> + Send,
+        token: impl Into<String> + Send,
+        jaeger_debug: Option<&str>,
+    ) -> Result<Self> {
+        let host = host.into();
+
+        // Be somewhat lenient on what we accept as far as host; the client expects the
+        // protocol to be included. We could pull in the url crate and do more
+        // verification here.
+        let host = if host.starts_with("http") {
+            host
+        } else {
+            format!("http://{host}")
+        };
+
+        let mut client = influxdb2_client::Client::new(host, token.into());
+        if let Some(header) = jaeger_debug {
+            client = client.with_jaeger_debug(header.to_string());
+        }
+
+        Ok(Self {
+            config: PointsWriterConfig::Api(client),
+        })
+    }
+
+    /// Write points to a file in the directory specified.
+    pub fn new_file<P: AsRef<Path>>(path: P) -> Result<Self> {
+        fs::create_dir_all(&path).context(CantCreateDirectorySnafu)?;
+        let metadata = fs::metadata(&path).context(CantGetMetadataSnafu)?;
+        ensure!(metadata.is_dir(), MustBeDirectorySnafu);
+
+        Ok(Self {
+            config: PointsWriterConfig::Directory(PathBuf::from(path.as_ref())),
+        })
+    }
+
+    /// Write points to a Parquet file in the directory specified.
+    pub fn new_parquet<P: AsRef<Path>>(path: P) -> Result<Self> {
+        fs::create_dir_all(&path).context(CantCreateDirectorySnafu)?;
+        let metadata = fs::metadata(&path).context(CantGetMetadataSnafu)?;
+        ensure!(metadata.is_dir(), MustBeDirectorySnafu);
+
+        Ok(Self {
+            config: PointsWriterConfig::ParquetFile(PathBuf::from(path.as_ref())),
+        })
+    }
+
+    /// Write points to stdout
+    pub fn new_std_out() -> Self {
+        Self {
+            config: PointsWriterConfig::Stdout,
+        }
+    }
+
+    /// Generate points but do not write them anywhere
+    pub fn new_no_op(perform_write: bool) -> Self {
+        Self {
+            config: PointsWriterConfig::NoOp { perform_write },
+        }
+    }
+
+    /// Create a writer out of this writer's configuration for a particular
+    /// agent that runs in a separate thread/task.
+    pub fn build_for_agent(
+        &mut self,
+        name: impl Into<String>,
+        org: impl Into<String>,
+        bucket: impl Into<String>,
+    ) -> Result<PointsWriter> {
+        let inner_writer = match &mut self.config {
+            PointsWriterConfig::Api(client) => InnerPointsWriter::Api {
+                client: client.clone(),
+                org: org.into(),
+                bucket: bucket.into(),
+            },
+            PointsWriterConfig::Directory(dir_path) => {
+                let mut filename = dir_path.clone();
+                filename.push(name.into());
+                filename.set_extension("txt");
+
+                let file = OpenOptions::new()
+                    .append(true)
+                    .create(true)
+                    .open(&filename)
+                    .context(CantOpenLineProtocolFileSnafu { filename })?;
+
+                let file = Mutex::new(BufWriter::new(file));
+
+                InnerPointsWriter::File { file }
+            }
+
+            PointsWriterConfig::ParquetFile(dir_path) => InnerPointsWriter::ParquetFile {
+                dir_path: dir_path.clone(),
+                agent_name: name.into(),
+            },
+
+            PointsWriterConfig::NoOp { perform_write } => InnerPointsWriter::NoOp {
+                perform_write: *perform_write,
+            },
+            #[cfg(test)]
+            PointsWriterConfig::Vector(ref mut agents_by_name) => {
+                let v = agents_by_name
+                    .entry(name.into())
+                    .or_insert_with(|| Arc::new(Mutex::new(Vec::new())));
+                InnerPointsWriter::Vec(Arc::clone(v))
+            }
+            PointsWriterConfig::Stdout => InnerPointsWriter::Stdout,
+        };
+
+        Ok(PointsWriter { inner_writer })
+    }
+}
+
+/// Responsible for writing points to the location it's been configured for.
+#[derive(Debug)]
+pub struct PointsWriter {
+    inner_writer: InnerPointsWriter,
+}
+
+impl PointsWriter {
+    /// Write these points
+    pub async fn write_points(
+        &self,
+        points: impl Iterator<Item = LineToGenerate> + Send + Sync + 'static,
+    ) -> Result<()> {
+        self.inner_writer.write_points(points).await
+    }
+}
+
+#[derive(Debug)]
+enum InnerPointsWriter {
+    Api {
+        client: influxdb2_client::Client,
+        org: String,
+        bucket: String,
+    },
+    File {
+        file: Mutex<BufWriter<File>>,
+    },
+    ParquetFile {
+        dir_path: PathBuf,
+        agent_name: String,
+    },
+    NoOp {
+        perform_write: bool,
+    },
+    #[cfg(test)]
+    Vec(Arc<Mutex<Vec<u8>>>),
+    Stdout,
+}
+
+impl InnerPointsWriter {
+    async fn write_points(
+        &self,
+        points: impl Iterator<Item = LineToGenerate> + Send + Sync + 'static,
+    ) -> Result<()> {
+        match self {
+            Self::Api {
+                client,
+                org,
+                bucket,
+            } => {
+                client
+                    .write(org, bucket, stream::iter(points))
+                    .await
+                    .context(CantWriteToApiSnafu)?;
+            }
+            Self::File { file } => {
+                for point in points {
+                    let mut file = file.lock().expect("Should be able to get lock");
+                    point
+                        .write_data_point_to(&mut *file)
+                        .context(CantWriteToLineProtocolFileSnafu)?;
+                }
+            }
+
+            Self::ParquetFile {
+                dir_path,
+                agent_name,
+            } => {
+                let mut raw_line_protocol = Vec::new();
+                for point in points {
+                    point
+                        .write_data_point_to(&mut raw_line_protocol)
+                        .context(WriteToVecSnafu)?;
+                }
+                let line_protocol = String::from_utf8(raw_line_protocol)
+                    .expect("Generator should be creating valid UTF-8");
+
+                let batches_by_measurement =
+                    lines_to_batches(&line_protocol, 0).context(ConvertToMutableBatchSnafu)?;
+
+                for (measurement, batch) in batches_by_measurement {
+                    let record_batch = batch
+                        .to_arrow(Projection::All)
+                        .context(ConvertToArrowSnafu)?;
+                    let stream = Box::pin(MemoryStream::new(vec![record_batch]));
+                    let meta = IoxMetadata::external(crate::now_ns(), &*measurement);
+                    let pool = unbounded_memory_pool();
+                    let (data, _parquet_file_meta) =
+                        serialize::to_parquet_bytes(stream, &meta, pool)
+                            .await
+                            .context(ParquetSerializationSnafu)?;
+                    let data = Bytes::from(data);
+
+                    let mut filename = dir_path.clone();
+                    filename.push(format!("{agent_name}_{measurement}"));
+                    filename.set_extension("parquet");
+
+                    let file = OpenOptions::new()
+                        .create(true)
+                        .write(true)
+                        .open(&filename)
+                        .context(CantOpenParquetFileSnafu { filename })?;
+
+                    let mut file = BufWriter::new(file);
+
+                    file.write_all(&data).context(WriteToParquetFileSnafu)?;
+                }
+            }
+
+            Self::NoOp { perform_write } => {
+                if *perform_write {
+                    let mut sink = std::io::sink();
+
+                    for point in points {
+                        point
+                            .write_data_point_to(&mut sink)
+                            .context(CantWriteToNoOpSnafu)?;
+                    }
+                }
+            }
+            #[cfg(test)]
+            Self::Vec(vec) => {
+                let vec_ref = Arc::clone(vec);
+                let mut vec = vec_ref.lock().expect("Should be able to get lock");
+                for point in points {
+                    point
+                        .write_data_point_to(&mut *vec)
+                        .expect("Should be able to write to vec");
+                }
+            }
+            Self::Stdout => {
+                for point in points {
+                    point
+                        .write_data_point_to(std::io::stdout())
+                        .expect("should be able to write to stdout");
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::{generate, now_ns, specification::*};
+    use std::str::FromStr;
+
+    type Error = Box<dyn std::error::Error>;
+    type Result<T = (), E = Error> = std::result::Result<T, E>;
+
+    impl PointsWriterBuilder {
+        fn new_vec() -> Self {
+            Self {
+                config: PointsWriterConfig::Vector(BTreeMap::new()),
+            }
+        }
+
+        fn written_data(self, agent_name: &str) -> String {
+            match self.config {
+                PointsWriterConfig::Vector(agents_by_name) => {
+                    let bytes_ref =
+                        Arc::clone(agents_by_name.get(agent_name).expect(
+                            "Should have written some data, did not find any for this agent",
+                        ));
+                    let bytes = bytes_ref
+                        .lock()
+                        .expect("Should have been able to get a lock");
+                    String::from_utf8(bytes.to_vec()).expect("we should be generating valid UTF-8")
+                }
+                _ => unreachable!("this method is only valid when writing to a vector for testing"),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_generate() -> Result<()> {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "cpu"
+
+[[agents.measurements.fields]]
+name = "val"
+i64_range = [3,3]
+
+[[database_writers]]
+agents = [{name = "foo", sampling_interval = "1s"}]
+"#;
+
+        let data_spec = DataSpec::from_str(toml).unwrap();
+        let mut points_writer_builder = PointsWriterBuilder::new_vec();
+
+        let now = now_ns();
+
+        generate(
+            &data_spec,
+            vec!["foo_bar".to_string()],
+            &mut points_writer_builder,
+            Some(now),
+            Some(now),
+            now,
+            false,
+            1,
+            false,
+        )
+        .await?;
+
+        let line_protocol = points_writer_builder.written_data("foo");
+
+        let expected_line_protocol = format!(
+            r#"cpu val=3i {now}
+"#
+        );
+        assert_eq!(line_protocol, expected_line_protocol);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_generate_batches() -> Result<()> {
+        let toml = r#"
+name = "demo_schema"
+
+[[agents]]
+name = "foo"
+
+[[agents.measurements]]
+name = "cpu"
+
+[[agents.measurements.fields]]
+name = "val"
+i64_range = [2, 2]
+
+[[database_writers]]
+agents = [{name = "foo", sampling_interval = "1s"}]
+"#;
+
+        let data_spec = DataSpec::from_str(toml).unwrap();
+        let mut points_writer_builder = PointsWriterBuilder::new_vec();
+
+        let now = now_ns();
+
+        generate(
+            &data_spec,
+            vec!["foo_bar".to_string()],
+            &mut points_writer_builder,
+            Some(now - 1_000_000_000),
+            Some(now),
+            now,
+            false,
+            2,
+            false,
+        )
+        .await?;
+
+        let line_protocol = points_writer_builder.written_data("foo");
+
+        let expected_line_protocol = format!(
+            r#"cpu val=2i {}
+cpu val=2i {}
+"#,
+            now - 1_000_000_000,
+            now
+        );
+        assert_eq!(line_protocol, expected_line_protocol);
+
+        Ok(())
+    }
+}
diff --git a/iox_query/Cargo.toml b/iox_query/Cargo.toml
new file mode 100644
index 0000000..e453531
--- /dev/null
+++ b/iox_query/Cargo.toml
@@ -0,0 +1,55 @@
+[package]
+name = "iox_query"
+description = "IOx Query Interface and Executor"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+# This crate is designed to be independent of the rest of the IOx
+# server and specific storage systems such as Mutable Buffer and Read Buffer.
+#
+# The rationale for this is to:
+#
+# 1. Keep change/compile/link time down during development when working on just this crate
+# 2. Allow for query logic testing without bringing in all the storage systems.
+
+[dependencies] # In alphabetical order
+arrow = { workspace = true }
+arrow_util = { path = "../arrow_util" }
+async-trait = "0.1"
+chrono = { version = "0.4", default-features = false }
+data_types = { path = "../data_types" }
+datafusion = { workspace = true }
+datafusion_util = { path = "../datafusion_util" }
+executor = { path = "../executor"}
+futures = "0.3"
+hashbrown = { workspace = true }
+indexmap = { version = "2.1", features = ["std"] }
+itertools = "0.12.0"
+iox_time = { path = "../iox_time" }
+metric = { path = "../metric" }
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+once_cell = "1"
+parking_lot = "0.12"
+parquet_file = { path = "../parquet_file" }
+query_functions = { path = "../query_functions"}
+schema = { path = "../schema" }
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "parking_lot"] }
+tokio-stream = "0.1"
+trace = { path = "../trace" }
+tracker = { path = "../tracker" }
+predicate = { path = "../predicate" }
+uuid = { version = "1", features = ["v4"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+test_helpers = { path = "../test_helpers" }
+assert_matches = "1"
+insta = { version = "1", features = ["yaml"] }
+serde = { version = "1.0", features = ["derive"] }
diff --git a/iox_query/README.md b/iox_query/README.md
new file mode 100644
index 0000000..c522983
--- /dev/null
+++ b/iox_query/README.md
@@ -0,0 +1,3 @@
+# IOx Query Layer
+
+See [InfluxDB IOx -- Query Processing](../docs/query_processing.md) for details.
diff --git a/iox_query/src/chunk_statistics.rs b/iox_query/src/chunk_statistics.rs
new file mode 100644
index 0000000..0430347
--- /dev/null
+++ b/iox_query/src/chunk_statistics.rs
@@ -0,0 +1,289 @@
+//! Tools to set up DataFusion statistics.
+
+use std::{collections::HashMap, sync::Arc};
+
+use data_types::TimestampMinMax;
+use datafusion::common::stats::Precision;
+use datafusion::{
+    physical_plan::{ColumnStatistics, Statistics},
+    scalar::ScalarValue,
+};
+use datafusion_util::{option_to_precision, timestamptz_nano};
+use schema::{InfluxColumnType, Schema};
+
+/// Represent known min/max values for a specific column.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ColumnRange {
+    pub min_value: Arc<ScalarValue>,
+    pub max_value: Arc<ScalarValue>,
+}
+
+/// Represents the known min/max values for a subset (not all) of the columns in a partition.
+///
+/// The values may not actually in any row.
+///
+/// These ranges apply to ALL rows (esp. in ALL files and ingester chunks) within in given partition.
+pub type ColumnRanges = Arc<HashMap<Arc<str>, ColumnRange>>;
+
+/// Returns the min/max values for the range, if present
+fn range_to_min_max_stats(
+    range: Option<&ColumnRange>,
+) -> (Precision<ScalarValue>, Precision<ScalarValue>) {
+    let Some(range) = range else {
+        return (Precision::Absent, Precision::Absent);
+    };
+    (
+        Precision::Exact(range.min_value.as_ref().clone()),
+        Precision::Exact(range.max_value.as_ref().clone()),
+    )
+}
+
+/// Create chunk [statistics](Statistics).
+pub fn create_chunk_statistics(
+    row_count: Option<usize>,
+    schema: &Schema,
+    ts_min_max: Option<TimestampMinMax>,
+    ranges: Option<&ColumnRanges>,
+) -> Statistics {
+    let mut columns = Vec::with_capacity(schema.len());
+
+    for (t, field) in schema.iter() {
+        let stats = match t {
+            InfluxColumnType::Timestamp => {
+                // prefer explicitely given time range but fall back to column ranges
+                let (min_value, max_value) = match ts_min_max {
+                    Some(ts_min_max) => (
+                        Precision::Exact(timestamptz_nano(ts_min_max.min)),
+                        Precision::Exact(timestamptz_nano(ts_min_max.max)),
+                    ),
+                    None => {
+                        let range =
+                            ranges.and_then(|ranges| ranges.get::<str>(field.name().as_ref()));
+
+                        range_to_min_max_stats(range)
+                    }
+                };
+
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    min_value,
+                    max_value,
+                    distinct_count: Precision::Absent,
+                }
+            }
+            _ => {
+                let range = ranges.and_then(|ranges| ranges.get::<str>(field.name().as_ref()));
+
+                let (min_value, max_value) = range_to_min_max_stats(range);
+
+                ColumnStatistics {
+                    null_count: Precision::Absent,
+                    min_value,
+                    max_value,
+                    distinct_count: Precision::Absent,
+                }
+            }
+        };
+        columns.push(stats)
+    }
+
+    let num_rows = option_to_precision(row_count);
+
+    Statistics {
+        num_rows,
+        total_byte_size: Precision::Absent,
+        column_statistics: columns,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use schema::{InfluxFieldType, SchemaBuilder, TIME_COLUMN_NAME};
+
+    use super::*;
+
+    #[test]
+    fn test_create_chunk_statistics_no_columns_no_rows() {
+        let schema = SchemaBuilder::new().build().unwrap();
+        let row_count = 0;
+
+        let actual = create_chunk_statistics(Some(row_count), &schema, None, None);
+        let expected = Statistics {
+            num_rows: Precision::Exact(row_count),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_create_chunk_statistics_no_columns_null_rows() {
+        let schema = SchemaBuilder::new().build().unwrap();
+
+        let actual = create_chunk_statistics(None, &schema, None, None);
+        let expected = Statistics {
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_create_chunk_statistics() {
+        let schema = full_schema();
+        let ts_min_max = TimestampMinMax { min: 10, max: 20 };
+        let ranges = Arc::new(HashMap::from([
+            (
+                Arc::from("tag1"),
+                ColumnRange {
+                    min_value: Arc::new(ScalarValue::from("aaa")),
+                    max_value: Arc::new(ScalarValue::from("bbb")),
+                },
+            ),
+            (
+                Arc::from("tag3"), // does not exist in schema
+                ColumnRange {
+                    min_value: Arc::new(ScalarValue::from("ccc")),
+                    max_value: Arc::new(ScalarValue::from("ddd")),
+                },
+            ),
+            (
+                Arc::from("field_integer"),
+                ColumnRange {
+                    min_value: Arc::new(ScalarValue::from(10i64)),
+                    max_value: Arc::new(ScalarValue::from(20i64)),
+                },
+            ),
+        ]));
+
+        for row_count in [0usize, 1337usize] {
+            let actual =
+                create_chunk_statistics(Some(row_count), &schema, Some(ts_min_max), Some(&ranges));
+            let expected = Statistics {
+                num_rows: Precision::Exact(row_count),
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![
+                    // tag1
+                    ColumnStatistics {
+                        null_count: Precision::Absent,
+                        min_value: Precision::Exact(ScalarValue::from("aaa")),
+                        max_value: Precision::Exact(ScalarValue::from("bbb")),
+                        distinct_count: Precision::Absent,
+                    },
+                    // tag2
+                    ColumnStatistics::default(),
+                    // field_bool
+                    ColumnStatistics::default(),
+                    // field_float
+                    ColumnStatistics::default(),
+                    // field_integer
+                    ColumnStatistics {
+                        null_count: Precision::Absent,
+                        min_value: Precision::Exact(ScalarValue::from(10i64)),
+                        max_value: Precision::Exact(ScalarValue::from(20i64)),
+                        distinct_count: Precision::Absent,
+                    },
+                    // field_string
+                    ColumnStatistics::default(),
+                    // field_uinteger
+                    ColumnStatistics::default(),
+                    // time
+                    ColumnStatistics {
+                        null_count: Precision::Exact(0),
+                        min_value: Precision::Exact(timestamptz_nano(10)),
+                        max_value: Precision::Exact(timestamptz_nano(20)),
+                        distinct_count: Precision::Absent,
+                    },
+                ],
+            };
+            assert_eq!(actual, expected);
+        }
+    }
+
+    #[test]
+    fn test_create_chunk_statistics_ts_min_max_overrides_column_range() {
+        let schema = full_schema();
+        let row_count = 42usize;
+        let ts_min_max = TimestampMinMax { min: 10, max: 20 };
+        let ranges = Arc::new(HashMap::from([(
+            Arc::from(TIME_COLUMN_NAME),
+            ColumnRange {
+                min_value: Arc::new(timestamptz_nano(12)),
+                max_value: Arc::new(timestamptz_nano(22)),
+            },
+        )]));
+
+        let actual =
+            create_chunk_statistics(Some(row_count), &schema, Some(ts_min_max), Some(&ranges));
+        let expected = Statistics {
+            num_rows: Precision::Exact(row_count),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    min_value: Precision::Exact(timestamptz_nano(10)),
+                    max_value: Precision::Exact(timestamptz_nano(20)),
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_create_chunk_statistics_ts_min_max_none_so_fallback_to_column_range() {
+        let schema = full_schema();
+        let row_count = 42usize;
+        let ranges = Arc::new(HashMap::from([(
+            Arc::from(TIME_COLUMN_NAME),
+            ColumnRange {
+                min_value: Arc::new(timestamptz_nano(12)),
+                max_value: Arc::new(timestamptz_nano(22)),
+            },
+        )]));
+
+        let actual = create_chunk_statistics(Some(row_count), &schema, None, Some(&ranges));
+        let expected = Statistics {
+            num_rows: Precision::Exact(row_count),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics::default(),
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    min_value: Precision::Exact(timestamptz_nano(12)),
+                    max_value: Precision::Exact(timestamptz_nano(22)),
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    fn full_schema() -> Schema {
+        SchemaBuilder::new()
+            .tag("tag1")
+            .tag("tag2")
+            .influx_field("field_bool", InfluxFieldType::Boolean)
+            .influx_field("field_float", InfluxFieldType::Float)
+            .influx_field("field_integer", InfluxFieldType::Integer)
+            .influx_field("field_string", InfluxFieldType::String)
+            .influx_field("field_uinteger", InfluxFieldType::UInteger)
+            .timestamp()
+            .build()
+            .unwrap()
+    }
+}
diff --git a/iox_query/src/config.rs b/iox_query/src/config.rs
new file mode 100644
index 0000000..6dc2235
--- /dev/null
+++ b/iox_query/src/config.rs
@@ -0,0 +1,94 @@
+use std::{str::FromStr, time::Duration};
+
+use datafusion::{common::extensions_options, config::ConfigExtension};
+
+/// IOx-specific config extension prefix.
+pub const IOX_CONFIG_PREFIX: &str = "iox";
+
+extensions_options! {
+    /// Config options for IOx.
+    pub struct IoxConfigExt {
+        /// When splitting de-duplicate operations based on IOx partitions[^iox_part], this is the maximum number of IOx
+        /// partitions that should be considered. If there are more partitions, the split will NOT be performed.
+        ///
+        /// This protects against certain highly degenerative plans.
+        ///
+        ///
+        /// [^iox_part]: "IOx partition" refers to a partition within the IOx catalog, i.e. a partition within the
+        ///              primary key space. This is NOT the same as a DataFusion partition which refers to a stream
+        ///              within the physical plan data flow.
+        pub max_dedup_partition_split: usize, default = 10_000
+
+        /// When splitting de-duplicate operations based on time-based overlaps, this is the maximum number of groups
+        /// that should be considered. If there are more groups, the split will NOT be performed.
+        ///
+        /// This protects against certain highly degenerative plans.
+        pub max_dedup_time_split: usize, default = 100
+
+        /// When multiple parquet files are required in a sorted way (e.g. for de-duplication), we have two options:
+        ///
+        /// 1. **In-mem sorting:** Put them into [`target_partitions`] DataFusion partitions. This limits the fan-out,
+        ///    but requires that we potentially chain multiple parquet files into a single DataFusion partition. Since
+        ///    chaining sorted data does NOT automatically result in sorted data (e.g. AB-AB is not sorted), we need to
+        ///    preform an in-memory sort using [`SortExec`] afterwards. This is expensive.
+        /// 2. **Fan-out:** Instead of chaining files within DataFusion partitions, we can accept a fan-out beyond
+        ///    [`target_partitions`]. This prevents in-memory sorting but may result in OOMs (out-of-memory).
+        ///
+        /// We try to pick option 2 up to a certain number of files, which is configured by this setting.
+        ///
+        ///
+        /// [`SortExec`]: datafusion::physical_plan::sorts::sort::SortExec
+        /// [`target_partitions`]: datafusion::common::config::ExecutionOptions::target_partitions
+        pub max_parquet_fanout: usize, default = 40
+
+        /// Cuttoff date for InfluxQL metadata queries.
+        pub influxql_metadata_cutoff: MetadataCutoff, default = MetadataCutoff::Relative(Duration::from_secs(3600 * 24))
+    }
+}
+
+impl ConfigExtension for IoxConfigExt {
+    const PREFIX: &'static str = IOX_CONFIG_PREFIX;
+}
+
+/// Optional datetime.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum MetadataCutoff {
+    Absolute(chrono::DateTime<chrono::Utc>),
+    Relative(Duration),
+}
+
+#[derive(Debug)]
+pub struct ParseError(String);
+
+impl std::fmt::Display for ParseError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl std::error::Error for ParseError {}
+
+impl FromStr for MetadataCutoff {
+    type Err = ParseError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        if let Some(s) = s.strip_prefix('-') {
+            let delta = u64::from_str(s).map_err(|e| ParseError(e.to_string()))?;
+            let delta = Duration::from_nanos(delta);
+            Ok(Self::Relative(delta))
+        } else {
+            let dt = chrono::DateTime::<chrono::Utc>::from_str(s)
+                .map_err(|e| ParseError(e.to_string()))?;
+            Ok(Self::Absolute(dt))
+        }
+    }
+}
+
+impl std::fmt::Display for MetadataCutoff {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Relative(delta) => write!(f, "-{}", delta.as_nanos()),
+            Self::Absolute(dt) => write!(f, "{}", dt),
+        }
+    }
+}
diff --git a/iox_query/src/exec.rs b/iox_query/src/exec.rs
new file mode 100644
index 0000000..abb8ba5
--- /dev/null
+++ b/iox_query/src/exec.rs
@@ -0,0 +1,814 @@
+//! This module handles the manipulation / execution of storage
+//! plans. This is currently implemented using DataFusion, and this
+//! interface abstracts away many of the details
+pub(crate) mod context;
+pub mod field;
+pub mod fieldlist;
+pub mod gapfill;
+mod metrics;
+mod non_null_checker;
+pub mod query_tracing;
+mod schema_pivot;
+pub mod seriesset;
+pub mod sleep;
+pub(crate) mod split;
+pub mod stringset;
+use datafusion_util::config::register_iox_object_store;
+use executor::DedicatedExecutor;
+use metric::Registry;
+use object_store::DynObjectStore;
+use parquet_file::storage::StorageId;
+mod cross_rt_stream;
+
+use std::{collections::HashMap, fmt::Display, num::NonZeroUsize, sync::Arc};
+
+use datafusion::{
+    self,
+    execution::{
+        disk_manager::DiskManagerConfig,
+        memory_pool::MemoryPool,
+        runtime_env::{RuntimeConfig, RuntimeEnv},
+    },
+    logical_expr::{expr_rewriter::normalize_col, Extension},
+    logical_expr::{Expr, LogicalPlan},
+};
+
+pub use context::{IOxSessionConfig, IOxSessionContext, SessionContextIOxExt};
+use schema_pivot::SchemaPivotNode;
+
+use crate::exec::metrics::DataFusionMemoryPoolMetricsBridge;
+
+use self::{non_null_checker::NonNullCheckerNode, split::StreamSplitNode};
+
+const TESTING_MEM_POOL_SIZE: usize = 1024 * 1024 * 1024; // 1GB
+
+/// Configuration for an Executor
+#[derive(Debug, Clone)]
+pub struct ExecutorConfig {
+    /// Number of threads per thread pool
+    pub num_threads: NonZeroUsize,
+
+    /// Target parallelism for query execution
+    pub target_query_partitions: NonZeroUsize,
+
+    /// Object stores
+    pub object_stores: HashMap<StorageId, Arc<DynObjectStore>>,
+
+    /// Metric registry
+    pub metric_registry: Arc<Registry>,
+
+    /// Memory pool size in bytes.
+    pub mem_pool_size: usize,
+}
+
+impl ExecutorConfig {
+    pub fn testing() -> Self {
+        Self {
+            num_threads: NonZeroUsize::new(1).unwrap(),
+            target_query_partitions: NonZeroUsize::new(1).unwrap(),
+            object_stores: HashMap::default(),
+            metric_registry: Arc::new(Registry::default()),
+            mem_pool_size: TESTING_MEM_POOL_SIZE,
+        }
+    }
+}
+
+impl Display for ExecutorConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "num_threads={}, target_query_partitions={}, mem_pool_size={}",
+            self.num_threads, self.target_query_partitions, self.mem_pool_size
+        )
+    }
+}
+
+#[derive(Debug)]
+pub struct DedicatedExecutors {
+    /// Executor for running user queries
+    query_exec: DedicatedExecutor,
+
+    /// Executor for running system/reorganization tasks such as
+    /// compact
+    reorg_exec: DedicatedExecutor,
+
+    /// Number of threads per thread pool
+    num_threads: NonZeroUsize,
+}
+
+impl DedicatedExecutors {
+    pub fn new(num_threads: NonZeroUsize, metric_registry: Arc<Registry>) -> Self {
+        let query_exec =
+            DedicatedExecutor::new("IOx Query", num_threads, Arc::clone(&metric_registry));
+        let reorg_exec = DedicatedExecutor::new("IOx Reorg", num_threads, metric_registry);
+
+        Self {
+            query_exec,
+            reorg_exec,
+            num_threads,
+        }
+    }
+
+    pub fn new_testing() -> Self {
+        let query_exec = DedicatedExecutor::new_testing();
+        let reorg_exec = DedicatedExecutor::new_testing();
+        assert_eq!(query_exec.num_threads(), reorg_exec.num_threads());
+        let num_threads = query_exec.num_threads();
+        Self {
+            query_exec,
+            reorg_exec,
+            num_threads,
+        }
+    }
+
+    pub fn num_threads(&self) -> NonZeroUsize {
+        self.num_threads
+    }
+}
+
+/// Handles executing DataFusion plans, and marshalling the results into rust
+/// native structures.
+#[derive(Debug)]
+pub struct Executor {
+    /// Executors
+    executors: Arc<DedicatedExecutors>,
+
+    /// The default configuration options with which to create contexts
+    config: ExecutorConfig,
+
+    /// The DataFusion [RuntimeEnv] (including memory manager and disk
+    /// manager) used for all executions
+    runtime: Arc<RuntimeEnv>,
+}
+
+impl Display for Executor {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Executor({})", self.config)
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ExecutorType {
+    /// Run using the pool for queries
+    Query,
+
+    /// Run using the pool for system / reorganization tasks
+    Reorg,
+}
+
+impl Executor {
+    /// Creates a new executor with a two dedicated thread pools, each
+    /// with num_threads
+    pub fn new(
+        num_threads: NonZeroUsize,
+        mem_pool_size: usize,
+        metric_registry: Arc<Registry>,
+    ) -> Self {
+        Self::new_with_config(ExecutorConfig {
+            num_threads,
+            target_query_partitions: num_threads,
+            object_stores: HashMap::default(),
+            metric_registry,
+            mem_pool_size,
+        })
+    }
+
+    /// Create new executor based on a specific config.
+    pub fn new_with_config(config: ExecutorConfig) -> Self {
+        let executors = Arc::new(DedicatedExecutors::new(
+            config.num_threads,
+            Arc::clone(&config.metric_registry),
+        ));
+        Self::new_with_config_and_executors(config, executors)
+    }
+
+    /// Get testing executor that runs a on single thread and a low memory bound
+    /// to preserve resources.
+    pub fn new_testing() -> Self {
+        let config = ExecutorConfig::testing();
+        let executors = Arc::new(DedicatedExecutors::new_testing());
+        Self::new_with_config_and_executors(config, executors)
+    }
+
+    /// Low-level constructor.
+    ///
+    /// This is mostly useful if you wanna keep the executors (because they are quiet expensive to create) but need a fresh IOx runtime.
+    ///
+    /// # Panic
+    /// Panics if the number of threads in `executors` is different from `config`.
+    pub fn new_with_config_and_executors(
+        config: ExecutorConfig,
+        executors: Arc<DedicatedExecutors>,
+    ) -> Self {
+        assert_eq!(config.num_threads, executors.num_threads);
+
+        let runtime_config = RuntimeConfig::new()
+            .with_disk_manager(DiskManagerConfig::Disabled)
+            .with_memory_limit(config.mem_pool_size, 1.0);
+
+        let runtime = Arc::new(RuntimeEnv::new(runtime_config).expect("creating runtime"));
+        for (id, store) in &config.object_stores {
+            register_iox_object_store(&runtime, id, Arc::clone(store));
+        }
+
+        // As there should only be a single memory pool for any executor,
+        // verify that there was no existing instrument registered (for another pool)
+        let mut created = false;
+        let created_captured = &mut created;
+        let bridge =
+            DataFusionMemoryPoolMetricsBridge::new(&runtime.memory_pool, config.mem_pool_size);
+        let bridge_ctor = move || {
+            *created_captured = true;
+            bridge
+        };
+        config
+            .metric_registry
+            .register_instrument("datafusion_pool", bridge_ctor);
+        assert!(
+            created,
+            "More than one execution pool created: previously existing instrument"
+        );
+
+        Self {
+            executors,
+            config,
+            runtime,
+        }
+    }
+
+    /// Return a new execution config, suitable for executing a new query or system task.
+    ///
+    /// Note that this context (and all its clones) will be shut down once `Executor` is dropped.
+    pub fn new_execution_config(&self, executor_type: ExecutorType) -> IOxSessionConfig {
+        let exec = self.executor(executor_type).clone();
+        IOxSessionConfig::new(exec, Arc::clone(&self.runtime))
+            .with_target_partitions(self.config.target_query_partitions)
+    }
+
+    /// Create a new execution context, suitable for executing a new query or system task
+    ///
+    /// Note that this context (and all its clones) will be shut down once `Executor` is dropped.
+    pub fn new_context(&self, executor_type: ExecutorType) -> IOxSessionContext {
+        self.new_execution_config(executor_type).build()
+    }
+
+    /// Return the execution pool  of the specified type
+    pub fn executor(&self, executor_type: ExecutorType) -> &DedicatedExecutor {
+        match executor_type {
+            ExecutorType::Query => &self.executors.query_exec,
+            ExecutorType::Reorg => &self.executors.reorg_exec,
+        }
+    }
+
+    /// Initializes shutdown.
+    pub fn shutdown(&self) {
+        self.executors.query_exec.shutdown();
+        self.executors.reorg_exec.shutdown();
+    }
+
+    /// Stops all subsequent task executions, and waits for the worker
+    /// thread to complete. Note this will shutdown all created contexts.
+    ///
+    /// Only the first all to `join` will actually wait for the
+    /// executing thread to complete. All other calls to join will
+    /// complete immediately.
+    pub async fn join(&self) {
+        self.executors.query_exec.join().await;
+        self.executors.reorg_exec.join().await;
+    }
+
+    /// Returns the memory pool associated with this `Executor`
+    pub fn pool(&self) -> Arc<dyn MemoryPool> {
+        Arc::clone(&self.runtime.memory_pool)
+    }
+
+    /// Returns underlying config.
+    pub fn config(&self) -> &ExecutorConfig {
+        &self.config
+    }
+}
+
+// No need to implement `Drop` because this is done by DedicatedExecutor already
+
+/// Create a SchemaPivot node which  an arbitrary input like
+///  ColA | ColB | ColC
+/// ------+------+------
+///   1   | NULL | NULL
+///   2   | 2    | NULL
+///   3   | 2    | NULL
+///
+/// And pivots it to a table with a single string column for any
+/// columns that had non null values.
+///
+///   non_null_column
+///  -----------------
+///   "ColA"
+///   "ColB"
+pub fn make_schema_pivot(input: LogicalPlan) -> LogicalPlan {
+    let node = Arc::new(SchemaPivotNode::new(input));
+
+    LogicalPlan::Extension(Extension { node })
+}
+
+/// Make a NonNullChecker node takes an arbitrary input array and
+/// produces a single string output column that contains
+///
+/// 1. the single `table_name` string if any of the input columns are non-null
+/// 2. zero rows if all of the input columns are null
+///
+/// For this input:
+///
+///  ColA | ColB | ColC
+/// ------+------+------
+///   1   | NULL | NULL
+///   2   | 2    | NULL
+///   3   | 2    | NULL
+///
+/// The output would be (given 'the_table_name' was the table name)
+///
+///   non_null_column
+///  -----------------
+///   the_table_name
+///
+/// However, for this input (All NULL)
+///
+///  ColA | ColB | ColC
+/// ------+------+------
+///  NULL | NULL | NULL
+///  NULL | NULL | NULL
+///  NULL | NULL | NULL
+///
+/// There would be no output rows
+///
+///   non_null_column
+///  -----------------
+pub fn make_non_null_checker(table_name: &str, input: LogicalPlan) -> LogicalPlan {
+    let node = Arc::new(NonNullCheckerNode::new(table_name, input));
+
+    LogicalPlan::Extension(Extension { node })
+}
+
+/// Create a StreamSplit node which takes an input stream of record
+/// batches and produces multiple output streams based on  a list of `N` predicates.
+/// The output will have `N+1` streams, and each row is sent to the stream
+/// corresponding to the first predicate that evaluates to true, or the last stream if none do.
+///
+/// For example, if the input looks like:
+/// ```text
+///  X | time
+/// ---+-----
+///  a | 1000
+///  b | 4000
+///  c | 2000
+/// ```
+///
+/// A StreamSplit with split_exprs = [`time <= 1000`, `1000 < time <=2000`] will produce the
+/// following three output streams (output DataFusion Partitions):
+///
+///
+/// ```text
+///  X | time
+/// ---+-----
+///  a | 1000
+/// ```
+///
+/// ```text
+///  X | time
+/// ---+-----
+///  b | 2000
+/// ```
+/// and
+/// ```text
+///  X | time
+/// ---+-----
+///  b | 4000
+/// ```
+pub fn make_stream_split(input: LogicalPlan, split_exprs: Vec<Expr>) -> LogicalPlan {
+    // rewrite the input expression so that it is fully qualified with the input schema
+    let split_exprs = split_exprs
+        .into_iter()
+        .map(|split_expr| normalize_col(split_expr, &input).expect("normalize is infallable"))
+        .collect::<Vec<_>>();
+
+    let node = Arc::new(StreamSplitNode::new(input, split_exprs));
+    LogicalPlan::Extension(Extension { node })
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::{
+        array::{ArrayRef, Int64Array, StringArray},
+        datatypes::{DataType, Field, Schema, SchemaRef},
+    };
+    use datafusion::{
+        datasource::{provider_as_source, MemTable},
+        error::DataFusionError,
+        logical_expr::LogicalPlanBuilder,
+        physical_expr::PhysicalSortExpr,
+        physical_plan::{
+            expressions::Column, sorts::sort::SortExec, DisplayAs, ExecutionPlan, RecordBatchStream,
+        },
+    };
+    use futures::{stream::BoxStream, Stream, StreamExt};
+    use metric::{Observation, RawReporter};
+    use stringset::StringSet;
+    use tokio::sync::Barrier;
+
+    use super::*;
+    use crate::exec::stringset::StringSetRef;
+    use crate::plan::stringset::StringSetPlan;
+    use arrow::record_batch::RecordBatch;
+
+    #[tokio::test]
+    async fn executor_known_string_set_plan_ok() {
+        let expected_strings = to_set(&["Foo", "Bar"]);
+        let plan = StringSetPlan::Known(Arc::clone(&expected_strings));
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let result_strings = ctx.to_string_set(plan).await.unwrap();
+        assert_eq!(result_strings, expected_strings);
+    }
+
+    #[tokio::test]
+    async fn executor_datafusion_string_set_single_plan_no_batches() {
+        // Test with a single plan that produces no batches
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
+        let scan = make_plan(schema, vec![]);
+        let plan: StringSetPlan = vec![scan].into();
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let results = ctx.to_string_set(plan).await.unwrap();
+
+        assert_eq!(results, StringSetRef::new(StringSet::new()));
+    }
+
+    #[tokio::test]
+    async fn executor_datafusion_string_set_single_plan_one_batch() {
+        // Test with a single plan that produces one record batch
+        let data = to_string_array(&["foo", "bar", "baz", "foo"]);
+        let batch = RecordBatch::try_from_iter_with_nullable(vec![("a", data, true)])
+            .expect("created new record batch");
+        let scan = make_plan(batch.schema(), vec![batch]);
+        let plan: StringSetPlan = vec![scan].into();
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let results = ctx.to_string_set(plan).await.unwrap();
+
+        assert_eq!(results, to_set(&["foo", "bar", "baz"]));
+    }
+
+    #[tokio::test]
+    async fn executor_datafusion_string_set_single_plan_two_batch() {
+        // Test with a single plan that produces multiple record batches
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
+        let data1 = to_string_array(&["foo", "bar"]);
+        let batch1 = RecordBatch::try_new(Arc::clone(&schema), vec![data1])
+            .expect("created new record batch");
+        let data2 = to_string_array(&["baz", "foo"]);
+        let batch2 = RecordBatch::try_new(Arc::clone(&schema), vec![data2])
+            .expect("created new record batch");
+        let scan = make_plan(schema, vec![batch1, batch2]);
+        let plan: StringSetPlan = vec![scan].into();
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let results = ctx.to_string_set(plan).await.unwrap();
+
+        assert_eq!(results, to_set(&["foo", "bar", "baz"]));
+    }
+
+    #[tokio::test]
+    async fn executor_datafusion_string_set_multi_plan() {
+        // Test with multiple datafusion logical plans
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
+
+        let data1 = to_string_array(&["foo", "bar"]);
+        let batch1 = RecordBatch::try_new(Arc::clone(&schema), vec![data1])
+            .expect("created new record batch");
+        let scan1 = make_plan(Arc::clone(&schema), vec![batch1]);
+
+        let data2 = to_string_array(&["baz", "foo"]);
+        let batch2 = RecordBatch::try_new(Arc::clone(&schema), vec![data2])
+            .expect("created new record batch");
+        let scan2 = make_plan(schema, vec![batch2]);
+
+        let plan: StringSetPlan = vec![scan1, scan2].into();
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let results = ctx.to_string_set(plan).await.unwrap();
+
+        assert_eq!(results, to_set(&["foo", "bar", "baz"]));
+    }
+
+    #[tokio::test]
+    async fn executor_datafusion_string_set_nulls() {
+        // Ensure that nulls in the output set are handled reasonably
+        // (error, rather than silently ignored)
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
+        let array = StringArray::from_iter(vec![Some("foo"), None]);
+        let data = Arc::new(array);
+        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data])
+            .expect("created new record batch");
+        let scan = make_plan(schema, vec![batch]);
+        let plan: StringSetPlan = vec![scan].into();
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let results = ctx.to_string_set(plan).await;
+
+        let actual_error = match results {
+            Ok(_) => "Unexpected Ok".into(),
+            Err(e) => format!("{e}"),
+        };
+        let expected_error = "unexpected null value";
+        assert!(
+            actual_error.contains(expected_error),
+            "expected error '{expected_error}' not found in '{actual_error:?}'",
+        );
+    }
+
+    #[tokio::test]
+    async fn executor_datafusion_string_set_bad_schema() {
+        // Ensure that an incorect schema (an int) gives a reasonable error
+        let data: ArrayRef = Arc::new(Int64Array::from(vec![1]));
+        let batch =
+            RecordBatch::try_from_iter(vec![("a", data)]).expect("created new record batch");
+        let scan = make_plan(batch.schema(), vec![batch]);
+        let plan: StringSetPlan = vec![scan].into();
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let results = ctx.to_string_set(plan).await;
+
+        let actual_error = match results {
+            Ok(_) => "Unexpected Ok".into(),
+            Err(e) => format!("{e}"),
+        };
+
+        let expected_error = "schema not a single Utf8";
+        assert!(
+            actual_error.contains(expected_error),
+            "expected error '{expected_error}' not found in '{actual_error:?}'"
+        );
+    }
+
+    #[tokio::test]
+    async fn make_schema_pivot_is_planned() {
+        // Test that all the planning logic is wired up and that we
+        // can make a plan using a SchemaPivot node
+        let batch = RecordBatch::try_from_iter_with_nullable(vec![
+            ("f1", to_string_array(&["foo", "bar"]), true),
+            ("f2", to_string_array(&["baz", "bzz"]), true),
+        ])
+        .expect("created new record batch");
+
+        let scan = make_plan(batch.schema(), vec![batch]);
+        let pivot = make_schema_pivot(scan);
+        let plan = vec![pivot].into();
+
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let results = ctx.to_string_set(plan).await.expect("Executed plan");
+
+        assert_eq!(results, to_set(&["f1", "f2"]));
+    }
+
+    #[tokio::test]
+    async fn test_metrics_integration() {
+        let exec = Executor::new_testing();
+
+        // start w/o any reservation
+        assert_eq!(
+            PoolMetrics::read(&exec.config.metric_registry),
+            PoolMetrics {
+                reserved: 0,
+                limit: TESTING_MEM_POOL_SIZE as u64,
+            },
+        );
+
+        // block some reservation
+        let test_input = Arc::new(TestExec::default());
+        let schema = test_input.schema();
+        let plan = Arc::new(SortExec::new(
+            vec![PhysicalSortExpr {
+                expr: Arc::new(Column::new_with_schema("c", &schema).unwrap()),
+                options: Default::default(),
+            }],
+            Arc::clone(&test_input) as _,
+        ));
+        let ctx = exec.new_context(ExecutorType::Query);
+        let handle = tokio::spawn(async move {
+            ctx.collect(plan).await.unwrap();
+        });
+        test_input.wait().await;
+        assert_eq!(
+            PoolMetrics::read(&exec.config.metric_registry),
+            PoolMetrics {
+                reserved: 896,
+                limit: TESTING_MEM_POOL_SIZE as u64,
+            },
+        );
+        test_input.wait_for_finish().await;
+
+        // end w/o any reservation
+        handle.await.unwrap();
+        assert_eq!(
+            PoolMetrics::read(&exec.config.metric_registry),
+            PoolMetrics {
+                reserved: 0,
+                limit: TESTING_MEM_POOL_SIZE as u64,
+            },
+        );
+    }
+
+    /// return a set for testing
+    fn to_set(strs: &[&str]) -> StringSetRef {
+        StringSetRef::new(strs.iter().map(|s| s.to_string()).collect::<StringSet>())
+    }
+
+    fn to_string_array(strs: &[&str]) -> ArrayRef {
+        let array: StringArray = strs.iter().map(|s| Some(*s)).collect();
+        Arc::new(array)
+    }
+
+    // creates a DataFusion plan that reads the RecordBatches into memory
+    fn make_plan(schema: SchemaRef, data: Vec<RecordBatch>) -> LogicalPlan {
+        let partitions = vec![data];
+
+        let projection = None;
+
+        // model one partition,
+        let table = MemTable::try_new(schema, partitions).unwrap();
+        let source = provider_as_source(Arc::new(table));
+
+        LogicalPlanBuilder::scan("memtable", source, projection)
+            .unwrap()
+            .build()
+            .unwrap()
+    }
+
+    #[derive(Debug)]
+    struct TestExec {
+        schema: SchemaRef,
+        // Barrier after a batch has been produced
+        barrier: Arc<Barrier>,
+        // Barrier right before the operator is complete
+        barrier_finish: Arc<Barrier>,
+    }
+
+    impl Default for TestExec {
+        fn default() -> Self {
+            Self {
+                schema: Arc::new(arrow::datatypes::Schema::new(vec![Field::new(
+                    "c",
+                    DataType::Int64,
+                    true,
+                )])),
+                barrier: Arc::new(Barrier::new(2)),
+                barrier_finish: Arc::new(Barrier::new(2)),
+            }
+        }
+    }
+
+    impl TestExec {
+        /// wait for the first output to be produced
+        pub async fn wait(&self) {
+            self.barrier.wait().await;
+        }
+
+        /// wait for output to be done
+        pub async fn wait_for_finish(&self) {
+            self.barrier_finish.wait().await;
+        }
+    }
+
+    impl DisplayAs for TestExec {
+        fn fmt_as(
+            &self,
+            _t: datafusion::physical_plan::DisplayFormatType,
+            f: &mut std::fmt::Formatter<'_>,
+        ) -> std::fmt::Result {
+            write!(f, "TestExec")
+        }
+    }
+
+    impl ExecutionPlan for TestExec {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+
+        fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning {
+            datafusion::physical_plan::Partitioning::UnknownPartitioning(1)
+        }
+
+        fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+            None
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+            unimplemented!()
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<datafusion::execution::TaskContext>,
+        ) -> datafusion::error::Result<datafusion::physical_plan::SendableRecordBatchStream>
+        {
+            let barrier = Arc::clone(&self.barrier);
+            let schema = Arc::clone(&self.schema);
+            let barrier_finish = Arc::clone(&self.barrier_finish);
+            let schema_finish = Arc::clone(&self.schema);
+            let stream = futures::stream::iter([Ok(RecordBatch::try_new(
+                Arc::clone(&self.schema),
+                vec![Arc::new(Int64Array::from(vec![1i64; 100]))],
+            )
+            .unwrap())])
+            .chain(futures::stream::once(async move {
+                barrier.wait().await;
+                Ok(RecordBatch::new_empty(schema))
+            }))
+            .chain(futures::stream::once(async move {
+                barrier_finish.wait().await;
+                Ok(RecordBatch::new_empty(schema_finish))
+            }));
+            let stream = BoxRecordBatchStream {
+                schema: Arc::clone(&self.schema),
+                inner: stream.boxed(),
+            };
+            Ok(Box::pin(stream))
+        }
+
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            Ok(datafusion::physical_plan::Statistics::new_unknown(
+                &self.schema(),
+            ))
+        }
+    }
+
+    struct BoxRecordBatchStream {
+        schema: SchemaRef,
+        inner: BoxStream<'static, Result<RecordBatch, DataFusionError>>,
+    }
+
+    impl Stream for BoxRecordBatchStream {
+        type Item = Result<RecordBatch, DataFusionError>;
+
+        fn poll_next(
+            mut self: std::pin::Pin<&mut Self>,
+            cx: &mut std::task::Context<'_>,
+        ) -> std::task::Poll<Option<Self::Item>> {
+            let this = &mut *self;
+            this.inner.poll_next_unpin(cx)
+        }
+    }
+
+    impl RecordBatchStream for BoxRecordBatchStream {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+
+    #[derive(Debug, PartialEq, Eq)]
+    struct PoolMetrics {
+        reserved: u64,
+        limit: u64,
+    }
+
+    impl PoolMetrics {
+        fn read(registry: &Registry) -> Self {
+            let mut reporter = RawReporter::default();
+            registry.report(&mut reporter);
+            let metric = reporter.metric("datafusion_mem_pool_bytes").unwrap();
+
+            let reserved = metric.observation(&[("state", "reserved")]).unwrap();
+            let Observation::U64Gauge(reserved) = reserved else {
+                panic!("wrong metric type")
+            };
+            let limit = metric.observation(&[("state", "limit")]).unwrap();
+            let Observation::U64Gauge(limit) = limit else {
+                panic!("wrong metric type")
+            };
+
+            Self {
+                reserved: *reserved,
+                limit: *limit,
+            }
+        }
+    }
+}
diff --git a/iox_query/src/exec/context.rs b/iox_query/src/exec/context.rs
new file mode 100644
index 0000000..ad60c7a
--- /dev/null
+++ b/iox_query/src/exec/context.rs
@@ -0,0 +1,753 @@
+//! This module contains plumbing to connect InfluxDB IOx extensions to
+//! DataFusion
+
+use super::{
+    cross_rt_stream::CrossRtStream,
+    gapfill::{plan_gap_fill, GapFill},
+    non_null_checker::NonNullCheckerNode,
+    seriesset::{series::Either, SeriesSet},
+    sleep::SleepNode,
+    split::StreamSplitNode,
+};
+use crate::{
+    config::IoxConfigExt,
+    exec::{
+        fieldlist::{FieldList, IntoFieldList},
+        non_null_checker::NonNullCheckerExec,
+        query_tracing::TracedStream,
+        schema_pivot::{SchemaPivotExec, SchemaPivotNode},
+        seriesset::{
+            converter::{GroupGenerator, SeriesSetConverter},
+            series::Series,
+        },
+        split::StreamSplitExec,
+        stringset::{IntoStringSet, StringSetRef},
+    },
+    logical_optimizer::register_iox_logical_optimizers,
+    physical_optimizer::register_iox_physical_optimizers,
+    plan::{
+        fieldlist::FieldListPlan,
+        seriesset::{SeriesSetPlan, SeriesSetPlans},
+        stringset::StringSetPlan,
+    },
+};
+use arrow::record_batch::RecordBatch;
+use async_trait::async_trait;
+use datafusion::{
+    catalog::CatalogProvider,
+    common::ParamValues,
+    execution::{
+        context::{QueryPlanner, SessionState, TaskContext},
+        memory_pool::MemoryPool,
+        runtime_env::RuntimeEnv,
+    },
+    logical_expr::{LogicalPlan, UserDefinedLogicalNode},
+    physical_plan::{
+        coalesce_partitions::CoalescePartitionsExec, displayable, stream::RecordBatchStreamAdapter,
+        EmptyRecordBatchStream, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream,
+    },
+    physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner},
+    prelude::*,
+};
+use datafusion_util::config::{iox_session_config, DEFAULT_CATALOG};
+use executor::DedicatedExecutor;
+use futures::{Stream, StreamExt, TryStreamExt};
+use observability_deps::tracing::{debug, warn};
+use query_functions::{register_scalar_functions, selectors::register_selector_aggregates};
+use std::{fmt, num::NonZeroUsize, sync::Arc};
+use trace::{
+    ctx::SpanContext,
+    span::{MetaValue, Span, SpanEvent, SpanExt, SpanRecorder},
+};
+
+// Reuse DataFusion error and Result types for this module
+pub use datafusion::error::{DataFusionError, Result};
+
+/// This structure implements the DataFusion notion of "query planner"
+/// and is needed to create plans with the IOx extension nodes.
+struct IOxQueryPlanner {}
+
+#[async_trait]
+impl QueryPlanner for IOxQueryPlanner {
+    /// Given a `LogicalPlan` created from above, create an
+    /// `ExecutionPlan` suitable for execution
+    async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+        session_state: &SessionState,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        // Teach the default physical planner how to plan SchemaPivot
+        // and StreamSplit nodes.
+        let physical_planner =
+            DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(IOxExtensionPlanner {})]);
+        // Delegate most work of physical planning to the default physical planner
+        physical_planner
+            .create_physical_plan(logical_plan, session_state)
+            .await
+    }
+}
+
+/// Physical planner for InfluxDB IOx extension plans
+struct IOxExtensionPlanner {}
+
+#[async_trait]
+impl ExtensionPlanner for IOxExtensionPlanner {
+    /// Create a physical plan for an extension node
+    async fn plan_extension(
+        &self,
+        planner: &dyn PhysicalPlanner,
+        node: &dyn UserDefinedLogicalNode,
+        logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        session_state: &SessionState,
+    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
+        let any = node.as_any();
+        let plan = if let Some(schema_pivot) = any.downcast_ref::<SchemaPivotNode>() {
+            assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs");
+            Some(Arc::new(SchemaPivotExec::new(
+                Arc::clone(&physical_inputs[0]),
+                schema_pivot.schema().as_ref().clone().into(),
+            )) as Arc<dyn ExecutionPlan>)
+        } else if let Some(non_null_checker) = any.downcast_ref::<NonNullCheckerNode>() {
+            assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs");
+            Some(Arc::new(NonNullCheckerExec::new(
+                Arc::clone(&physical_inputs[0]),
+                non_null_checker.schema().as_ref().clone().into(),
+                non_null_checker.value(),
+            )) as Arc<dyn ExecutionPlan>)
+        } else if let Some(stream_split) = any.downcast_ref::<StreamSplitNode>() {
+            assert_eq!(
+                logical_inputs.len(),
+                1,
+                "Inconsistent number of logical inputs"
+            );
+            assert_eq!(
+                physical_inputs.len(),
+                1,
+                "Inconsistent number of physical inputs"
+            );
+
+            let split_exprs = stream_split
+                .split_exprs()
+                .iter()
+                .map(|e| {
+                    planner.create_physical_expr(
+                        e,
+                        logical_inputs[0].schema(),
+                        &physical_inputs[0].schema(),
+                        session_state,
+                    )
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            Some(Arc::new(StreamSplitExec::new(
+                Arc::clone(&physical_inputs[0]),
+                split_exprs,
+            )) as Arc<dyn ExecutionPlan>)
+        } else if let Some(gap_fill) = any.downcast_ref::<GapFill>() {
+            let gap_fill_exec = plan_gap_fill(
+                session_state.execution_props(),
+                gap_fill,
+                logical_inputs,
+                physical_inputs,
+            )?;
+            Some(Arc::new(gap_fill_exec) as Arc<dyn ExecutionPlan>)
+        } else if let Some(sleep) = any.downcast_ref::<SleepNode>() {
+            let sleep = sleep.plan(planner, logical_inputs, physical_inputs, session_state)?;
+            Some(Arc::new(sleep) as _)
+        } else {
+            None
+        };
+        Ok(plan)
+    }
+}
+
+/// Configuration for an IOx execution context
+///
+/// Created from an Executor
+#[derive(Clone)]
+pub struct IOxSessionConfig {
+    /// Executor to run on
+    exec: DedicatedExecutor,
+
+    /// DataFusion session configuration
+    session_config: SessionConfig,
+
+    /// Shared DataFusion runtime
+    runtime: Arc<RuntimeEnv>,
+
+    /// Default catalog
+    default_catalog: Option<Arc<dyn CatalogProvider>>,
+
+    /// Span context from which to create spans for this query
+    span_ctx: Option<SpanContext>,
+}
+
+impl fmt::Debug for IOxSessionConfig {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "IOxSessionConfig ...")
+    }
+}
+
+impl IOxSessionConfig {
+    pub(super) fn new(exec: DedicatedExecutor, runtime: Arc<RuntimeEnv>) -> Self {
+        let mut session_config = iox_session_config();
+        session_config
+            .options_mut()
+            .extensions
+            .insert(IoxConfigExt::default());
+
+        Self {
+            exec,
+            session_config,
+            runtime,
+            default_catalog: None,
+            span_ctx: None,
+        }
+    }
+
+    /// Set execution concurrency
+    pub fn with_target_partitions(mut self, target_partitions: NonZeroUsize) -> Self {
+        self.session_config = self
+            .session_config
+            .with_target_partitions(target_partitions.get());
+        self
+    }
+
+    /// Set the default catalog provider
+    pub fn with_default_catalog(self, catalog: Arc<dyn CatalogProvider>) -> Self {
+        Self {
+            default_catalog: Some(catalog),
+            ..self
+        }
+    }
+
+    /// Set the span context from which to create  distributed tracing spans for this query
+    pub fn with_span_context(self, span_ctx: Option<SpanContext>) -> Self {
+        Self { span_ctx, ..self }
+    }
+
+    /// Set DataFusion [config option].
+    ///
+    /// May be used to set [IOx-specific] option as well.
+    ///
+    ///
+    /// [config option]: datafusion::common::config::ConfigOptions
+    /// [IOx-specific]: crate::config::IoxConfigExt
+    pub fn with_config_option(mut self, key: &str, value: &str) -> Self {
+        // ignore invalid config
+        if let Err(e) = self.session_config.options_mut().set(key, value) {
+            warn!(
+                key,
+                value,
+                %e,
+                "invalid DataFusion config",
+            );
+        }
+        self
+    }
+
+    /// Create an ExecutionContext suitable for executing DataFusion plans
+    pub fn build(self) -> IOxSessionContext {
+        let maybe_span = self.span_ctx.child_span("Query Execution");
+        let recorder = SpanRecorder::new(maybe_span);
+
+        // attach span to DataFusion session
+        let session_config = self
+            .session_config
+            .with_extension(Arc::new(recorder.span().cloned()));
+
+        let state = SessionState::new_with_config_rt(session_config, self.runtime)
+            .with_query_planner(Arc::new(IOxQueryPlanner {}));
+        let state = register_iox_physical_optimizers(state);
+        let state = register_iox_logical_optimizers(state);
+
+        let inner = SessionContext::new_with_state(state);
+        register_selector_aggregates(&inner);
+        register_scalar_functions(&inner);
+        if let Some(default_catalog) = self.default_catalog {
+            inner.register_catalog(DEFAULT_CATALOG, default_catalog);
+        }
+
+        IOxSessionContext::new(inner, self.exec, recorder)
+    }
+}
+
+/// This is an execution context for planning in IOx.  It wraps a
+/// DataFusion execution context with the information needed for planning.
+///
+/// Methods on this struct should be preferred to using the raw
+/// DataFusion functions (such as `collect`) directly.
+///
+/// Eventually we envision this also managing additional resource
+/// types such as Memory and providing visibility into what plans are
+/// running
+///
+/// An IOxSessionContext is created directly from an Executor, or from
+/// an IOxSessionConfig created by an Executor
+pub struct IOxSessionContext {
+    inner: SessionContext,
+
+    /// Dedicated executor for query execution.
+    ///
+    /// DataFusion plans are "CPU" bound and thus can consume tokio
+    /// executors threads for extended periods of time. We use a
+    /// dedicated tokio runtime to run them so that other requests
+    /// can be handled.
+    exec: DedicatedExecutor,
+
+    /// Span context from which to create spans for this query
+    recorder: SpanRecorder,
+}
+
+impl fmt::Debug for IOxSessionContext {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("IOxSessionContext")
+            .field("inner", &"<DataFusion ExecutionContext>")
+            .field("exec", &self.exec)
+            .field("recorder", &self.recorder)
+            .finish()
+    }
+}
+
+impl IOxSessionContext {
+    /// Constructor for testing.
+    ///
+    /// This is identical to [`Default::default`] but we do NOT implement [`Default`] to make the creation of untracked
+    /// contexts more explicit.
+    pub fn with_testing() -> Self {
+        Self {
+            inner: SessionContext::default(),
+            exec: DedicatedExecutor::new_testing(),
+            recorder: SpanRecorder::default(),
+        }
+    }
+
+    /// Private constructor
+    pub(crate) fn new(
+        inner: SessionContext,
+        exec: DedicatedExecutor,
+        recorder: SpanRecorder,
+    ) -> Self {
+        Self {
+            inner,
+            exec,
+            recorder,
+        }
+    }
+
+    /// returns a reference to the inner datafusion execution context
+    pub fn inner(&self) -> &SessionContext {
+        &self.inner
+    }
+
+    /// Plan a SQL statement. This assumes that any tables referenced
+    /// in the SQL have been registered with this context. Use
+    /// `create_physical_plan` to actually execute the query.
+    pub async fn sql_to_logical_plan(&self, sql: &str) -> Result<LogicalPlan> {
+        Self::sql_to_logical_plan_with_params(self, sql, ParamValues::List(vec![])).await
+    }
+
+    /// Plan a SQL statement, providing a list of parameter values
+    /// to supply to `$placeholder` variables. This assumes that
+    /// any tables referenced in the SQL have been registered with
+    /// this context. Use `create_physical_plan` to actually execute
+    /// the query.
+    pub async fn sql_to_logical_plan_with_params(
+        &self,
+        sql: &str,
+        params: impl Into<ParamValues> + Send,
+    ) -> Result<LogicalPlan> {
+        let ctx = self.child_ctx("sql_to_logical_plan");
+        debug!(text=%sql, "planning SQL query");
+        let plan = ctx
+            .inner
+            .state()
+            .create_logical_plan(sql)
+            .await?
+            .with_param_values(params.into())?;
+        // ensure the plan does not contain unwanted statements
+        let verifier = SQLOptions::new()
+            .with_allow_ddl(false) // no CREATE ...
+            .with_allow_dml(false) // no INSERT or COPY
+            .with_allow_statements(false); // no SET VARIABLE, etc
+        verifier.verify_plan(&plan)?;
+        Ok(plan)
+    }
+
+    /// Create a logical plan that reads a single [`RecordBatch`]. Use
+    /// `create_physical_plan` to actually execute the query.
+    pub fn batch_to_logical_plan(&self, batch: RecordBatch) -> Result<LogicalPlan> {
+        let ctx = self.child_ctx("batch_to_logical_plan");
+        debug!(num_rows = batch.num_rows(), "planning RecordBatch query");
+        ctx.inner.read_batch(batch)?.into_optimized_plan()
+    }
+
+    /// Plan a SQL statement and convert it to an execution plan. This assumes that any
+    /// tables referenced in the SQL have been registered with this context
+    pub async fn sql_to_physical_plan(&self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
+        Self::sql_to_physical_plan_with_params(self, sql, ParamValues::List(vec![])).await
+    }
+
+    /// Plan a SQL statement and convert it to an execution plan, providing a list of
+    /// parameter values to supply to `$placeholder` variables. This assumes that any
+    /// tables referenced in the SQL have been registered with this context
+    pub async fn sql_to_physical_plan_with_params(
+        &self,
+        sql: &str,
+        params: impl Into<ParamValues> + Send,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let ctx = self.child_ctx("sql_to_physical_plan");
+
+        let logical_plan = ctx.sql_to_logical_plan_with_params(sql, params).await?;
+        ctx.create_physical_plan(&logical_plan).await
+    }
+
+    /// Prepare (optimize + plan) a pre-created [`LogicalPlan`] for execution
+    pub async fn create_physical_plan(
+        &self,
+        logical_plan: &LogicalPlan,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let mut ctx = self.child_ctx("create_physical_plan");
+        debug!(text=%logical_plan.display_indent_schema(), "create_physical_plan: initial plan");
+        let physical_plan = ctx.inner.state().create_physical_plan(logical_plan).await?;
+
+        ctx.recorder.event(SpanEvent::new("physical plan"));
+        debug!(text=%displayable(physical_plan.as_ref()).indent(false), "create_physical_plan: plan to run");
+        Ok(physical_plan)
+    }
+
+    /// Executes the logical plan using DataFusion on a separate
+    /// thread pool and produces RecordBatches
+    pub async fn collect(&self, physical_plan: Arc<dyn ExecutionPlan>) -> Result<Vec<RecordBatch>> {
+        debug!(
+            "Running plan, physical:\n{}",
+            displayable(physical_plan.as_ref()).indent(false)
+        );
+        let ctx = self.child_ctx("collect");
+        let stream = ctx.execute_stream(physical_plan).await?;
+
+        ctx.run(
+            stream
+                .err_into() // convert to DataFusionError
+                .try_collect(),
+        )
+        .await
+    }
+
+    /// Executes the physical plan and produces a
+    /// `SendableRecordBatchStream` to stream over the result that
+    /// iterates over the results. The creation of the stream is
+    /// performed in a separate thread pool.
+    pub async fn execute_stream(
+        &self,
+        physical_plan: Arc<dyn ExecutionPlan>,
+    ) -> Result<SendableRecordBatchStream> {
+        match physical_plan.output_partitioning().partition_count() {
+            0 => Ok(Box::pin(EmptyRecordBatchStream::new(
+                physical_plan.schema(),
+            ))),
+            1 => self.execute_stream_partitioned(physical_plan, 0).await,
+            _ => {
+                // Merge into a single partition
+                self.execute_stream_partitioned(
+                    Arc::new(CoalescePartitionsExec::new(physical_plan)),
+                    0,
+                )
+                .await
+            }
+        }
+    }
+
+    /// Executes a single partition of a physical plan and produces a
+    /// `SendableRecordBatchStream` to stream over the result that
+    /// iterates over the results. The creation of the stream is
+    /// performed in a separate thread pool.
+    pub async fn execute_stream_partitioned(
+        &self,
+        physical_plan: Arc<dyn ExecutionPlan>,
+        partition: usize,
+    ) -> Result<SendableRecordBatchStream> {
+        let span = self
+            .recorder
+            .span()
+            .map(|span| span.child("execute_stream_partitioned"));
+
+        let task_context = Arc::new(TaskContext::from(self.inner()));
+
+        let stream = self
+            .run(async move {
+                let stream = physical_plan.execute(partition, task_context)?;
+                Ok(TracedStream::new(stream, span, physical_plan))
+            })
+            .await?;
+        // Wrap the resulting stream into `CrossRtStream`. This is required because polling the DataFusion result stream
+        // actually drives the (potentially CPU-bound) work. We need to make sure that this work stays within the
+        // dedicated executor because otherwise this may block the top-level tokio/tonic runtime which may lead to
+        // requests timetouts (either for new requests, metrics or even for HTTP2 pings on the active connection).
+        let schema = stream.schema();
+        let stream = CrossRtStream::new_with_df_error_stream(stream, self.exec.clone());
+        let stream = RecordBatchStreamAdapter::new(schema, stream);
+        Ok(Box::pin(stream))
+    }
+
+    /// Executes the SeriesSetPlans on the query executor, in
+    /// parallel, producing series or groups
+    pub async fn to_series_and_groups(
+        &self,
+        series_set_plans: SeriesSetPlans,
+        memory_pool: Arc<dyn MemoryPool>,
+        points_per_batch: usize,
+    ) -> Result<impl Stream<Item = Result<Either>>> {
+        let SeriesSetPlans {
+            mut plans,
+            group_columns,
+        } = series_set_plans;
+
+        if plans.is_empty() {
+            return Ok(futures::stream::empty().boxed());
+        }
+
+        // sort plans by table (measurement) name
+        plans.sort_by(|a, b| a.table_name.cmp(&b.table_name));
+
+        // Run the plans in parallel
+        let ctx = self.child_ctx("to_series_set");
+        let exec = self.exec.clone();
+        let data = futures::stream::iter(plans)
+            .then(move |plan| {
+                let ctx = ctx.child_ctx("for plan");
+                let exec = exec.clone();
+
+                async move {
+                    let stream = Self::run_inner(exec.clone(), async move {
+                        let SeriesSetPlan {
+                            table_name,
+                            plan,
+                            tag_columns,
+                            field_columns,
+                        } = plan;
+
+                        let tag_columns = Arc::new(tag_columns);
+
+                        let physical_plan = ctx.create_physical_plan(&plan).await?;
+
+                        let it = ctx.execute_stream(physical_plan).await?;
+
+                        SeriesSetConverter::default()
+                            .convert(table_name, tag_columns, field_columns, it)
+                            .await
+                    })
+                    .await?;
+
+                    Ok::<_, DataFusionError>(CrossRtStream::new_with_df_error_stream(stream, exec))
+                }
+            })
+            .try_flatten()
+            .try_filter_map(move |series_set: SeriesSet| async move {
+                // If all timestamps of returned columns are nulls,
+                // there must be no data. We need to check this because
+                // aggregate (e.g. count, min, max) returns one row that are
+                // all null (even the values of aggregate) for min, max and 0 for count.
+                // For influx read_group's series and group, we do not want to return 0
+                // for count either.
+                if series_set.is_timestamp_all_null() {
+                    return Ok(None);
+                }
+
+                let series: Vec<Series> =
+                    series_set.try_into_series(points_per_batch).map_err(|e| {
+                        DataFusionError::Execution(format!("Error converting to series: {e}"))
+                    })?;
+                Ok(Some(futures::stream::iter(series).map(Ok)))
+            })
+            .try_flatten();
+
+        // If we have group columns, sort the results, and create the
+        // appropriate groups
+        if let Some(group_columns) = group_columns {
+            let grouper = GroupGenerator::new(group_columns, memory_pool);
+            Ok(grouper.group(data).await?.boxed())
+        } else {
+            Ok(data.map_ok(|series| series.into()).boxed())
+        }
+    }
+
+    /// Executes `plan` and return the resulting FieldList on the query executor
+    pub async fn to_field_list(&self, plan: FieldListPlan) -> Result<FieldList> {
+        let FieldListPlan {
+            known_values,
+            extra_plans,
+        } = plan;
+
+        // Run the plans in parallel
+        let handles = extra_plans
+            .into_iter()
+            .map(|plan| {
+                let ctx = self.child_ctx("to_field_list");
+                self.run(async move {
+                    let physical_plan = ctx.create_physical_plan(&plan).await?;
+
+                    // TODO: avoid this buffering
+                    let field_list =
+                        ctx.collect(physical_plan)
+                            .await?
+                            .into_fieldlist()
+                            .map_err(|e| {
+                                DataFusionError::Context(
+                                    "Error converting to field list".to_string(),
+                                    Box::new(DataFusionError::External(Box::new(e))),
+                                )
+                            })?;
+
+                    Ok(field_list)
+                })
+            })
+            .collect::<Vec<_>>();
+
+        // collect them all up and combine them
+        let mut results = Vec::new();
+
+        if !known_values.is_empty() {
+            let list = known_values.into_iter().map(|f| f.1).collect();
+            results.push(FieldList { fields: list })
+        }
+
+        for join_handle in handles {
+            let fieldlist = join_handle.await?;
+
+            results.push(fieldlist);
+        }
+
+        // TODO: Stream this
+        results.into_fieldlist().map_err(|e| {
+            DataFusionError::Context(
+                "Error converting to field list".to_string(),
+                Box::new(DataFusionError::External(Box::new(e))),
+            )
+        })
+    }
+
+    /// Executes this plan on the query pool, and returns the
+    /// resulting set of strings
+    pub async fn to_string_set(&self, plan: StringSetPlan) -> Result<StringSetRef> {
+        let ctx = self.child_ctx("to_string_set");
+        match plan {
+            StringSetPlan::Known(ss) => Ok(ss),
+            StringSetPlan::Plan(plans) => ctx
+                .run_logical_plans(plans)
+                .await?
+                .into_stringset()
+                .map_err(|e| {
+                    DataFusionError::Context(
+                        "Error converting to stringset".to_string(),
+                        Box::new(DataFusionError::External(Box::new(e))),
+                    )
+                }),
+        }
+    }
+
+    /// plans and runs the plans in parallel and collects the results
+    /// run each plan in parallel and collect the results
+    async fn run_logical_plans(&self, plans: Vec<LogicalPlan>) -> Result<Vec<RecordBatch>> {
+        let value_futures = plans
+            .into_iter()
+            .map(|plan| {
+                let ctx = self.child_ctx("run_logical_plans");
+                self.run(async move {
+                    let physical_plan = ctx.create_physical_plan(&plan).await?;
+
+                    // TODO: avoid this buffering
+                    ctx.collect(physical_plan).await
+                })
+            })
+            .collect::<Vec<_>>();
+
+        // now, wait for all the values to resolve and collect them together
+        let mut results = Vec::new();
+        for join_handle in value_futures {
+            let mut plan_result = join_handle.await?;
+            results.append(&mut plan_result);
+        }
+        Ok(results)
+    }
+
+    /// Runs the provided future using this execution context
+    pub async fn run<Fut, T>(&self, fut: Fut) -> Result<T>
+    where
+        Fut: std::future::Future<Output = Result<T>> + Send + 'static,
+        T: Send + 'static,
+    {
+        Self::run_inner(self.exec.clone(), fut).await
+    }
+
+    async fn run_inner<Fut, T>(exec: DedicatedExecutor, fut: Fut) -> Result<T>
+    where
+        Fut: std::future::Future<Output = Result<T>> + Send + 'static,
+        T: Send + 'static,
+    {
+        exec.spawn(fut).await.unwrap_or_else(|e| {
+            Err(DataFusionError::Context(
+                "Join Error".to_string(),
+                Box::new(DataFusionError::External(Box::new(e))),
+            ))
+        })
+    }
+
+    /// Returns a IOxSessionContext with a SpanRecorder that is a child of the current
+    pub fn child_ctx(&self, name: &'static str) -> Self {
+        Self::new(
+            self.inner.clone(),
+            self.exec.clone(),
+            self.recorder.child(name),
+        )
+    }
+
+    /// Record an event on the span recorder
+    pub fn record_event(&mut self, name: &'static str) {
+        self.recorder.event(SpanEvent::new(name));
+    }
+
+    /// Record an event on the span recorder
+    pub fn set_metadata(&mut self, name: &'static str, value: impl Into<MetaValue>) {
+        self.recorder.set_metadata(name, value);
+    }
+
+    /// Returns the current [`Span`] if any
+    pub fn span(&self) -> Option<&Span> {
+        self.recorder.span()
+    }
+
+    /// Returns a new child span of the current context
+    pub fn child_span(&self, name: &'static str) -> Option<Span> {
+        self.recorder.child_span(name)
+    }
+
+    /// Number of currently active tasks.
+    pub fn tasks(&self) -> usize {
+        self.exec.tasks()
+    }
+}
+
+/// Extension trait to pull IOx spans out of DataFusion contexts.
+pub trait SessionContextIOxExt {
+    /// Get child span of the current context.
+    fn child_span(&self, name: &'static str) -> Option<Span>;
+
+    /// Get span context
+    fn span_ctx(&self) -> Option<SpanContext>;
+}
+
+impl SessionContextIOxExt for SessionState {
+    fn child_span(&self, name: &'static str) -> Option<Span> {
+        self.config()
+            .get_extension::<Option<Span>>()
+            .and_then(|span| span.as_ref().as_ref().map(|span| span.child(name)))
+    }
+
+    fn span_ctx(&self) -> Option<SpanContext> {
+        self.config()
+            .get_extension::<Option<Span>>()
+            .and_then(|span| span.as_ref().as_ref().map(|span| span.ctx.clone()))
+    }
+}
diff --git a/iox_query/src/exec/cross_rt_stream.rs b/iox_query/src/exec/cross_rt_stream.rs
new file mode 100644
index 0000000..c5303cb
--- /dev/null
+++ b/iox_query/src/exec/cross_rt_stream.rs
@@ -0,0 +1,357 @@
+//! Tooling to pull [`Stream`]s from one tokio runtime into another.
+//!
+//! This is critical so that CPU heavy loads are not run on the same runtime as IO handling
+use std::{
+    future::Future,
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use datafusion::error::DataFusionError;
+use executor::DedicatedExecutor;
+use futures::{future::BoxFuture, ready, FutureExt, Stream, StreamExt};
+use tokio::sync::mpsc::{channel, Sender};
+use tokio_stream::wrappers::ReceiverStream;
+
+/// [`Stream`] that is calculated by one tokio runtime but can safely be pulled from another w/o stalling (esp. when the
+/// calculating runtime is CPU-blocked).
+pub struct CrossRtStream<T> {
+    /// Future that drives the underlying stream.
+    ///
+    /// This is actually wrapped into [`DedicatedExecutor::spawn`] so it can be safely polled by the receiving runtime.
+    driver: BoxFuture<'static, ()>,
+
+    /// Flags if the [driver](Self::driver) returned [`Poll::Ready`].
+    driver_ready: bool,
+
+    /// Receiving stream.
+    ///
+    /// This one can be polled from the receiving runtime.
+    inner: ReceiverStream<T>,
+
+    /// Signals that [`inner`](Self::inner) finished.
+    ///
+    /// Note that we must also drive the [driver](Self::driver) even when the stream finished to allow proper state clean-ups.
+    inner_done: bool,
+}
+
+impl<T> std::fmt::Debug for CrossRtStream<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CrossRtStream")
+            .field("driver", &"...")
+            .field("driver_ready", &self.driver_ready)
+            .field("inner", &"...")
+            .field("inner_done", &self.inner_done)
+            .finish()
+    }
+}
+
+impl<T> CrossRtStream<T> {
+    /// Create new stream by producing a future that sends its state to the given [`Sender`].
+    ///
+    /// This is an internal method. `f` should always be wrapped into [`DedicatedExecutor::spawn`] (except for testing purposes).
+    fn new_with_tx<F, Fut>(f: F) -> Self
+    where
+        F: FnOnce(Sender<T>) -> Fut,
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        let (tx, rx) = channel(1);
+        let driver = f(tx).boxed();
+        Self {
+            driver,
+            driver_ready: false,
+            inner: ReceiverStream::new(rx),
+            inner_done: false,
+        }
+    }
+}
+
+impl<X, E> CrossRtStream<Result<X, E>>
+where
+    X: Send + 'static,
+    E: Send + 'static,
+{
+    /// Create new stream based on an existing stream that transports [`Result`]s.
+    ///
+    /// Also receives an executor that actually executes the underlying stream as well as a converter that convets
+    /// [`executor::JobError`] to the error type of the stream (so we can send potential crashes/panics).
+    fn new_with_error_stream<S, C>(stream: S, exec: DedicatedExecutor, converter: C) -> Self
+    where
+        S: Stream<Item = Result<X, E>> + Send + 'static,
+        C: Fn(executor::JobError) -> E + Send + 'static,
+    {
+        Self::new_with_tx(|tx| {
+            // future to be run in the other runtime
+            let tx_captured = tx.clone();
+            let fut = async move {
+                tokio::pin!(stream);
+
+                while let Some(res) = stream.next().await {
+                    if tx_captured.send(res).await.is_err() {
+                        // receiver gone
+                        return;
+                    }
+                }
+            };
+
+            // future for this runtime (likely the tokio/tonic/web driver)
+            async move {
+                if let Err(e) = exec.spawn(fut).await {
+                    let e = converter(e);
+
+                    // last message, so we don't care about the receiver side
+                    tx.send(Err(e)).await.ok();
+                }
+            }
+        })
+    }
+}
+
+impl<X> CrossRtStream<Result<X, DataFusionError>>
+where
+    X: Send + 'static,
+{
+    /// Create new stream based on an existing stream that transports [`Result`]s w/ [`DataFusionError`]s.
+    ///
+    /// Also receives an executor that actually executes the underlying stream.
+    pub fn new_with_df_error_stream<S>(stream: S, exec: DedicatedExecutor) -> Self
+    where
+        S: Stream<Item = Result<X, DataFusionError>> + Send + 'static,
+    {
+        Self::new_with_error_stream(stream, exec, |e| {
+            DataFusionError::Context(
+                "Join Error (panic)".to_string(),
+                Box::new(DataFusionError::External(e.into())),
+            )
+        })
+    }
+}
+
+impl<T> Stream for CrossRtStream<T> {
+    type Item = T;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = &mut *self;
+
+        if !this.driver_ready {
+            let res = this.driver.poll_unpin(cx);
+
+            if res.is_ready() {
+                this.driver_ready = true;
+            }
+        }
+
+        if this.inner_done {
+            if this.driver_ready {
+                Poll::Ready(None)
+            } else {
+                Poll::Pending
+            }
+        } else {
+            match ready!(this.inner.poll_next_unpin(cx)) {
+                None => {
+                    this.inner_done = true;
+                    if this.driver_ready {
+                        Poll::Ready(None)
+                    } else {
+                        Poll::Pending
+                    }
+                }
+                Some(x) => Poll::Ready(Some(x)),
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{sync::Arc, time::Duration};
+
+    use super::*;
+    use tokio::runtime::{Handle, RuntimeFlavor};
+
+    #[tokio::test]
+    async fn test_async_block() {
+        let exec = DedicatedExecutor::new_testing();
+        let barrier1 = Arc::new(tokio::sync::Barrier::new(2));
+        let barrier1_captured = Arc::clone(&barrier1);
+        let barrier2 = Arc::new(tokio::sync::Barrier::new(2));
+        let barrier2_captured = Arc::clone(&barrier2);
+        let mut stream = CrossRtStream::<Result<u8, executor::JobError>>::new_with_error_stream(
+            futures::stream::once(async move {
+                barrier1_captured.wait().await;
+                barrier2_captured.wait().await;
+                Ok(1)
+            }),
+            exec,
+            std::convert::identity,
+        );
+
+        let mut f = stream.next();
+
+        ensure_pending(&mut f).await;
+        barrier1.wait().await;
+        ensure_pending(&mut f).await;
+        barrier2.wait().await;
+
+        let res = f.await.expect("streamed data");
+        assert_eq!(res.unwrap(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_sync_block() {
+        // This would deadlock if the stream payload would run within the same tokio runtime. To prevent any cheating
+        // (e.g. via channels), we ensure that the current runtime only has a single thread:
+        assert_eq!(
+            RuntimeFlavor::CurrentThread,
+            Handle::current().runtime_flavor()
+        );
+
+        let exec = DedicatedExecutor::new_testing();
+        let barrier1 = Arc::new(std::sync::Barrier::new(2));
+        let barrier1_captured = Arc::clone(&barrier1);
+        let barrier2 = Arc::new(std::sync::Barrier::new(2));
+        let barrier2_captured = Arc::clone(&barrier2);
+        let mut stream = CrossRtStream::<Result<u8, executor::JobError>>::new_with_error_stream(
+            futures::stream::once(async move {
+                barrier1_captured.wait();
+                barrier2_captured.wait();
+                Ok(1)
+            }),
+            exec,
+            std::convert::identity,
+        );
+
+        let mut f = stream.next();
+
+        ensure_pending(&mut f).await;
+        barrier1.wait();
+        ensure_pending(&mut f).await;
+        barrier2.wait();
+
+        let res = f.await.expect("streamed data");
+        assert_eq!(res.unwrap(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_panic() {
+        let exec = DedicatedExecutor::new_testing();
+        let mut stream = CrossRtStream::<Result<(), executor::JobError>>::new_with_error_stream(
+            futures::stream::once(async { panic!("foo") }),
+            exec,
+            std::convert::identity,
+        );
+
+        let e = stream
+            .next()
+            .await
+            .expect("stream not finished")
+            .unwrap_err();
+        assert_eq!(e.to_string(), "Panic: foo");
+
+        let none = stream.next().await;
+        assert!(none.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_cancel_future() {
+        let exec = DedicatedExecutor::new_testing();
+        let barrier1 = Arc::new(tokio::sync::Barrier::new(2));
+        let barrier1_captured = Arc::clone(&barrier1);
+        let barrier2 = Arc::new(tokio::sync::Barrier::new(2));
+        let barrier2_captured = Arc::clone(&barrier2);
+        let mut stream = CrossRtStream::<Result<u8, executor::JobError>>::new_with_error_stream(
+            futures::stream::once(async move {
+                barrier1_captured.wait().await;
+                barrier2_captured.wait().await;
+                Ok(1)
+            }),
+            exec,
+            std::convert::identity,
+        );
+
+        let mut f = stream.next();
+
+        // fire up stream
+        ensure_pending(&mut f).await;
+        barrier1.wait().await;
+
+        // cancel
+        drop(f);
+
+        barrier2.wait().await;
+        let res = stream.next().await.expect("streamed data");
+        assert_eq!(res.unwrap(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_cancel_stream() {
+        let exec = DedicatedExecutor::new_testing();
+        let barrier = Arc::new(tokio::sync::Barrier::new(2));
+        let barrier_captured = Arc::clone(&barrier);
+        let mut stream = CrossRtStream::<Result<u8, executor::JobError>>::new_with_error_stream(
+            futures::stream::once(async move {
+                barrier_captured.wait().await;
+
+                // block forever
+                futures::future::pending::<()>().await;
+
+                // keep barrier Arc alive
+                drop(barrier_captured);
+                unreachable!()
+            }),
+            exec,
+            std::convert::identity,
+        );
+
+        let mut f = stream.next();
+
+        // fire up stream
+        ensure_pending(&mut f).await;
+        barrier.wait().await;
+        assert_eq!(Arc::strong_count(&barrier), 2);
+
+        // cancel
+        drop(f);
+        drop(stream);
+
+        tokio::time::timeout(Duration::from_secs(5), async {
+            loop {
+                if Arc::strong_count(&barrier) == 1 {
+                    return;
+                }
+
+                tokio::time::sleep(Duration::from_millis(10)).await;
+            }
+        })
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_inner_future_driven_to_completion_after_stream_ready() {
+        let barrier = Arc::new(tokio::sync::Barrier::new(2));
+        let barrier_captured = Arc::clone(&barrier);
+
+        let mut stream = CrossRtStream::<u8>::new_with_tx(|tx| async move {
+            tx.send(1).await.ok();
+            drop(tx);
+            barrier_captured.wait().await;
+        });
+
+        let handle = tokio::spawn(async move { barrier.wait().await });
+
+        assert_eq!(stream.next().await, Some(1));
+        handle.await.unwrap();
+    }
+
+    async fn ensure_pending<F>(f: &mut F)
+    where
+        F: Future + Send + Unpin,
+    {
+        tokio::select! {
+            _ = tokio::time::sleep(Duration::from_millis(100)) => {}
+            _ = f => {panic!("not pending")},
+        }
+    }
+}
diff --git a/iox_query/src/exec/field.rs b/iox_query/src/exec/field.rs
new file mode 100644
index 0000000..5838890
--- /dev/null
+++ b/iox_query/src/exec/field.rs
@@ -0,0 +1,182 @@
+use std::sync::Arc;
+
+use arrow::{self, datatypes::SchemaRef};
+use schema::TIME_COLUMN_NAME;
+use snafu::{ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Error finding field column: {:?} in schema '{}'", column_name, source))]
+    ColumnNotFoundForField {
+        column_name: String,
+        source: arrow::error::ArrowError,
+    },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Names for a field: a value field and the associated timestamp columns
+#[derive(Debug, PartialEq, Eq)]
+pub enum FieldColumns {
+    /// All field columns share a timestamp column, named TIME_COLUMN_NAME
+    SharedTimestamp(Vec<Arc<str>>),
+
+    /// Each field has a potentially different timestamp column
+    // (value_name, timestamp_name)
+    DifferentTimestamp(Vec<(Arc<str>, Arc<str>)>),
+}
+
+impl From<Vec<Arc<str>>> for FieldColumns {
+    fn from(v: Vec<Arc<str>>) -> Self {
+        Self::SharedTimestamp(v)
+    }
+}
+
+impl From<Vec<(Arc<str>, Arc<str>)>> for FieldColumns {
+    fn from(v: Vec<(Arc<str>, Arc<str>)>) -> Self {
+        Self::DifferentTimestamp(v)
+    }
+}
+
+impl From<Vec<&str>> for FieldColumns {
+    fn from(v: Vec<&str>) -> Self {
+        let v = v.into_iter().map(Arc::from).collect();
+
+        Self::SharedTimestamp(v)
+    }
+}
+
+impl From<&[&str]> for FieldColumns {
+    fn from(v: &[&str]) -> Self {
+        let v = v.iter().map(|v| Arc::from(*v)).collect();
+
+        Self::SharedTimestamp(v)
+    }
+}
+
+/// Column indexes for a field: a value and corresponding timestamp
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct FieldIndex {
+    pub value_index: usize,
+    pub timestamp_index: usize,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct FieldIndexes {
+    inner: Arc<Vec<FieldIndex>>,
+}
+
+impl FieldIndexes {
+    /// Create FieldIndexes where each field has the same timestamp
+    /// and different value index
+    pub fn from_timestamp_and_value_indexes(
+        timestamp_index: usize,
+        value_indexes: &[usize],
+    ) -> Self {
+        value_indexes
+            .iter()
+            .map(|&value_index| FieldIndex {
+                value_index,
+                timestamp_index,
+            })
+            .collect::<Vec<_>>()
+            .into()
+    }
+
+    /// Convert a slice of pairs (value_index, time_index) into
+    /// FieldIndexes
+    pub fn from_slice(v: &[(usize, usize)]) -> Self {
+        let inner = v
+            .iter()
+            .map(|&(value_index, timestamp_index)| FieldIndex {
+                value_index,
+                timestamp_index,
+            })
+            .collect();
+
+        Self {
+            inner: Arc::new(inner),
+        }
+    }
+
+    pub fn as_slice(&self) -> &[FieldIndex] {
+        self.inner.as_ref()
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = &FieldIndex> {
+        self.as_slice().iter()
+    }
+}
+
+impl From<Vec<FieldIndex>> for FieldIndexes {
+    fn from(list: Vec<FieldIndex>) -> Self {
+        Self {
+            inner: Arc::new(list),
+        }
+    }
+}
+
+impl FieldIndexes {
+    // look up which column index correponds to each column name
+    pub fn names_to_indexes(schema: &SchemaRef, column_names: &[Arc<str>]) -> Result<Vec<usize>> {
+        column_names
+            .iter()
+            .map(|column_name| {
+                schema
+                    .index_of(column_name)
+                    .context(ColumnNotFoundForFieldSnafu {
+                        column_name: column_name.as_ref(),
+                    })
+            })
+            .collect()
+    }
+
+    /// Translate the field columns into pairs of (field_index, timestamp_index)
+    pub fn from_field_columns(schema: &SchemaRef, field_columns: &FieldColumns) -> Result<Self> {
+        let indexes = match field_columns {
+            FieldColumns::SharedTimestamp(field_names) => {
+                let timestamp_index =
+                    schema
+                        .index_of(TIME_COLUMN_NAME)
+                        .context(ColumnNotFoundForFieldSnafu {
+                            column_name: TIME_COLUMN_NAME,
+                        })?;
+
+                Self::names_to_indexes(schema, field_names)?
+                    .into_iter()
+                    .map(|field_index| FieldIndex {
+                        value_index: field_index,
+                        timestamp_index,
+                    })
+                    .collect::<Vec<_>>()
+                    .into()
+            }
+            FieldColumns::DifferentTimestamp(fields_and_timestamp_names) => {
+                fields_and_timestamp_names
+                    .iter()
+                    .map(|(field_name, timestamp_name)| {
+                        let field_index =
+                            schema
+                                .index_of(field_name)
+                                .context(ColumnNotFoundForFieldSnafu {
+                                    column_name: field_name.as_ref(),
+                                })?;
+
+                        let timestamp_index = schema.index_of(timestamp_name).context(
+                            ColumnNotFoundForFieldSnafu {
+                                column_name: TIME_COLUMN_NAME,
+                            },
+                        )?;
+
+                        Ok(FieldIndex {
+                            value_index: field_index,
+                            timestamp_index,
+                        })
+                    })
+                    .collect::<Result<Vec<_>>>()?
+                    .into()
+            }
+        };
+        Ok(indexes)
+    }
+}
diff --git a/iox_query/src/exec/fieldlist.rs b/iox_query/src/exec/fieldlist.rs
new file mode 100644
index 0000000..e749543
--- /dev/null
+++ b/iox_query/src/exec/fieldlist.rs
@@ -0,0 +1,433 @@
+//! This module contains the definition of a "FieldList" a set of
+//! records of (field_name, field_type, last_timestamp) and code to
+//! pull them from RecordBatches
+use std::{collections::BTreeMap, sync::Arc};
+
+use arrow::{
+    self,
+    array::TimestampNanosecondArray,
+    datatypes::{DataType, SchemaRef},
+    record_batch::RecordBatch,
+};
+use schema::TIME_COLUMN_NAME;
+
+use snafu::{ensure, ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Internal error converting to FieldList. No time column in schema: {:?}. {}",
+        schema,
+        source
+    ))]
+    InternalNoTimeColumn {
+        schema: SchemaRef,
+        source: arrow::error::ArrowError,
+    },
+
+    #[snafu(display(
+        "Inconsistent data type for field '{}': found both '{:?}' and '{:?}'",
+        field_name,
+        data_type1,
+        data_type2
+    ))]
+    InconsistentFieldType {
+        field_name: String,
+        data_type1: DataType,
+        data_type2: DataType,
+    },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Represents a single Field (column)'s metadata: Name, data_type,
+/// and most recent (last) timestamp.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Field {
+    pub name: String,
+    pub data_type: DataType,
+    pub last_timestamp: i64,
+}
+
+/// A list of `Fields`
+#[derive(Debug, Clone, PartialEq, Eq, Default)]
+pub struct FieldList {
+    pub fields: Vec<Field>,
+}
+
+/// Trait to convert RecordBatch'y things into `FieldLists`. Assumes
+/// that the input RecordBatch es can each have a single string
+/// column.
+pub trait IntoFieldList {
+    /// Convert this thing into a fieldlist
+    fn into_fieldlist(self) -> Result<FieldList>;
+}
+
+/// Converts record batches into FieldLists
+impl IntoFieldList for Vec<RecordBatch> {
+    fn into_fieldlist(self) -> Result<FieldList> {
+        if self.is_empty() {
+            return Ok(FieldList::default());
+        }
+
+        // For each field in the schema (except time) for all rows
+        // that are non-null, update the current most-recent timestamp
+        // seen
+        let arrow_schema = self[0].schema();
+
+        let time_column_index = arrow_schema.index_of(TIME_COLUMN_NAME).with_context(|_| {
+            InternalNoTimeColumnSnafu {
+                schema: Arc::clone(&arrow_schema),
+            }
+        })?;
+
+        // key: fieldname, value: highest value of time column we have seen
+        let mut field_times = BTreeMap::new();
+
+        for batch in self {
+            let time_column = batch
+                .column(time_column_index)
+                .as_any()
+                .downcast_ref::<TimestampNanosecondArray>()
+                .expect("Downcasting time to TimestampNanosecondArray");
+
+            for (column_index, arrow_field) in arrow_schema.fields().iter().enumerate() {
+                if column_index == time_column_index {
+                    continue;
+                }
+                let array = batch.column(column_index);
+
+                // walk each value in array, looking for non-null values
+                let mut max_ts: Option<i64> = None;
+                for i in 0..batch.num_rows() {
+                    if !array.is_null(i) {
+                        let cur_ts = time_column.value(i);
+                        max_ts = max_ts.map(|ts| std::cmp::max(ts, cur_ts)).or(Some(cur_ts));
+                    }
+                }
+
+                if let Some(max_ts) = max_ts {
+                    if let Some(ts) = field_times.get_mut(arrow_field.name()) {
+                        *ts = std::cmp::max(max_ts, *ts);
+                    } else {
+                        field_times.insert(arrow_field.name().to_string(), max_ts);
+                    }
+                }
+            }
+        }
+
+        let fields = arrow_schema
+            .fields()
+            .iter()
+            .filter_map(|arrow_field| {
+                let field_name = arrow_field.name();
+                if field_name == TIME_COLUMN_NAME {
+                    None
+                } else {
+                    field_times.get(field_name).map(|ts| Field {
+                        name: field_name.to_string(),
+                        data_type: arrow_field.data_type().clone(),
+                        last_timestamp: *ts,
+                    })
+                }
+            })
+            .collect();
+
+        Ok(FieldList { fields })
+    }
+}
+
+/// Merge several FieldLists into a single field list, merging the
+/// entries appropriately
+// Clippy gets confused and tells me that I should be using Self
+// instead of Vec even though the type of Vec being created is different
+#[allow(clippy::use_self)]
+impl IntoFieldList for Vec<FieldList> {
+    fn into_fieldlist(self) -> Result<FieldList> {
+        if self.is_empty() {
+            return Ok(FieldList::default());
+        }
+
+        // otherwise merge the fields together
+        let mut field_map = BTreeMap::<String, Field>::new();
+
+        // iterate over all fields
+        let field_iter = self.into_iter().flat_map(|f| f.fields.into_iter());
+
+        for new_field in field_iter {
+            if let Some(existing_field) = field_map.get_mut(&new_field.name) {
+                ensure!(
+                    existing_field.data_type == new_field.data_type,
+                    InconsistentFieldTypeSnafu {
+                        field_name: new_field.name,
+                        data_type1: existing_field.data_type.clone(),
+                        data_type2: new_field.data_type,
+                    }
+                );
+                existing_field.last_timestamp =
+                    std::cmp::max(existing_field.last_timestamp, new_field.last_timestamp);
+            }
+            // no entry for field yet
+            else {
+                field_map.insert(new_field.name.clone(), new_field);
+            }
+        }
+
+        let mut fields = field_map.into_values().collect::<Vec<_>>();
+        fields.sort_by(|a, b| a.name.cmp(&b.name));
+
+        Ok(FieldList { fields })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+
+    use arrow::array::ArrayRef;
+    use arrow::{
+        array::{Int64Array, StringArray},
+        datatypes::{DataType as ArrowDataType, Field as ArrowField, Schema},
+    };
+    use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE};
+
+    #[test]
+    fn test_convert_single_batch() {
+        let schema = Arc::new(Schema::new(vec![
+            ArrowField::new("string_field", ArrowDataType::Utf8, true),
+            ArrowField::new("time", TIME_DATA_TYPE(), true),
+        ]));
+
+        let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        let actual = do_conversion(
+            Arc::clone(&schema),
+            vec![vec![string_array, timestamp_array]],
+        )
+        .expect("convert correctly");
+
+        let expected = FieldList {
+            fields: vec![Field {
+                name: "string_field".into(),
+                data_type: ArrowDataType::Utf8,
+                last_timestamp: 4000,
+            }],
+        };
+
+        assert_eq!(
+            expected, actual,
+            "Expected:\n{expected:#?}\nActual:\n{actual:#?}"
+        );
+
+        // expect same even if the timestamp order is different
+
+        let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 4000, 2000, 3000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        let actual = do_conversion(schema, vec![vec![string_array, timestamp_array]])
+            .expect("convert correctly");
+
+        assert_eq!(
+            expected, actual,
+            "Expected:\n{expected:#?}\nActual:\n{actual:#?}"
+        );
+    }
+
+    #[test]
+    fn test_convert_two_batches() {
+        let schema = Arc::new(Schema::new(vec![
+            ArrowField::new("string_field", ArrowDataType::Utf8, true),
+            ArrowField::new("time", TIME_DATA_TYPE(), true),
+        ]));
+
+        let string_array1: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"]));
+        let timestamp_array1: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 3000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        let string_array2: ArrayRef = Arc::new(StringArray::from(vec!["foo", "foo"]));
+        let timestamp_array2: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        let actual = do_conversion(
+            schema,
+            vec![
+                vec![string_array1, timestamp_array1],
+                vec![string_array2, timestamp_array2],
+            ],
+        )
+        .expect("convert correctly");
+
+        let expected = FieldList {
+            fields: vec![Field {
+                name: "string_field".into(),
+                data_type: ArrowDataType::Utf8,
+                last_timestamp: 4000,
+            }],
+        };
+
+        assert_eq!(
+            expected, actual,
+            "Expected:\n{expected:#?}\nActual:\n{actual:#?}"
+        );
+    }
+
+    #[test]
+    fn test_convert_all_nulls() {
+        let schema = Arc::new(Schema::new(vec![
+            ArrowField::new("string_field", ArrowDataType::Utf8, true),
+            ArrowField::new("time", TIME_DATA_TYPE(), true),
+        ]));
+
+        // string array has no actual values, so should not be returned as a field
+        let string_array: ArrayRef =
+            Arc::new(StringArray::from(vec![None::<&str>, None, None, None]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        let actual = do_conversion(schema, vec![vec![string_array, timestamp_array]])
+            .expect("convert correctly");
+
+        let expected = FieldList { fields: vec![] };
+
+        assert_eq!(
+            expected, actual,
+            "Expected:\n{expected:#?}\nActual:\n{actual:#?}"
+        );
+    }
+
+    // test three columns, with different data types and null
+    #[test]
+    fn test_multi_column_multi_datatype() {
+        let schema = Arc::new(Schema::new(vec![
+            ArrowField::new("string_field", ArrowDataType::Utf8, true),
+            ArrowField::new("int_field", ArrowDataType::Int64, true),
+            ArrowField::new("time", TIME_DATA_TYPE(), true),
+        ]));
+
+        let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"]));
+        let int_array: ArrayRef =
+            Arc::new(Int64Array::from(vec![Some(10), Some(20), Some(30), None]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        let expected = FieldList {
+            fields: vec![
+                Field {
+                    name: "string_field".into(),
+                    data_type: ArrowDataType::Utf8,
+                    last_timestamp: 4000,
+                },
+                Field {
+                    name: "int_field".into(),
+                    data_type: ArrowDataType::Int64,
+                    last_timestamp: 3000,
+                },
+            ],
+        };
+
+        let actual = do_conversion(schema, vec![vec![string_array, int_array, timestamp_array]])
+            .expect("conversion successful");
+
+        assert_eq!(
+            expected, actual,
+            "Expected:\n{expected:#?}\nActual:\n{actual:#?}"
+        );
+    }
+
+    fn do_conversion(schema: SchemaRef, value_arrays: Vec<Vec<ArrayRef>>) -> Result<FieldList> {
+        let batches = value_arrays
+            .into_iter()
+            .map(|arrays| {
+                RecordBatch::try_new(Arc::clone(&schema), arrays).expect("created new record batch")
+            })
+            .collect::<Vec<_>>();
+
+        batches.into_fieldlist()
+    }
+
+    #[test]
+    fn test_merge_field_list() {
+        let field1 = Field {
+            name: "one".into(),
+            data_type: ArrowDataType::Utf8,
+            last_timestamp: 4000,
+        };
+        let field2 = Field {
+            name: "two".into(),
+            data_type: ArrowDataType::Int64,
+            last_timestamp: 3000,
+        };
+
+        let l1 = FieldList {
+            fields: vec![field1, field2.clone()],
+        };
+        let actual = vec![l1.clone()].into_fieldlist().unwrap();
+        let expected = l1.clone();
+
+        assert_eq!(
+            expected, actual,
+            "Expected:\n{expected:#?}\nActual:\n{actual:#?}"
+        );
+
+        let field1_later = Field {
+            name: "one".into(),
+            data_type: ArrowDataType::Utf8,
+            last_timestamp: 5000,
+        };
+
+        // use something that has a later timestamp and expect the later one takes
+        // precedence
+        let l2 = FieldList {
+            fields: vec![field1_later.clone()],
+        };
+        let actual = vec![l1.clone(), l2.clone()].into_fieldlist().unwrap();
+        let expected = FieldList {
+            fields: vec![field1_later, field2],
+        };
+
+        assert_eq!(
+            expected, actual,
+            "Expected:\n{expected:#?}\nActual:\n{actual:#?}"
+        );
+
+        // Now, try to add a field that has a different type
+
+        let field1_new_type = Field {
+            name: "one".into(),
+            data_type: ArrowDataType::Int64,
+            last_timestamp: 5000,
+        };
+
+        // use something that has a later timestamp and expect the later one takes
+        // precedence
+        let l3 = FieldList {
+            fields: vec![field1_new_type],
+        };
+        let actual = vec![l1, l2, l3].into_fieldlist();
+        let actual_error = actual.expect_err("should be an error").to_string();
+
+        let expected_error =
+            "Inconsistent data type for field 'one': found both 'Utf8' and 'Int64'";
+
+        assert!(
+            actual_error.contains(expected_error),
+            "Can not find expected '{expected_error}' in actual '{actual_error}'"
+        );
+    }
+}
diff --git a/iox_query/src/exec/gapfill/algo.rs b/iox_query/src/exec/gapfill/algo.rs
new file mode 100644
index 0000000..0733038
--- /dev/null
+++ b/iox_query/src/exec/gapfill/algo.rs
@@ -0,0 +1,1650 @@
+//! Contains the [GapFiller] type which does the
+//! actual gap filling of record batches.
+
+mod interpolate;
+
+use std::{ops::Range, sync::Arc};
+
+use arrow::{
+    array::{Array, ArrayRef, TimestampNanosecondArray, UInt64Array},
+    compute::{kernels::take, partition},
+    datatypes::SchemaRef,
+    record_batch::RecordBatch,
+};
+use datafusion::{
+    error::{DataFusionError, Result},
+    scalar::ScalarValue,
+};
+use hashbrown::HashMap;
+
+use self::interpolate::Segment;
+
+use super::{params::GapFillParams, FillStrategy};
+
+/// Provides methods to the [`GapFillStream`](super::stream::GapFillStream)
+/// module that fill gaps in buffered input.
+///
+/// [GapFiller] assumes that there will be at least `output_batch_size + 2`
+/// input records buffered when [`build_gapfilled_output`](GapFiller::build_gapfilled_output)
+/// is invoked, provided there is enough data.
+///
+/// Once output is produced, clients should call `slice_input_batch` to unbuffer
+/// data that is no longer needed.
+///
+/// Below is a diagram of how buffered input is structured.
+///
+/// ```text
+///
+///                                     BUFFERED INPUT ROWS
+///
+///                        time     group columns       aggregate columns
+///                       ╓────╥───┬───┬─────────────╥───┬───┬─────────────╖
+/// context row         0 ║    ║   │   │   . . .     ║   │   │   . . .     ║
+///                       ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+///  ┬────  cursor────► 1 ║    ║   │   │             ║   │   │             ║
+///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+///  │                  2 ║    ║   │   │             ║   │   │             ║
+///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+///  │                      .                .                     .
+/// output_batch_size       .                .                     .
+///  │                      .                .                     .
+///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+///  │              n - 1 ║    ║   │   │             ║   │   │             ║
+///  │                    ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+///  ┴────              n ║    ║   │   │             ║   │   │             ║
+///                       ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+/// trailing row(s) n + 1 ║    ║   │   │             ║   │   │             ║
+///                       ╟────╫───┼───┼─────────────╫───┼───┼─────────────╢
+///                         .                .                     .
+///                         .                .                     .
+///                         .                .                     .
+/// ```
+///
+/// Just before generating output, the cursor will generally point at offset 1
+/// in the input, since offset 0 is a _context row_. The exception to this is
+/// there is no context row when generating the first output batch.
+///
+/// Buffering at least `output_batch_size + 2` rows ensures that:
+/// - `GapFiller` can produce enough rows to produce a complete output batch, since
+///       every input row will appear in the output.
+/// - There is a _context row_ that represents the last input row that got output before
+///       the current output batch. Group column values will be taken from this row
+///       (using the [`take`](take::take) kernel) when we are generating trailing gaps, i.e.,
+///       when all of the input rows have been output for a series in the previous batch,
+///       but there still remains missing rows to produce at the end.
+/// - Having at least one additional _trailing row_ at the end ensures that `GapFiller` can
+///       infer whether there is trailing gaps to produce at the beginning of the
+///       next batch, since it can discover if the last row starts a new series.
+/// - If there are columns that have a fill strategy of [`LinearInterpolate`], then more
+///       trailing rows may be necessary to find the next non-null value for the column.
+///
+/// [`LinearInterpolate`]: FillStrategy::LinearInterpolate
+#[derive(Debug)]
+pub(super) struct GapFiller {
+    /// The static parameters of gap-filling: time range start, end and the stride.
+    params: GapFillParams,
+    /// The number of rows to produce in each output batch.
+    batch_size: usize,
+    /// The current state of gap-filling, including the next timestamp,
+    /// the offset of the next input row, and remaining space in output batch.
+    cursor: Cursor,
+}
+
+impl GapFiller {
+    /// Initialize a [GapFiller] at the beginning of an input record batch.
+    pub fn new(params: GapFillParams, batch_size: usize) -> Self {
+        let cursor = Cursor::new(&params);
+        Self {
+            params,
+            batch_size,
+            cursor,
+        }
+    }
+
+    /// Given that the cursor points at the input row that will be
+    /// the first row in the next output batch, return the offset
+    /// of last input row that could possibly be in the output.
+    ///
+    /// This offset is used by ['BufferedInput`] to determine how many
+    /// rows need to be buffered.
+    ///
+    /// [`BufferedInput`]: super::BufferedInput
+    pub(super) fn last_output_row_offset(&self) -> usize {
+        self.cursor.next_input_offset + self.batch_size - 1
+    }
+
+    /// Returns true if there are no more output rows to produce given
+    /// the number of rows of buffered input.
+    pub fn done(&self, buffered_input_row_count: usize) -> bool {
+        self.cursor.done(buffered_input_row_count)
+    }
+
+    /// Produces a gap-filled output [RecordBatch].
+    ///
+    /// Input arrays are represented as pairs that include their offset in the
+    /// schema at member `0`.
+    pub fn build_gapfilled_output(
+        &mut self,
+        schema: SchemaRef,
+        input_time_array: (usize, &TimestampNanosecondArray),
+        group_arrays: &[(usize, ArrayRef)],
+        aggr_arrays: &[(usize, ArrayRef)],
+    ) -> Result<RecordBatch> {
+        let series_ends = self.plan_output_batch(input_time_array.1, group_arrays)?;
+        self.cursor.remaining_output_batch_size = self.batch_size;
+        self.build_output(
+            schema,
+            input_time_array,
+            group_arrays,
+            aggr_arrays,
+            &series_ends,
+        )
+    }
+
+    /// Slice the input batch so that it has one context row before the next input offset.
+    pub fn slice_input_batch(&mut self, batch: RecordBatch) -> Result<RecordBatch> {
+        if self.cursor.next_input_offset < 2 {
+            // nothing to do
+            return Ok(batch);
+        }
+
+        let offset = self.cursor.next_input_offset - 1;
+        self.cursor.slice(offset, &batch)?;
+
+        let len = batch.num_rows() - offset;
+        Ok(batch.slice(offset, len))
+    }
+
+    /// Produces a vector of offsets that are the exclusive ends of each series
+    /// in the buffered input. It will return the ends of only those series
+    /// that can at least  be started in the output batch.
+    ///
+    /// Uses [`lexicographical_partition_ranges`](arrow::compute::lexicographical_partition_ranges)
+    /// to partition input rows into series.
+    fn plan_output_batch(
+        &mut self,
+        input_time_array: &TimestampNanosecondArray,
+        group_arr: &[(usize, ArrayRef)],
+    ) -> Result<Vec<usize>> {
+        if group_arr.is_empty() {
+            // there are no group columns, so the output
+            // will be just one big series.
+            return Ok(vec![input_time_array.len()]);
+        }
+
+        let sort_columns = group_arr
+            .iter()
+            .map(|(_, arr)| Arc::clone(arr))
+            .collect::<Vec<_>>();
+
+        let mut ranges = partition(&sort_columns)?.ranges().into_iter();
+
+        let mut series_ends = vec![];
+        let mut cursor = self.cursor.clone_for_aggr_col(None)?;
+        let mut output_row_count = 0;
+
+        let start_offset = cursor.next_input_offset;
+        assert!(start_offset <= 1, "input is sliced after it is consumed");
+        while output_row_count < self.batch_size {
+            match ranges.next() {
+                Some(Range { end, .. }) => {
+                    assert!(
+                        end > 0,
+                        "each lexicographical partition will have at least one row"
+                    );
+
+                    if let Some(nrows) =
+                        cursor.count_series_rows(&self.params, input_time_array, end)
+                    {
+                        output_row_count += nrows;
+                        series_ends.push(end);
+                    }
+                }
+                None => break,
+            }
+        }
+
+        Ok(series_ends)
+    }
+
+    /// Helper method that produces gap-filled record batches.
+    ///
+    /// This method works by producing each array in the output completely,
+    /// for all series that have end offsets in `series_ends`, before producing
+    /// subsequent arrays.
+    fn build_output(
+        &mut self,
+        schema: SchemaRef,
+        input_time_array: (usize, &TimestampNanosecondArray),
+        group_arr: &[(usize, ArrayRef)],
+        aggr_arr: &[(usize, ArrayRef)],
+        series_ends: &[usize],
+    ) -> Result<RecordBatch> {
+        let mut output_arrays: Vec<(usize, ArrayRef)> =
+            Vec::with_capacity(group_arr.len() + aggr_arr.len() + 1); // plus one for time column
+
+        // build the time column
+        let mut cursor = self.cursor.clone_for_aggr_col(None)?;
+        let (time_idx, input_time_array) = input_time_array;
+        let time_vec = cursor.build_time_vec(&self.params, series_ends, input_time_array)?;
+        let output_time_len = time_vec.len();
+        output_arrays.push((
+            time_idx,
+            Arc::new(
+                TimestampNanosecondArray::from(time_vec)
+                    .with_timezone_opt(input_time_array.timezone()),
+            ),
+        ));
+        // There may not be any aggregate or group columns, so use this cursor state as the new
+        // GapFiller cursor once this output batch is complete.
+        let mut final_cursor = cursor;
+
+        // build the other group columns
+        for (idx, ga) in group_arr {
+            let mut cursor = self.cursor.clone_for_aggr_col(None)?;
+            let take_vec =
+                cursor.build_group_take_vec(&self.params, series_ends, input_time_array)?;
+            if take_vec.len() != output_time_len {
+                return Err(DataFusionError::Internal(format!(
+                    "gapfill group column has {} rows, expected {}",
+                    take_vec.len(),
+                    output_time_len
+                )));
+            }
+            let take_arr = UInt64Array::from(take_vec);
+            output_arrays.push((*idx, take::take(ga, &take_arr, None)?))
+        }
+
+        // Build the aggregate columns
+        for (idx, aa) in aggr_arr {
+            let mut cursor = self.cursor.clone_for_aggr_col(Some(*idx))?;
+            let output_array =
+                cursor.build_aggr_col(&self.params, series_ends, input_time_array, aa)?;
+            if output_array.len() != output_time_len {
+                return Err(DataFusionError::Internal(format!(
+                    "gapfill aggr column has {} rows, expected {}",
+                    output_array.len(),
+                    output_time_len
+                )));
+            }
+            output_arrays.push((*idx, output_array));
+            final_cursor.merge_aggr_col_cursor(cursor);
+        }
+
+        output_arrays.sort_by(|(a, _), (b, _)| a.cmp(b));
+        let output_arrays: Vec<_> = output_arrays.into_iter().map(|(_, arr)| arr).collect();
+        let batch = RecordBatch::try_new(Arc::clone(&schema), output_arrays)
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
+
+        self.cursor = final_cursor;
+        Ok(batch)
+    }
+}
+
+/// Maintains the state needed to fill gaps in output columns. Also provides methods
+/// for building vectors that build time, group, and aggregate output arrays.
+#[derive(Debug)]
+pub(crate) struct Cursor {
+    /// Where to read the next row from the input.
+    next_input_offset: usize,
+    /// The next timestamp to be produced for the current series.
+    /// Since the lower bound for gap filling could just be "whatever
+    /// the first timestamp in the series is," this may be `None` before
+    /// any rows with non-null timestamps are produced for a series.
+    next_ts: Option<i64>,
+    /// How many rows may be output before we need to start a new record batch.
+    remaining_output_batch_size: usize,
+    /// True if there are trailing gaps from after the last input row for a series
+    /// to be produced at the beginning of the next output batch.
+    trailing_gaps: bool,
+    /// State for each aggregate column, keyed on the columns offset in the schema.
+    aggr_col_states: HashMap<usize, AggrColState>,
+}
+
+impl Cursor {
+    /// Creates a new cursor.
+    fn new(params: &GapFillParams) -> Self {
+        let aggr_col_states = params
+            .fill_strategy
+            .iter()
+            .map(|(idx, fs)| (*idx, AggrColState::new(fs)))
+            .collect();
+        Self {
+            next_input_offset: 0,
+            next_ts: params.first_ts,
+            remaining_output_batch_size: 0,
+            trailing_gaps: false,
+            aggr_col_states,
+        }
+    }
+
+    /// Returns true of we point past all rows of buffered input and there
+    /// are no trailing gaps left to produce.
+    fn done(&self, buffered_input_row_count: usize) -> bool {
+        self.next_input_offset == buffered_input_row_count && !self.trailing_gaps
+    }
+
+    /// Make a clone of this cursor to be used for creating an aggregate column,
+    /// if `idx` is `Some`. The resulting `Cursor` will only contain [AggrColState]
+    /// for the indicated column.
+    ///
+    /// When `idx` is `None`, return a `Cursor` with an empty [Cursor::aggr_col_states].
+    fn clone_for_aggr_col(&self, idx: Option<usize>) -> Result<Self> {
+        let mut cur = Self {
+            next_input_offset: self.next_input_offset,
+            next_ts: self.next_ts,
+            remaining_output_batch_size: self.remaining_output_batch_size,
+            trailing_gaps: self.trailing_gaps,
+            aggr_col_states: HashMap::default(),
+        };
+        if let Some(idx) = idx {
+            let state = self
+                .aggr_col_states
+                .get(&idx)
+                .ok_or(DataFusionError::Internal(format!(
+                    "could not find aggr col with offset {idx}"
+                )))?;
+            cur.aggr_col_states.insert(idx, state.clone());
+        }
+        Ok(cur)
+    }
+
+    /// Update [Cursor::aggr_col_states] with updated state for an
+    /// aggregate column. `cursor` will have been created via `Cursor::clone_for_aggr_col`,
+    /// so [Cursor::aggr_col_states] will contain exactly one item.
+    ///
+    /// # Panics
+    ///
+    /// Will panic if input cursor's [Cursor::aggr_col_states] does not contain exactly one item.
+    fn merge_aggr_col_cursor(&mut self, cursor: Self) {
+        assert_eq!(1, cursor.aggr_col_states.len());
+        for (idx, state) in cursor.aggr_col_states.into_iter() {
+            self.aggr_col_states.insert(idx, state);
+        }
+    }
+
+    /// Get the [AggrColState] for this cursor. `self` will have been created via
+    /// `Cursor::clone_for_aggr_col`, so [Cursor::aggr_col_states] will contain exactly one item.
+    ///
+    /// # Panics
+    ///
+    /// Will panic if [Cursor::aggr_col_states] does not contain exactly one item.
+    fn get_aggr_col_state(&self) -> &AggrColState {
+        assert_eq!(1, self.aggr_col_states.len());
+        self.aggr_col_states.iter().next().unwrap().1
+    }
+
+    /// Set the [AggrColState] for this cursor. `self` will have been created via
+    /// `Cursor::clone_for_aggr_col`, so [Cursor::aggr_col_states] will contain exactly one item.
+    ///
+    /// # Panics
+    ///
+    /// Will panic if [Cursor::aggr_col_states] does not contain exactly one item.
+    fn set_aggr_col_state(&mut self, new_state: AggrColState) {
+        assert_eq!(1, self.aggr_col_states.len());
+        let (_idx, state) = self.aggr_col_states.iter_mut().next().unwrap();
+        *state = new_state;
+    }
+
+    /// Counts the number of rows that will be produced for a series that ends (exclusively)
+    /// at `series_end`, including rows that have a null timestamp, if any.
+    ///
+    /// Produces `None` for the case where `next_input_offset` is equal to `series_end`,
+    /// and there are no trailing gaps to produce.
+    fn count_series_rows(
+        &mut self,
+        params: &GapFillParams,
+        input_time_array: &TimestampNanosecondArray,
+        series_end: usize,
+    ) -> Option<usize> {
+        if !self.trailing_gaps && self.next_input_offset == series_end {
+            return None;
+        }
+
+        let mut count = if input_time_array.null_count() > 0 {
+            let len = series_end - self.next_input_offset;
+            let slice = input_time_array.slice(self.next_input_offset, len);
+            slice.null_count()
+        } else {
+            0
+        };
+
+        self.next_input_offset += count;
+        if self.maybe_init_next_ts(input_time_array, series_end) {
+            count += params.valid_row_count(self.next_ts.unwrap());
+        }
+
+        self.next_input_offset = series_end;
+        self.next_ts = params.first_ts;
+
+        Some(count)
+    }
+
+    /// Update this cursor to reflect that `offset` older rows are being sliced off from the
+    /// buffered input.
+    fn slice(&mut self, offset: usize, batch: &RecordBatch) -> Result<()> {
+        for (idx, aggr_col_state) in &mut self.aggr_col_states {
+            aggr_col_state.slice(offset, batch.column(*idx))?;
+        }
+        self.next_input_offset -= offset;
+        Ok(())
+    }
+
+    /// Attempts to assign a value to `self.next_ts` if it does not have one.
+    ///
+    /// This bit of abstraction is needed because the lower bound for gap filling may be
+    /// determined in one of two ways:
+    /// * If the [`GapFillParams`] provided by client code has `first_ts` set to `Some`, this
+    ///   will be the first timestamp for each series. In this case `self.next_ts`
+    ///   will never `None`, and this function does nothing.
+    /// * Otherwise it is determined to be whatever the first timestamp in the input series is.
+    ///   In this case `params.first_ts == None`, and we need to extract the timestamp from
+    ///   the input time array.
+    ///
+    /// Returns true if `self.next_ts` ends up containing a value.
+    fn maybe_init_next_ts(
+        &mut self,
+        input_time_array: &TimestampNanosecondArray,
+        series_end: usize,
+    ) -> bool {
+        self.next_ts = match self.next_ts {
+            Some(_) => self.next_ts,
+            None if self.next_input_offset < series_end
+                && input_time_array.is_valid(self.next_input_offset) =>
+            {
+                Some(input_time_array.value(self.next_input_offset))
+            }
+            // This may happen if current input offset points at a row
+            // with a null timestamp, or is past the end of the current series.
+            _ => None,
+        };
+        self.next_ts.is_some()
+    }
+
+    /// Builds a vector that can be used to produce a timestamp array.
+    fn build_time_vec(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+    ) -> Result<Vec<Option<i64>>> {
+        struct TimeBuilder {
+            times: Vec<Option<i64>>,
+        }
+
+        impl VecBuilder for TimeBuilder {
+            fn push(&mut self, row_status: RowStatus) -> Result<()> {
+                match row_status {
+                    RowStatus::NullTimestamp { .. } => self.times.push(None),
+                    RowStatus::Present { ts, .. } | RowStatus::Missing { ts, .. } => {
+                        self.times.push(Some(ts))
+                    }
+                }
+                Ok(())
+            }
+        }
+
+        let mut time_builder = TimeBuilder {
+            times: Vec::with_capacity(self.remaining_output_batch_size),
+        };
+        self.build_vec(params, input_time_array, series_ends, &mut time_builder)?;
+
+        Ok(time_builder.times)
+    }
+
+    /// Builds a vector that can use the [`take`](take::take) kernel
+    /// to produce a group column.
+    fn build_group_take_vec(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+    ) -> Result<Vec<u64>> {
+        struct GroupBuilder {
+            take_idxs: Vec<u64>,
+        }
+
+        impl VecBuilder for GroupBuilder {
+            fn push(&mut self, row_status: RowStatus) -> Result<()> {
+                match row_status {
+                    RowStatus::NullTimestamp {
+                        series_end_offset, ..
+                    }
+                    | RowStatus::Present {
+                        series_end_offset, ..
+                    }
+                    | RowStatus::Missing {
+                        series_end_offset, ..
+                    } => self.take_idxs.push(series_end_offset as u64 - 1),
+                }
+                Ok(())
+            }
+        }
+
+        let mut group_builder = GroupBuilder {
+            take_idxs: Vec::with_capacity(self.remaining_output_batch_size),
+        };
+        self.build_vec(params, input_time_array, series_ends, &mut group_builder)?;
+
+        Ok(group_builder.take_idxs)
+    }
+
+    /// Produce a gap-filled array for the aggregate column
+    /// in [`Self::aggr_col_states`].
+    ///
+    /// # Panics
+    ///
+    /// Will panic if [Cursor::aggr_col_states] does not contain exactly one item.
+    fn build_aggr_col(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+        input_aggr_array: &ArrayRef,
+    ) -> Result<ArrayRef> {
+        match self.get_aggr_col_state() {
+            AggrColState::Null => {
+                self.build_aggr_fill_null(params, series_ends, input_time_array, input_aggr_array)
+            }
+            AggrColState::PrevNullAsIntentional { .. } | AggrColState::PrevNullAsMissing { .. } => {
+                self.build_aggr_fill_prev(params, series_ends, input_time_array, input_aggr_array)
+            }
+            AggrColState::PrevNullAsMissingStashed { .. } => self.build_aggr_fill_prev_stashed(
+                params,
+                series_ends,
+                input_time_array,
+                input_aggr_array,
+            ),
+            AggrColState::LinearInterpolate(_) => self.build_aggr_fill_interpolate(
+                params,
+                series_ends,
+                input_time_array,
+                input_aggr_array,
+            ),
+        }
+    }
+
+    /// Builds an array using the [`take`](take::take) kernel
+    /// to produce an aggregate output column, filling gaps with
+    /// null values.
+    fn build_aggr_fill_null(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+        input_aggr_array: &ArrayRef,
+    ) -> Result<ArrayRef> {
+        struct AggrBuilder {
+            take_idxs: Vec<Option<u64>>,
+        }
+
+        impl VecBuilder for AggrBuilder {
+            fn push(&mut self, row_status: RowStatus) -> Result<()> {
+                match row_status {
+                    RowStatus::NullTimestamp { offset, .. } | RowStatus::Present { offset, .. } => {
+                        self.take_idxs.push(Some(offset as u64))
+                    }
+                    RowStatus::Missing { .. } => self.take_idxs.push(None),
+                }
+                Ok(())
+            }
+        }
+
+        let mut aggr_builder = AggrBuilder {
+            take_idxs: Vec::with_capacity(self.remaining_output_batch_size),
+        };
+        self.build_vec(params, input_time_array, series_ends, &mut aggr_builder)?;
+
+        let take_arr = UInt64Array::from(aggr_builder.take_idxs);
+        take::take(input_aggr_array, &take_arr, None)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
+    }
+
+    /// Builds an array using the [`take`](take::take) kernel
+    /// to produce an aggregate output column, filling gaps with the
+    /// previous values in the column.
+    fn build_aggr_fill_prev(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+        input_aggr_array: &ArrayRef,
+    ) -> Result<ArrayRef> {
+        struct AggrBuilder<'a> {
+            take_idxs: Vec<Option<u64>>,
+            prev_offset: Option<u64>,
+            input_aggr_array: &'a ArrayRef,
+            null_as_missing: bool,
+        }
+
+        impl<'a> VecBuilder for AggrBuilder<'a> {
+            fn push(&mut self, row_status: RowStatus) -> Result<()> {
+                match row_status {
+                    RowStatus::NullTimestamp { offset, .. } => {
+                        self.take_idxs.push(Some(offset as u64))
+                    }
+                    RowStatus::Present { offset, .. } => {
+                        if !self.null_as_missing || self.input_aggr_array.is_valid(offset) {
+                            self.take_idxs.push(Some(offset as u64));
+                            self.prev_offset = Some(offset as u64);
+                        } else {
+                            self.take_idxs.push(self.prev_offset);
+                        }
+                    }
+                    RowStatus::Missing { .. } => self.take_idxs.push(self.prev_offset),
+                }
+                Ok(())
+            }
+            fn start_new_series(&mut self) -> Result<()> {
+                self.prev_offset = None;
+                Ok(())
+            }
+        }
+
+        let null_as_missing = matches!(
+            self.get_aggr_col_state(),
+            AggrColState::PrevNullAsMissing { .. }
+        );
+
+        let mut aggr_builder = AggrBuilder {
+            take_idxs: Vec::with_capacity(self.remaining_output_batch_size),
+            prev_offset: self.get_aggr_col_state().prev_offset(),
+            input_aggr_array,
+            null_as_missing,
+        };
+        self.build_vec(params, input_time_array, series_ends, &mut aggr_builder)?;
+
+        let AggrBuilder {
+            take_idxs,
+            prev_offset,
+            ..
+        } = aggr_builder;
+        self.set_aggr_col_state(match null_as_missing {
+            false => AggrColState::PrevNullAsIntentional {
+                offset: prev_offset,
+            },
+            true => AggrColState::PrevNullAsMissing {
+                offset: prev_offset,
+            },
+        });
+
+        let take_arr = UInt64Array::from(take_idxs);
+        take::take(input_aggr_array, &take_arr, None)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
+    }
+
+    /// Builds an array using the [`interleave`](arrow::compute::interleave) kernel
+    /// to produce an aggregate output column, filling gaps with the
+    /// previous values in the column.
+    fn build_aggr_fill_prev_stashed(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+        input_aggr_array: &ArrayRef,
+    ) -> Result<ArrayRef> {
+        let stash = self.get_aggr_col_state().stash();
+        let mut aggr_builder = StashedAggrBuilder {
+            interleave_idxs: Vec::with_capacity(self.remaining_output_batch_size),
+            state: StashedAggrState::Stashed,
+            stash,
+            input_aggr_array,
+        };
+        self.build_vec(params, input_time_array, series_ends, &mut aggr_builder)?;
+        let output_array = aggr_builder.build()?;
+
+        // Update the aggregate column state for this cursor to prime it for the
+        // next batch.
+        let StashedAggrBuilder { state, .. } = aggr_builder;
+        match state {
+            StashedAggrState::Stashed => (), // nothing changes
+            StashedAggrState::PrevNone => {
+                self.set_aggr_col_state(AggrColState::PrevNullAsMissing { offset: None })
+            }
+            StashedAggrState::PrevSome { offset } => {
+                self.set_aggr_col_state(AggrColState::PrevNullAsMissing {
+                    offset: Some(offset as u64),
+                })
+            }
+        };
+
+        Ok(output_array)
+    }
+
+    /// Helper method that iterates over each series
+    /// that ends with offsets in `series_ends` and produces
+    /// the appropriate output values.
+    fn build_vec(
+        &mut self,
+        params: &GapFillParams,
+        input_time_array: &TimestampNanosecondArray,
+        series_ends: &[usize],
+        vec_builder: &mut impl VecBuilder,
+    ) -> Result<()> {
+        for series in series_ends {
+            if self
+                .next_ts
+                .map_or(false, |next_ts| next_ts > params.last_ts)
+            {
+                vec_builder.start_new_series()?;
+                self.next_ts = params.first_ts;
+            }
+
+            self.append_series_items(params, input_time_array, *series, vec_builder)?;
+        }
+
+        let last_series_end = series_ends.last().ok_or(DataFusionError::Internal(
+            "expected at least one item in series batch".to_string(),
+        ))?;
+
+        self.trailing_gaps = self.next_input_offset == *last_series_end
+            && self
+                .next_ts
+                .map_or(true, |next_ts| next_ts <= params.last_ts);
+        Ok(())
+    }
+
+    /// Helper method that generates output for one series by invoking
+    /// [VecBuilder::push] for each output value in the column to be generated.
+    fn append_series_items(
+        &mut self,
+        params: &GapFillParams,
+        input_times: &TimestampNanosecondArray,
+        series_end: usize,
+        vec_builder: &mut impl VecBuilder,
+    ) -> Result<()> {
+        // If there are any null timestamps for this group, they will be first.
+        // These rows can just be copied into the output.
+        // Append the corresponding values.
+        while self.remaining_output_batch_size > 0
+            && self.next_input_offset < series_end
+            && input_times.is_null(self.next_input_offset)
+        {
+            vec_builder.push(RowStatus::NullTimestamp {
+                series_end_offset: series_end,
+                offset: self.next_input_offset,
+            })?;
+            self.remaining_output_batch_size -= 1;
+            self.next_input_offset += 1;
+        }
+
+        if !self.maybe_init_next_ts(input_times, series_end) {
+            return Ok(());
+        }
+        let mut next_ts = self.next_ts.unwrap();
+
+        let output_row_count = std::cmp::min(
+            params.valid_row_count(next_ts),
+            self.remaining_output_batch_size,
+        );
+        if output_row_count == 0 {
+            return Ok(());
+        }
+
+        // last_ts is the last timestamp that will fit in the output batch
+        let last_ts = next_ts + (output_row_count - 1) as i64 * params.stride;
+
+        loop {
+            if self.next_input_offset >= series_end {
+                break;
+            }
+            let in_ts = input_times.value(self.next_input_offset);
+            if in_ts > last_ts {
+                break;
+            }
+            while next_ts < in_ts {
+                vec_builder.push(RowStatus::Missing {
+                    series_end_offset: series_end,
+                    ts: next_ts,
+                })?;
+                next_ts += params.stride;
+            }
+            vec_builder.push(RowStatus::Present {
+                series_end_offset: series_end,
+                offset: self.next_input_offset,
+                ts: next_ts,
+            })?;
+            next_ts += params.stride;
+            self.next_input_offset += 1;
+        }
+
+        // Add any additional missing values after the last of the input.
+        while next_ts <= last_ts {
+            vec_builder.push(RowStatus::Missing {
+                series_end_offset: series_end,
+                ts: next_ts,
+            })?;
+            next_ts += params.stride;
+        }
+
+        self.next_ts = Some(last_ts + params.stride);
+        self.remaining_output_batch_size -= output_row_count;
+        Ok(())
+    }
+}
+
+/// Maintains the state needed to fill gaps in an aggregate column,
+/// depending on the fill strategy.
+#[derive(Clone, Debug)]
+enum AggrColState {
+    /// For [FillStrategy::Null] there is no state to maintain.
+    Null,
+    /// For [FillStrategy::PrevNullAsIntentional].
+    PrevNullAsIntentional { offset: Option<u64> },
+    /// For [FillStrategy::PrevNullAsMissing].
+    PrevNullAsMissing { offset: Option<u64> },
+    /// For [FillStrategy::PrevNullAsMissing], when
+    /// the fill value must be stashed in a separate array so it
+    /// can persist across output batches.
+    ///
+    /// This state happens when the previous value in the buffered input
+    /// rows has gone away during a call to [`GapFiller::slice_input_batch`].
+    PrevNullAsMissingStashed { stash: ArrayRef },
+    /// For [FillStrategy::LinearInterpolate], this tracks if we are in the middle
+    /// of a "segment" (two non-null points in the input separated by more
+    /// than the stride) between output batches.
+    LinearInterpolate(Option<Segment<ScalarValue>>),
+}
+
+impl AggrColState {
+    /// Create a new [AggrColState] based on the [FillStrategy] for the column.
+    fn new(fill_strategy: &FillStrategy) -> Self {
+        match fill_strategy {
+            FillStrategy::Null => Self::Null,
+            FillStrategy::PrevNullAsIntentional => Self::PrevNullAsIntentional { offset: None },
+            FillStrategy::PrevNullAsMissing => Self::PrevNullAsMissing { offset: None },
+            FillStrategy::LinearInterpolate => Self::LinearInterpolate(None),
+        }
+    }
+
+    /// Return the offset in the input from which to fill gaps.
+    ///
+    /// # Panics
+    ///
+    /// This method will panic if `self` is not [AggrColState::PrevNullAsIntentional]
+    /// or [AggrColState::PrevNullAsMissing].
+    fn prev_offset(&self) -> Option<u64> {
+        match self {
+            Self::PrevNullAsIntentional { offset } | Self::PrevNullAsMissing { offset } => *offset,
+            _ => unreachable!(),
+        }
+    }
+
+    /// Update state to reflect that older rows in the buffered input
+    /// are being sliced away.
+    fn slice(&mut self, offset: usize, array: &ArrayRef) -> Result<()> {
+        let offset = offset as u64;
+        match self {
+            Self::PrevNullAsMissing { offset: Some(v) } if offset > *v => {
+                // The element in the buffered input that may be in the output
+                // will be sliced away, so store it on the side.
+                let stash = StashedAggrBuilder::create_stash(array, *v)?;
+                *self = Self::PrevNullAsMissingStashed { stash };
+            }
+            Self::PrevNullAsIntentional { offset: Some(v) }
+            | Self::PrevNullAsMissing { offset: Some(v) } => *v -= offset,
+            _ => (),
+        };
+        Ok(())
+    }
+
+    /// Return the stashed previous value used to fill gaps.
+    ///
+    /// # Panics
+    ///
+    /// This method will panic if `self` is not [AggrColState::PrevNullAsMissingStashed].
+    fn stash(&self) -> ArrayRef {
+        match self {
+            Self::PrevNullAsMissingStashed { stash } => Arc::clone(stash),
+            _ => unreachable!(),
+        }
+    }
+
+    /// Return the segment being interpolated, if any.
+    ///
+    /// # Panics
+    ///
+    /// This method will panic if `self` is not [AggrColState::LinearInterpolate].
+    fn segment(&self) -> &Option<Segment<ScalarValue>> {
+        match self {
+            Self::LinearInterpolate(segment) => segment,
+            _ => unreachable!(),
+        }
+    }
+}
+
+/// A trait that lets implementors describe how to build the
+/// vectors used to create Arrow arrays in the output.
+trait VecBuilder {
+    /// Pushes a new value based on the output row's
+    /// relation to the input row.
+    fn push(&mut self, _: RowStatus) -> Result<()>;
+
+    /// Called just before a new series starts.
+    fn start_new_series(&mut self) -> Result<()> {
+        Ok(())
+    }
+}
+
+/// The state of an input row relative to gap-filled output.
+#[derive(Debug)]
+enum RowStatus {
+    /// This row had a null timestamp in the input.
+    NullTimestamp {
+        /// The exclusive offset of the series end in the input.
+        series_end_offset: usize,
+        /// The offset of the null timestamp in the input time array.
+        offset: usize,
+    },
+    /// A row with this timestamp is present in the input.
+    Present {
+        /// The exclusive offset of the series end in the input.
+        series_end_offset: usize,
+        /// The offset of the value in the input time array.
+        offset: usize,
+        /// The timestamp corresponding to this row.
+        ts: i64,
+    },
+    /// A row with this timestamp is missing from the input.
+    Missing {
+        /// The exclusive offset of the series end in the input.
+        series_end_offset: usize,
+        /// The timestamp corresponding to this row.
+        ts: i64,
+    },
+}
+
+/// Implements [`VecBuilder`] for [`FillStrategy::PrevNullAsMissing`],
+/// specifically for the case where a previous value that needs to be
+/// propagated into a new output batch has been sliced off from
+/// buffered input rows.
+struct StashedAggrBuilder<'a> {
+    interleave_idxs: Vec<(usize, usize)>,
+    state: StashedAggrState,
+    stash: ArrayRef,
+    input_aggr_array: &'a ArrayRef,
+}
+
+impl StashedAggrBuilder<'_> {
+    /// Create a 2-element array containing a null value and the value from
+    /// `input_aggr_array` at `offset` for use with the [`interleave`](arrow::compute::interleave)
+    /// kernel.
+    fn create_stash(input_aggr_array: &ArrayRef, offset: u64) -> Result<ArrayRef> {
+        let take_arr: UInt64Array = vec![None, Some(offset)].into();
+        let stash = take::take(input_aggr_array, &take_arr, None)
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
+        Ok(stash)
+    }
+
+    /// Build the output column.
+    fn build(&self) -> Result<ArrayRef> {
+        arrow::compute::interleave(&[&self.stash, self.input_aggr_array], &self.interleave_idxs)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
+    }
+
+    fn buffered_input(offset: usize) -> (usize, usize) {
+        (Self::BUFFERED_INPUT_ARRAY, offset)
+    }
+
+    const STASHED_NULL: (usize, usize) = (0, 0);
+    const STASHED_VALUE: (usize, usize) = (0, 1);
+    const BUFFERED_INPUT_ARRAY: usize = 1;
+}
+
+/// Stores state about how to fill the output aggregate column
+/// for [`StashedAggrBuilder`].
+enum StashedAggrState {
+    /// Fill the next missing or null element with the
+    /// stashed value.
+    Stashed,
+    /// Fill the next missing or null element with a null value.
+    PrevNone,
+    /// Fill the next missing or null element with the element in the
+    /// input at `offset`.
+    PrevSome { offset: usize },
+}
+
+impl<'a> VecBuilder for StashedAggrBuilder<'a> {
+    fn push(&mut self, row_status: RowStatus) -> Result<()> {
+        match row_status {
+            RowStatus::NullTimestamp { offset, .. } => {
+                self.interleave_idxs.push(Self::buffered_input(offset));
+                self.state = StashedAggrState::PrevNone;
+            }
+            RowStatus::Present { offset, .. } if self.input_aggr_array.is_valid(offset) => {
+                self.interleave_idxs.push(Self::buffered_input(offset));
+                self.state = StashedAggrState::PrevSome { offset };
+            }
+            RowStatus::Present { .. } | RowStatus::Missing { .. } => match self.state {
+                StashedAggrState::Stashed => self.interleave_idxs.push(Self::STASHED_VALUE),
+                StashedAggrState::PrevNone => self.interleave_idxs.push(Self::STASHED_NULL),
+                StashedAggrState::PrevSome { offset } => {
+                    self.interleave_idxs.push(Self::buffered_input(offset))
+                }
+            },
+        }
+
+        Ok(())
+    }
+
+    fn start_new_series(&mut self) -> Result<()> {
+        self.state = StashedAggrState::PrevNone;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::{
+        array::{ArrayRef, Float64Array, TimestampNanosecondArray},
+        datatypes::{Field, Schema},
+        record_batch::RecordBatch,
+    };
+    use arrow_util::test_util::batches_to_lines;
+    use datafusion::error::Result;
+    use hashbrown::HashMap;
+    use schema::{InfluxColumnType, TIME_DATA_TIMEZONE};
+
+    use crate::exec::gapfill::{
+        algo::{AggrColState, Cursor},
+        params::GapFillParams,
+        FillStrategy,
+    };
+
+    #[test]
+    fn test_cursor_append_time_values() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let input_times = TimestampNanosecondArray::from(vec![1000, 1100, 1200]);
+        let series = input_times.len();
+
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1250,
+            fill_strategy: simple_fill_strategy(),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let out_times = cursor.build_time_vec(&params, &[series], &input_times)?;
+        assert_eq!(
+            vec![
+                Some(950),
+                Some(1000),
+                Some(1050),
+                Some(1100),
+                Some(1150),
+                Some(1200),
+                Some(1250)
+            ],
+            out_times
+        );
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_cursor_append_time_values_no_first_ts() {
+        test_helpers::maybe_start_logging();
+        let input_times = TimestampNanosecondArray::from(vec![1100, 1200]);
+        let series = input_times.len();
+
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: None,
+            last_ts: 1250,
+            fill_strategy: simple_fill_strategy(),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let out_times = cursor
+            .build_time_vec(&params, &[series], &input_times)
+            .unwrap();
+        assert_eq!(
+            vec![Some(1100), Some(1150), Some(1200), Some(1250)],
+            out_times
+        );
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_cursor_append_time_value_nulls() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let input_times =
+            TimestampNanosecondArray::from(vec![None, None, Some(1000), Some(1100), Some(1200)]);
+        let series = input_times.len();
+
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1250,
+            fill_strategy: simple_fill_strategy(),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+        let out_times = cursor.build_time_vec(&params, &[series], &input_times)?;
+        assert_eq!(
+            vec![
+                None,
+                None,
+                Some(950),
+                Some(1000),
+                Some(1050),
+                Some(1100),
+                Some(1150),
+                Some(1200),
+                Some(1250)
+            ],
+            out_times
+        );
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_cursor_append_group_take() -> Result<()> {
+        let input_times = TimestampNanosecondArray::from(vec![1000, 1100, 1200]);
+        let series = input_times.len();
+
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1250,
+            fill_strategy: simple_fill_strategy(),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+        let take_idxs = cursor.build_group_take_vec(&params, &[series], &input_times)?;
+        assert_eq!(vec![2; 7], take_idxs);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_cursor_append_aggr_take() {
+        let input_times = TimestampNanosecondArray::from(vec![1000, 1100, 1200]);
+        let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![10.0, 11.0, 12.0]));
+        let series = input_times.len();
+
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1250,
+            fill_strategy: simple_fill_strategy(),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_null(&params, &[series], &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "| 1970-01-01T00:00:00.000000950Z |      |"
+        - "| 1970-01-01T00:00:00.000001Z    | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001050Z |      |"
+        - "| 1970-01-01T00:00:00.000001100Z | 11.0 |"
+        - "| 1970-01-01T00:00:00.000001150Z |      |"
+        - "| 1970-01-01T00:00:00.000001200Z | 12.0 |"
+        - "| 1970-01-01T00:00:00.000001250Z |      |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_cursor_append_aggr_take_nulls() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let input_times =
+            TimestampNanosecondArray::from(vec![None, None, Some(1000), Some(1100), Some(1200)]);
+        let input_aggr_array: ArrayRef =
+            Arc::new(Float64Array::from(vec![0.1, 0.2, 10.0, 11.0, 12.0]));
+        let series = input_times.len();
+
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1250,
+            fill_strategy: simple_fill_strategy(),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr =
+            cursor.build_aggr_fill_null(&params, &[series], &input_times, &input_aggr_array)?;
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "|                                | 0.1  |"
+        - "|                                | 0.2  |"
+        - "| 1970-01-01T00:00:00.000000950Z |      |"
+        - "| 1970-01-01T00:00:00.000001Z    | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001050Z |      |"
+        - "| 1970-01-01T00:00:00.000001100Z | 11.0 |"
+        - "| 1970-01-01T00:00:00.000001150Z |      |"
+        - "| 1970-01-01T00:00:00.000001200Z | 12.0 |"
+        - "| 1970-01-01T00:00:00.000001250Z |      |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_cursor_append_aggr_take_prev() {
+        let input_times = TimestampNanosecondArray::from(vec![
+            // 950
+            1000, // 1050
+            1100, // 1150
+            1200,
+            // 1250
+        ]);
+        let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![10.0, 11.0, 12.0]));
+        let series = input_times.len();
+
+        let idx = 0;
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1250,
+            fill_strategy: prev_fill_strategy(idx),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_prev(&params, &[series], &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "| 1970-01-01T00:00:00.000000950Z |      |"
+        - "| 1970-01-01T00:00:00.000001Z    | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001050Z | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001100Z | 11.0 |"
+        - "| 1970-01-01T00:00:00.000001150Z | 11.0 |"
+        - "| 1970-01-01T00:00:00.000001200Z | 12.0 |"
+        - "| 1970-01-01T00:00:00.000001250Z | 12.0 |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_cursor_append_aggr_take_prev_with_nulls() {
+        let input_times = TimestampNanosecondArray::from(vec![
+            None,
+            None,
+            // 950,
+            Some(1000),
+            // 1050
+            Some(1100),
+            // 1150
+            Some(1200),
+            // 1250
+            //
+        ]);
+        let input_aggr_array: ArrayRef =
+            Arc::new(Float64Array::from(vec![0.0, 0.1, 10.0, 11.0, 12.0]));
+        let series = input_times.len();
+
+        let idx = 0;
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1250,
+            fill_strategy: prev_fill_strategy(idx),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_prev(&params, &[series], &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "|                                | 0.0  |"
+        - "|                                | 0.1  |"
+        - "| 1970-01-01T00:00:00.000000950Z |      |"
+        - "| 1970-01-01T00:00:00.000001Z    | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001050Z | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001100Z | 11.0 |"
+        - "| 1970-01-01T00:00:00.000001150Z | 11.0 |"
+        - "| 1970-01-01T00:00:00.000001200Z | 12.0 |"
+        - "| 1970-01-01T00:00:00.000001250Z | 12.0 |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_cursor_append_aggr_take_prev_multi_series() {
+        let input_times = TimestampNanosecondArray::from(vec![
+            // 950
+            // 1000
+            Some(1050),
+            // 1100
+            // --- new series
+            // 950
+            // 1000
+            Some(1050),
+            // 1100
+        ])
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![10.0, 11.0]));
+        let series_ends = vec![1, 2];
+
+        let idx = 0;
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1100,
+            fill_strategy: prev_fill_strategy(idx),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_null(&params, &series_ends, &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "| 1970-01-01T00:00:00.000000950Z |      |"
+        - "| 1970-01-01T00:00:00.000001Z    |      |"
+        - "| 1970-01-01T00:00:00.000001050Z | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001100Z |      |"
+        - "| 1970-01-01T00:00:00.000000950Z |      |"
+        - "| 1970-01-01T00:00:00.000001Z    |      |"
+        - "| 1970-01-01T00:00:00.000001050Z | 11.0 |"
+        - "| 1970-01-01T00:00:00.000001100Z |      |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_cursor_aggr_prev_null_as_missing() {
+        let input_times = TimestampNanosecondArray::from(vec![
+            // 950
+            // 1000
+            Some(1050),
+            Some(1100),
+            // --- new series
+            Some(950),
+            Some(1000),
+            Some(1050),
+            Some(1100),
+        ])
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![
+            // 950
+            // 1000
+            Some(10.0), // 1050
+            None,       // 1100
+            Some(20.0), // 950
+            None,       // 1000
+            Some(21.0), // 1050
+            None,       // 1100
+        ]));
+        let series_ends = vec![2, 6];
+
+        let idx = 0;
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1100,
+            fill_strategy: prev_null_as_missing_fill_strategy(idx),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_prev(&params, &series_ends, &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "| 1970-01-01T00:00:00.000000950Z |      |"
+        - "| 1970-01-01T00:00:00.000001Z    |      |"
+        - "| 1970-01-01T00:00:00.000001050Z | 10.0 |"
+        - "| 1970-01-01T00:00:00.000001100Z | 10.0 |"
+        - "| 1970-01-01T00:00:00.000000950Z | 20.0 |"
+        - "| 1970-01-01T00:00:00.000001Z    | 20.0 |"
+        - "| 1970-01-01T00:00:00.000001050Z | 21.0 |"
+        - "| 1970-01-01T00:00:00.000001100Z | 21.0 |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_cursor_aggr_prev_null_as_missing_stashed() {
+        // This test is intended to simulate producing output with
+        // prev-null-as-missing when the previous element has been
+        // sliced away from the buffered input and is "stashed" in
+        // another array on the side.
+        let input_times = TimestampNanosecondArray::from(vec![
+            // Some(950), // output in last batch
+            // ^^^^^^^^^ this element has been sliced off
+            // 1000 // <-- cursor.next_ts
+            Some(1050), // context row
+            Some(1100), // <-- cursor.next_input_offset
+            // 1150
+            // --- new series
+            None, // null timestamp
+            // 950
+            Some(1000),
+            Some(1050),
+            Some(1100),
+            Some(1100),
+        ])
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![
+            // Some(9.0) //  950
+            // ^^^^^^^^^ this element has been sliced off
+            // 1000 // filled with stashed because missing
+            None,       // 1050 // filled with stashed because null
+            Some(10.0), // 1100 // present
+            // 1150 // filled with previous because missing
+            // -- new series
+            Some(-20.0), // null timestamp
+            // 950 // null because no value for this series yet
+            None, // 1000 // still null
+            Some(21.1),
+            None, // 1100 // filled with previous because null value in column
+            None, // 1150 // filled with previous because null value in column
+        ]));
+        let series_ends = vec![2, 7];
+
+        let aggr_col_idx = 0;
+        let params = GapFillParams {
+            stride: 50,
+            first_ts: Some(950),
+            last_ts: 1150,
+            fill_strategy: prev_null_as_missing_fill_strategy(aggr_col_idx),
+        };
+
+        let stash: Float64Array = vec![None, Some(9.0)].into();
+        let stash: ArrayRef = Arc::new(stash);
+        let output_batch_size = 10000;
+        let mut cursor = Cursor {
+            next_input_offset: 1,
+            next_ts: Some(1000),
+            remaining_output_batch_size: output_batch_size,
+            trailing_gaps: false,
+            aggr_col_states: std::iter::once((
+                aggr_col_idx,
+                AggrColState::PrevNullAsMissingStashed { stash },
+            ))
+            .collect(),
+        };
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_prev_stashed(&params, &series_ends, &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+-------+
+        - "| time                           | a0    |"
+        - +--------------------------------+-------+
+        - "| 1970-01-01T00:00:00.000001Z    | 9.0   |"
+        - "| 1970-01-01T00:00:00.000001050Z | 9.0   |"
+        - "| 1970-01-01T00:00:00.000001100Z | 10.0  |"
+        - "| 1970-01-01T00:00:00.000001150Z | 10.0  |"
+        - "|                                | -20.0 |"
+        - "| 1970-01-01T00:00:00.000000950Z |       |"
+        - "| 1970-01-01T00:00:00.000001Z    |       |"
+        - "| 1970-01-01T00:00:00.000001050Z | 21.1  |"
+        - "| 1970-01-01T00:00:00.000001100Z | 21.1  |"
+        - "| 1970-01-01T00:00:00.000001150Z | 21.1  |"
+        - +--------------------------------+-------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    pub(crate) fn array_to_lines(
+        time_array: &TimestampNanosecondArray,
+        aggr_array: &ArrayRef,
+    ) -> Vec<String> {
+        let data_type = aggr_array.data_type().clone();
+        let schema = Schema::new(vec![
+            Field::new(
+                "time".to_string(),
+                (&InfluxColumnType::Timestamp).into(),
+                true,
+            ),
+            Field::new("a0".to_string(), data_type, true),
+        ]);
+
+        let time_array: ArrayRef = Arc::new(time_array.clone());
+        let arrays = vec![time_array, Arc::clone(aggr_array)];
+        let rb = RecordBatch::try_new(Arc::new(schema), arrays).unwrap();
+        batches_to_lines(&[rb])
+    }
+
+    pub(crate) fn new_cursor_with_batch_size(params: &GapFillParams, batch_size: usize) -> Cursor {
+        let mut cursor = Cursor::new(params);
+        cursor.remaining_output_batch_size = batch_size;
+        cursor
+    }
+
+    pub(crate) fn assert_cursor_end_state(
+        cursor: &Cursor,
+        input_times: &TimestampNanosecondArray,
+        params: &GapFillParams,
+    ) {
+        assert_eq!(input_times.len(), cursor.next_input_offset);
+        assert_eq!(params.last_ts + params.stride, cursor.next_ts.unwrap());
+    }
+
+    fn simple_fill_strategy() -> HashMap<usize, FillStrategy> {
+        std::iter::once((1, FillStrategy::Null)).collect()
+    }
+
+    fn prev_fill_strategy(idx: usize) -> HashMap<usize, FillStrategy> {
+        std::iter::once((idx, FillStrategy::PrevNullAsIntentional)).collect()
+    }
+
+    fn prev_null_as_missing_fill_strategy(idx: usize) -> HashMap<usize, FillStrategy> {
+        std::iter::once((idx, FillStrategy::PrevNullAsMissing)).collect()
+    }
+}
diff --git a/iox_query/src/exec/gapfill/algo/interpolate.rs b/iox_query/src/exec/gapfill/algo/interpolate.rs
new file mode 100644
index 0000000..277e01b
--- /dev/null
+++ b/iox_query/src/exec/gapfill/algo/interpolate.rs
@@ -0,0 +1,592 @@
+//! Filling gaps with interpolated values.
+use std::sync::Arc;
+
+use arrow::{
+    array::{
+        as_primitive_array, as_struct_array, Array, ArrayRef, PrimitiveArray, StructArray,
+        TimestampNanosecondArray,
+    },
+    datatypes::{ArrowPrimitiveType, DataType, Float64Type, Int64Type, UInt64Type},
+};
+
+use crate::exec::gapfill::params::GapFillParams;
+
+use datafusion::{
+    error::{DataFusionError, Result},
+    scalar::ScalarValue,
+};
+
+use super::{AggrColState, Cursor, RowStatus, VecBuilder};
+
+/// [Cursor] methods that are related to interpolation.
+impl Cursor {
+    /// Create an Arrow array with gaps filled in between values
+    /// using linear interpolation.
+    pub(super) fn build_aggr_fill_interpolate(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+        input_aggr_array: &ArrayRef,
+    ) -> Result<ArrayRef> {
+        match input_aggr_array.data_type() {
+            DataType::Int64 => {
+                let input_aggr_array = as_primitive_array::<Int64Type>(input_aggr_array);
+                self.build_aggr_fill_interpolate_typed(
+                    params,
+                    series_ends,
+                    input_time_array,
+                    input_aggr_array,
+                )
+            }
+            DataType::UInt64 => {
+                let input_aggr_array = as_primitive_array::<UInt64Type>(input_aggr_array);
+                self.build_aggr_fill_interpolate_typed(
+                    params,
+                    series_ends,
+                    input_time_array,
+                    input_aggr_array,
+                )
+            }
+            DataType::Float64 => {
+                let input_aggr_array = as_primitive_array::<Float64Type>(input_aggr_array);
+                self.build_aggr_fill_interpolate_typed(
+                    params,
+                    series_ends,
+                    input_time_array,
+                    input_aggr_array,
+                )
+            }
+            DataType::Struct(_) => {
+                // The only struct type that is expected is the one produced by the
+                // selector_* functions. These consist of a value, a timestamp and a
+                // number of associated values selected from the same row. When
+                // interpolating it is only the value field that will be interpolated.
+                // All other columns in the structure are filled with nulls.
+
+                let input_aggr_array = as_struct_array(input_aggr_array);
+                let (fields, arrays, _) = input_aggr_array.clone().into_parts();
+                let cursors = fields
+                    .iter()
+                    .map(|f| {
+                        if f.name() == "value" {
+                            // The "value" array uses the parent cursor.
+                            Ok(None)
+                        } else {
+                            Ok(Some(self.clone_for_aggr_col(None)?))
+                        }
+                    })
+                    .collect::<Result<Vec<Option<Self>>>>()?;
+                let new_arrays = cursors
+                    .into_iter()
+                    .zip(arrays.into_iter())
+                    .map(|(cursor, a)| {
+                        if let Some(mut c) = cursor {
+                            c.build_aggr_fill_null(params, series_ends, input_time_array, &a)
+                        } else {
+                            self.build_aggr_fill_interpolate(
+                                params,
+                                series_ends,
+                                input_time_array,
+                                &a,
+                            )
+                        }
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+                Ok(Arc::new(StructArray::new(fields, new_arrays, None)))
+            }
+            dt => Err(DataFusionError::Execution(format!(
+                "unsupported data type {dt} for interpolation gap filling"
+            ))),
+        }
+    }
+
+    /// Create an Arrow array with gaps filled in between values
+    /// using linear interpolation.
+    ///
+    /// This method has a template parameter and so accepts Arrow arrays of either
+    /// [Int64Array], [UInt64Array], or [Float64Array].
+    ///
+    /// [Int64Array]: arrow::array::Int64Array
+    /// [UInt64Array]: arrow::array::UInt64Array
+    /// [Float64Array]: arrow::array::Float64Array
+    pub(super) fn build_aggr_fill_interpolate_typed<T>(
+        &mut self,
+        params: &GapFillParams,
+        series_ends: &[usize],
+        input_time_array: &TimestampNanosecondArray,
+        input_aggr_array: &PrimitiveArray<T>,
+    ) -> Result<ArrayRef>
+    where
+        T: ArrowPrimitiveType,
+        T::Native: LinearInterpolate,
+        PrimitiveArray<T>: From<Vec<Option<T::Native>>>,
+        Segment<T::Native>: TryFrom<Segment<ScalarValue>, Error = DataFusionError>,
+        Segment<ScalarValue>: From<Segment<T::Native>>,
+    {
+        let segment = self
+            .get_aggr_col_state()
+            .segment()
+            .as_ref()
+            .map(|seg| Segment::<T::Native>::try_from(seg.clone()))
+            .transpose()?;
+        let mut builder = InterpolateBuilder {
+            values: Vec::with_capacity(self.remaining_output_batch_size),
+            segment,
+            input_time_array,
+            input_aggr_array,
+        };
+        self.build_vec(params, input_time_array, series_ends, &mut builder)?;
+
+        let segment: Option<Segment<ScalarValue>> = builder.segment.clone().map(|seg| seg.into());
+        self.set_aggr_col_state(AggrColState::LinearInterpolate(segment));
+        let array: PrimitiveArray<T> = builder.values.into();
+        Ok(Arc::new(array))
+    }
+}
+
+/// Represents two non-null data values at two points in time, where the
+/// gap between them must be fulled. The template parameter `T` stands in for
+/// the type of the input aggregate column being filled.
+#[derive(Clone, Debug)]
+pub struct Segment<T> {
+    start_point: (i64, T),
+    end_point: (i64, T),
+}
+
+/// A macro to go from `Segment<$NATIVE>` into [`Segment<ScalarValue>`].
+/// Between output batches data values in segments are stored as [`ScalarValue`]
+/// to avoid type parameters in [`Cursor`].
+macro_rules! impl_try_from_segment_native {
+    ($NATIVE:ident) => {
+        impl TryFrom<Segment<ScalarValue>> for Segment<$NATIVE> {
+            type Error = DataFusionError;
+
+            fn try_from(segment: Segment<ScalarValue>) -> Result<Self> {
+                let Segment {
+                    start_point: (start_ts, start_sv),
+                    end_point: (end_ts, end_sv),
+                } = segment;
+
+                let start_v = $NATIVE::try_from(start_sv)?;
+                let end_v = $NATIVE::try_from(end_sv)?;
+                Ok(Segment {
+                    start_point: (start_ts, start_v),
+                    end_point: (end_ts, end_v),
+                })
+            }
+        }
+    };
+}
+
+impl_try_from_segment_native!(i64);
+impl_try_from_segment_native!(u64);
+impl_try_from_segment_native!(f64);
+
+/// A macro to go from [`Segment<ScalarValue>`] into `Segment<$NATIVE>`.
+/// When producing an output batch, it's easiest to use the native type
+/// to represent segments being filled.
+macro_rules! impl_from_segment_scalar_value {
+    ($NATIVE:ident) => {
+        impl From<Segment<$NATIVE>> for Segment<ScalarValue> {
+            fn from(segment: Segment<$NATIVE>) -> Self {
+                let Segment {
+                    start_point: (start_ts, start_native),
+                    end_point: (end_ts, end_native),
+                } = segment;
+
+                let start_v = ScalarValue::from(start_native);
+                let end_v = ScalarValue::from(end_native);
+                Segment {
+                    start_point: (start_ts, start_v),
+                    end_point: (end_ts, end_v),
+                }
+            }
+        }
+    };
+}
+
+impl_from_segment_scalar_value!(i64);
+impl_from_segment_scalar_value!(u64);
+impl_from_segment_scalar_value!(f64);
+
+/// Implements [`VecBuilder`] for build aggregate columns whose gaps
+/// are being filled using linear interpolation.
+pub(super) struct InterpolateBuilder<'a, T: ArrowPrimitiveType> {
+    pub values: Vec<Option<T::Native>>,
+    pub segment: Option<Segment<T::Native>>,
+    pub input_time_array: &'a TimestampNanosecondArray,
+    pub input_aggr_array: &'a PrimitiveArray<T>,
+}
+
+impl<'a, T> VecBuilder for InterpolateBuilder<'a, T>
+where
+    T: ArrowPrimitiveType,
+    T::Native: LinearInterpolate,
+{
+    fn push(&mut self, row_status: RowStatus) -> Result<()> {
+        match row_status {
+            RowStatus::NullTimestamp { offset, .. } => self.copy_point(offset),
+            RowStatus::Present {
+                ts,
+                offset,
+                series_end_offset,
+            } => {
+                if self.input_aggr_array.is_valid(offset) {
+                    let end_offset = self.find_end_offset(offset, series_end_offset);
+                    // Find the next non-null value in this column for the series.
+                    // If there is one, start a new segment at the current value.
+                    self.segment = end_offset.map(|end_offset| Segment {
+                        start_point: (ts, self.input_aggr_array.value(offset)),
+                        end_point: (
+                            self.input_time_array.value(end_offset),
+                            self.input_aggr_array.value(end_offset),
+                        ),
+                    });
+                    self.copy_point(offset);
+                } else {
+                    self.values.push(
+                        self.segment
+                            .as_ref()
+                            .map(|seg| T::Native::interpolate(seg, ts)),
+                    );
+                }
+            }
+            RowStatus::Missing { ts, .. } => self.values.push(
+                self.segment
+                    .as_ref()
+                    .map(|seg| T::Native::interpolate(seg, ts)),
+            ),
+        }
+        Ok(())
+    }
+
+    fn start_new_series(&mut self) -> Result<()> {
+        self.segment = None;
+        Ok(())
+    }
+}
+
+impl<T> InterpolateBuilder<'_, T>
+where
+    T: ArrowPrimitiveType,
+{
+    /// Copies a point at `offset` into the vector that will be used to build
+    /// an Arrow array.
+    fn copy_point(&mut self, offset: usize) {
+        let v = self
+            .input_aggr_array
+            .is_valid(offset)
+            .then_some(self.input_aggr_array.value(offset));
+        self.values.push(v)
+    }
+
+    /// Scan forward to find the endpoint for a segment that starts at `start_offset`.
+    /// Skip over any null values.
+    ///
+    /// We are guaranteed to have buffered enough input to find the next non-null point for this series,
+    /// if there is one, by the logic in [`BufferedInput`].
+    ///
+    /// [`BufferedInput`]: super::super::buffered_input::BufferedInput
+    fn find_end_offset(&self, start_offset: usize, series_end_offset: usize) -> Option<usize> {
+        ((start_offset + 1)..series_end_offset).find(|&i| self.input_aggr_array.is_valid(i))
+    }
+}
+
+/// A trait for the native numeric types that can be interpolated
+/// by IOx.
+///
+/// All implementations match what the
+/// [1.8 Go implementation](<https://github.com/influxdata/influxdb/blob/688e697c51fd5353725da078555adbeff0363d01/query/linear.go>)
+/// of InfluxQL does.
+pub(super) trait LinearInterpolate
+where
+    Self: Sized,
+{
+    /// Given a [`Segment<Self>`] compute the value of the column at timestamp `ts`.
+    fn interpolate(segment: &Segment<Self>, ts: i64) -> Self;
+}
+
+impl LinearInterpolate for i64 {
+    fn interpolate(segment: &Segment<Self>, ts: i64) -> Self {
+        let rise = (segment.end_point.1 - segment.start_point.1) as f64;
+        let run = (segment.end_point.0 - segment.start_point.0) as f64;
+        let m = rise / run;
+        let x = (ts - segment.start_point.0) as f64;
+        let b: f64 = segment.start_point.1 as f64;
+        (m * x + b) as Self
+    }
+}
+
+impl LinearInterpolate for u64 {
+    fn interpolate(segment: &Segment<Self>, ts: i64) -> Self {
+        let rise = if segment.end_point.1 >= segment.start_point.1 {
+            (segment.end_point.1 - segment.start_point.1) as f64
+        } else {
+            -(segment.end_point.1.abs_diff(segment.start_point.1) as f64)
+        };
+        let run = (segment.end_point.0 - segment.start_point.0) as f64;
+        let m = rise / run;
+        let x = (ts - segment.start_point.0) as f64;
+        let b: f64 = segment.start_point.1 as f64;
+        (m * x + b) as Self
+    }
+}
+
+impl LinearInterpolate for f64 {
+    fn interpolate(segment: &Segment<Self>, ts: i64) -> Self {
+        let rise = segment.end_point.1 - segment.start_point.1;
+        let run = (segment.end_point.0 - segment.start_point.0) as Self;
+        let m = rise / run;
+        let x = (ts - segment.start_point.0) as Self;
+        let b = segment.start_point.1;
+        m * x + b
+    }
+}
+
+/// These tests verify that interpolation works as expected for each data type.
+/// For comprehensive tests that handle multiple series and input/output
+/// batches, see [crate::exec::gapfill::exec_tests].
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::array::{ArrayRef, Float64Array, Int64Array, TimestampNanosecondArray, UInt64Array};
+    use hashbrown::HashMap;
+    use schema::TIME_DATA_TIMEZONE;
+
+    use crate::exec::gapfill::{
+        algo::tests::{array_to_lines, assert_cursor_end_state, new_cursor_with_batch_size},
+        params::GapFillParams,
+        FillStrategy,
+    };
+
+    /// Verify the rounding behavior (really just truncating towards zero) which is
+    /// what InfluxQL does. Also verify that we can have a descending slope in the
+    /// line that does not overflow a `u64`.
+    #[test]
+    fn test_interpolate_u64() {
+        let input_times = TimestampNanosecondArray::from(vec![
+            // 1000
+            Some(1100),
+            // 1200
+            // 1300
+            Some(1400),
+            Some(1500),
+            // 1600
+            Some(1700),
+            // 1800
+            Some(1900),
+            // 2000
+        ]);
+        let input_aggr_array: ArrayRef = Arc::new(UInt64Array::from(vec![
+            //             1000
+            Some(100), //  1100
+            //             1200
+            //             1300
+            Some(200), //  1400
+            None,      //  1500
+            //             1600
+            Some(1000), // 1700
+            //             1800
+            Some(0), //    1900
+                     //    2000
+        ]));
+        let series_ends = vec![input_times.len()];
+
+        let idx = 0;
+        let params = GapFillParams {
+            stride: 100,
+            first_ts: Some(1000),
+            last_ts: 2000,
+            fill_strategy: interpolate_fill_strategy(idx),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_interpolate(&params, &series_ends, &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "| 1970-01-01T00:00:00.000001Z    |      |"
+        - "| 1970-01-01T00:00:00.000001100Z | 100  |"
+        - "| 1970-01-01T00:00:00.000001200Z | 133  |"
+        - "| 1970-01-01T00:00:00.000001300Z | 166  |"
+        - "| 1970-01-01T00:00:00.000001400Z | 200  |"
+        - "| 1970-01-01T00:00:00.000001500Z | 466  |"
+        - "| 1970-01-01T00:00:00.000001600Z | 733  |"
+        - "| 1970-01-01T00:00:00.000001700Z | 1000 |"
+        - "| 1970-01-01T00:00:00.000001800Z | 500  |"
+        - "| 1970-01-01T00:00:00.000001900Z | 0    |"
+        - "| 1970-01-01T00:00:00.000002Z    |      |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_interpolate_i64() {
+        let input_times = TimestampNanosecondArray::from(vec![
+            // 1000
+            Some(1100),
+            // 1200
+            // 1300
+            Some(1400),
+            Some(1500),
+            // 1600
+            Some(1700),
+            // 1800
+            Some(1900),
+            // 2000
+        ]);
+        let input_aggr_array: ArrayRef = Arc::new(Int64Array::from(vec![
+            //             1000
+            Some(100), //  1100
+            //             1200
+            //             1300
+            Some(200), //  1400
+            None,      //  1500
+            //             1600
+            Some(1000), // 1700
+            //             1800
+            Some(0), //    1900
+                     //    2000
+        ]));
+        let series_ends = vec![input_times.len()];
+
+        let idx = 0;
+        let params = GapFillParams {
+            stride: 100,
+            first_ts: Some(1000),
+            last_ts: 2000,
+            fill_strategy: interpolate_fill_strategy(idx),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_interpolate(&params, &series_ends, &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+------+
+        - "| time                           | a0   |"
+        - +--------------------------------+------+
+        - "| 1970-01-01T00:00:00.000001Z    |      |"
+        - "| 1970-01-01T00:00:00.000001100Z | 100  |"
+        - "| 1970-01-01T00:00:00.000001200Z | 133  |"
+        - "| 1970-01-01T00:00:00.000001300Z | 166  |"
+        - "| 1970-01-01T00:00:00.000001400Z | 200  |"
+        - "| 1970-01-01T00:00:00.000001500Z | 466  |"
+        - "| 1970-01-01T00:00:00.000001600Z | 733  |"
+        - "| 1970-01-01T00:00:00.000001700Z | 1000 |"
+        - "| 1970-01-01T00:00:00.000001800Z | 500  |"
+        - "| 1970-01-01T00:00:00.000001900Z | 0    |"
+        - "| 1970-01-01T00:00:00.000002Z    |      |"
+        - +--------------------------------+------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    #[test]
+    fn test_interpolate_f64() {
+        let input_times = TimestampNanosecondArray::from(vec![
+            // 1000
+            Some(1100),
+            // 1200
+            // 1300
+            Some(1400),
+            Some(1500),
+            // 1600
+            Some(1700),
+            // 1800
+            Some(1900),
+            // 2000
+        ]);
+        let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![
+            //               1000
+            Some(100.0), //  1100
+            //               1200
+            //               1300
+            Some(400.0), //  1400
+            None,        //  1500
+            //               1600
+            Some(1000.0), // 1700
+            //               1800
+            Some(0.0), //    1900
+                       //    2000
+        ]));
+        let series_ends = vec![input_times.len()];
+
+        let idx = 0;
+        let params = GapFillParams {
+            stride: 100,
+            first_ts: Some(1000),
+            last_ts: 2000,
+            fill_strategy: interpolate_fill_strategy(idx),
+        };
+
+        let output_batch_size = 10000;
+        let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
+
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+        let arr = cursor
+            .build_aggr_fill_interpolate(&params, &series_ends, &input_times, &input_aggr_array)
+            .unwrap();
+        insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
+        ---
+        - +--------------------------------+--------+
+        - "| time                           | a0     |"
+        - +--------------------------------+--------+
+        - "| 1970-01-01T00:00:00.000001Z    |        |"
+        - "| 1970-01-01T00:00:00.000001100Z | 100.0  |"
+        - "| 1970-01-01T00:00:00.000001200Z | 200.0  |"
+        - "| 1970-01-01T00:00:00.000001300Z | 300.0  |"
+        - "| 1970-01-01T00:00:00.000001400Z | 400.0  |"
+        - "| 1970-01-01T00:00:00.000001500Z | 600.0  |"
+        - "| 1970-01-01T00:00:00.000001600Z | 800.0  |"
+        - "| 1970-01-01T00:00:00.000001700Z | 1000.0 |"
+        - "| 1970-01-01T00:00:00.000001800Z | 500.0  |"
+        - "| 1970-01-01T00:00:00.000001900Z | 0.0    |"
+        - "| 1970-01-01T00:00:00.000002Z    |        |"
+        - +--------------------------------+--------+
+        "###);
+
+        assert_cursor_end_state(&cursor, &input_times, &params);
+    }
+
+    fn interpolate_fill_strategy(idx: usize) -> HashMap<usize, FillStrategy> {
+        std::iter::once((idx, FillStrategy::LinearInterpolate)).collect()
+    }
+}
diff --git a/iox_query/src/exec/gapfill/buffered_input.rs b/iox_query/src/exec/gapfill/buffered_input.rs
new file mode 100644
index 0000000..59ae311
--- /dev/null
+++ b/iox_query/src/exec/gapfill/buffered_input.rs
@@ -0,0 +1,502 @@
+//! Logic for buffering record batches for gap filling.
+
+use std::sync::Arc;
+
+use arrow::{
+    array::{as_struct_array, ArrayRef},
+    datatypes::DataType,
+    record_batch::RecordBatch,
+    row::{RowConverter, Rows, SortField},
+};
+use datafusion::error::{DataFusionError, Result};
+use hashbrown::HashSet;
+
+use super::{params::GapFillParams, FillStrategy};
+
+/// Encapsulate the logic around how to buffer input records.
+///
+/// If there are no columns with [`FillStrategy::LinearInterpolate`], then
+/// we need to buffer up to the last input row that might appear in the output, plus
+/// one additional row.
+///
+/// However, if there are columns filled via interpolation, then we need
+/// to ensure that we read ahead far enough to a non-null value, or a change
+/// of group columns, in the columns being interpolated.
+///
+/// [`FillStrategy::LinearInterpolate`]: super::FillStrategy::LinearInterpolate
+/// [`GapFillStream`]: super::stream::GapFillStream
+pub(super) struct BufferedInput {
+    /// Indexes of group columns in the schema (not including time).
+    group_cols: Vec<usize>,
+    /// Indexes of aggregate columns filled via interpolation.
+    interpolate_cols: Vec<usize>,
+    /// Buffered records from the input stream.
+    batches: Vec<RecordBatch>,
+    /// When gap filling with interpolated values, this row converter
+    /// is used to compare rows to see if group columns have changed.
+    row_converter: Option<RowConverter>,
+    /// When gap filling with interpolated values, cache a row-oriented
+    /// representation of the last row that may appear in the output so
+    /// it doesn't need to be computed more than once.
+    last_output_row: Option<Rows>,
+}
+
+impl BufferedInput {
+    pub(super) fn new(params: &GapFillParams, group_cols: Vec<usize>) -> Self {
+        let interpolate_cols = params
+            .fill_strategy
+            .iter()
+            .filter_map(|(col_offset, fs)| {
+                (fs == &FillStrategy::LinearInterpolate).then_some(*col_offset)
+            })
+            .collect::<Vec<usize>>();
+        Self {
+            group_cols,
+            interpolate_cols,
+            batches: vec![],
+            row_converter: None,
+            last_output_row: None,
+        }
+    }
+    /// Add a new batch of buffered records from the input stream.
+    pub(super) fn push(&mut self, batch: RecordBatch) {
+        self.batches.push(batch);
+    }
+
+    /// Transfer ownership of the buffered record batches to the caller for
+    /// processing.
+    pub(super) fn take(&mut self) -> Vec<RecordBatch> {
+        self.last_output_row = None;
+        std::mem::take(&mut self.batches)
+    }
+
+    /// Determine if we need more input before we start processing.
+    pub(super) fn need_more(&mut self, last_output_row_offset: usize) -> Result<bool> {
+        let record_count: usize = self.batches.iter().map(|rb| rb.num_rows()).sum();
+        // min number of rows needed is the number of rows up to and including
+        // the last row that may appear in the output, plus one more row.
+        let min_needed = last_output_row_offset + 2;
+
+        if record_count < min_needed {
+            return Ok(true);
+        } else if self.interpolate_cols.is_empty() {
+            return Ok(false);
+        }
+
+        // Check to see if the last row that might appear in the output
+        // has a different group column values than the last buffered row.
+        // If they are different, then we have enough input to start.
+        let (last_output_batch_offset, last_output_row_offset) = self
+            .find_row_idx(last_output_row_offset)
+            .expect("checked record count");
+        if self.group_columns_changed((last_output_batch_offset, last_output_row_offset))? {
+            return Ok(false);
+        }
+
+        // Now check if there are non-null values in the columns being interpolated.
+        // We skip over the batches that come before the one that contains the last
+        // possible output row. We start with the last buffered batch, so we can avoid
+        // having to slice unless necessary.
+        let mut cols_that_need_more =
+            HashSet::<usize>::from_iter(self.interpolate_cols.iter().cloned());
+        let mut to_remove = vec![];
+        for (i, batch) in self
+            .batches
+            .iter()
+            .enumerate()
+            .skip(last_output_batch_offset)
+            .rev()
+        {
+            for col_offset in cols_that_need_more.clone() {
+                // If this is the batch containing the last possible output row, slice the
+                // array so we are just looking at that value and the ones after.
+                let array = batch.column(col_offset);
+                let array = if i == last_output_batch_offset {
+                    let length = array.len() - last_output_row_offset;
+                    batch
+                        .column(col_offset)
+                        .slice(last_output_row_offset, length)
+                } else {
+                    Arc::clone(array)
+                };
+
+                let struct_value_col = if let DataType::Struct(fields) = array.data_type().clone() {
+                    fields.find("value").map(|(n, _)| n)
+                } else {
+                    None
+                };
+
+                match struct_value_col {
+                    Some(n) => {
+                        let value_array = as_struct_array(&array).column(n);
+                        if array.null_count() < array.len()
+                            && value_array.null_count() < value_array.len()
+                        {
+                            to_remove.push(col_offset);
+                        }
+                    }
+                    None => {
+                        if array.null_count() < array.len() {
+                            to_remove.push(col_offset);
+                        }
+                    }
+                }
+            }
+
+            to_remove.drain(..).for_each(|c| {
+                cols_that_need_more.remove(&c);
+            });
+            if cols_that_need_more.is_empty() {
+                break;
+            }
+        }
+
+        Ok(!cols_that_need_more.is_empty())
+    }
+
+    /// Check to see if the group column values have changed between the last row
+    /// that may be in the output and the last buffered input row.
+    ///
+    /// This method uses the row-oriented representation of Arrow data from [`arrow::row`] to
+    /// compare rows in different record batches.
+    ///
+    /// [`arrow::row`]: https://docs.rs/arrow-row/36.0.0/arrow_row/index.html
+    fn group_columns_changed(&mut self, last_output_row_idx: (usize, usize)) -> Result<bool> {
+        if self.group_cols.is_empty() {
+            return Ok(false);
+        }
+
+        let last_buffered_row_idx = self.last_buffered_row_idx();
+        if last_output_row_idx == last_buffered_row_idx {
+            // the output row is also the last buffered row,
+            // so there is nothing to compare.
+            return Ok(false);
+        }
+
+        let last_input_rows = self.convert_row(self.last_buffered_row_idx())?;
+        let last_row_in_output = self.last_output_row(last_output_row_idx)?;
+
+        Ok(last_row_in_output.row(0) != last_input_rows.row(0))
+    }
+
+    /// Get a row converter for comparing records. Keep it in [`Self::row_converter`]
+    /// to avoid creating it multiple times.
+    fn get_row_converter(&mut self) -> Result<&mut RowConverter> {
+        if self.row_converter.is_none() {
+            let batch = self.batches.first().expect("at least one batch");
+            let sort_fields = self
+                .group_cols
+                .iter()
+                .map(|c| SortField::new(batch.column(*c).data_type().clone()))
+                .collect();
+            let row_converter = RowConverter::new(sort_fields)
+                .map_err(|err| DataFusionError::ArrowError(err, None))?;
+            self.row_converter = Some(row_converter);
+        }
+        Ok(self.row_converter.as_mut().expect("cannot be none"))
+    }
+
+    /// Convert a row to row-oriented format for easy comparison.
+    fn convert_row(&mut self, row_idxs: (usize, usize)) -> Result<Rows> {
+        let batch = &self.batches[row_idxs.0];
+        let columns: Vec<ArrayRef> = self
+            .group_cols
+            .iter()
+            .map(|col_idx| batch.column(*col_idx).slice(row_idxs.1, 1))
+            .collect();
+        self.get_row_converter()?
+            .convert_columns(&columns)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
+    }
+
+    /// Returns the row-oriented representation of the last buffered row that may appear in the next
+    /// output batch. Since this row may be used multiple times, cache it in `self` to
+    /// avoid computing it multiple times.
+    fn last_output_row(&mut self, idxs: (usize, usize)) -> Result<&Rows> {
+        if self.last_output_row.is_none() {
+            let rows = self.convert_row(idxs)?;
+            self.last_output_row = Some(rows);
+        }
+        Ok(self.last_output_row.as_ref().expect("cannot be none"))
+    }
+
+    /// Return the `(batch_idx, row_idx)` of the last buffered row.
+    fn last_buffered_row_idx(&self) -> (usize, usize) {
+        let last_batch_len = self.batches.last().unwrap().num_rows();
+        (self.batches.len() - 1, last_batch_len - 1)
+    }
+
+    /// Return the `(batch_idx, row_idx)` of the `nth` row.
+    fn find_row_idx(&self, mut nth: usize) -> Option<(usize, usize)> {
+        let mut idx = None;
+        for (i, batch) in self.batches.iter().enumerate() {
+            if nth >= batch.num_rows() {
+                nth -= batch.num_rows()
+            } else {
+                idx = Some((i, nth));
+                break;
+            }
+        }
+        idx
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::VecDeque;
+
+    use arrow_util::test_util::batches_to_lines;
+
+    use super::*;
+    use crate::exec::gapfill::exec_tests::TestRecords;
+
+    fn test_records(batch_size: usize) -> VecDeque<RecordBatch> {
+        let records = TestRecords {
+            group_cols: vec![
+                std::iter::repeat(Some("a")).take(12).collect(),
+                std::iter::repeat(Some("b"))
+                    .take(6)
+                    .chain(std::iter::repeat(Some("c")).take(6))
+                    .collect(),
+            ],
+            time_col: (0..12).map(|i| Some(1000 + i * 5)).take(12).collect(),
+            timezone: None,
+            agg_cols: vec![
+                vec![
+                    Some(1),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    Some(10),
+                ],
+                vec![
+                    Some(2),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    Some(20),
+                    None,
+                    None,
+                    None,
+                ],
+                (0..12).map(Some).collect(),
+            ],
+            struct_cols: vec![],
+            input_batch_size: batch_size,
+        };
+
+        TryInto::<Vec<RecordBatch>>::try_into(records)
+            .unwrap()
+            .into()
+    }
+
+    fn test_struct_records(batch_size: usize) -> VecDeque<RecordBatch> {
+        let records = TestRecords {
+            group_cols: vec![
+                std::iter::repeat(Some("a")).take(12).collect(),
+                std::iter::repeat(Some("b"))
+                    .take(6)
+                    .chain(std::iter::repeat(Some("c")).take(6))
+                    .collect(),
+            ],
+            time_col: (0..12).map(|i| Some(1000 + i * 5)).take(12).collect(),
+            timezone: None,
+            agg_cols: vec![],
+            struct_cols: vec![
+                vec![
+                    Some(vec![1, 0]),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    Some(vec![10, 0]),
+                ],
+                vec![
+                    Some(vec![2, 0]),
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    None,
+                    Some(vec![20, 0]),
+                    None,
+                    None,
+                    None,
+                ],
+                (0..12).map(|n| Some(vec![n, 0])).collect(),
+            ],
+            input_batch_size: batch_size,
+        };
+
+        TryInto::<Vec<RecordBatch>>::try_into(records)
+            .unwrap()
+            .into()
+    }
+
+    fn test_params() -> GapFillParams {
+        GapFillParams {
+            stride: 50_000_000,
+            first_ts: Some(1_000_000_000),
+            last_ts: 1_055_000_000,
+            fill_strategy: [
+                (3, FillStrategy::LinearInterpolate),
+                (4, FillStrategy::LinearInterpolate),
+            ]
+            .into(),
+        }
+    }
+
+    // This test is just here so it's clear what the
+    // test data is
+    #[test]
+    fn test_test_records() {
+        let batch = test_records(1000).pop_front().unwrap();
+        let actual = batches_to_lines(&[batch]);
+        insta::assert_yaml_snapshot!(actual, @r###"
+        ---
+        - +----+----+--------------------------+----+----+----+
+        - "| g0 | g1 | time                     | a0 | a1 | a2 |"
+        - +----+----+--------------------------+----+----+----+
+        - "| a  | b  | 1970-01-01T00:00:01Z     | 1  | 2  | 0  |"
+        - "| a  | b  | 1970-01-01T00:00:01.005Z |    |    | 1  |"
+        - "| a  | b  | 1970-01-01T00:00:01.010Z |    |    | 2  |"
+        - "| a  | b  | 1970-01-01T00:00:01.015Z |    |    | 3  |"
+        - "| a  | b  | 1970-01-01T00:00:01.020Z |    |    | 4  |"
+        - "| a  | b  | 1970-01-01T00:00:01.025Z |    |    | 5  |"
+        - "| a  | c  | 1970-01-01T00:00:01.030Z |    |    | 6  |"
+        - "| a  | c  | 1970-01-01T00:00:01.035Z |    |    | 7  |"
+        - "| a  | c  | 1970-01-01T00:00:01.040Z |    | 20 | 8  |"
+        - "| a  | c  | 1970-01-01T00:00:01.045Z |    |    | 9  |"
+        - "| a  | c  | 1970-01-01T00:00:01.050Z |    |    | 10 |"
+        - "| a  | c  | 1970-01-01T00:00:01.055Z | 10 |    | 11 |"
+        - +----+----+--------------------------+----+----+----+
+        "###);
+    }
+
+    #[test]
+    fn no_group_no_interpolate() {
+        let batch_size = 3;
+        let mut params = test_params();
+        params.fill_strategy = [].into();
+
+        let mut buffered_input = BufferedInput::new(&params, vec![]);
+        let mut batches = test_records(batch_size);
+
+        // There are no rows, so that is less than the batch size,
+        // it needs more.
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // There are now 3 rows, still less than batch_size + 1,
+        // so it needs more.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // We now have batch_size * 2, records, which is enough.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(!buffered_input.need_more(batch_size - 1).unwrap());
+    }
+
+    #[test]
+    fn no_group() {
+        let batch_size = 3;
+        let params = test_params();
+        let mut buffered_input = BufferedInput::new(&params, vec![]);
+        let mut batches = test_records(batch_size);
+
+        // There are no rows, so that is less than the batch size,
+        // it needs more.
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // There are now 3 rows, still less than batch_size + 1,
+        // so it needs more.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // There are now 6 rows, if we were not interpolating,
+        // this would be enough.
+        buffered_input.push(batches.pop_front().unwrap());
+
+        // If we are interpolating, there are no non null values
+        // at offset 5.
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // Push more rows, now totaling 9.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+        // Column `a1` has a non-null value at offset 8.
+        // If that were the only column being interpolated, we would have enough.
+
+        // 12 rows, with non-null values in both columns being interpolated.
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(!buffered_input.need_more(batch_size - 1).unwrap());
+    }
+
+    #[test]
+    fn with_group() {
+        let params = test_params();
+        let group_cols = vec![0, 1];
+        let mut buffered_input = BufferedInput::new(&params, group_cols);
+
+        let batch_size = 3;
+        let mut batches = test_records(batch_size);
+
+        // no rows
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 3 rows
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 6 rows
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 9 rows (series changes here)
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(!buffered_input.need_more(batch_size - 1).unwrap());
+    }
+
+    #[test]
+    fn struct_with_group() {
+        let params = test_params();
+        let group_cols = vec![0, 1];
+        let mut buffered_input = BufferedInput::new(&params, group_cols);
+
+        let batch_size = 3;
+        let mut batches = test_struct_records(batch_size);
+
+        // no rows
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 3 rows
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 6 rows
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(buffered_input.need_more(batch_size - 1).unwrap());
+
+        // 9 rows (series changes here)
+        buffered_input.push(batches.pop_front().unwrap());
+        assert!(!buffered_input.need_more(batch_size - 1).unwrap());
+    }
+}
diff --git a/iox_query/src/exec/gapfill/exec_tests.rs b/iox_query/src/exec/gapfill/exec_tests.rs
new file mode 100644
index 0000000..cc0a190
--- /dev/null
+++ b/iox_query/src/exec/gapfill/exec_tests.rs
@@ -0,0 +1,1619 @@
+//! Tests that verify output produced by [GapFillExec].
+
+use std::{
+    cmp::Ordering,
+    ops::{Bound, Range},
+};
+
+use super::*;
+use arrow::{
+    array::{ArrayRef, DictionaryArray, Int64Array, StructArray, TimestampNanosecondArray},
+    datatypes::{DataType, Field, Fields, Int32Type, Schema, TimeUnit},
+    record_batch::RecordBatch,
+};
+use arrow_util::test_util::batches_to_lines;
+use datafusion::{
+    error::Result,
+    execution::runtime_env::{RuntimeConfig, RuntimeEnv},
+    physical_plan::{
+        collect, expressions::col as phys_col, expressions::lit as phys_lit, memory::MemoryExec,
+    },
+    prelude::{SessionConfig, SessionContext},
+    scalar::ScalarValue,
+};
+use futures::executor::block_on;
+use observability_deps::tracing::debug;
+use schema::{InfluxColumnType, InfluxFieldType};
+use test_helpers::assert_error;
+
+#[test]
+fn test_gapfill_simple() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8] {
+        for input_batch_size in [1, 2] {
+            let batch = TestRecords {
+                group_cols: vec![vec![Some("a"), Some("a")]],
+                time_col: vec![Some(1_000), Some(1_100)],
+                timezone: None,
+                agg_cols: vec![vec![Some(10), Some(11)]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&batch, 25, Some(975), 1_125);
+            let tc = TestCase {
+                test_records: batch,
+                output_batch_size,
+                params,
+            };
+            // For this simple test case, also test that
+            // memory is tracked correctly, which is done by
+            // TestCase when running with a memory limit.
+            let batches = tc.run_with_memory_limit(16384).unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+--------------------------+----+
+            - "| g0 | time                     | a0 |"
+            - +----+--------------------------+----+
+            - "| a  | 1970-01-01T00:00:00.975Z |    |"
+            - "| a  | 1970-01-01T00:00:01Z     | 10 |"
+            - "| a  | 1970-01-01T00:00:01.025Z |    |"
+            - "| a  | 1970-01-01T00:00:01.050Z |    |"
+            - "| a  | 1970-01-01T00:00:01.075Z |    |"
+            - "| a  | 1970-01-01T00:00:01.100Z | 11 |"
+            - "| a  | 1970-01-01T00:00:01.125Z |    |"
+            - +----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_simple_tz() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8] {
+        for input_batch_size in [1, 2] {
+            let batch = TestRecords {
+                group_cols: vec![vec![Some("a"), Some("a")]],
+                time_col: vec![Some(1_000), Some(1_100)],
+                timezone: Some("Australia/Adelaide".into()),
+                agg_cols: vec![vec![Some(10), Some(11)]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&batch, 25, Some(975), 1_125);
+            let tc = TestCase {
+                test_records: batch,
+                output_batch_size,
+                params,
+            };
+            // For this simple test case, also test that
+            // memory is tracked correctly, which is done by
+            // TestCase when running with a memory limit.
+            let batches = tc.run_with_memory_limit(16384).unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+-------------------------------+----+
+            - "| g0 | time                          | a0 |"
+            - +----+-------------------------------+----+
+            - "| a  | 1970-01-01T09:30:00.975+09:30 |    |"
+            - "| a  | 1970-01-01T09:30:01+09:30     | 10 |"
+            - "| a  | 1970-01-01T09:30:01.025+09:30 |    |"
+            - "| a  | 1970-01-01T09:30:01.050+09:30 |    |"
+            - "| a  | 1970-01-01T09:30:01.075+09:30 |    |"
+            - "| a  | 1970-01-01T09:30:01.100+09:30 | 11 |"
+            - "| a  | 1970-01-01T09:30:01.125+09:30 |    |"
+            - +----+-------------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_simple_no_group_no_aggr() {
+    // There may be no group columns in a gap fill query,
+    // and there may be no aggregate columns as well.
+    // Such a query is not all that useful but it should work.
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8] {
+        for input_batch_size in [1, 2, 4] {
+            let batch = TestRecords {
+                group_cols: vec![],
+                time_col: vec![None, Some(1_000), Some(1_100)],
+                timezone: None,
+                agg_cols: vec![],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&batch, 25, Some(975), 1_125);
+            let tc = TestCase {
+                test_records: batch,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +--------------------------+
+            - "| time                     |"
+            - +--------------------------+
+            - "|                          |"
+            - "| 1970-01-01T00:00:00.975Z |"
+            - "| 1970-01-01T00:00:01Z     |"
+            - "| 1970-01-01T00:00:01.025Z |"
+            - "| 1970-01-01T00:00:01.050Z |"
+            - "| 1970-01-01T00:00:01.075Z |"
+            - "| 1970-01-01T00:00:01.100Z |"
+            - "| 1970-01-01T00:00:01.125Z |"
+            - +--------------------------+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_multi_group_simple() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8, 16] {
+        for input_batch_size in [1, 2, 4] {
+            let records = TestRecords {
+                group_cols: vec![vec![Some("a"), Some("a"), Some("b"), Some("b")]],
+                time_col: vec![Some(1_000), Some(1_100), Some(1_025), Some(1_050)],
+                timezone: None,
+                agg_cols: vec![vec![Some(10), Some(11), Some(20), Some(21)]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+--------------------------+----+
+            - "| g0 | time                     | a0 |"
+            - +----+--------------------------+----+
+            - "| a  | 1970-01-01T00:00:00.975Z |    |"
+            - "| a  | 1970-01-01T00:00:01Z     | 10 |"
+            - "| a  | 1970-01-01T00:00:01.025Z |    |"
+            - "| a  | 1970-01-01T00:00:01.050Z |    |"
+            - "| a  | 1970-01-01T00:00:01.075Z |    |"
+            - "| a  | 1970-01-01T00:00:01.100Z | 11 |"
+            - "| a  | 1970-01-01T00:00:01.125Z |    |"
+            - "| b  | 1970-01-01T00:00:00.975Z |    |"
+            - "| b  | 1970-01-01T00:00:01Z     |    |"
+            - "| b  | 1970-01-01T00:00:01.025Z | 20 |"
+            - "| b  | 1970-01-01T00:00:01.050Z | 21 |"
+            - "| b  | 1970-01-01T00:00:01.075Z |    |"
+            - "| b  | 1970-01-01T00:00:01.100Z |    |"
+            - "| b  | 1970-01-01T00:00:01.125Z |    |"
+            - +----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_multi_group_simple_origin() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8, 16] {
+        for input_batch_size in [1, 2, 4] {
+            let records = TestRecords {
+                group_cols: vec![vec![Some("a"), Some("a"), Some("b"), Some("b")]],
+                time_col: vec![Some(1_000), Some(1_100), Some(1_025), Some(1_050)],
+                timezone: None,
+                agg_cols: vec![vec![Some(10), Some(11), Some(20), Some(21)]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms_with_origin_fill_strategy(&records, 25, Some(975), 1_125, Some(3), FillStrategy::Null);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            // timestamps are now offset by 3ms
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+--------------------------+----+
+            - "| g0 | time                     | a0 |"
+            - +----+--------------------------+----+
+            - "| a  | 1970-01-01T00:00:00.953Z |    |"
+            - "| a  | 1970-01-01T00:00:00.978Z |    |"
+            - "| a  | 1970-01-01T00:00:01.003Z | 10 |"
+            - "| a  | 1970-01-01T00:00:01.028Z |    |"
+            - "| a  | 1970-01-01T00:00:01.053Z |    |"
+            - "| a  | 1970-01-01T00:00:01.078Z |    |"
+            - "| a  | 1970-01-01T00:00:01.103Z | 11 |"
+            - "| b  | 1970-01-01T00:00:00.953Z |    |"
+            - "| b  | 1970-01-01T00:00:00.978Z |    |"
+            - "| b  | 1970-01-01T00:00:01.003Z |    |"
+            - "| b  | 1970-01-01T00:00:01.028Z | 20 |"
+            - "| b  | 1970-01-01T00:00:01.053Z | 21 |"
+            - "| b  | 1970-01-01T00:00:01.078Z |    |"
+            - "| b  | 1970-01-01T00:00:01.103Z |    |"
+            - +----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_multi_group_with_nulls() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8, 16, 32] {
+        for input_batch_size in [1, 2, 4, 8] {
+            let records = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                ]],
+                time_col: vec![
+                    None,
+                    None,
+                    Some(1_000),
+                    Some(1_100),
+                    None,
+                    Some(1_000),
+                    Some(1_100),
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(1),
+                    None,
+                    Some(10),
+                    Some(11),
+                    Some(2),
+                    Some(20),
+                    Some(21),
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+--------------------------+----+
+            - "| g0 | time                     | a0 |"
+            - +----+--------------------------+----+
+            - "| a  |                          | 1  |"
+            - "| a  |                          |    |"
+            - "| a  | 1970-01-01T00:00:00.975Z |    |"
+            - "| a  | 1970-01-01T00:00:01Z     | 10 |"
+            - "| a  | 1970-01-01T00:00:01.025Z |    |"
+            - "| a  | 1970-01-01T00:00:01.050Z |    |"
+            - "| a  | 1970-01-01T00:00:01.075Z |    |"
+            - "| a  | 1970-01-01T00:00:01.100Z | 11 |"
+            - "| a  | 1970-01-01T00:00:01.125Z |    |"
+            - "| b  |                          | 2  |"
+            - "| b  | 1970-01-01T00:00:00.975Z |    |"
+            - "| b  | 1970-01-01T00:00:01Z     | 20 |"
+            - "| b  | 1970-01-01T00:00:01.025Z |    |"
+            - "| b  | 1970-01-01T00:00:01.050Z |    |"
+            - "| b  | 1970-01-01T00:00:01.075Z |    |"
+            - "| b  | 1970-01-01T00:00:01.100Z | 21 |"
+            - "| b  | 1970-01-01T00:00:01.125Z |    |"
+            - +----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_multi_group_cols_with_nulls() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8, 16, 32] {
+        for input_batch_size in [1, 2, 4, 8] {
+            let records = TestRecords {
+                group_cols: vec![
+                    vec![
+                        Some("a"),
+                        Some("a"),
+                        Some("a"),
+                        Some("a"),
+                        Some("a"),
+                        Some("a"),
+                        Some("a"),
+                    ],
+                    vec![
+                        Some("c"),
+                        Some("c"),
+                        Some("c"),
+                        Some("c"),
+                        Some("d"),
+                        Some("d"),
+                        Some("d"),
+                    ],
+                ],
+                time_col: vec![
+                    None,
+                    None,
+                    Some(1_000),
+                    Some(1_100),
+                    None,
+                    Some(1_000),
+                    Some(1_100),
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(1),
+                    None,
+                    Some(10),
+                    Some(11),
+                    Some(2),
+                    Some(20),
+                    Some(21),
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+----+--------------------------+----+
+            - "| g0 | g1 | time                     | a0 |"
+            - +----+----+--------------------------+----+
+            - "| a  | c  |                          | 1  |"
+            - "| a  | c  |                          |    |"
+            - "| a  | c  | 1970-01-01T00:00:00.975Z |    |"
+            - "| a  | c  | 1970-01-01T00:00:01Z     | 10 |"
+            - "| a  | c  | 1970-01-01T00:00:01.025Z |    |"
+            - "| a  | c  | 1970-01-01T00:00:01.050Z |    |"
+            - "| a  | c  | 1970-01-01T00:00:01.075Z |    |"
+            - "| a  | c  | 1970-01-01T00:00:01.100Z | 11 |"
+            - "| a  | c  | 1970-01-01T00:00:01.125Z |    |"
+            - "| a  | d  |                          | 2  |"
+            - "| a  | d  | 1970-01-01T00:00:00.975Z |    |"
+            - "| a  | d  | 1970-01-01T00:00:01Z     | 20 |"
+            - "| a  | d  | 1970-01-01T00:00:01.025Z |    |"
+            - "| a  | d  | 1970-01-01T00:00:01.050Z |    |"
+            - "| a  | d  | 1970-01-01T00:00:01.075Z |    |"
+            - "| a  | d  | 1970-01-01T00:00:01.100Z | 21 |"
+            - "| a  | d  | 1970-01-01T00:00:01.125Z |    |"
+            - +----+----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_multi_group_cols_with_more_nulls() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8, 16, 32] {
+        for input_batch_size in [1, 2, 4, 8] {
+            let records = TestRecords {
+                group_cols: vec![vec![Some("a"), Some("b"), Some("b"), Some("b"), Some("b")]],
+                time_col: vec![
+                    Some(1_000),
+                    None, // group b
+                    None,
+                    None,
+                    None,
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(10), // group a
+                    Some(90), // group b
+                    Some(91),
+                    Some(92),
+                    Some(93),
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&records, 25, Some(975), 1_025);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+--------------------------+----+
+            - "| g0 | time                     | a0 |"
+            - +----+--------------------------+----+
+            - "| a  | 1970-01-01T00:00:00.975Z |    |"
+            - "| a  | 1970-01-01T00:00:01Z     | 10 |"
+            - "| a  | 1970-01-01T00:00:01.025Z |    |"
+            - "| b  |                          | 90 |"
+            - "| b  |                          | 91 |"
+            - "| b  |                          | 92 |"
+            - "| b  |                          | 93 |"
+            - "| b  | 1970-01-01T00:00:00.975Z |    |"
+            - "| b  | 1970-01-01T00:00:01Z     |    |"
+            - "| b  | 1970-01-01T00:00:01.025Z |    |"
+            - +----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_multi_aggr_cols_with_nulls() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8, 16, 32] {
+        for input_batch_size in [1, 2, 4, 8] {
+            let records = TestRecords {
+                group_cols: vec![
+                    vec![
+                        Some("a"),
+                        Some("a"),
+                        Some("a"),
+                        Some("a"),
+                        Some("b"),
+                        Some("b"),
+                        Some("b"),
+                    ],
+                    vec![
+                        Some("c"),
+                        Some("c"),
+                        Some("c"),
+                        Some("c"),
+                        Some("d"),
+                        Some("d"),
+                        Some("d"),
+                    ],
+                ],
+                time_col: vec![
+                    None,
+                    None,
+                    Some(1_000),
+                    Some(1_100),
+                    None,
+                    Some(1_000),
+                    Some(1_100),
+                ],
+                timezone: None,
+                agg_cols: vec![
+                    vec![
+                        Some(1),
+                        None,
+                        Some(10),
+                        Some(11),
+                        Some(2),
+                        Some(20),
+                        Some(21),
+                    ],
+                    vec![
+                        Some(3),
+                        Some(3),
+                        Some(30),
+                        None,
+                        Some(4),
+                        Some(40),
+                        Some(41),
+                    ],
+                ],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&records, 25, Some(975), 1_125);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+----+--------------------------+----+----+
+            - "| g0 | g1 | time                     | a0 | a1 |"
+            - +----+----+--------------------------+----+----+
+            - "| a  | c  |                          | 1  | 3  |"
+            - "| a  | c  |                          |    | 3  |"
+            - "| a  | c  | 1970-01-01T00:00:00.975Z |    |    |"
+            - "| a  | c  | 1970-01-01T00:00:01Z     | 10 | 30 |"
+            - "| a  | c  | 1970-01-01T00:00:01.025Z |    |    |"
+            - "| a  | c  | 1970-01-01T00:00:01.050Z |    |    |"
+            - "| a  | c  | 1970-01-01T00:00:01.075Z |    |    |"
+            - "| a  | c  | 1970-01-01T00:00:01.100Z | 11 |    |"
+            - "| a  | c  | 1970-01-01T00:00:01.125Z |    |    |"
+            - "| b  | d  |                          | 2  | 4  |"
+            - "| b  | d  | 1970-01-01T00:00:00.975Z |    |    |"
+            - "| b  | d  | 1970-01-01T00:00:01Z     | 20 | 40 |"
+            - "| b  | d  | 1970-01-01T00:00:01.025Z |    |    |"
+            - "| b  | d  | 1970-01-01T00:00:01.050Z |    |    |"
+            - "| b  | d  | 1970-01-01T00:00:01.075Z |    |    |"
+            - "| b  | d  | 1970-01-01T00:00:01.100Z | 21 | 41 |"
+            - "| b  | d  | 1970-01-01T00:00:01.125Z |    |    |"
+            - +----+----+--------------------------+----+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_simple_no_lower_bound() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8] {
+        for input_batch_size in [1, 2, 4] {
+            let batch = TestRecords {
+                group_cols: vec![vec![Some("a"), Some("a"), Some("b"), Some("b")]],
+                time_col: vec![Some(1_025), Some(1_100), Some(1_050), Some(1_100)],
+                timezone: None,
+                agg_cols: vec![vec![Some(10), Some(11), Some(20), Some(21)]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&batch, 25, None, 1_125);
+            let tc = TestCase {
+                test_records: batch,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+--------------------------+----+
+            - "| g0 | time                     | a0 |"
+            - +----+--------------------------+----+
+            - "| a  | 1970-01-01T00:00:01.025Z | 10 |"
+            - "| a  | 1970-01-01T00:00:01.050Z |    |"
+            - "| a  | 1970-01-01T00:00:01.075Z |    |"
+            - "| a  | 1970-01-01T00:00:01.100Z | 11 |"
+            - "| a  | 1970-01-01T00:00:01.125Z |    |"
+            - "| b  | 1970-01-01T00:00:01.050Z | 20 |"
+            - "| b  | 1970-01-01T00:00:01.075Z |    |"
+            - "| b  | 1970-01-01T00:00:01.100Z | 21 |"
+            - "| b  | 1970-01-01T00:00:01.125Z |    |"
+            - +----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_fill_prev() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8] {
+        for input_batch_size in [1, 2, 4] {
+            let records = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                ]],
+                time_col: vec![
+                    // 975
+                    Some(1000),
+                    // 1025
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    // 1125
+                    // --- new series
+                    // 975
+                    Some(1000),
+                    // 1025
+                    Some(1050),
+                    // 1075
+                    Some(1100),
+                    // 1125
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(10),
+                    Some(11),
+                    Some(20),
+                    None,
+                    Some(21),
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, FillStrategy::PrevNullAsIntentional);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::with_settings!({
+                description => format!("input_batch_size: {input_batch_size}, output_batch_size: {output_batch_size}"),
+            }, {
+                insta::assert_yaml_snapshot!(actual, @r###"
+                ---
+                - +----+--------------------------+----+
+                - "| g0 | time                     | a0 |"
+                - +----+--------------------------+----+
+                - "| a  | 1970-01-01T00:00:00.975Z |    |"
+                - "| a  | 1970-01-01T00:00:01Z     | 10 |"
+                - "| a  | 1970-01-01T00:00:01.025Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.050Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.075Z | 11 |"
+                - "| a  | 1970-01-01T00:00:01.100Z | 11 |"
+                - "| a  | 1970-01-01T00:00:01.125Z | 11 |"
+                - "| b  | 1970-01-01T00:00:00.975Z |    |"
+                - "| b  | 1970-01-01T00:00:01Z     | 20 |"
+                - "| b  | 1970-01-01T00:00:01.025Z | 20 |"
+                - "| b  | 1970-01-01T00:00:01.050Z |    |"
+                - "| b  | 1970-01-01T00:00:01.075Z |    |"
+                - "| b  | 1970-01-01T00:00:01.100Z | 21 |"
+                - "| b  | 1970-01-01T00:00:01.125Z | 21 |"
+                - +----+--------------------------+----+
+                "###)
+            });
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_fill_prev_null_as_missing() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! {
+        for output_batch_size in [16, 1] {
+        for input_batch_size in [8, 1] {
+            let records = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                ]],
+                time_col: vec![
+                    // 975
+                    Some(1000),
+                    // 1025
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    // 1125
+                    // --- new series
+                    // 975
+                    Some(1000),
+                    // 1025
+                    Some(1050),
+                    // 1075
+                    Some(1100),
+                    // 1125
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(10),  // a: 1000
+                    None,      // a: 1075
+                    Some(20),  // b: 1000
+                    None,      // b: 1050
+                    Some(21),  // b: 1100
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, FillStrategy::PrevNullAsMissing);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::with_settings!({
+                description => format!("input_batch_size: {input_batch_size}, output_batch_size: {output_batch_size}"),
+            }, {
+                insta::assert_yaml_snapshot!(actual, @r###"
+                ---
+                - +----+--------------------------+----+
+                - "| g0 | time                     | a0 |"
+                - +----+--------------------------+----+
+                - "| a  | 1970-01-01T00:00:00.975Z |    |"
+                - "| a  | 1970-01-01T00:00:01Z     | 10 |"
+                - "| a  | 1970-01-01T00:00:01.025Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.050Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.075Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.100Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.125Z | 10 |"
+                - "| b  | 1970-01-01T00:00:00.975Z |    |"
+                - "| b  | 1970-01-01T00:00:01Z     | 20 |"
+                - "| b  | 1970-01-01T00:00:01.025Z | 20 |"
+                - "| b  | 1970-01-01T00:00:01.050Z | 20 |"
+                - "| b  | 1970-01-01T00:00:01.075Z | 20 |"
+                - "| b  | 1970-01-01T00:00:01.100Z | 21 |"
+                - "| b  | 1970-01-01T00:00:01.125Z | 21 |"
+                - +----+--------------------------+----+
+                "###)
+            });
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_fill_prev_null_as_missing_many_nulls() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! {
+        for output_batch_size in [16, 1] {
+        for input_batch_size in [8, 1] {
+            let records = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    // --- new series
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                ]],
+                time_col: vec![
+                    None,
+                    Some(975),
+                    Some(1000),
+                    Some(1025),
+                    Some(1050),
+                    // 1075
+                    Some(1100),
+                    // 1125
+                    // --- new series
+                    None,
+                    Some(975),
+                    // 1000
+                    Some(1025),
+                    Some(1050),
+                    // 1075
+                    Some(1100),
+                    // 1125
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(-1),  // a: null ts
+                    Some(10),  // a: 975
+                    None,      // a: 1000
+                    None,      // a: 1025 (stashed)
+                    None,      // a: 1050 (stashed)
+                               // a: 1075 (stashed)
+                    Some(12),  // a: 1100
+                               // a: 1125
+                    // --- new series
+                    Some(-2),  // b: null ts
+                    None,      // b: 975
+                               // b: 1000
+                    Some(21),  // b: 1025
+                    None,      // b: 1050
+                               // b: 1075
+                    Some(22),  // b: 1100
+                               // b: 1125
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms_with_fill_strategy(&records, 25, Some(975), 1_125, FillStrategy::PrevNullAsMissing);
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::with_settings!({
+                description => format!("input_batch_size: {input_batch_size}, output_batch_size: {output_batch_size}"),
+            }, {
+                insta::assert_yaml_snapshot!(actual, @r###"
+                ---
+                - +----+--------------------------+----+
+                - "| g0 | time                     | a0 |"
+                - +----+--------------------------+----+
+                - "| a  |                          | -1 |"
+                - "| a  | 1970-01-01T00:00:00.975Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01Z     | 10 |"
+                - "| a  | 1970-01-01T00:00:01.025Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.050Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.075Z | 10 |"
+                - "| a  | 1970-01-01T00:00:01.100Z | 12 |"
+                - "| a  | 1970-01-01T00:00:01.125Z | 12 |"
+                - "| b  |                          | -2 |"
+                - "| b  | 1970-01-01T00:00:00.975Z |    |"
+                - "| b  | 1970-01-01T00:00:01Z     |    |"
+                - "| b  | 1970-01-01T00:00:01.025Z | 21 |"
+                - "| b  | 1970-01-01T00:00:01.050Z | 21 |"
+                - "| b  | 1970-01-01T00:00:01.075Z | 21 |"
+                - "| b  | 1970-01-01T00:00:01.100Z | 22 |"
+                - "| b  | 1970-01-01T00:00:01.125Z | 22 |"
+                - +----+--------------------------+----+
+                "###)
+            });
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+/// Show that:
+/// - we can have multiple interpolated segments within
+///   a series
+/// - a null value will break interpolation
+/// - times before the first or after the last non-null data point
+///   in a series are filled with nulls.
+#[test]
+fn test_gapfill_fill_interpolate() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! {
+        for output_batch_size in [16, 1] {
+            let input_batch_size = 8;
+            let records = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    // --- new series
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                ]],
+                time_col: vec![
+                    None,
+                    // 975
+                    Some(1000),
+                    // 1025
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    // 1125
+                    // --- new series
+                    None,
+                    Some(975),
+                    Some(1000),
+                    Some(1025),
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    Some(1125),
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(-1),
+                    // null,       975
+                    Some(100), // 1000
+                    // 200        1025
+                    // 300        1050
+                    Some(400), // 1075
+                    //            1100
+                    //            1125
+                    // --- new series
+                    Some(-10),
+                    Some(1100), //  975
+                    None, // 1200  1000 (this null value will be filled)
+                    Some(1300), // 1025
+                    // 1325        1050
+                    Some(1350), // 1075
+                    Some(1550), // 1100
+                    //             1125
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms_with_fill_strategy(
+                &records,
+                25,
+                Some(975),
+                1_125,
+                FillStrategy::LinearInterpolate
+            );
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::with_settings!({
+                description => format!("input_batch_size: {input_batch_size}, output_batch_size: {output_batch_size}"),
+            }, {
+                insta::assert_yaml_snapshot!(actual, @r###"
+                ---
+                - +----+--------------------------+------+
+                - "| g0 | time                     | a0   |"
+                - +----+--------------------------+------+
+                - "| a  |                          | -1   |"
+                - "| a  | 1970-01-01T00:00:00.975Z |      |"
+                - "| a  | 1970-01-01T00:00:01Z     | 100  |"
+                - "| a  | 1970-01-01T00:00:01.025Z | 200  |"
+                - "| a  | 1970-01-01T00:00:01.050Z | 300  |"
+                - "| a  | 1970-01-01T00:00:01.075Z | 400  |"
+                - "| a  | 1970-01-01T00:00:01.100Z |      |"
+                - "| a  | 1970-01-01T00:00:01.125Z |      |"
+                - "| b  |                          | -10  |"
+                - "| b  | 1970-01-01T00:00:00.975Z | 1100 |"
+                - "| b  | 1970-01-01T00:00:01Z     | 1200 |"
+                - "| b  | 1970-01-01T00:00:01.025Z | 1300 |"
+                - "| b  | 1970-01-01T00:00:01.050Z | 1325 |"
+                - "| b  | 1970-01-01T00:00:01.075Z | 1350 |"
+                - "| b  | 1970-01-01T00:00:01.100Z | 1450 |"
+                - "| b  | 1970-01-01T00:00:01.125Z | 1550 |"
+                - +----+--------------------------+------+
+                "###)
+            });
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }
+}
+
+#[test]
+fn test_gapfill_simple_no_lower_bound_with_nulls() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! { for output_batch_size in [1, 2, 4, 8] {
+        for input_batch_size in [1, 2, 4] {
+            let batch = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("c"),
+                    Some("c"),
+                    Some("c"),
+                    Some("c"),
+                    Some("c"),
+                ]],
+                time_col: vec![
+                    None, // group a
+                    Some(1_025),
+                    Some(1_100),
+                    None, // group b
+                    None,
+                    None,
+                    None, // group c
+                    None,
+                    None,
+                    None,
+                    Some(1_050),
+                    Some(1_100),
+                ],
+                timezone: None,
+                agg_cols: vec![vec![
+                    Some(1), // group a
+                    Some(10),
+                    Some(11),
+                    Some(90), // group b
+                    Some(91),
+                    Some(92),
+                    Some(93),
+                    None, // group c
+                    None,
+                    Some(2),
+                    Some(20),
+                    Some(21),
+                ]],
+                struct_cols: vec![],
+                input_batch_size,
+            };
+            let params = get_params_ms(&batch, 25, None, 1_125);
+            let tc = TestCase {
+                test_records: batch,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::assert_yaml_snapshot!(actual, @r###"
+            ---
+            - +----+--------------------------+----+
+            - "| g0 | time                     | a0 |"
+            - +----+--------------------------+----+
+            - "| a  |                          | 1  |"
+            - "| a  | 1970-01-01T00:00:01.025Z | 10 |"
+            - "| a  | 1970-01-01T00:00:01.050Z |    |"
+            - "| a  | 1970-01-01T00:00:01.075Z |    |"
+            - "| a  | 1970-01-01T00:00:01.100Z | 11 |"
+            - "| a  | 1970-01-01T00:00:01.125Z |    |"
+            - "| b  |                          | 90 |"
+            - "| b  |                          | 91 |"
+            - "| b  |                          | 92 |"
+            - "| b  |                          | 93 |"
+            - "| c  |                          |    |"
+            - "| c  |                          |    |"
+            - "| c  |                          | 2  |"
+            - "| c  | 1970-01-01T00:00:01.050Z | 20 |"
+            - "| c  | 1970-01-01T00:00:01.075Z |    |"
+            - "| c  | 1970-01-01T00:00:01.100Z | 21 |"
+            - "| c  | 1970-01-01T00:00:01.125Z |    |"
+            - +----+--------------------------+----+
+            "###);
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }}
+}
+
+#[test]
+fn test_gapfill_oom() {
+    // Show that a graceful error is produced if memory limit is exceeded
+    test_helpers::maybe_start_logging();
+    let input_batch_size = 128;
+    let output_batch_size = 128;
+    let batch = TestRecords {
+        group_cols: vec![vec![Some("a"), Some("a")]],
+        time_col: vec![Some(1_000), Some(1_100)],
+        timezone: None,
+        agg_cols: vec![vec![Some(10), Some(11)]],
+        struct_cols: vec![],
+        input_batch_size,
+    };
+    let params = get_params_ms(&batch, 25, Some(975), 1_125);
+    let tc = TestCase {
+        test_records: batch,
+        output_batch_size,
+        params,
+    };
+    let result = tc.run_with_memory_limit(1);
+    assert_error!(result, DataFusionError::ResourcesExhausted(_));
+}
+
+#[test]
+fn test_gapfill_interpolate_struct() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! {
+        for output_batch_size in [16, 1] {
+            let input_batch_size = 8;
+            let records = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    // --- new series
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                ]],
+                time_col: vec![
+                    None,
+                    // 975
+                    Some(1000),
+                    // 1025
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    // 1125
+                    // --- new series
+                    None,
+                    Some(975),
+                    Some(1000),
+                    Some(1025),
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    Some(1125),
+                ],
+                timezone: None,
+                agg_cols: vec![],
+                struct_cols: vec![vec![
+                    Some(vec![-1, 0]),
+                    // null,       975
+                    Some(vec![100, 0]),
+                    // 200        1025
+                    // 300        1050
+                    Some(vec![400, 0]), // 1075
+                    //            1100
+                    //            1125
+                    // --- new series
+                    Some(vec![-10, 0]),
+                    Some(vec![1100, 0]), //  975
+                    None, // 1200  1000 (this null value will be filled)
+                    Some(vec![1300, 0]), // 1025
+                    // 1325        1050
+                    Some(vec![1350, 0]), // 1075
+                    Some(vec![1550, 0]), // 1100
+                    //             1125
+                ]],
+                input_batch_size,
+            };
+            let params = get_params_ms_with_fill_strategy(
+                &records,
+                25,
+                Some(975),
+                1_125,
+                FillStrategy::LinearInterpolate
+            );
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::with_settings!({
+                description => format!("input_batch_size: {input_batch_size}, output_batch_size: {output_batch_size}"),
+            }, {
+                insta::assert_yaml_snapshot!(actual, @r###"
+                ---
+                - +----+--------------------------+------------------------+
+                - "| g0 | time                     | a0                     |"
+                - +----+--------------------------+------------------------+
+                - "| a  |                          | {value: -1, time: 0}   |"
+                - "| a  | 1970-01-01T00:00:00.975Z | {value: , time: }      |"
+                - "| a  | 1970-01-01T00:00:01Z     | {value: 100, time: 0}  |"
+                - "| a  | 1970-01-01T00:00:01.025Z | {value: 200, time: }   |"
+                - "| a  | 1970-01-01T00:00:01.050Z | {value: 300, time: }   |"
+                - "| a  | 1970-01-01T00:00:01.075Z | {value: 400, time: 0}  |"
+                - "| a  | 1970-01-01T00:00:01.100Z | {value: , time: }      |"
+                - "| a  | 1970-01-01T00:00:01.125Z | {value: , time: }      |"
+                - "| b  |                          | {value: -10, time: 0}  |"
+                - "| b  | 1970-01-01T00:00:00.975Z | {value: 1100, time: 0} |"
+                - "| b  | 1970-01-01T00:00:01Z     | {value: 1200, time: }  |"
+                - "| b  | 1970-01-01T00:00:01.025Z | {value: 1300, time: 0} |"
+                - "| b  | 1970-01-01T00:00:01.050Z | {value: 1325, time: }  |"
+                - "| b  | 1970-01-01T00:00:01.075Z | {value: 1350, time: 0} |"
+                - "| b  | 1970-01-01T00:00:01.100Z | {value: 1450, time: }  |"
+                - "| b  | 1970-01-01T00:00:01.125Z | {value: 1550, time: 0} |"
+                - +----+--------------------------+------------------------+
+                "###)
+            });
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }
+}
+
+#[test]
+fn test_gapfill_interpolate_struct_additional_data() {
+    test_helpers::maybe_start_logging();
+    insta::allow_duplicates! {
+        for output_batch_size in [16, 1] {
+            let input_batch_size = 8;
+            let records = TestRecords {
+                group_cols: vec![vec![
+                    Some("a"),
+                    Some("a"),
+                    Some("a"),
+                    // --- new series
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                    Some("b"),
+                ]],
+                time_col: vec![
+                    None,
+                    // 975
+                    Some(1000),
+                    // 1025
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    // 1125
+                    // --- new series
+                    None,
+                    Some(975),
+                    Some(1000),
+                    Some(1025),
+                    // 1050
+                    Some(1075),
+                    // 1100
+                    Some(1125),
+                ],
+                timezone: None,
+                agg_cols: vec![],
+                struct_cols: vec![vec![
+                    Some(vec![-1, 0, 1, 1]),
+                    // null,       975
+                    Some(vec![100, 0, 2, 2]),
+                    // 200        1025
+                    // 300        1050
+                    Some(vec![400, 0, 3, 3]), // 1075
+                    //            1100
+                    //            1125
+                    // --- new series
+                    Some(vec![-10, 0, 10, 10]),
+                    Some(vec![1100, 0, 11, 11]), //  975
+                    None, // 1200  1000 (this null value will be filled)
+                    Some(vec![1300, 0, 12, 12]), // 1025
+                    // 1325        1050
+                    Some(vec![1350, 0, 13, 13]), // 1075
+                    Some(vec![1550, 0, 14, 14]), // 1100
+                    //             1125
+                ]],
+                input_batch_size,
+            };
+            let params = get_params_ms_with_fill_strategy(
+                &records,
+                25,
+                Some(975),
+                1_125,
+                FillStrategy::LinearInterpolate
+            );
+            let tc = TestCase {
+                test_records: records,
+                output_batch_size,
+                params,
+            };
+            let batches = tc.run().unwrap();
+            let actual = batches_to_lines(&batches);
+            insta::with_settings!({
+                description => format!("input_batch_size: {input_batch_size}, output_batch_size: {output_batch_size}"),
+            }, {
+                insta::assert_yaml_snapshot!(actual, @r###"
+                ---
+                - +----+--------------------------+--------------------------------------------------+
+                - "| g0 | time                     | a0                                               |"
+                - +----+--------------------------+--------------------------------------------------+
+                - "| a  |                          | {value: -1, time: 0, other_0: 1, other_1: 1}     |"
+                - "| a  | 1970-01-01T00:00:00.975Z | {value: , time: , other_0: , other_1: }          |"
+                - "| a  | 1970-01-01T00:00:01Z     | {value: 100, time: 0, other_0: 2, other_1: 2}    |"
+                - "| a  | 1970-01-01T00:00:01.025Z | {value: 200, time: , other_0: , other_1: }       |"
+                - "| a  | 1970-01-01T00:00:01.050Z | {value: 300, time: , other_0: , other_1: }       |"
+                - "| a  | 1970-01-01T00:00:01.075Z | {value: 400, time: 0, other_0: 3, other_1: 3}    |"
+                - "| a  | 1970-01-01T00:00:01.100Z | {value: , time: , other_0: , other_1: }          |"
+                - "| a  | 1970-01-01T00:00:01.125Z | {value: , time: , other_0: , other_1: }          |"
+                - "| b  |                          | {value: -10, time: 0, other_0: 10, other_1: 10}  |"
+                - "| b  | 1970-01-01T00:00:00.975Z | {value: 1100, time: 0, other_0: 11, other_1: 11} |"
+                - "| b  | 1970-01-01T00:00:01Z     | {value: 1200, time: , other_0: , other_1: }      |"
+                - "| b  | 1970-01-01T00:00:01.025Z | {value: 1300, time: 0, other_0: 12, other_1: 12} |"
+                - "| b  | 1970-01-01T00:00:01.050Z | {value: 1325, time: , other_0: , other_1: }      |"
+                - "| b  | 1970-01-01T00:00:01.075Z | {value: 1350, time: 0, other_0: 13, other_1: 13} |"
+                - "| b  | 1970-01-01T00:00:01.100Z | {value: 1450, time: , other_0: , other_1: }      |"
+                - "| b  | 1970-01-01T00:00:01.125Z | {value: 1550, time: 0, other_0: 14, other_1: 14} |"
+                - +----+--------------------------+--------------------------------------------------+
+                "###)
+            });
+            assert_batch_count(&batches, output_batch_size);
+        }
+    }
+}
+
+fn assert_batch_count(actual_batches: &[RecordBatch], batch_size: usize) {
+    let num_rows = actual_batches.iter().map(|b| b.num_rows()).sum::<usize>();
+    let expected_batch_count = f64::ceil(num_rows as f64 / batch_size as f64) as usize;
+    assert_eq!(expected_batch_count, actual_batches.len());
+}
+
+type ExprVec = Vec<Arc<dyn PhysicalExpr>>;
+
+pub(super) struct TestRecords {
+    pub group_cols: Vec<Vec<Option<&'static str>>>,
+    // Stored as millisecods since intervals use millis,
+    // to let test cases be consistent and easier to read.
+    pub time_col: Vec<Option<i64>>,
+    pub timezone: Option<Arc<str>>,
+    pub agg_cols: Vec<Vec<Option<i64>>>,
+    pub struct_cols: Vec<Vec<Option<Vec<i64>>>>,
+    pub input_batch_size: usize,
+}
+
+impl TestRecords {
+    fn schema(&self) -> SchemaRef {
+        // In order to test input with null timestamps, we need the
+        // timestamp column to be nullable. Unforunately this means
+        // we can't use the IOx schema builder here.
+        let mut fields = vec![];
+        for i in 0..self.group_cols.len() {
+            fields.push(Field::new(
+                format!("g{i}"),
+                (&InfluxColumnType::Tag).into(),
+                true,
+            ));
+        }
+        fields.push(Field::new(
+            "time",
+            DataType::Timestamp(TimeUnit::Nanosecond, self.timezone.clone()),
+            true,
+        ));
+        for i in 0..self.agg_cols.len() {
+            fields.push(Field::new(
+                format!("a{i}"),
+                (&InfluxColumnType::Field(InfluxFieldType::Integer)).into(),
+                true,
+            ));
+        }
+        for i in 0..self.struct_cols.len() {
+            fields.push(Field::new(
+                format!("a{}", self.agg_cols.len() + i),
+                DataType::Struct(self.struct_fields(i)),
+                true,
+            ));
+        }
+        Schema::new(fields).into()
+    }
+
+    fn struct_fields(&self, col: usize) -> Fields {
+        let mut fields = vec![
+            Field::new(
+                "value",
+                (&InfluxColumnType::Field(InfluxFieldType::Integer)).into(),
+                true,
+            ),
+            Field::new(
+                "time",
+                (&InfluxColumnType::Field(InfluxFieldType::Integer)).into(),
+                true,
+            ),
+        ];
+        let num_other = self.struct_cols[col]
+            .iter()
+            .find(|o| o.is_some())
+            .map_or(0, |v| match v.as_ref().unwrap().len() {
+                0..=2 => 0,
+                n => n - 2,
+            });
+        for i in 0..num_other {
+            fields.push(Field::new(
+                format!("other_{}", i),
+                (&InfluxColumnType::Field(InfluxFieldType::Integer)).into(),
+                true,
+            ));
+        }
+        fields.into()
+    }
+
+    fn len(&self) -> usize {
+        self.time_col.len()
+    }
+
+    fn exprs(&self) -> Result<(ExprVec, ExprVec)> {
+        let mut group_expr: ExprVec = vec![];
+        let mut aggr_expr: ExprVec = vec![];
+        let ngroup_cols = self.group_cols.len();
+        for i in 0..self.schema().fields().len() {
+            match i.cmp(&ngroup_cols) {
+                Ordering::Less => group_expr.push(Arc::new(Column::new(&format!("g{i}"), i))),
+                Ordering::Equal => group_expr.push(Arc::new(Column::new("t", i))),
+                Ordering::Greater => {
+                    let idx = i - ngroup_cols + 1;
+                    aggr_expr.push(Arc::new(Column::new(&format!("a{idx}"), i)));
+                }
+            }
+        }
+        Ok((group_expr, aggr_expr))
+    }
+}
+
+impl TryFrom<TestRecords> for Vec<RecordBatch> {
+    type Error = DataFusionError;
+
+    fn try_from(value: TestRecords) -> Result<Self> {
+        let mut arrs: Vec<ArrayRef> = Vec::with_capacity(
+            value.group_cols.len() + value.agg_cols.len() + value.struct_cols.len() + 1,
+        );
+        for gc in &value.group_cols {
+            let arr = Arc::new(DictionaryArray::<Int32Type>::from_iter(gc.iter().cloned()));
+            arrs.push(arr);
+        }
+        // Scale from milliseconds to the nanoseconds that are actually stored.
+        let scaled_times = value
+            .time_col
+            .iter()
+            .map(|o| o.map(|v| v * 1_000_000))
+            .collect::<TimestampNanosecondArray>()
+            .with_timezone_opt(value.timezone.clone());
+        arrs.push(Arc::new(scaled_times));
+        for ac in &value.agg_cols {
+            let arr = Arc::new(Int64Array::from_iter(ac));
+            arrs.push(arr);
+        }
+        for i in 0..value.struct_cols.len() {
+            let fields = value.struct_fields(i);
+            let nulls = value.struct_cols[i]
+                .iter()
+                .map(|o| o.is_none())
+                .collect::<Vec<_>>();
+            let mut struct_arrs: Vec<ArrayRef> = vec![];
+            for j in 0..fields.len() {
+                let arr = Arc::new(Int64Array::from_iter(
+                    value.struct_cols[i]
+                        .iter()
+                        .map(|o| o.as_ref().map(|v| v[j])),
+                ));
+                struct_arrs.push(arr);
+            }
+            arrs.push(Arc::new(StructArray::new(
+                fields,
+                struct_arrs,
+                Some(nulls.into()),
+            )));
+        }
+
+        let one_batch = RecordBatch::try_new(value.schema(), arrs)
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
+        let mut batches = vec![];
+        let mut offset = 0;
+        while offset < one_batch.num_rows() {
+            let len = std::cmp::min(value.input_batch_size, one_batch.num_rows() - offset);
+            let batch = one_batch.slice(offset, len);
+            batches.push(batch);
+            offset += value.input_batch_size;
+        }
+        Ok(batches)
+    }
+}
+
+struct TestCase {
+    test_records: TestRecords,
+    output_batch_size: usize,
+    params: GapFillExecParams,
+}
+
+impl TestCase {
+    fn run(self) -> Result<Vec<RecordBatch>> {
+        block_on(async {
+            let session_ctx = SessionContext::new_with_config(
+                SessionConfig::default().with_batch_size(self.output_batch_size),
+            )
+            .into();
+            Self::execute_with_config(&session_ctx, self.plan()?).await
+        })
+    }
+
+    fn run_with_memory_limit(self, limit: usize) -> Result<Vec<RecordBatch>> {
+        block_on(async {
+            let session_ctx = SessionContext::new_with_config_rt(
+                SessionConfig::default().with_batch_size(self.output_batch_size),
+                RuntimeEnv::new(RuntimeConfig::default().with_memory_limit(limit, 1.0))?.into(),
+            )
+            .into();
+            let result = Self::execute_with_config(&session_ctx, self.plan()?).await;
+
+            if result.is_ok() {
+                // Verify that the operator reports usage in a
+                // symmetrical way.
+                let pool = &session_ctx.runtime_env().memory_pool;
+                assert_eq!(0, pool.reserved());
+            }
+
+            result
+        })
+    }
+
+    fn plan(self) -> Result<Arc<GapFillExec>> {
+        let schema = self.test_records.schema();
+        let (group_expr, aggr_expr) = self.test_records.exprs()?;
+
+        let input_batch_size = self.test_records.input_batch_size;
+
+        let num_records = self.test_records.len();
+        let batches: Vec<RecordBatch> = self.test_records.try_into()?;
+        assert_batch_count(&batches, input_batch_size);
+        assert_eq!(
+            batches.iter().map(|b| b.num_rows()).sum::<usize>(),
+            num_records
+        );
+
+        debug!(
+            "input_batch_size is {input_batch_size}, output_batch_size is {}",
+            self.output_batch_size
+        );
+        let input = Arc::new(MemoryExec::try_new(&[batches], schema, None)?);
+        let plan = Arc::new(GapFillExec::try_new(
+            input,
+            group_expr,
+            aggr_expr,
+            self.params.clone(),
+        )?);
+        Ok(plan)
+    }
+
+    async fn execute_with_config(
+        session_ctx: &Arc<SessionContext>,
+        plan: Arc<GapFillExec>,
+    ) -> Result<Vec<RecordBatch>> {
+        let task_ctx = Arc::new(TaskContext::from(session_ctx.as_ref()));
+        collect(plan, task_ctx).await
+    }
+}
+
+fn bound_included_from_option<T>(o: Option<T>) -> Bound<T> {
+    if let Some(v) = o {
+        Bound::Included(v)
+    } else {
+        Bound::Unbounded
+    }
+}
+
+fn phys_fill_strategies(
+    records: &TestRecords,
+    fill_strategy: FillStrategy,
+) -> Result<Vec<(Arc<dyn PhysicalExpr>, FillStrategy)>> {
+    let start = records.group_cols.len() + 1; // 1 is for time col
+    let end = start + records.agg_cols.len() + records.struct_cols.len();
+    let mut v = Vec::with_capacity(records.agg_cols.len());
+    for f in &records.schema().fields()[start..end] {
+        v.push((phys_col(f.name(), &records.schema())?, fill_strategy));
+    }
+    Ok(v)
+}
+
+fn get_params_ms_with_fill_strategy(
+    batch: &TestRecords,
+    stride_ms: i64,
+    start: Option<i64>,
+    end: i64,
+    fill_strategy: FillStrategy,
+) -> GapFillExecParams {
+    get_params_ms_with_origin_fill_strategy(batch, stride_ms, start, end, None, fill_strategy)
+}
+
+fn get_params_ms_with_origin_fill_strategy(
+    batch: &TestRecords,
+    stride_ms: i64,
+    start: Option<i64>,
+    end: i64,
+    origin_ms: Option<i64>,
+    fill_strategy: FillStrategy,
+) -> GapFillExecParams {
+    // stride is in ms
+    let stride = ScalarValue::new_interval_mdn(0, 0, stride_ms * 1_000_000);
+    let origin =
+        origin_ms.map(|o| phys_lit(ScalarValue::TimestampNanosecond(Some(o * 1_000_000), None)));
+
+    GapFillExecParams {
+        stride: phys_lit(stride),
+        time_column: Column::new("t", batch.group_cols.len()),
+        origin,
+        // timestamps are nanos, so scale them accordingly
+        time_range: Range {
+            start: bound_included_from_option(start.map(|start| {
+                phys_lit(ScalarValue::TimestampNanosecond(
+                    Some(start * 1_000_000),
+                    None,
+                ))
+            })),
+            end: Bound::Included(phys_lit(ScalarValue::TimestampNanosecond(
+                Some(end * 1_000_000),
+                None,
+            ))),
+        },
+        fill_strategy: phys_fill_strategies(batch, fill_strategy).unwrap(),
+    }
+}
+
+fn get_params_ms(
+    batch: &TestRecords,
+    stride: i64,
+    start: Option<i64>,
+    end: i64,
+) -> GapFillExecParams {
+    get_params_ms_with_fill_strategy(batch, stride, start, end, FillStrategy::Null)
+}
diff --git a/iox_query/src/exec/gapfill/mod.rs b/iox_query/src/exec/gapfill/mod.rs
new file mode 100644
index 0000000..30ef8a5
--- /dev/null
+++ b/iox_query/src/exec/gapfill/mod.rs
@@ -0,0 +1,823 @@
+//! This module contains code that implements
+//! a gap-filling extension to DataFusion
+
+mod algo;
+mod buffered_input;
+#[cfg(test)]
+mod exec_tests;
+mod params;
+mod stream;
+
+use std::{
+    fmt::{self, Debug},
+    ops::{Bound, Range},
+    sync::Arc,
+};
+
+use arrow::{compute::SortOptions, datatypes::SchemaRef};
+use datafusion::{
+    common::DFSchemaRef,
+    error::{DataFusionError, Result},
+    execution::{context::TaskContext, memory_pool::MemoryConsumer},
+    logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore},
+    physical_expr::{
+        create_physical_expr, execution_props::ExecutionProps, PhysicalSortExpr,
+        PhysicalSortRequirement,
+    },
+    physical_plan::{
+        expressions::Column,
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet},
+        DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr,
+        SendableRecordBatchStream, Statistics,
+    },
+    prelude::Expr,
+};
+
+use self::stream::GapFillStream;
+
+/// A logical node that represents the gap filling operation.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct GapFill {
+    /// The incoming logical plan
+    pub input: Arc<LogicalPlan>,
+    /// Grouping expressions
+    pub group_expr: Vec<Expr>,
+    /// Aggregate expressions
+    pub aggr_expr: Vec<Expr>,
+    /// Parameters to configure the behavior of the
+    /// gap-filling operation
+    pub params: GapFillParams,
+}
+
+/// Parameters to the GapFill operation
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct GapFillParams {
+    /// The stride argument from the call to DATE_BIN_GAPFILL
+    pub stride: Expr,
+    /// The source time column
+    pub time_column: Expr,
+    /// The origin argument from the call to DATE_BIN_GAPFILL
+    pub origin: Option<Expr>,
+    /// The time range of the time column inferred from predicates
+    /// in the overall query. The lower bound may be [`Bound::Unbounded`]
+    /// which implies that gap-filling should just start from the
+    /// first point in each series.
+    pub time_range: Range<Bound<Expr>>,
+    /// What to do when filling aggregate columns.
+    /// The first item in the tuple will be the column
+    /// reference for the aggregate column.
+    pub fill_strategy: Vec<(Expr, FillStrategy)>,
+}
+
+/// Describes how to fill gaps in an aggregate column.
+#[derive(Clone, Debug, Hash, PartialEq, Eq, Copy)]
+pub enum FillStrategy {
+    /// Fill with null values.
+    /// This is the InfluxQL behavior for `FILL(NULL)` or `FILL(NONE)`.
+    Null,
+    /// Fill with the most recent value in the input column.
+    /// Null values in the input are preserved.
+    #[allow(dead_code)]
+    PrevNullAsIntentional,
+    /// Fill with the most recent non-null value in the input column.
+    /// This is the InfluxQL behavior for `FILL(PREVIOUS)`.
+    PrevNullAsMissing,
+    /// Fill the gaps between points linearly.
+    /// Null values will not be considered as missing, so two non-null values
+    /// with a null in between will not be filled.
+    LinearInterpolate,
+}
+
+impl GapFillParams {
+    // Extract the expressions so they can be optimized.
+    fn expressions(&self) -> Vec<Expr> {
+        let mut exprs = vec![self.stride.clone(), self.time_column.clone()];
+        if let Some(e) = self.origin.as_ref() {
+            exprs.push(e.clone())
+        }
+        if let Some(start) = bound_extract(&self.time_range.start) {
+            exprs.push(start.clone());
+        }
+        exprs.push(
+            bound_extract(&self.time_range.end)
+                .unwrap_or_else(|| panic!("upper time bound is required"))
+                .clone(),
+        );
+        exprs
+    }
+
+    #[allow(clippy::wrong_self_convention)] // follows convention of UserDefinedLogicalNode
+    fn from_template(&self, exprs: &[Expr], aggr_expr: &[Expr]) -> Self {
+        assert!(
+            exprs.len() >= 3,
+            "should be a at least stride, source and origin in params"
+        );
+        let mut iter = exprs.iter().cloned();
+        let stride = iter.next().unwrap();
+        let time_column = iter.next().unwrap();
+        let origin = self.origin.as_ref().map(|_| iter.next().unwrap());
+        let time_range = try_map_range(&self.time_range, |b| {
+            try_map_bound(b.as_ref(), |_| {
+                Ok(iter.next().expect("expr count should match template"))
+            })
+        })
+        .unwrap();
+
+        let fill_strategy = aggr_expr
+            .iter()
+            .cloned()
+            .zip(
+                self.fill_strategy
+                    .iter()
+                    .map(|(_expr, fill_strategy)| fill_strategy)
+                    .cloned(),
+            )
+            .collect();
+
+        Self {
+            stride,
+            time_column,
+            origin,
+            time_range,
+            fill_strategy,
+        }
+    }
+
+    // Find the expression that matches `e` and replace its fill strategy.
+    // If such an expression is found, return the old strategy, and `None` otherwise.
+    fn replace_fill_strategy(&mut self, e: &Expr, mut fs: FillStrategy) -> Option<FillStrategy> {
+        for expr_fs in &mut self.fill_strategy {
+            if &expr_fs.0 == e {
+                std::mem::swap(&mut fs, &mut expr_fs.1);
+                return Some(fs);
+            }
+        }
+        None
+    }
+}
+
+impl GapFill {
+    /// Create a new gap-filling operator.
+    pub fn try_new(
+        input: Arc<LogicalPlan>,
+        group_expr: Vec<Expr>,
+        aggr_expr: Vec<Expr>,
+        params: GapFillParams,
+    ) -> Result<Self> {
+        if params.time_range.end == Bound::Unbounded {
+            return Err(DataFusionError::Internal(
+                "missing upper bound in GapFill time range".to_string(),
+            ));
+        }
+        Ok(Self {
+            input,
+            group_expr,
+            aggr_expr,
+            params,
+        })
+    }
+
+    // Find the expression that matches `e` and replace its fill strategy.
+    // If such an expression is found, return the old strategy, and `None` otherwise.
+    pub(crate) fn replace_fill_strategy(
+        &mut self,
+        e: &Expr,
+        fs: FillStrategy,
+    ) -> Option<FillStrategy> {
+        self.params.replace_fill_strategy(e, fs)
+    }
+}
+
+impl UserDefinedLogicalNodeCore for GapFill {
+    fn name(&self) -> &str {
+        "GapFill"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![self.input.as_ref()]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.group_expr
+            .iter()
+            .chain(&self.aggr_expr)
+            .chain(&self.params.expressions())
+            .cloned()
+            .collect()
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let aggr_expr: String = self
+            .params
+            .fill_strategy
+            .iter()
+            .map(|(e, fs)| match fs {
+                FillStrategy::PrevNullAsIntentional => format!("LOCF(null-as-intentional, {})", e),
+                FillStrategy::PrevNullAsMissing => format!("LOCF({})", e),
+                FillStrategy::LinearInterpolate => format!("INTERPOLATE({})", e),
+                FillStrategy::Null => e.to_string(),
+            })
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        let group_expr = self
+            .group_expr
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        write!(
+            f,
+            "{}: groupBy=[{group_expr}], aggr=[[{aggr_expr}]], time_column={}, stride={}, range={:?}",
+            self.name(),
+            self.params.time_column,
+            self.params.stride,
+            self.params.time_range,
+        )
+    }
+
+    fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
+        let mut group_expr: Vec<_> = exprs.to_vec();
+        let mut aggr_expr = group_expr.split_off(self.group_expr.len());
+        let param_expr = aggr_expr.split_off(self.aggr_expr.len());
+        let params = self.params.from_template(&param_expr, &aggr_expr);
+        Self::try_new(Arc::new(inputs[0].clone()), group_expr, aggr_expr, params)
+            .expect("should not fail")
+    }
+}
+
+/// Called by the extension planner to plan a [GapFill] node.
+pub(crate) fn plan_gap_fill(
+    execution_props: &ExecutionProps,
+    gap_fill: &GapFill,
+    logical_inputs: &[&LogicalPlan],
+    physical_inputs: &[Arc<dyn ExecutionPlan>],
+) -> Result<GapFillExec> {
+    if logical_inputs.len() != 1 {
+        return Err(DataFusionError::Internal(
+            "GapFillExec: wrong number of logical inputs".to_string(),
+        ));
+    }
+    if physical_inputs.len() != 1 {
+        return Err(DataFusionError::Internal(
+            "GapFillExec: wrong number of physical inputs".to_string(),
+        ));
+    }
+
+    let input_dfschema = logical_inputs[0].schema().as_ref();
+    let input_schema = physical_inputs[0].schema();
+    let input_schema = input_schema.as_ref();
+
+    let group_expr: Result<Vec<_>> = gap_fill
+        .group_expr
+        .iter()
+        .map(|e| create_physical_expr(e, input_dfschema, input_schema, execution_props))
+        .collect();
+    let group_expr = group_expr?;
+
+    let aggr_expr: Result<Vec<_>> = gap_fill
+        .aggr_expr
+        .iter()
+        .map(|e| create_physical_expr(e, input_dfschema, input_schema, execution_props))
+        .collect();
+    let aggr_expr = aggr_expr?;
+
+    let logical_time_column = gap_fill.params.time_column.try_into_col()?;
+    let time_column = Column::new_with_schema(&logical_time_column.name, input_schema)?;
+
+    let stride = create_physical_expr(
+        &gap_fill.params.stride,
+        input_dfschema,
+        input_schema,
+        execution_props,
+    )?;
+
+    let time_range = &gap_fill.params.time_range;
+    let time_range = try_map_range(time_range, |b| {
+        try_map_bound(b.as_ref(), |e| {
+            create_physical_expr(e, input_dfschema, input_schema, execution_props)
+        })
+    })?;
+
+    let origin = gap_fill
+        .params
+        .origin
+        .as_ref()
+        .map(|e| create_physical_expr(e, input_dfschema, input_schema, execution_props))
+        .transpose()?;
+
+    let fill_strategy = gap_fill
+        .params
+        .fill_strategy
+        .iter()
+        .map(|(e, fs)| {
+            Ok((
+                create_physical_expr(e, input_dfschema, input_schema, execution_props)?,
+                *fs,
+            ))
+        })
+        .collect::<Result<Vec<(Arc<dyn PhysicalExpr>, FillStrategy)>>>()?;
+
+    let params = GapFillExecParams {
+        stride,
+        time_column,
+        origin,
+        time_range,
+        fill_strategy,
+    };
+    GapFillExec::try_new(
+        Arc::clone(&physical_inputs[0]),
+        group_expr,
+        aggr_expr,
+        params,
+    )
+}
+
+fn try_map_range<T, U, F>(tr: &Range<T>, mut f: F) -> Result<Range<U>>
+where
+    F: FnMut(&T) -> Result<U>,
+{
+    Ok(Range {
+        start: f(&tr.start)?,
+        end: f(&tr.end)?,
+    })
+}
+
+fn try_map_bound<T, U, F>(bt: Bound<T>, mut f: F) -> Result<Bound<U>>
+where
+    F: FnMut(T) -> Result<U>,
+{
+    Ok(match bt {
+        Bound::Excluded(t) => Bound::Excluded(f(t)?),
+        Bound::Included(t) => Bound::Included(f(t)?),
+        Bound::Unbounded => Bound::Unbounded,
+    })
+}
+
+fn bound_extract<T>(b: &Bound<T>) -> Option<&T> {
+    match b {
+        Bound::Included(t) | Bound::Excluded(t) => Some(t),
+        Bound::Unbounded => None,
+    }
+}
+
+/// A physical node for the gap-fill operation.
+pub struct GapFillExec {
+    input: Arc<dyn ExecutionPlan>,
+    // The group by expressions from the original aggregation node.
+    group_expr: Vec<Arc<dyn PhysicalExpr>>,
+    // The aggregate expressions from the original aggregation node.
+    aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
+    // The sort expressions for the required sort order of the input:
+    // all of the group exressions, with the time column being last.
+    sort_expr: Vec<PhysicalSortExpr>,
+    // Parameters (besides streaming data) to gap filling
+    params: GapFillExecParams,
+    /// Metrics reporting behavior during execution.
+    metrics: ExecutionPlanMetricsSet,
+}
+
+#[derive(Clone, Debug)]
+struct GapFillExecParams {
+    /// The uniform interval of incoming timestamps
+    stride: Arc<dyn PhysicalExpr>,
+    /// The timestamp column produced by date_bin
+    time_column: Column,
+    /// The origin argument from the all to DATE_BIN_GAPFILL
+    origin: Option<Arc<dyn PhysicalExpr>>,
+    /// The time range of source input to DATE_BIN_GAPFILL.
+    /// Inferred from predicates in the overall query.
+    time_range: Range<Bound<Arc<dyn PhysicalExpr>>>,
+    /// What to do when filling aggregate columns.
+    /// The 0th element in each tuple is the aggregate column.
+    fill_strategy: Vec<(Arc<dyn PhysicalExpr>, FillStrategy)>,
+}
+
+impl GapFillExec {
+    fn try_new(
+        input: Arc<dyn ExecutionPlan>,
+        group_expr: Vec<Arc<dyn PhysicalExpr>>,
+        aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
+        params: GapFillExecParams,
+    ) -> Result<Self> {
+        let sort_expr = {
+            let mut sort_expr: Vec<_> = group_expr
+                .iter()
+                .map(|expr| PhysicalSortExpr {
+                    expr: Arc::clone(expr),
+                    options: SortOptions::default(),
+                })
+                .collect();
+
+            // Ensure that the time column is the last component in the sort
+            // expressions.
+            let time_idx = group_expr
+                .iter()
+                .enumerate()
+                .find(|(_i, e)| {
+                    e.as_any()
+                        .downcast_ref::<Column>()
+                        .map_or(false, |c| c.index() == params.time_column.index())
+                })
+                .map(|(i, _)| i);
+
+            if let Some(time_idx) = time_idx {
+                let last_elem = sort_expr.len() - 1;
+                sort_expr.swap(time_idx, last_elem);
+            } else {
+                return Err(DataFusionError::Internal(
+                    "could not find time column for GapFillExec".to_string(),
+                ));
+            }
+
+            sort_expr
+        };
+
+        Ok(Self {
+            input,
+            group_expr,
+            aggr_expr,
+            sort_expr,
+            params,
+            metrics: ExecutionPlanMetricsSet::new(),
+        })
+    }
+}
+
+impl Debug for GapFillExec {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "GapFillExec")
+    }
+}
+
+impl ExecutionPlan for GapFillExec {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        // It seems like it could be possible to partition on all the
+        // group keys except for the time expression. For now, keep it simple.
+        vec![Distribution::SinglePartition]
+    }
+
+    fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
+        vec![Some(PhysicalSortRequirement::from_sort_exprs(
+            &self.sort_expr,
+        ))]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        match children.len() {
+            1 => Ok(Arc::new(Self::try_new(
+                Arc::clone(&children[0]),
+                self.group_expr.clone(),
+                self.aggr_expr.clone(),
+                self.params.clone(),
+            )?)),
+            _ => Err(DataFusionError::Internal(
+                "GapFillExec wrong number of children".to_string(),
+            )),
+        }
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        if partition != 0 {
+            return Err(DataFusionError::Internal(format!(
+                "GapFillExec invalid partition {partition}, there can be only one partition"
+            )));
+        }
+
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        let output_batch_size = context.session_config().batch_size();
+        let reservation = MemoryConsumer::new(format!("GapFillExec[{partition}]"))
+            .register(context.memory_pool());
+        let input_stream = self.input.execute(partition, context)?;
+        Ok(Box::pin(GapFillStream::try_new(
+            self,
+            output_batch_size,
+            input_stream,
+            reservation,
+            baseline_metrics,
+        )?))
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for GapFillExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                let group_expr: Vec<_> = self.group_expr.iter().map(|e| e.to_string()).collect();
+                let aggr_expr: Vec<_> = self
+                    .params
+                    .fill_strategy
+                    .iter()
+                    .map(|(e, fs)| match fs {
+                        FillStrategy::PrevNullAsIntentional => {
+                            format!("LOCF(null-as-intentional, {})", e)
+                        }
+                        FillStrategy::PrevNullAsMissing => format!("LOCF({})", e),
+                        FillStrategy::LinearInterpolate => format!("INTERPOLATE({})", e),
+                        FillStrategy::Null => e.to_string(),
+                    })
+                    .collect();
+                let time_range = try_map_range(&self.params.time_range, |b| {
+                    try_map_bound(b.as_ref(), |e| Ok(e.to_string()))
+                })
+                .map_err(|_| fmt::Error {})?;
+                write!(
+                    f,
+                    "GapFillExec: group_expr=[{}], aggr_expr=[{}], stride={}, time_range={:?}",
+                    group_expr.join(", "),
+                    aggr_expr.join(", "),
+                    self.params.stride,
+                    time_range
+                )
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::ops::{Bound, Range};
+
+    use crate::{
+        exec::{Executor, ExecutorType},
+        test::{format_execution_plan, format_logical_plan},
+    };
+
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use datafusion::{
+        datasource::empty::EmptyTable,
+        error::Result,
+        logical_expr::{logical_plan, Extension, UserDefinedLogicalNode},
+        prelude::{col, lit},
+        scalar::ScalarValue,
+    };
+    use datafusion_util::lit_timestamptz_nano;
+
+    use test_helpers::assert_error;
+
+    fn schema() -> Schema {
+        Schema::new(vec![
+            Field::new(
+                "time",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+            Field::new("loc", DataType::Utf8, false),
+            Field::new("temp", DataType::Float64, false),
+        ])
+    }
+
+    fn table_scan() -> Result<LogicalPlan> {
+        let schema = schema();
+        logical_plan::table_scan(Some("temps"), &schema, None)?.build()
+    }
+
+    fn fill_strategy_null(cols: Vec<Expr>) -> Vec<(Expr, FillStrategy)> {
+        cols.into_iter().map(|e| (e, FillStrategy::Null)).collect()
+    }
+
+    #[test]
+    fn test_try_new_errs() {
+        let scan = table_scan().unwrap();
+        let result = GapFill::try_new(
+            Arc::new(scan),
+            vec![col("loc"), col("time")],
+            vec![col("temp")],
+            GapFillParams {
+                stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                time_column: col("time"),
+                origin: None,
+                time_range: Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Unbounded,
+                },
+                fill_strategy: fill_strategy_null(vec![col("temp")]),
+            },
+        );
+
+        assert_error!(result, DataFusionError::Internal(ref msg) if msg == "missing upper bound in GapFill time range");
+    }
+
+    fn assert_gapfill_from_template_roundtrip(gapfill: &GapFill) {
+        let gapfill_as_node: &dyn UserDefinedLogicalNode = gapfill;
+        let scan = table_scan().unwrap();
+        let exprs = gapfill_as_node.expressions();
+        let want_exprs = gapfill.group_expr.len()
+            + gapfill.aggr_expr.len()
+            + 2 // stride, time
+            + gapfill.params.origin.iter().count()
+            + bound_extract(&gapfill.params.time_range.start).iter().count()
+            + bound_extract(&gapfill.params.time_range.end).iter().count();
+        assert_eq!(want_exprs, exprs.len());
+        let gapfill_ft = gapfill_as_node.from_template(&exprs, &[scan]);
+        let gapfill_ft = gapfill_ft
+            .as_any()
+            .downcast_ref::<GapFill>()
+            .expect("should be a GapFill");
+        assert_eq!(gapfill.group_expr, gapfill_ft.group_expr);
+        assert_eq!(gapfill.aggr_expr, gapfill_ft.aggr_expr);
+        assert_eq!(gapfill.params, gapfill_ft.params);
+    }
+
+    #[test]
+    fn test_from_template() {
+        for params in vec![
+            // no origin, no start bound
+            GapFillParams {
+                stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                time_column: col("time"),
+                origin: None,
+                time_range: Range {
+                    start: Bound::Unbounded,
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
+                },
+                fill_strategy: fill_strategy_null(vec![col("temp")]),
+            },
+            // no origin, yes start bound
+            GapFillParams {
+                stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                time_column: col("time"),
+                origin: None,
+                time_range: Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
+                },
+                fill_strategy: fill_strategy_null(vec![col("temp")]),
+            },
+            // yes origin, no start bound
+            GapFillParams {
+                stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                time_column: col("time"),
+                origin: Some(lit_timestamptz_nano(1_000_000_000)),
+                time_range: Range {
+                    start: Bound::Unbounded,
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
+                },
+                fill_strategy: fill_strategy_null(vec![col("temp")]),
+            },
+            // yes origin, yes start bound
+            GapFillParams {
+                stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                time_column: col("time"),
+                origin: Some(lit_timestamptz_nano(1_000_000_000)),
+                time_range: Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
+                },
+                fill_strategy: fill_strategy_null(vec![col("temp")]),
+            },
+        ] {
+            let scan = table_scan().unwrap();
+            let gapfill = GapFill::try_new(
+                Arc::new(scan.clone()),
+                vec![col("loc"), col("time")],
+                vec![col("temp")],
+                params,
+            )
+            .unwrap();
+            assert_gapfill_from_template_roundtrip(&gapfill);
+        }
+    }
+
+    #[test]
+    fn fmt_logical_plan() -> Result<()> {
+        // This test case does not make much sense but
+        // just verifies we can construct a logical gapfill node
+        // and show its plan.
+        let scan = table_scan()?;
+        let gapfill = GapFill::try_new(
+            Arc::new(scan),
+            vec![col("loc"), col("time")],
+            vec![col("temp")],
+            GapFillParams {
+                stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                time_column: col("time"),
+                origin: None,
+                time_range: Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
+                },
+                fill_strategy: fill_strategy_null(vec![col("temp")]),
+            },
+        )?;
+        let plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(gapfill),
+        });
+
+        insta::assert_yaml_snapshot!(
+            format_logical_plan(&plan),
+            @r###"
+        ---
+        - " GapFill: groupBy=[loc, time], aggr=[[temp]], time_column=time, stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "   TableScan: temps"
+        "###
+        );
+        Ok(())
+    }
+
+    async fn format_explain(sql: &str) -> Result<Vec<String>> {
+        let executor = Executor::new_testing();
+        let context = executor.new_context(ExecutorType::Query);
+        context
+            .inner()
+            .register_table("temps", Arc::new(EmptyTable::new(Arc::new(schema()))))?;
+        let physical_plan = context.sql_to_physical_plan(sql).await?;
+        Ok(format_execution_plan(&physical_plan))
+    }
+
+    #[tokio::test]
+    async fn plan_gap_fill() -> Result<()> {
+        // show that the optimizer rule can fire and that physical
+        // planning will succeed.
+        let sql = "SELECT date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:00Z') AS minute, avg(temp)\
+                   \nFROM temps\
+                   \nWHERE time >= '1980-01-01T00:00:00Z' and time < '1981-01-01T00:00:00Z'\
+                   \nGROUP BY minute;";
+
+        let explain = format_explain(sql).await?;
+        insta::assert_yaml_snapshot!(
+            explain,
+            @r###"
+        ---
+        - " ProjectionExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as minute, AVG(temps.temp)@1 as AVG(temps.temp)]"
+        - "   GapFillExec: group_expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0], aggr_expr=[AVG(temps.temp)@1], stride=60000000000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
+        - "     SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC]"
+        - "       AggregateExec: mode=Final, gby=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
+        - "         AggregateExec: mode=Partial, gby=[date_bin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
+        - "           EmptyExec"
+        "###
+        );
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn gap_fill_exec_sort_order() -> Result<()> {
+        // The call to `date_bin_gapfill` should be last in the SortExec
+        // expressions, even though it was not last on the SELECT list
+        // or the GROUP BY clause.
+        let sql = "SELECT \
+           \n  loc,\
+           \n  date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:00Z') AS minute,\
+           \n  concat('zz', loc) AS loczz,\
+           \n  avg(temp)\
+           \nFROM temps\
+           \nWHERE time >= '1980-01-01T00:00:00Z' and time < '1981-01-01T00:00:00Z'
+           \nGROUP BY loc, minute, loczz;";
+
+        let explain = format_explain(sql).await?;
+        insta::assert_yaml_snapshot!(
+            explain,
+            @r###"
+        ---
+        - " ProjectionExec: expr=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as minute, concat(Utf8(\"zz\"),temps.loc)@2 as loczz, AVG(temps.temp)@3 as AVG(temps.temp)]"
+        - "   GapFillExec: group_expr=[loc@0, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1, concat(Utf8(\"zz\"),temps.loc)@2], aggr_expr=[AVG(temps.temp)@3], stride=60000000000, time_range=Included(\"315532800000000000\")..Excluded(\"347155200000000000\")"
+        - "     SortExec: expr=[loc@0 ASC,concat(Utf8(\"zz\"),temps.loc)@2 ASC,date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC]"
+        - "       AggregateExec: mode=Final, gby=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(Utf8(\"zz\"),temps.loc)@2 as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
+        - "         AggregateExec: mode=Partial, gby=[loc@1 as loc, date_bin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
+        - "           EmptyExec"
+        "###
+        );
+        Ok(())
+    }
+}
diff --git a/iox_query/src/exec/gapfill/params.rs b/iox_query/src/exec/gapfill/params.rs
new file mode 100644
index 0000000..5e9d0c4
--- /dev/null
+++ b/iox_query/src/exec/gapfill/params.rs
@@ -0,0 +1,392 @@
+//! Evaluate the parameters to be used for gap filling.
+use std::ops::Bound;
+
+use arrow::{
+    datatypes::{IntervalMonthDayNanoType, SchemaRef},
+    record_batch::RecordBatch,
+};
+use chrono::Duration;
+use datafusion::{
+    error::{DataFusionError, Result},
+    physical_expr::datetime_expressions::date_bin,
+    physical_plan::{expressions::Column, ColumnarValue},
+    scalar::ScalarValue,
+};
+use hashbrown::HashMap;
+
+use super::{try_map_bound, try_map_range, FillStrategy, GapFillExecParams};
+
+/// The parameters to gap filling. Included here are the parameters
+/// that remain constant during gap filling, i.e., not the streaming table
+/// data, or anything else.
+/// When we support `locf` for aggregate columns, that will be tracked here.
+#[derive(Clone, Debug, PartialEq)]
+pub(crate) struct GapFillParams {
+    /// The stride in nanoseconds of the timestamps to be output.
+    pub stride: i64,
+    /// The first timestamp (inclusive) to be output for each series,
+    /// in nanoseconds since the epoch. `None` means gap filling should
+    /// start from the first timestamp in each series.
+    pub first_ts: Option<i64>,
+    /// The last timestamp (inclusive!) to be output for each series,
+    /// in nanoseconds since the epoch.
+    pub last_ts: i64,
+    /// What to do when filling gaps in aggregate columns.
+    /// The map is keyed on the columns offset in the schema.
+    pub fill_strategy: HashMap<usize, FillStrategy>,
+}
+
+impl GapFillParams {
+    /// Create a new [GapFillParams] by figuring out the actual values (as native i64) for the stride,
+    /// first and last timestamp for gap filling.
+    pub(super) fn try_new(schema: SchemaRef, params: &GapFillExecParams) -> Result<Self> {
+        let batch = RecordBatch::new_empty(schema);
+        let stride = params.stride.evaluate(&batch)?;
+        let origin = params
+            .origin
+            .as_ref()
+            .map(|e| e.evaluate(&batch))
+            .transpose()?;
+
+        // Evaluate the upper and lower bounds of the time range
+        let range = try_map_range(&params.time_range, |b| {
+            try_map_bound(b.as_ref(), |pe| {
+                extract_timestamp_nanos(&pe.evaluate(&batch)?)
+            })
+        })?;
+
+        // Find the smallest timestamp that might appear in the
+        // range. There might not be one, which is okay.
+        let first_ts = match range.start {
+            Bound::Included(v) => Some(v),
+            Bound::Excluded(v) => Some(v + 1),
+            Bound::Unbounded => None,
+        };
+
+        // Find the largest timestamp that might appear in the
+        // range
+        let last_ts = match range.end {
+            Bound::Included(v) => v,
+            Bound::Excluded(v) => v - 1,
+            Bound::Unbounded => {
+                return Err(DataFusionError::Execution(
+                    "missing upper time bound for gap filling".to_string(),
+                ))
+            }
+        };
+
+        // Call date_bin on the timestamps to find the first and last time bins
+        // for each series
+        let mut args = vec![stride, i64_to_columnar_ts(first_ts)];
+        if let Some(v) = origin {
+            args.push(v)
+        }
+        let first_ts = first_ts
+            .map(|_| extract_timestamp_nanos(&date_bin(&args)?))
+            .transpose()?;
+        args[1] = i64_to_columnar_ts(Some(last_ts));
+        let last_ts = extract_timestamp_nanos(&date_bin(&args)?)?;
+
+        let fill_strategy = params
+            .fill_strategy
+            .iter()
+            .map(|(e, fs)| {
+                let idx = e
+                    .as_any()
+                    .downcast_ref::<Column>()
+                    .ok_or(DataFusionError::Internal(format!(
+                        "fill strategy aggr expr was not a column: {e:?}",
+                    )))?
+                    .index();
+                Ok((idx, *fs))
+            })
+            .collect::<Result<HashMap<usize, FillStrategy>>>()?;
+
+        Ok(Self {
+            stride: extract_interval_nanos(&args[0])?,
+            first_ts,
+            last_ts,
+            fill_strategy,
+        })
+    }
+
+    /// Returns the number of rows remaining for a series that starts with first_ts.
+    pub fn valid_row_count(&self, first_ts: i64) -> usize {
+        if self.last_ts >= first_ts {
+            ((self.last_ts - first_ts) / self.stride + 1) as usize
+        } else {
+            0
+        }
+    }
+}
+
+fn i64_to_columnar_ts(i: Option<i64>) -> ColumnarValue {
+    match i {
+        Some(i) => ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(i), None)),
+        None => ColumnarValue::Scalar(ScalarValue::Null),
+    }
+}
+
+fn extract_timestamp_nanos(cv: &ColumnarValue) -> Result<i64> {
+    Ok(match cv {
+        ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(v), _)) => *v,
+        _ => {
+            return Err(DataFusionError::Execution(
+                "gap filling argument must be a scalar timestamp".to_string(),
+            ))
+        }
+    })
+}
+
+fn extract_interval_nanos(cv: &ColumnarValue) -> Result<i64> {
+    match cv {
+        ColumnarValue::Scalar(ScalarValue::IntervalMonthDayNano(Some(v))) => {
+            let (months, days, nanos) = IntervalMonthDayNanoType::to_parts(*v);
+
+            if months != 0 {
+                return Err(DataFusionError::Execution(
+                    "gap filling does not support month intervals".to_string(),
+                ));
+            }
+
+            let nanos =
+                (Duration::days(days as i64) + Duration::nanoseconds(nanos)).num_nanoseconds();
+            nanos.ok_or_else(|| {
+                DataFusionError::Execution("gap filling argument is too large".to_string())
+            })
+        }
+        _ => Err(DataFusionError::Execution(
+            "gap filling expects a stride parameter to be a scalar interval".to_string(),
+        )),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        ops::{Bound, Range},
+        sync::Arc,
+    };
+
+    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use datafusion::{
+        datasource::empty::EmptyTable,
+        error::Result,
+        physical_plan::{
+            expressions::{Column, Literal},
+            PhysicalExpr,
+        },
+        scalar::ScalarValue,
+    };
+    use hashbrown::HashMap;
+
+    use crate::exec::{
+        gapfill::{FillStrategy, GapFillExec, GapFillExecParams},
+        Executor, ExecutorType,
+    };
+
+    use super::GapFillParams;
+
+    #[tokio::test]
+    async fn test_evaluate_params() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let actual = plan_statement_and_get_params(
+            "select\
+               \n    date_bin_gapfill(interval '1 minute', time) minute\
+               \nfrom t\
+               \nwhere time >= timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\
+               \n    and time <= timestamp '1984-01-01T16:00:00Z'\
+               \ngroup by minute",
+        )
+        .await?;
+        let expected = GapFillParams {
+            stride: 60_000_000_000,                  // 1 minute
+            first_ts: Some(441_820_500_000_000_000), // Sunday, January 1, 1984 3:55:00 PM
+            last_ts: 441_820_800_000_000_000,        // Sunday, January 1, 1984 3:59:00 PM
+            fill_strategy: HashMap::new(),
+        };
+        assert_eq!(expected, actual);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_evaluate_params_default_origin() -> Result<()> {
+        // as above but the default origin is explicity specified.
+        test_helpers::maybe_start_logging();
+        let actual = plan_statement_and_get_params(
+                "select\
+               \n    date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:00Z') minute\
+               \nfrom t\
+               \nwhere time >= timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\
+               \n    and time <= timestamp '1984-01-01T16:00:00Z'\
+               \ngroup by minute",
+            ).await?;
+        let expected = GapFillParams {
+            stride: 60_000_000_000,                  // 1 minute
+            first_ts: Some(441_820_500_000_000_000), // Sunday, January 1, 1984 3:55:00 PM
+            last_ts: 441_820_800_000_000_000,        // Sunday, January 1, 1984 3:59:00 PM
+            fill_strategy: HashMap::new(),
+        };
+        assert_eq!(expected, actual);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_evaluate_params_exclude_end() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let actual = plan_statement_and_get_params(
+            "select\
+               \n    date_bin_gapfill(interval '1 minute', time) minute\
+               \nfrom t\
+               \nwhere time >= timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\
+               \n    and time < timestamp '1984-01-01T16:00:00Z'\
+               \ngroup by minute",
+        )
+        .await?;
+        let expected = GapFillParams {
+            stride: 60_000_000_000,                  // 1 minute
+            first_ts: Some(441_820_500_000_000_000), // Sunday, January 1, 1984 3:55:00 PM
+            // Last bin at 16:00 is excluded
+            last_ts: 441_820_740_000_000_000, // Sunday, January 1, 1984 3:59:00 PM
+            fill_strategy: HashMap::new(),
+        };
+        assert_eq!(expected, actual);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_evaluate_params_exclude_start() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let actual = plan_statement_and_get_params(
+            "select\
+               \n    date_bin_gapfill(interval '1 minute', time) minute\
+               \nfrom t\
+               \nwhere time > timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\
+               \n    and time <= timestamp '1984-01-01T16:00:00Z'\
+               \ngroup by minute",
+        )
+        .await?;
+        let expected = GapFillParams {
+            stride: 60_000_000_000, // 1 minute
+            // First bin not exluded since it truncates to 15:55:00
+            first_ts: Some(441_820_500_000_000_000), // Sunday, January 1, 1984 3:55:00 PM
+            last_ts: 441_820_800_000_000_000,        // Sunday, January 1, 1984 3:59:00 PM
+            fill_strategy: HashMap::new(),
+        };
+        assert_eq!(expected, actual);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_evaluate_params_origin() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let actual = plan_statement_and_get_params(
+            // origin is 9s after the epoch
+                "select\
+               \n    date_bin_gapfill(interval '1 minute', time, timestamp '1970-01-01T00:00:09Z') minute\
+               \nfrom t\
+               \nwhere time >= timestamp '1984-01-01T16:00:00Z' - interval '5 minutes'\
+               \n    and time <= timestamp '1984-01-01T16:00:00Z'\
+               \ngroup by minute",
+            ).await?;
+        let expected = GapFillParams {
+            stride: 60_000_000_000,                  // 1 minute
+            first_ts: Some(441_820_449_000_000_000), // Sunday, January 1, 1984 3:54:09 PM
+            last_ts: 441_820_749_000_000_000,        // Sunday, January 1, 1984 3:59:09 PM
+            fill_strategy: HashMap::new(),
+        };
+        assert_eq!(expected, actual);
+        Ok(())
+    }
+
+    fn interval(ns: i64) -> Arc<dyn PhysicalExpr> {
+        Arc::new(Literal::new(ScalarValue::new_interval_mdn(0, 0, ns)))
+    }
+
+    fn timestamp(ns: i64) -> Arc<dyn PhysicalExpr> {
+        Arc::new(Literal::new(ScalarValue::TimestampNanosecond(
+            Some(ns),
+            None,
+        )))
+    }
+
+    #[test]
+    fn test_params_no_start() {
+        let exec_params = GapFillExecParams {
+            stride: interval(1_000_000_000),
+            time_column: Column::new("time", 0),
+            origin: None,
+            time_range: Range {
+                start: Bound::Unbounded,
+                end: Bound::Excluded(timestamp(20_000_000_000)),
+            },
+            fill_strategy: std::iter::once((
+                Arc::new(Column::new("a0", 1)) as Arc<dyn PhysicalExpr>,
+                FillStrategy::Null,
+            ))
+            .collect(),
+        };
+
+        let actual = GapFillParams::try_new(schema().into(), &exec_params).unwrap();
+        assert_eq!(
+            GapFillParams {
+                stride: 1_000_000_000,
+                first_ts: None,
+                last_ts: 19_000_000_000,
+                fill_strategy: simple_fill_strategy(),
+            },
+            actual
+        );
+    }
+
+    #[test]
+    #[allow(clippy::reversed_empty_ranges)]
+    fn test_params_row_count() -> Result<()> {
+        test_helpers::maybe_start_logging();
+        let params = GapFillParams {
+            stride: 10,
+            first_ts: Some(1000),
+            last_ts: 1050,
+            fill_strategy: simple_fill_strategy(),
+        };
+
+        assert_eq!(6, params.valid_row_count(1000));
+        assert_eq!(0, params.valid_row_count(1100));
+        Ok(())
+    }
+
+    fn schema() -> Schema {
+        Schema::new(vec![
+            Field::new(
+                "time",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+            Field::new(
+                "other_time",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+            Field::new("loc", DataType::Utf8, false),
+            Field::new("temp", DataType::Float64, false),
+        ])
+    }
+
+    async fn plan_statement_and_get_params(sql: &str) -> Result<GapFillParams> {
+        let executor = Executor::new_testing();
+        let context = executor.new_context(ExecutorType::Query);
+        context
+            .inner()
+            .register_table("t", Arc::new(EmptyTable::new(Arc::new(schema()))))?;
+        let physical_plan = context.sql_to_physical_plan(sql).await?;
+        let gapfill_node = &physical_plan.children()[0];
+        let gapfill_node = gapfill_node.as_any().downcast_ref::<GapFillExec>().unwrap();
+        let exec_params = &gapfill_node.params;
+        let schema = schema();
+        GapFillParams::try_new(schema.into(), exec_params)
+    }
+
+    fn simple_fill_strategy() -> HashMap<usize, FillStrategy> {
+        std::iter::once((1, FillStrategy::Null)).collect()
+    }
+}
diff --git a/iox_query/src/exec/gapfill/stream.rs b/iox_query/src/exec/gapfill/stream.rs
new file mode 100644
index 0000000..499de06
--- /dev/null
+++ b/iox_query/src/exec/gapfill/stream.rs
@@ -0,0 +1,284 @@
+//! Implementation of [Stream] that performs gap-filling on tables.
+use std::{
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use arrow::{
+    array::{ArrayRef, TimestampNanosecondArray},
+    datatypes::SchemaRef,
+    record_batch::RecordBatch,
+};
+use arrow_util::optimize::optimize_dictionaries;
+use datafusion::{
+    error::{DataFusionError, Result},
+    execution::memory_pool::MemoryReservation,
+    physical_plan::{
+        expressions::Column,
+        metrics::{BaselineMetrics, RecordOutput},
+        ExecutionPlan, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream,
+    },
+};
+use futures::{ready, Stream, StreamExt};
+
+use super::{algo::GapFiller, buffered_input::BufferedInput, params::GapFillParams, GapFillExec};
+
+/// An implementation of a gap-filling operator that uses the [Stream] trait.
+///
+/// This type takes responsibility for:
+/// - Reading input record batches
+/// - Accounting for memory
+/// - Extracting arrays for processing by [`GapFiller`]
+/// - Recording metrics
+/// - Sending record batches to next operator (by implementing [`Self::poll_next'])
+#[allow(dead_code)]
+pub(super) struct GapFillStream {
+    /// The schema of the input and output.
+    schema: SchemaRef,
+    /// The column from the input that contains the timestamps for each row.
+    /// This column has already had `date_bin` applied to it by a previous `Aggregate`
+    /// operator.
+    time_expr: Arc<dyn PhysicalExpr>,
+    /// The other columns from the input that appeared in the GROUP BY clause of the
+    /// original query.
+    group_expr: Vec<Arc<dyn PhysicalExpr>>,
+    /// The aggregate columns from the select list of the original query.
+    aggr_expr: Vec<Arc<dyn PhysicalExpr>>,
+    /// The producer of the input record batches.
+    input: SendableRecordBatchStream,
+    /// Input that has been read from the iput stream.
+    buffered_input: BufferedInput,
+    /// The thing that does the gap filling.
+    gap_filler: GapFiller,
+    /// This is true as long as there are more input record batches to read from `input`.
+    more_input: bool,
+    /// For tracking memory.
+    reservation: MemoryReservation,
+    /// Baseline metrics.
+    baseline_metrics: BaselineMetrics,
+}
+
+impl GapFillStream {
+    /// Creates a new GapFillStream.
+    pub fn try_new(
+        exec: &GapFillExec,
+        batch_size: usize,
+        input: SendableRecordBatchStream,
+        reservation: MemoryReservation,
+        metrics: BaselineMetrics,
+    ) -> Result<Self> {
+        let schema = exec.schema();
+        let GapFillExec {
+            sort_expr,
+            aggr_expr,
+            params,
+            ..
+        } = exec;
+
+        if sort_expr.is_empty() {
+            return Err(DataFusionError::Internal(
+                "empty sort_expr vector for gap filling; should have at least a time expression"
+                    .to_string(),
+            ));
+        }
+        let mut group_expr = sort_expr
+            .iter()
+            .map(|se| Arc::clone(&se.expr))
+            .collect::<Vec<_>>();
+        let aggr_expr = aggr_expr.to_owned();
+        let time_expr = group_expr.split_off(group_expr.len() - 1).pop().unwrap();
+
+        let group_cols = group_expr.iter().map(expr_to_index).collect::<Vec<_>>();
+        let params = GapFillParams::try_new(Arc::clone(&schema), params)?;
+        let buffered_input = BufferedInput::new(&params, group_cols);
+
+        let gap_filler = GapFiller::new(params, batch_size);
+        Ok(Self {
+            schema,
+            time_expr,
+            group_expr,
+            aggr_expr,
+            input,
+            buffered_input,
+            gap_filler,
+            more_input: true,
+            reservation,
+            baseline_metrics: metrics,
+        })
+    }
+}
+
+impl RecordBatchStream for GapFillStream {
+    fn schema(&self) -> arrow::datatypes::SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl Stream for GapFillStream {
+    type Item = Result<RecordBatch>;
+
+    /// Produces a gap-filled record batch from its input stream.
+    ///
+    /// For details on implementation, see [`GapFiller`].
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        let last_output_row_offset = self.gap_filler.last_output_row_offset();
+        while self.more_input && self.buffered_input.need_more(last_output_row_offset)? {
+            match ready!(self.input.poll_next_unpin(cx)) {
+                Some(Ok(batch)) => {
+                    self.reservation.try_grow(batch.get_array_memory_size())?;
+                    self.buffered_input.push(batch);
+                }
+                Some(Err(e)) => {
+                    return Poll::Ready(Some(Err(e)));
+                }
+                None => {
+                    self.more_input = false;
+                }
+            }
+        }
+
+        let input_batch = match self.take_buffered_input() {
+            Ok(None) => return Poll::Ready(None),
+            Ok(Some(input_batch)) => {
+                // If we have consumed all of our input, and there is no more work
+                if self.gap_filler.done(input_batch.num_rows()) {
+                    // leave the input batch taken so that its reference
+                    // count goes to zero.
+                    self.reservation.shrink(input_batch.get_array_memory_size());
+                    return Poll::Ready(None);
+                }
+
+                input_batch
+            }
+            Err(e) => return Poll::Ready(Some(Err(e))),
+        };
+
+        match self.process(input_batch) {
+            Ok((output_batch, remaining_input_batch)) => {
+                self.buffered_input.push(remaining_input_batch);
+
+                self.reservation
+                    .shrink(output_batch.get_array_memory_size());
+                Poll::Ready(Some(Ok(output_batch)))
+            }
+            Err(e) => Poll::Ready(Some(Err(e))),
+        }
+    }
+}
+
+impl GapFillStream {
+    /// If any buffered input batches are present, concatenates it all together
+    /// and returns an owned batch to the caller, leaving `self.buffered_input_batches` empty.
+    fn take_buffered_input(&mut self) -> Result<Option<RecordBatch>> {
+        let batches = self.buffered_input.take();
+        if batches.is_empty() {
+            return Ok(None);
+        }
+
+        let old_size = batches.iter().map(|rb| rb.get_array_memory_size()).sum();
+
+        let mut batch = arrow::compute::concat_batches(&self.schema, &batches)
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
+        self.reservation.try_grow(batch.get_array_memory_size())?;
+
+        if batches.len() > 1 {
+            // Optimize the dictionaries. The output of this operator uses the take kernel to produce
+            // its output. Since the input batches will usually be smaller than the output, it should
+            // be less work to optimize here vs optimizing the output.
+            batch = optimize_dictionaries(&batch)
+                .map_err(|err| DataFusionError::ArrowError(err, None))?;
+        }
+
+        self.reservation.shrink(old_size);
+        Ok(Some(batch))
+    }
+
+    /// Produces a 2-tuple of [RecordBatch]es:
+    /// - The gap-filled output
+    /// - Remaining buffered input
+    fn process(&mut self, mut input_batch: RecordBatch) -> Result<(RecordBatch, RecordBatch)> {
+        let elapsed_compute = self.baseline_metrics.elapsed_compute().clone();
+
+        let input_time_array = self
+            .time_expr
+            .evaluate(&input_batch)?
+            .into_array(input_batch.num_rows())?;
+        let input_time_array: &TimestampNanosecondArray = input_time_array
+            .as_any()
+            .downcast_ref()
+            .ok_or(DataFusionError::Internal(
+                "time array must be a TimestampNanosecondArray".to_string(),
+            ))?;
+        let input_time_array = (expr_to_index(&self.time_expr), input_time_array);
+
+        let group_arrays = self.group_arrays(&input_batch)?;
+        let aggr_arrays = self.aggr_arrays(&input_batch)?;
+
+        let timer = elapsed_compute.timer();
+        let output_batch = self
+            .gap_filler
+            .build_gapfilled_output(
+                Arc::clone(&self.schema),
+                input_time_array,
+                &group_arrays,
+                &aggr_arrays,
+            )
+            .record_output(&self.baseline_metrics)?;
+        timer.done();
+
+        self.reservation
+            .try_grow(output_batch.get_array_memory_size())?;
+
+        // Slice the input to just what is needed moving forward, with one context
+        // row before the next input offset.
+        input_batch = self.gap_filler.slice_input_batch(input_batch)?;
+
+        Ok((output_batch, input_batch))
+    }
+
+    /// Produces the arrays for the group columns in the input.
+    /// The first item in the 2-tuple is the arrays offset in the schema.
+    fn group_arrays(&self, input_batch: &RecordBatch) -> Result<Vec<(usize, ArrayRef)>> {
+        self.group_expr
+            .iter()
+            .map(|e| {
+                Ok((
+                    expr_to_index(e),
+                    e.evaluate(input_batch)?
+                        .into_array(input_batch.num_rows())?,
+                ))
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+
+    /// Produces the arrays for the aggregate columns in the input.
+    /// The first item in the 2-tuple is the arrays offset in the schema.
+    fn aggr_arrays(&self, input_batch: &RecordBatch) -> Result<Vec<(usize, ArrayRef)>> {
+        self.aggr_expr
+            .iter()
+            .map(|e| {
+                Ok((
+                    expr_to_index(e),
+                    e.evaluate(input_batch)?
+                        .into_array(input_batch.num_rows())?,
+                ))
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+}
+
+/// Returns the index of the given expression in the schema,
+/// assuming that it is a column.
+///
+/// # Panic
+/// Panics if the expression is not a column.
+fn expr_to_index(expr: &Arc<dyn PhysicalExpr>) -> usize {
+    expr.as_any()
+        .downcast_ref::<Column>()
+        .expect("all exprs should be columns")
+        .index()
+}
diff --git a/iox_query/src/exec/metrics.rs b/iox_query/src/exec/metrics.rs
new file mode 100644
index 0000000..7a39768
--- /dev/null
+++ b/iox_query/src/exec/metrics.rs
@@ -0,0 +1,52 @@
+use std::{
+    borrow::Cow,
+    sync::{Arc, Weak},
+};
+
+use datafusion::execution::memory_pool::MemoryPool;
+use metric::{Attributes, Instrument, MetricKind, Observation, Reporter};
+
+/// Hooks DataFusion [`MemoryPool`] into our [`metric`] crate.
+#[derive(Debug, Clone)]
+pub struct DataFusionMemoryPoolMetricsBridge {
+    pool: Weak<dyn MemoryPool>,
+    limit: usize,
+}
+
+impl DataFusionMemoryPoolMetricsBridge {
+    /// Register new pool.
+    pub fn new(pool: &Arc<dyn MemoryPool>, limit: usize) -> Self {
+        Self {
+            pool: Arc::downgrade(pool),
+            limit,
+        }
+    }
+}
+
+impl Instrument for DataFusionMemoryPoolMetricsBridge {
+    fn report(&self, reporter: &mut dyn Reporter) {
+        reporter.start_metric(
+            "datafusion_mem_pool_bytes",
+            "Number of bytes within the datafusion memory pool",
+            MetricKind::U64Gauge,
+        );
+        let Some(pool_arc) = self.pool.upgrade() else {
+            return;
+        };
+
+        reporter.report_observation(
+            &Attributes::from([("state", Cow::Borrowed("limit"))]),
+            Observation::U64Gauge(self.limit as u64),
+        );
+
+        reporter.report_observation(
+            &Attributes::from([("state", Cow::Borrowed("reserved"))]),
+            Observation::U64Gauge(pool_arc.reserved() as u64),
+        );
+        reporter.finish_metric();
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
diff --git a/iox_query/src/exec/non_null_checker.rs b/iox_query/src/exec/non_null_checker.rs
new file mode 100644
index 0000000..8a60bd7
--- /dev/null
+++ b/iox_query/src/exec/non_null_checker.rs
@@ -0,0 +1,478 @@
+//! This module contains code for the "NonNullChecker" DataFusion
+//! extension plan node
+//!
+//! A NonNullChecker node takes an arbitrary input array and produces
+//! a single string output column that contains
+//!
+//! 1. A single string if any of the input columns are non-null
+//! 2. zero rows if all of the input columns are null
+//!
+//! For this input:
+//!
+//!  ColA | ColB | ColC
+//! ------+------+------
+//!   1   | NULL | NULL
+//!   2   | 2    | NULL
+//!   3   | 2    | NULL
+//!
+//! The output would be (given 'the_value' was provided to `NonNullChecker` node)
+//!
+//!   non_null_column
+//!  -----------------
+//!   the_value
+//!
+//! However, for this input (All NULL)
+//!
+//!  ColA | ColB | ColC
+//! ------+------+------
+//!  NULL | NULL | NULL
+//!  NULL | NULL | NULL
+//!  NULL | NULL | NULL
+//!
+//! There would be no output rows
+//!
+//!   non_null_column
+//!  -----------------
+//!
+//! This operation can be used to implement the table_name metadata query
+
+use std::{
+    fmt::{self, Debug},
+    sync::Arc,
+};
+
+use arrow::{
+    array::{new_empty_array, StringArray},
+    datatypes::{DataType, Field, Schema, SchemaRef},
+    record_batch::RecordBatch,
+};
+use datafusion::logical_expr::expr_vec_fmt;
+use datafusion::{
+    common::{DFSchemaRef, ToDFSchema},
+    error::{DataFusionError, Result},
+    execution::context::TaskContext,
+    logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore},
+    physical_plan::{
+        expressions::PhysicalSortExpr,
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
+        DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+        SendableRecordBatchStream, Statistics,
+    },
+};
+
+use datafusion_util::{watch::WatchedTask, AdapterStream};
+use observability_deps::tracing::debug;
+use tokio::sync::mpsc;
+use tokio_stream::StreamExt;
+
+/// Implements the NonNullChecker operation as described in this module's documentation
+#[derive(Hash, PartialEq, Eq)]
+pub struct NonNullCheckerNode {
+    input: LogicalPlan,
+    schema: DFSchemaRef,
+    /// these expressions represent what columns are "used" by this
+    /// node (in this case all of them) -- columns that are not used
+    /// are optimzied away by datafusion.
+    exprs: Vec<Expr>,
+
+    /// The value to produce if there are any non null Inputs
+    value: Arc<str>,
+}
+
+impl NonNullCheckerNode {
+    /// Creates a new NonNullChecker node
+    ///
+    /// # Panics
+    /// If the input schema is empty
+    pub fn new(value: &str, input: LogicalPlan) -> Self {
+        let schema = make_non_null_checker_output_schema();
+
+        // Form exprs that refer to all of our input columns (so that
+        // datafusion knows not to opimize them away)
+        let exprs = input
+            .schema()
+            .fields()
+            .iter()
+            .map(|field| Expr::Column(field.qualified_column()))
+            .collect::<Vec<_>>();
+
+        assert!(!exprs.is_empty(), "NonNullChecker: input schema was empty");
+
+        Self {
+            input,
+            schema,
+            exprs,
+            value: value.into(),
+        }
+    }
+
+    /// Return the value associated with this checker
+    pub fn value(&self) -> Arc<str> {
+        Arc::clone(&self.value)
+    }
+}
+
+impl Debug for NonNullCheckerNode {
+    /// Use explain format for the Debug format.
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.fmt_for_explain(f)
+    }
+}
+
+impl UserDefinedLogicalNodeCore for NonNullCheckerNode {
+    fn name(&self) -> &str {
+        "NonNullChecker"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    /// Schema for Pivot is a single string
+    fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.exprs.clone()
+    }
+
+    /// For example: `NonNullChecker('the_value'), exprs=[foo]`
+    fn fmt_for_explain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{}('{}') exprs={}",
+            self.name(),
+            self.value,
+            expr_vec_fmt!(self.exprs)
+        )
+    }
+
+    fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
+        assert_eq!(inputs.len(), 1, "NonNullChecker: input sizes inconsistent");
+        assert_eq!(
+            exprs.len(),
+            self.exprs.len(),
+            "NonNullChecker: expression sizes inconsistent"
+        );
+        Self::new(self.value.as_ref(), inputs[0].clone())
+    }
+}
+
+// ------ The implementation of NonNullChecker code follows -----
+
+/// Create the schema describing the output
+pub fn make_non_null_checker_output_schema() -> DFSchemaRef {
+    let nullable = false;
+    Schema::new(vec![Field::new(
+        "non_null_column",
+        DataType::Utf8,
+        nullable,
+    )])
+    .to_dfschema_ref()
+    .unwrap()
+}
+
+/// Physical operator that implements the NonNullChecker operation aginst
+/// data types
+pub struct NonNullCheckerExec {
+    input: Arc<dyn ExecutionPlan>,
+    /// Output schema
+    schema: SchemaRef,
+    /// The value to produce if there are any non null Inputs
+    value: Arc<str>,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl NonNullCheckerExec {
+    pub fn new(input: Arc<dyn ExecutionPlan>, schema: SchemaRef, value: Arc<str>) -> Self {
+        Self {
+            input,
+            schema,
+            value,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+}
+
+impl Debug for NonNullCheckerExec {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "NonNullCheckerExec")
+    }
+}
+
+impl ExecutionPlan for NonNullCheckerExec {
+    fn as_any(&self) -> &(dyn std::any::Any + 'static) {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        use Partitioning::*;
+        match self.input.output_partitioning() {
+            RoundRobinBatch(num_partitions) => RoundRobinBatch(num_partitions),
+            // as this node transforms the output schema,  whatever partitioning
+            // was present on the input is lost on the output
+            Hash(_, num_partitions) => UnknownPartitioning(num_partitions),
+            UnknownPartitioning(num_partitions) => UnknownPartitioning(num_partitions),
+        }
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::UnspecifiedDistribution]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        match children.len() {
+            1 => Ok(Arc::new(Self {
+                input: Arc::clone(&children[0]),
+                schema: Arc::clone(&self.schema),
+                metrics: ExecutionPlanMetricsSet::new(),
+                value: Arc::clone(&self.value),
+            })),
+            _ => Err(DataFusionError::Internal(
+                "NonNullCheckerExec wrong number of children".to_string(),
+            )),
+        }
+    }
+
+    /// Execute one partition and return an iterator over RecordBatch
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        debug!(partition, "Start NonNullCheckerExec::execute");
+        if self.output_partitioning().partition_count() <= partition {
+            return Err(DataFusionError::Internal(format!(
+                "NonNullCheckerExec invalid partition {partition}"
+            )));
+        }
+
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        let input_stream = self.input.execute(partition, context)?;
+
+        let (tx, rx) = mpsc::channel(1);
+
+        let fut = check_for_nulls(
+            input_stream,
+            Arc::clone(&self.schema),
+            baseline_metrics,
+            Arc::clone(&self.value),
+            tx.clone(),
+        );
+
+        // A second task watches the output of the worker task and
+        // reports errors
+        let handle = WatchedTask::new(fut, vec![tx], "non_null_checker");
+
+        debug!(partition, "End NonNullCheckerExec::execute");
+        Ok(AdapterStream::adapt(self.schema(), rx, handle))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for NonNullCheckerExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "NonNullCheckerExec")
+            }
+        }
+    }
+}
+
+async fn check_for_nulls(
+    mut input_stream: SendableRecordBatchStream,
+    schema: SchemaRef,
+    baseline_metrics: BaselineMetrics,
+    value: Arc<str>,
+    tx: mpsc::Sender<Result<RecordBatch, DataFusionError>>,
+) -> Result<(), DataFusionError> {
+    while let Some(input_batch) = input_stream.next().await.transpose()? {
+        let timer = baseline_metrics.elapsed_compute().timer();
+
+        if input_batch
+            .columns()
+            .iter()
+            .any(|arr| arr.null_count() != arr.len())
+        {
+            // found a non null in input, return value
+            let arr: StringArray = vec![Some(value.as_ref())].into();
+
+            let output_batch = RecordBatch::try_new(schema, vec![Arc::new(arr)])?;
+            // ignore errors on sending (means receiver hung up)
+            std::mem::drop(timer);
+            tx.send(Ok(output_batch)).await.ok();
+            return Ok(());
+        }
+        // else keep looking
+    }
+    // if we got here, did not see any non null values. So
+    // send back an empty record batch
+    let output_batch = RecordBatch::try_new(schema, vec![new_empty_array(&DataType::Utf8)])?;
+
+    // ignore errors on sending (means receiver hung up)
+    tx.send(Ok(output_batch)).await.ok();
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{ArrayRef, StringArray};
+    use arrow_util::assert_batches_eq;
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion_util::test_collect;
+
+    #[tokio::test]
+    async fn test_single_column_non_null() {
+        let t1 = StringArray::from(vec![Some("a"), Some("c"), Some("c")]);
+        let batch = RecordBatch::try_from_iter(vec![("t1", Arc::new(t1) as ArrayRef)]).unwrap();
+
+        let results = check("the_value", vec![batch]).await;
+
+        let expected = vec![
+            "+-----------------+",
+            "| non_null_column |",
+            "+-----------------+",
+            "| the_value       |",
+            "+-----------------+",
+        ];
+        assert_batches_eq!(&expected, &results);
+    }
+
+    #[tokio::test]
+    async fn test_single_column_null() {
+        let t1 = StringArray::from(vec![None::<&str>, None, None]);
+        let batch = RecordBatch::try_from_iter(vec![("t1", Arc::new(t1) as ArrayRef)]).unwrap();
+
+        let results = check("the_value", vec![batch]).await;
+
+        let expected = vec![
+            "+-----------------+",
+            "| non_null_column |",
+            "+-----------------+",
+            "+-----------------+",
+        ];
+        assert_batches_eq!(&expected, &results);
+    }
+
+    #[tokio::test]
+    async fn test_multi_column_non_null() {
+        let t1 = StringArray::from(vec![None::<&str>, None, None]);
+        let t2 = StringArray::from(vec![None::<&str>, None, Some("c")]);
+        let batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let results = check("the_value", vec![batch]).await;
+
+        let expected = vec![
+            "+-----------------+",
+            "| non_null_column |",
+            "+-----------------+",
+            "| the_value       |",
+            "+-----------------+",
+        ];
+        assert_batches_eq!(&expected, &results);
+    }
+
+    #[tokio::test]
+    async fn test_multi_column_null() {
+        let t1 = StringArray::from(vec![None::<&str>, None, None]);
+        let t2 = StringArray::from(vec![None::<&str>, None, None]);
+        let batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let results = check("the_value", vec![batch]).await;
+
+        let expected = vec![
+            "+-----------------+",
+            "| non_null_column |",
+            "+-----------------+",
+            "+-----------------+",
+        ];
+        assert_batches_eq!(&expected, &results);
+    }
+
+    #[tokio::test]
+    async fn test_multi_column_second_batch_non_null() {
+        // this time only the second batch has a non null value
+        let t1 = StringArray::from(vec![None::<&str>, None, None]);
+        let t2 = StringArray::from(vec![None::<&str>, None, None]);
+
+        let batch1 = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let t1 = StringArray::from(vec![None::<&str>]);
+        let t2 = StringArray::from(vec![Some("f")]);
+
+        let batch2 = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let results = check("another_value", vec![batch1, batch2]).await;
+
+        let expected = vec![
+            "+-----------------+",
+            "| non_null_column |",
+            "+-----------------+",
+            "| another_value   |",
+            "+-----------------+",
+        ];
+        assert_batches_eq!(&expected, &results);
+    }
+
+    /// Run the input through the checker and return results
+    async fn check(value: &str, input: Vec<RecordBatch>) -> Vec<RecordBatch> {
+        test_helpers::maybe_start_logging();
+
+        // Setup in memory stream
+        let schema = input[0].schema();
+        let projection = None;
+        let input = Arc::new(MemoryExec::try_new(&[input], schema, projection).unwrap());
+
+        // Create and run the checker
+        let schema: Schema = make_non_null_checker_output_schema().as_ref().into();
+        let exec = Arc::new(NonNullCheckerExec::new(
+            input,
+            Arc::new(schema),
+            value.into(),
+        ));
+
+        test_collect(exec as Arc<dyn ExecutionPlan>).await
+    }
+}
diff --git a/iox_query/src/exec/query_tracing.rs b/iox_query/src/exec/query_tracing.rs
new file mode 100644
index 0000000..de639c3
--- /dev/null
+++ b/iox_query/src/exec/query_tracing.rs
@@ -0,0 +1,703 @@
+//! This module contains the code to map DataFusion metrics to `Span`s
+//! for use in distributed tracing (e.g. Jaeger)
+
+use arrow::record_batch::RecordBatch;
+use chrono::{DateTime, Utc};
+use datafusion::error::DataFusionError;
+use datafusion::physical_plan::{
+    metrics::{MetricValue, MetricsSet},
+    DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream,
+};
+use futures::StreamExt;
+use hashbrown::HashMap;
+use observability_deps::tracing::debug;
+use std::{fmt, sync::Arc};
+use trace::span::{Span, SpanRecorder};
+
+const PER_PARTITION_TRACING_ENABLE_ENV: &str = "INFLUXDB_IOX_PER_PARTITION_TRACING";
+fn per_partition_tracing() -> bool {
+    use std::sync::atomic::{AtomicU8, Ordering};
+    static TRACING_ENABLED: AtomicU8 = AtomicU8::new(u8::MAX);
+
+    match TRACING_ENABLED.load(Ordering::Relaxed) {
+        u8::MAX => {
+            let val = std::env::var(PER_PARTITION_TRACING_ENABLE_ENV)
+                .ok()
+                .and_then(|x| x.parse::<BooleanFlag>().ok())
+                .map(Into::into)
+                .unwrap_or(false);
+
+            TRACING_ENABLED.store(val as u8, Ordering::Relaxed);
+            val
+        }
+        x => x != 0,
+    }
+}
+
+/// Stream wrapper that records DataFusion `MetricSets` into IOx
+/// [`Span`]s when it is dropped.
+pub(crate) struct TracedStream {
+    inner: SendableRecordBatchStream,
+    span_recorder: SpanRecorder,
+    physical_plan: Arc<dyn ExecutionPlan>,
+}
+
+impl TracedStream {
+    /// Return a stream that records DataFusion `MetricSets` from
+    /// `physical_plan` into `span` when dropped.
+    pub(crate) fn new(
+        inner: SendableRecordBatchStream,
+        span: Option<Span>,
+        physical_plan: Arc<dyn ExecutionPlan>,
+    ) -> Self {
+        Self {
+            inner,
+            span_recorder: SpanRecorder::new(span),
+            physical_plan,
+        }
+    }
+}
+
+impl RecordBatchStream for TracedStream {
+    fn schema(&self) -> arrow::datatypes::SchemaRef {
+        self.inner.schema()
+    }
+}
+
+impl futures::Stream for TracedStream {
+    type Item = Result<RecordBatch, DataFusionError>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        self.inner.poll_next_unpin(cx)
+    }
+}
+
+impl Drop for TracedStream {
+    fn drop(&mut self) {
+        if let Some(span) = self.span_recorder.span() {
+            let default_end_time = Utc::now();
+            let per_partition_tracing = per_partition_tracing();
+            send_metrics_to_tracing(
+                default_end_time,
+                span,
+                self.physical_plan.as_ref(),
+                per_partition_tracing,
+            );
+        }
+    }
+}
+
+/// This function translates data in DataFusion `MetricSets` into IOx
+/// [`Span`]s. It records a snapshot of the current state of the
+/// DataFusion metrics, so it should only be invoked *after* a plan is
+/// fully `collect`ed.
+///
+/// Each `ExecutionPlan` in the plan gets its own new [`Span`] that covers
+/// the time spent executing its partitions and its children
+///
+/// Each `ExecutionPlan` also has a new [`Span`] for each of its
+/// partitions that collected metrics
+///
+/// The start and end time of the span are taken from the
+/// ExecutionPlan's metrics, falling back to the parent span's
+/// timestamps if there are no metrics
+///
+/// Span metadata is used to record:
+/// 1. If the ExecutionPlan had no metrics
+/// 2. The total number of rows produced by the ExecutionPlan (if available)
+/// 3. The elapsed compute time taken by the ExecutionPlan
+pub fn send_metrics_to_tracing(
+    default_end_time: DateTime<Utc>,
+    parent_span: &Span,
+    physical_plan: &dyn ExecutionPlan,
+    per_partition_tracing: bool,
+) {
+    // Something like this when one_line is contributed back upstream
+    //let plan_name = physical_plan.displayable().one_line().to_string();
+    let desc = one_line(physical_plan).to_string();
+    let operator_name: String = desc.chars().take_while(|x| *x != ':').collect();
+
+    // Get the timings of the parent operator
+    let parent_start_time = parent_span.start.unwrap_or(default_end_time);
+    let parent_end_time = parent_span.end.unwrap_or(default_end_time);
+
+    // A span for the operation, this is the aggregate of all the partition spans
+    let mut operator_span = parent_span.child(operator_name.clone());
+    operator_span.metadata.insert("desc".into(), desc.into());
+
+    let mut operator_metrics = SpanMetrics {
+        output_rows: None,
+        elapsed_compute_nanos: None,
+    };
+
+    // The total duration for this span and all its children and partitions
+    let mut operator_start_time = DateTime::<Utc>::MAX_UTC;
+    let mut operator_end_time = DateTime::<Utc>::MIN_UTC;
+
+    match physical_plan.metrics() {
+        None => {
+            // this DataFusion node had no metrics, so record that in
+            // metadata and use the start/stop time of the parent span
+            operator_span
+                .metadata
+                .insert("missing_statistics".into(), "true".into());
+        }
+        Some(metrics) => {
+            // Create a separate span for each partition in the operator
+            for (partition, metrics) in partition_metrics(metrics) {
+                let (start_ts, end_ts) = get_timestamps(&metrics);
+
+                let partition_start_time = start_ts.unwrap_or(parent_start_time);
+                let partition_end_time = end_ts.unwrap_or(parent_end_time);
+
+                let partition_metrics = SpanMetrics {
+                    output_rows: metrics.output_rows(),
+                    elapsed_compute_nanos: metrics.elapsed_compute(),
+                };
+
+                operator_start_time = operator_start_time.min(partition_start_time);
+                operator_end_time = operator_end_time.max(partition_end_time);
+
+                // Update the aggregate totals in the operator span
+                operator_metrics.aggregate_child(&partition_metrics);
+
+                // Generate a span for the partition if
+                // - these metrics correspond to a partition
+                // - per partition tracing is enabled
+                if per_partition_tracing {
+                    if let Some(partition) = partition {
+                        let mut partition_span =
+                            operator_span.child(format!("{operator_name} ({partition})"));
+
+                        partition_span.start = Some(partition_start_time);
+                        partition_span.end = Some(partition_end_time);
+
+                        partition_metrics.add_to_span(&mut partition_span);
+
+                        partition_span.export();
+                    }
+                }
+            }
+        }
+    }
+
+    // If we've not encountered any metrics to determine the operator's start
+    // and end time, use those of the parent
+    if operator_start_time == DateTime::<Utc>::MAX_UTC {
+        operator_start_time = parent_span.start.unwrap_or(default_end_time);
+    }
+
+    if operator_end_time == DateTime::<Utc>::MIN_UTC {
+        operator_end_time = parent_span.end.unwrap_or(default_end_time);
+    }
+
+    operator_span.start = Some(operator_start_time);
+    operator_span.end = Some(operator_end_time);
+
+    // recurse
+    for child in physical_plan.children() {
+        send_metrics_to_tracing(
+            operator_end_time,
+            &operator_span,
+            child.as_ref(),
+            per_partition_tracing,
+        );
+    }
+
+    operator_metrics.add_to_span(&mut operator_span);
+    operator_span.export();
+}
+
+#[derive(Debug)]
+struct SpanMetrics {
+    output_rows: Option<usize>,
+    elapsed_compute_nanos: Option<usize>,
+}
+
+impl SpanMetrics {
+    fn aggregate_child(&mut self, child: &Self) {
+        if let Some(rows) = child.output_rows {
+            *self.output_rows.get_or_insert(0) += rows;
+        }
+
+        if let Some(nanos) = child.elapsed_compute_nanos {
+            *self.elapsed_compute_nanos.get_or_insert(0) += nanos;
+        }
+    }
+
+    fn add_to_span(&self, span: &mut Span) {
+        if let Some(rows) = self.output_rows {
+            span.metadata
+                .insert("output_rows".into(), (rows as i64).into());
+        }
+
+        if let Some(nanos) = self.elapsed_compute_nanos {
+            span.metadata
+                .insert("elapsed_compute_nanos".into(), (nanos as i64).into());
+        }
+    }
+}
+
+fn partition_metrics(metrics: MetricsSet) -> HashMap<Option<usize>, MetricsSet> {
+    let mut hashmap = HashMap::<_, MetricsSet>::new();
+    for metric in metrics.iter() {
+        hashmap
+            .entry(metric.partition())
+            .or_default()
+            .push(Arc::clone(metric))
+    }
+    hashmap
+}
+
+// todo contribute this back upstream to datafusion (add to `DisplayableExecutionPlan`)
+
+/// Return a `Display`able structure that produces a single line, for
+/// this node only (does not recurse to children)
+pub fn one_line(plan: &dyn ExecutionPlan) -> impl fmt::Display + '_ {
+    struct Wrapper<'a> {
+        plan: &'a dyn ExecutionPlan,
+    }
+    impl<'a> fmt::Display for Wrapper<'a> {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            let t = DisplayFormatType::Default;
+            self.plan.fmt_as(t, f)
+        }
+    }
+
+    Wrapper { plan }
+}
+
+// TODO maybe also contribute these back upstream to datafusion (make
+// as a method on MetricsSet)
+
+/// Return the start, and end timestamps of the metrics set, if any
+fn get_timestamps(metrics: &MetricsSet) -> (Option<DateTime<Utc>>, Option<DateTime<Utc>>) {
+    let mut start_ts = None;
+    let mut end_ts = None;
+
+    for metric in metrics.iter() {
+        if metric.labels().is_empty() {
+            match metric.value() {
+                MetricValue::StartTimestamp(ts) => {
+                    if ts.value().is_some() && start_ts.is_some() {
+                        debug!(
+                            ?metric,
+                            ?start_ts,
+                            "WARNING: more than one StartTimestamp metric found"
+                        )
+                    }
+                    start_ts = ts.value()
+                }
+                MetricValue::EndTimestamp(ts) => {
+                    if ts.value().is_some() && end_ts.is_some() {
+                        debug!(
+                            ?metric,
+                            ?end_ts,
+                            "WARNING: more than one EndTimestamp metric found"
+                        )
+                    }
+                    end_ts = ts.value()
+                }
+                _ => {}
+            }
+        }
+    }
+
+    (start_ts, end_ts)
+}
+
+/// Boolean flag that works with environment variables.
+#[derive(Debug, Clone, Copy)]
+pub enum BooleanFlag {
+    True,
+    False,
+}
+
+impl std::str::FromStr for BooleanFlag {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_ascii_lowercase().as_str() {
+            "yes" | "y" | "true" | "t" | "1" => Ok(Self::True),
+            "no" | "n" | "false" | "f" | "0" => Ok(Self::False),
+            _ => Err(format!(
+                "Invalid boolean flag '{s}'. Valid options: yes, no, y, n, true, false, t, f, 1, 0"
+            )),
+        }
+    }
+}
+
+impl From<BooleanFlag> for bool {
+    fn from(yes_no: BooleanFlag) -> Self {
+        matches!(yes_no, BooleanFlag::True)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::TimeZone;
+    use datafusion::{
+        execution::context::TaskContext,
+        physical_plan::{
+            expressions::PhysicalSortExpr,
+            metrics::{Count, Time, Timestamp},
+            DisplayAs, Metric,
+        },
+    };
+    use std::{collections::BTreeMap, str::FromStr, sync::Arc, time::Duration};
+    use trace::{ctx::SpanContext, span::MetaValue, RingBufferTraceCollector};
+
+    #[test]
+    fn name_truncation() {
+        let name = "Foo: expr nonsense";
+        let exec = TestExec::new(name, Default::default());
+
+        let traces = TraceBuilder::new();
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec, true);
+
+        let spans = traces.spans();
+        assert_eq!(spans.len(), 1);
+
+        // name is truncated to the operator name
+        assert_eq!(spans[0].name, "TestExec - Foo", "span: {spans:#?}");
+    }
+
+    // children and time propagation
+    #[test]
+    fn children_and_timestamps() {
+        let ts1 = Utc.timestamp_opt(1, 0).unwrap();
+        let ts2 = Utc.timestamp_opt(2, 0).unwrap();
+        let ts3 = Utc.timestamp_opt(3, 0).unwrap();
+        let ts4 = Utc.timestamp_opt(4, 0).unwrap();
+        let ts5 = Utc.timestamp_opt(5, 0).unwrap();
+
+        let mut many_partition = MetricsSet::new();
+        add_time_metrics(&mut many_partition, None, Some(ts2), Some(1));
+        add_time_metrics(&mut many_partition, Some(ts2), Some(ts3), Some(2));
+        add_time_metrics(&mut many_partition, Some(ts1), None, Some(3));
+
+        // build this timestamp tree:
+        //
+        // exec:   [ ts1 -------- ts4]   <-- both start and end timestamps
+        // child1:   [ ts2 - ]      <-- only start timestamp
+        // child2:   [ ts2 --- ts3] <-- both start and end timestamps
+        // child3:   [     --- ts3] <-- only end timestamps (e.g. bad data)
+        // child4:   [     ]        <-- no timestamps
+        // child5 (1): [   --- ts2]
+        // child5 (2): [ ts2 --- ts3]
+        // child5 (4): [ ts1 ---  ]
+        let mut exec = TestExec::new("exec", make_time_metric_set(Some(ts1), Some(ts4), Some(1)));
+        exec.new_child(
+            "child1: foo",
+            make_time_metric_set(Some(ts2), None, Some(1)),
+        );
+        exec.new_child(
+            "child2: bar",
+            make_time_metric_set(Some(ts2), Some(ts3), None),
+        );
+        exec.new_child(
+            "child3: baz",
+            make_time_metric_set(None, Some(ts3), Some(1)),
+        );
+        exec.new_child("child4: bingo", make_time_metric_set(None, None, Some(1)));
+        exec.new_child("child5: bongo", many_partition);
+
+        let traces = TraceBuilder::new();
+        send_metrics_to_tracing(ts5, &traces.make_span(), &exec, true);
+
+        let spans = traces.spans();
+        let spans: BTreeMap<_, _> = spans.iter().map(|s| (s.name.as_ref(), s)).collect();
+
+        println!("Spans: \n\n{spans:#?}");
+        assert_eq!(spans.len(), 10);
+
+        let check_span = |span: &Span, expected_start, expected_end, desc: Option<&str>| {
+            assert_eq!(span.start, expected_start, "expected start; {span:?}");
+            assert_eq!(span.end, expected_end, "expected end; {span:?}");
+            assert_eq!(span.metadata.get("desc").map(|x| x.string().unwrap()), desc);
+        };
+
+        check_span(
+            spans["TestExec - exec"],
+            Some(ts1),
+            Some(ts4),
+            Some("TestExec - exec"),
+        );
+
+        check_span(
+            spans["TestExec - child1"],
+            Some(ts2),
+            Some(ts4),
+            Some("TestExec - child1: foo"),
+        );
+
+        check_span(
+            spans["TestExec - child2"],
+            Some(ts2),
+            Some(ts3),
+            Some("TestExec - child2: bar"),
+        );
+
+        check_span(
+            spans["TestExec - child3"],
+            Some(ts1),
+            Some(ts3),
+            Some("TestExec - child3: baz"),
+        );
+        check_span(spans["TestExec - child3 (1)"], Some(ts1), Some(ts3), None);
+
+        check_span(
+            spans["TestExec - child4"],
+            Some(ts1),
+            Some(ts4),
+            Some("TestExec - child4: bingo"),
+        );
+
+        check_span(
+            spans["TestExec - child5"],
+            Some(ts1),
+            Some(ts4),
+            Some("TestExec - child5: bongo"),
+        );
+        check_span(spans["TestExec - child5 (1)"], Some(ts1), Some(ts2), None);
+        check_span(spans["TestExec - child5 (2)"], Some(ts2), Some(ts3), None);
+        check_span(spans["TestExec - child5 (3)"], Some(ts1), Some(ts4), None);
+    }
+
+    #[test]
+    fn no_metrics() {
+        // given execution plan with no metrics, should add notation on metadata
+        let mut exec = TestExec::new("exec", Default::default());
+        exec.metrics = None;
+
+        let traces = TraceBuilder::new();
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec, true);
+
+        let spans = traces.spans();
+        assert_eq!(spans.len(), 1);
+        assert_eq!(
+            spans[0].metadata.get("missing_statistics"),
+            Some(&MetaValue::String("true".into())),
+            "spans: {spans:#?}"
+        );
+    }
+
+    // row count and elapsed compute
+    #[test]
+    fn metrics() {
+        // given execution plan with execution time and compute spread across two partitions (1, and 2)
+        let mut exec = TestExec::new("exec", Default::default());
+        add_output_rows(exec.metrics_mut(), 100, 1);
+        add_output_rows(exec.metrics_mut(), 200, 2);
+
+        add_elapsed_compute(exec.metrics_mut(), 1000, 1);
+        add_elapsed_compute(exec.metrics_mut(), 2000, 2);
+
+        let traces = TraceBuilder::new();
+        send_metrics_to_tracing(Utc::now(), &traces.make_span(), &exec, true);
+
+        // aggregated metrics should be reported
+        let spans = traces.spans();
+        let spans: BTreeMap<_, _> = spans.iter().map(|s| (s.name.as_ref(), s)).collect();
+
+        assert_eq!(spans.len(), 3);
+
+        let check_span = |span: &Span, output_row: i64, nanos: i64| {
+            assert_eq!(
+                span.metadata.get("output_rows"),
+                Some(&MetaValue::Int(output_row)),
+                "span: {span:#?}"
+            );
+
+            assert_eq!(
+                span.metadata.get("elapsed_compute_nanos"),
+                Some(&MetaValue::Int(nanos)),
+                "spans: {span:#?}"
+            );
+        };
+
+        check_span(spans["TestExec - exec"], 300, 3000);
+        check_span(spans["TestExec - exec (1)"], 100, 1000);
+        check_span(spans["TestExec - exec (2)"], 200, 2000);
+    }
+
+    fn add_output_rows(metrics: &mut MetricsSet, output_rows: usize, partition: usize) {
+        let value = Count::new();
+        value.add(output_rows);
+
+        let partition = Some(partition);
+        metrics.push(Arc::new(Metric::new(
+            MetricValue::OutputRows(value),
+            partition,
+        )));
+    }
+
+    fn add_elapsed_compute(metrics: &mut MetricsSet, elapsed_compute: u64, partition: usize) {
+        let value = Time::new();
+        value.add_duration(Duration::from_nanos(elapsed_compute));
+
+        let partition = Some(partition);
+        metrics.push(Arc::new(Metric::new(
+            MetricValue::ElapsedCompute(value),
+            partition,
+        )));
+    }
+
+    fn make_time_metric_set(
+        start: Option<DateTime<Utc>>,
+        end: Option<DateTime<Utc>>,
+        partition: Option<usize>,
+    ) -> MetricsSet {
+        let mut metrics = MetricsSet::new();
+        add_time_metrics(&mut metrics, start, end, partition);
+        metrics
+    }
+
+    fn add_time_metrics(
+        metrics: &mut MetricsSet,
+        start: Option<DateTime<Utc>>,
+        end: Option<DateTime<Utc>>,
+        partition: Option<usize>,
+    ) {
+        if let Some(start) = start {
+            let value = make_metrics_timestamp(start);
+            metrics.push(Arc::new(Metric::new(
+                MetricValue::StartTimestamp(value),
+                partition,
+            )));
+        }
+
+        if let Some(end) = end {
+            let value = make_metrics_timestamp(end);
+            metrics.push(Arc::new(Metric::new(
+                MetricValue::EndTimestamp(value),
+                partition,
+            )));
+        }
+    }
+
+    fn make_metrics_timestamp(t: DateTime<Utc>) -> Timestamp {
+        let timestamp = Timestamp::new();
+        timestamp.set(t);
+        timestamp
+    }
+
+    /// Encapsulates creating and capturing spans for tests
+    struct TraceBuilder {
+        collector: Arc<RingBufferTraceCollector>,
+    }
+
+    impl TraceBuilder {
+        fn new() -> Self {
+            Self {
+                collector: Arc::new(RingBufferTraceCollector::new(10)),
+            }
+        }
+
+        // create a new span connected to the collector
+        fn make_span(&self) -> Span {
+            SpanContext::new(Arc::clone(&self.collector) as _).child("foo")
+        }
+
+        /// return all collected spans
+        fn spans(&self) -> Vec<Span> {
+            self.collector.spans()
+        }
+    }
+
+    /// mocked out execution plan we can control metrics
+    #[derive(Debug)]
+    struct TestExec {
+        name: String,
+        metrics: Option<MetricsSet>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    }
+
+    impl TestExec {
+        fn new(name: impl Into<String>, metrics: MetricsSet) -> Self {
+            Self {
+                name: name.into(),
+                metrics: Some(metrics),
+                children: vec![],
+            }
+        }
+
+        fn new_child(&mut self, name: impl Into<String>, metrics: MetricsSet) {
+            self.children.push(Arc::new(Self::new(name, metrics)));
+        }
+
+        fn metrics_mut(&mut self) -> &mut MetricsSet {
+            self.metrics.as_mut().unwrap()
+        }
+    }
+
+    impl ExecutionPlan for TestExec {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn schema(&self) -> arrow::datatypes::SchemaRef {
+            unimplemented!()
+        }
+
+        fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning {
+            unimplemented!()
+        }
+
+        fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+            unimplemented!()
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            self.children.clone()
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+            unimplemented!()
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> datafusion::error::Result<datafusion::physical_plan::SendableRecordBatchStream>
+        {
+            unimplemented!()
+        }
+
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            Ok(datafusion::physical_plan::Statistics::new_unknown(
+                &self.schema(),
+            ))
+        }
+
+        fn metrics(&self) -> Option<MetricsSet> {
+            self.metrics.clone()
+        }
+    }
+
+    impl DisplayAs for TestExec {
+        fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "TestExec - {}", self.name)
+        }
+    }
+
+    #[test]
+    fn test_parsing() {
+        assert!(bool::from(BooleanFlag::from_str("yes").unwrap()));
+        assert!(bool::from(BooleanFlag::from_str("Yes").unwrap()));
+        assert!(bool::from(BooleanFlag::from_str("YES").unwrap()));
+
+        assert!(!bool::from(BooleanFlag::from_str("No").unwrap()));
+        assert!(!bool::from(BooleanFlag::from_str("FaLse").unwrap()));
+
+        BooleanFlag::from_str("foo").unwrap_err();
+    }
+}
diff --git a/iox_query/src/exec/schema_pivot.rs b/iox_query/src/exec/schema_pivot.rs
new file mode 100644
index 0000000..a3e3d3a
--- /dev/null
+++ b/iox_query/src/exec/schema_pivot.rs
@@ -0,0 +1,561 @@
+//! This module contains code for the "SchemaPivot" DataFusion
+//! extension plan node
+//!
+//! A SchemaPivot node takes an arbitrary input like
+//!
+//!  ColA | ColB | ColC
+//! ------+------+------
+//!   1   | NULL | NULL
+//!   2   | 2    | NULL
+//!   3   | 2    | NULL
+//!
+//! And pivots it to a table with a single string column for any
+//! columns that had non null values.
+//!
+//!   non_null_column
+//!  -----------------
+//!   "ColA"
+//!   "ColB"
+//!
+//! This operation can be used to implement the tag_keys metadata query
+
+use std::{
+    fmt::{self, Debug},
+    sync::Arc,
+};
+
+use arrow::{
+    array::StringArray,
+    datatypes::{DataType, Field, Schema, SchemaRef},
+    error::ArrowError,
+    record_batch::RecordBatch,
+};
+use datafusion::{
+    common::{DFSchemaRef, ToDFSchema},
+    error::{DataFusionError as Error, Result},
+    execution::context::TaskContext,
+    logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore},
+    physical_plan::{
+        expressions::PhysicalSortExpr,
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput},
+        DisplayFormatType, Distribution, ExecutionPlan, Partitioning, SendableRecordBatchStream,
+        Statistics,
+    },
+};
+use datafusion::{error::DataFusionError, physical_plan::DisplayAs};
+
+use datafusion_util::{watch::WatchedTask, AdapterStream};
+use observability_deps::tracing::debug;
+use tokio::sync::mpsc;
+use tokio_stream::StreamExt;
+
+/// Implements the SchemaPivot operation described in `make_schema_pivot`
+#[derive(Hash, PartialEq, Eq)]
+pub struct SchemaPivotNode {
+    input: LogicalPlan,
+    schema: DFSchemaRef,
+    // these expressions represent what columns are "used" by this
+    // node (in this case all of them) -- columns that are not used
+    // are optimzied away by datafusion.
+    exprs: Vec<Expr>,
+}
+
+impl SchemaPivotNode {
+    pub fn new(input: LogicalPlan) -> Self {
+        let schema = make_schema_pivot_output_schema();
+
+        // Form exprs that refer to all of our input columns (so that
+        // datafusion knows not to opimize them away)
+        let exprs = input
+            .schema()
+            .fields()
+            .iter()
+            .map(|field| Expr::Column(field.qualified_column()))
+            .collect::<Vec<_>>();
+
+        Self {
+            input,
+            schema,
+            exprs,
+        }
+    }
+}
+
+impl Debug for SchemaPivotNode {
+    /// Use explain format for the Debug format.
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.fmt_for_explain(f)
+    }
+}
+
+impl UserDefinedLogicalNodeCore for SchemaPivotNode {
+    fn name(&self) -> &str {
+        "SchemaPivot"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    /// Schema for Pivot is a single string
+    fn schema(&self) -> &DFSchemaRef {
+        &self.schema
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.exprs.clone()
+    }
+
+    /// For example: `SchemaPivot`
+    fn fmt_for_explain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.name())
+    }
+
+    fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
+        assert_eq!(inputs.len(), 1, "SchemaPivot: input sizes inconistent");
+        assert_eq!(
+            exprs.len(),
+            self.exprs.len(),
+            "SchemaPivot: expression sizes inconistent"
+        );
+        Self::new(inputs[0].clone())
+    }
+}
+
+// ------ The implementation of SchemaPivot code follows -----
+
+/// Create the schema describing the output
+fn make_schema_pivot_output_schema() -> DFSchemaRef {
+    let nullable = false;
+    Schema::new(vec![Field::new(
+        "non_null_column",
+        DataType::Utf8,
+        nullable,
+    )])
+    .to_dfschema_ref()
+    .unwrap()
+}
+
+/// Physical operator that implements the SchemaPivot operation against
+/// data types
+pub struct SchemaPivotExec {
+    input: Arc<dyn ExecutionPlan>,
+    /// Output schema
+    schema: SchemaRef,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl SchemaPivotExec {
+    pub fn new(input: Arc<dyn ExecutionPlan>, schema: SchemaRef) -> Self {
+        Self {
+            input,
+            schema,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+}
+
+impl Debug for SchemaPivotExec {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "SchemaPivotExec")
+    }
+}
+
+impl ExecutionPlan for SchemaPivotExec {
+    fn as_any(&self) -> &(dyn std::any::Any + 'static) {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        use Partitioning::*;
+        match self.input.output_partitioning() {
+            RoundRobinBatch(num_partitions) => RoundRobinBatch(num_partitions),
+            // as this node transforms the output schema,  whatever partitioning
+            // was present on the input is lost on the output
+            Hash(_, num_partitions) => UnknownPartitioning(num_partitions),
+            UnknownPartitioning(num_partitions) => UnknownPartitioning(num_partitions),
+        }
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::UnspecifiedDistribution]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        match children.len() {
+            1 => Ok(Arc::new(Self {
+                input: Arc::clone(&children[0]),
+                schema: Arc::clone(&self.schema),
+                metrics: ExecutionPlanMetricsSet::new(),
+            })),
+            _ => Err(Error::Internal(
+                "SchemaPivotExec wrong number of children".to_string(),
+            )),
+        }
+    }
+
+    /// Execute one partition and return an iterator over RecordBatch
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        debug!(partition, "Start SchemaPivotExec::execute");
+
+        if self.output_partitioning().partition_count() <= partition {
+            return Err(Error::Internal(format!(
+                "SchemaPivotExec invalid partition {partition}"
+            )));
+        }
+
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        let input_schema = self.input.schema();
+        let input_stream = self.input.execute(partition, context)?;
+
+        // the operation performed in a separate task which is
+        // then sent via a channel to the output
+        let (tx, rx) = mpsc::channel(1);
+
+        let fut = schema_pivot(
+            input_stream,
+            input_schema,
+            self.schema(),
+            tx.clone(),
+            baseline_metrics,
+        );
+
+        // A second task watches the output of the worker task and reports errors
+        let handle = WatchedTask::new(fut, vec![tx], "schema_pivot");
+
+        debug!(partition, "End SchemaPivotExec::execute");
+        Ok(AdapterStream::adapt(self.schema(), rx, handle))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for SchemaPivotExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "SchemaPivotExec")
+            }
+        }
+    }
+}
+
+// Algorithm: for each column we haven't seen a value for yet,
+// check each input row;
+//
+// Performance Optimizations: Don't continue scaning columns
+// if we have already seen a non-null value, and stop early we
+// have seen values for all columns.
+async fn schema_pivot(
+    mut input_stream: SendableRecordBatchStream,
+    input_schema: SchemaRef,
+    output_schema: SchemaRef,
+    tx: mpsc::Sender<Result<RecordBatch, DataFusionError>>,
+    baseline_metrics: BaselineMetrics,
+) -> Result<(), DataFusionError> {
+    let input_fields = input_schema.fields();
+    let num_fields = input_fields.len();
+    let mut field_indexes_with_seen_values = vec![false; num_fields];
+    let mut num_fields_seen_with_values = 0;
+
+    // use a loop so that we release the mutex once we have read each input_batch
+    let mut keep_searching = true;
+    while keep_searching {
+        let input_batch = input_stream.next().await.transpose()?;
+        let timer = baseline_metrics.elapsed_compute().timer();
+
+        keep_searching = match input_batch {
+            Some(input_batch) => {
+                let num_rows = input_batch.num_rows();
+
+                for (i, seen_value) in field_indexes_with_seen_values.iter_mut().enumerate() {
+                    // only check fields we haven't seen values for
+                    if !*seen_value {
+                        let column = input_batch.column(i);
+
+                        let field_has_values = !column.is_empty() && column.null_count() < num_rows;
+
+                        if field_has_values {
+                            *seen_value = true;
+                            num_fields_seen_with_values += 1;
+                        }
+                    }
+                }
+                // need to keep searching if there are still some
+                // fields without values
+                num_fields_seen_with_values < num_fields
+            }
+            // no more input
+            None => false,
+        };
+        timer.done();
+    }
+
+    // now, output a string for each column in the input schema
+    // that we saw values for
+    let column_names: StringArray = field_indexes_with_seen_values
+        .iter()
+        .enumerate()
+        .filter_map(|(field_index, has_values)| {
+            if *has_values {
+                Some(input_fields[field_index].name())
+            } else {
+                None
+            }
+        })
+        .map(Some)
+        .collect();
+
+    let batch = RecordBatch::try_new(output_schema, vec![Arc::new(column_names)])?
+        .record_output(&baseline_metrics);
+
+    // and send the result back
+    tx.send(Ok(batch))
+        .await
+        .map_err(|e| ArrowError::from_external_error(Box::new(e)))?;
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::exec::stringset::{IntoStringSet, StringSetRef};
+
+    use super::*;
+    use arrow::{
+        array::{Int64Array, StringArray},
+        datatypes::{Field, Schema, SchemaRef},
+    };
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion_util::test_execute_partition;
+
+    #[tokio::test]
+    async fn schema_pivot_exec_all_null() {
+        let case = SchemaTestCase {
+            input_batches: &[TestBatch {
+                a: &[None, None],
+                b: &[None, None],
+            }],
+            expected_output: &[],
+        };
+        assert_eq!(
+            case.pivot().await,
+            case.expected_output(),
+            "TestCase: {case:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn schema_pivot_exec_both_non_null() {
+        let case = SchemaTestCase {
+            input_batches: &[TestBatch {
+                a: &[Some(1), None],
+                b: &[None, Some("foo")],
+            }],
+            expected_output: &["A", "B"],
+        };
+        assert_eq!(
+            case.pivot().await,
+            case.expected_output(),
+            "TestCase: {case:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn schema_pivot_exec_one_non_null() {
+        let case = SchemaTestCase {
+            input_batches: &[TestBatch {
+                a: &[Some(1), None],
+                b: &[None, None],
+            }],
+            expected_output: &["A"],
+        };
+        assert_eq!(
+            case.pivot().await,
+            case.expected_output(),
+            "TestCase: {case:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn schema_pivot_exec_both_non_null_two_record_batches() {
+        let case = SchemaTestCase {
+            input_batches: &[
+                TestBatch {
+                    a: &[Some(1), None],
+                    b: &[None, None],
+                },
+                TestBatch {
+                    a: &[None, None],
+                    b: &[None, Some("foo")],
+                },
+            ],
+            expected_output: &["A", "B"],
+        };
+        assert_eq!(
+            case.pivot().await,
+            case.expected_output(),
+            "TestCase: {case:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn schema_pivot_exec_one_non_null_in_second_record_batch() {
+        let case = SchemaTestCase {
+            input_batches: &[
+                TestBatch {
+                    a: &[None, None],
+                    b: &[None, None],
+                },
+                TestBatch {
+                    a: &[None, Some(1), None],
+                    b: &[None, Some("foo"), None],
+                },
+            ],
+            expected_output: &["A", "B"],
+        };
+        assert_eq!(
+            case.pivot().await,
+            case.expected_output(),
+            "TestCase: {case:?}"
+        );
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "SchemaPivotExec invalid partition 1")]
+    async fn schema_pivot_exec_bad_partition() {
+        // ensure passing in a bad partition generates a reasonable error
+
+        let pivot = make_schema_pivot(SchemaTestCase::input_schema(), vec![]);
+
+        test_execute_partition(pivot, 1).await;
+    }
+
+    /// Return a StringSet extracted from the record batch
+    async fn reader_to_stringset(mut reader: SendableRecordBatchStream) -> StringSetRef {
+        let mut batches = Vec::new();
+        // process the record batches one by one
+        while let Some(record_batch) = reader.next().await.transpose().expect("reading next batch")
+        {
+            batches.push(record_batch)
+        }
+        batches
+            .into_stringset()
+            .expect("Converted record batch reader into stringset")
+    }
+
+    /// return a set for testing
+    fn to_stringset(strs: &[&str]) -> StringSetRef {
+        let stringset = strs.iter().map(|s| s.to_string()).collect();
+        StringSetRef::new(stringset)
+    }
+
+    /// Create a schema pivot node with a single input
+    fn make_schema_pivot(
+        input_schema: SchemaRef,
+        data: Vec<RecordBatch>,
+    ) -> Arc<dyn ExecutionPlan> {
+        let input = make_memory_exec(input_schema, data);
+        let output_schema = Arc::new(make_schema_pivot_output_schema().as_ref().clone().into());
+        Arc::new(SchemaPivotExec::new(input, output_schema))
+    }
+
+    /// Create an ExecutionPlan that produces `data` record batches.
+    fn make_memory_exec(schema: SchemaRef, data: Vec<RecordBatch>) -> Arc<dyn ExecutionPlan> {
+        let partitions = vec![data]; // single partition
+        let projection = None;
+
+        let memory_exec =
+            MemoryExec::try_new(&partitions, schema, projection).expect("creating memory exec");
+
+        Arc::new(memory_exec)
+    }
+
+    fn to_string_array(strs: &[Option<&str>]) -> Arc<StringArray> {
+        let arr: StringArray = strs.iter().collect();
+        Arc::new(arr)
+    }
+
+    // Input schema is (A INT, B STRING)
+    #[derive(Debug)]
+    struct TestBatch<'a> {
+        a: &'a [Option<i64>],
+        b: &'a [Option<&'a str>],
+    }
+
+    // Input schema is (A INT, B STRING)
+    #[derive(Debug)]
+    struct SchemaTestCase<'a> {
+        // Input record batches, slices of slices (a,b)
+        input_batches: &'a [TestBatch<'a>],
+        expected_output: &'a [&'a str],
+    }
+
+    impl SchemaTestCase<'_> {
+        fn input_schema() -> SchemaRef {
+            Arc::new(Schema::new(vec![
+                Field::new("A", DataType::Int64, true),
+                Field::new("B", DataType::Utf8, true),
+            ]))
+        }
+
+        /// return expected output, as StringSet
+        fn expected_output(&self) -> StringSetRef {
+            to_stringset(self.expected_output)
+        }
+
+        /// run the input batches through a schema pivot and return the results
+        /// as a StringSetRef
+        async fn pivot(&self) -> StringSetRef {
+            let schema = Self::input_schema();
+
+            // prepare input
+            let input_batches = self
+                .input_batches
+                .iter()
+                .map(|test_batch| {
+                    let a_vec = test_batch.a.to_vec();
+                    RecordBatch::try_new(
+                        Arc::clone(&schema),
+                        vec![
+                            Arc::new(Int64Array::from(a_vec)),
+                            to_string_array(test_batch.b),
+                        ],
+                    )
+                    .expect("Creating new record batch")
+                })
+                .collect::<Vec<_>>();
+
+            let pivot = make_schema_pivot(schema, input_batches);
+
+            let results = test_execute_partition(pivot, 0).await;
+
+            reader_to_stringset(results).await
+        }
+    }
+}
diff --git a/iox_query/src/exec/seriesset.rs b/iox_query/src/exec/seriesset.rs
new file mode 100644
index 0000000..7ff393d
--- /dev/null
+++ b/iox_query/src/exec/seriesset.rs
@@ -0,0 +1,89 @@
+//! This module contains the definition of a "SeriesSet" a plan that when run
+//! produces rows that can be logically divided into "Series"
+//!
+//! Specifically, a SeriesSet wraps a "table", and each table is
+//! sorted on a set of "tag" columns, meaning the data the series
+//! series will be contiguous.
+//!
+//! For example, the output columns of such a plan would be:
+//! (tag col0) (tag col1) ... (tag colN) (field val1) (field val2) ... (field
+//! valN) .. (timestamps)
+//!
+//! Note that the data will come out ordered by the tag keys (ORDER BY
+//! (tag col0) (tag col1) ... (tag colN))
+//!
+//! NOTE: The InfluxDB classic storage engine not only returns
+//! series sorted by the tag values, but the order of the tag columns
+//! (and thus the actual sort order) is also lexographically
+//! sorted. So for example, if you have `region`, `host`, and
+//! `service` as tags, the columns would be ordered `host`, `region`,
+//! and `service` as well.
+
+pub mod converter;
+pub mod series;
+
+use arrow::{self, record_batch::RecordBatch};
+
+use std::sync::Arc;
+
+use super::field::FieldIndexes;
+
+#[derive(Debug)]
+/// Information to map a slice of rows in a [`RecordBatch`] sorted by
+/// tags and timestamps to several timeseries that share the same
+/// tag keys and timestamps.
+///
+/// The information in a [`SeriesSet`] can be used to "unpivot" a
+/// [`RecordBatch`] into one or more Time Series as [`series::Series`]
+///
+/// For example, given the following set of rows from a [`RecordBatch`]
+/// which must be sorted by `(TagA, TagB, time)`:
+//
+/// TagA | TagB | Field1 | Field2 | time
+/// -----+------+--------+--------+-------
+///   a  |  b   |  1     | 10     | 100
+///   a  |  b   |  2     | 20     | 200
+///   a  |  b   |  3     | 30     | 300
+///   a  |  x   |  11    |        | 100
+///   a  |  x   |  12    |        | 200
+///
+/// Would be represented as
+/// * `SeriesSet` 1: For {TagA='a', TagB='b'}
+/// * `SeriesSet` 2: For {TagA='a', TagB='x'}
+///
+/// `SeriesSet` 1 would produce 2 series (one for each field):
+///
+/// {_field=Field1, TagA=a, TagB=b} timestamps = {100, 200, 300} values = {1, 2, 3}
+/// {_field=Field2, TagA=a, TagB=b} timestamps = {100, 200, 300} values = {100, 200, 300}
+///
+/// `SeriesSet` 2 would produce a single series for `Field1` (no
+/// series is created for `Field2` because there are no values for
+/// `Field2` where TagA=a, and TagB=x)
+///
+/// {_field=Field1, TagA=a, TagB=x} timestamps = {100, 200} values = {11, 12}
+///
+/// NB: The heavy use of `Arc` is to avoid many duplicated Strings given
+/// the the fact that many SeriesSets share the same tag keys and
+/// table name.
+pub struct SeriesSet {
+    /// The table name this series came from
+    pub table_name: Arc<str>,
+
+    /// key = value pairs that define this series
+    pub tags: Vec<(Arc<str>, Arc<str>)>,
+
+    /// the column index of each "field" of the time series. For
+    /// example, if there are two field indexes then this series set
+    /// would result in two distinct series being sent back, one for
+    /// each field.
+    pub field_indexes: FieldIndexes,
+
+    // The row in the record batch where the data starts (inclusive)
+    pub start_row: usize,
+
+    // The number of rows in the record batch that the data goes to
+    pub num_rows: usize,
+
+    // The underlying record batch data
+    pub batch: RecordBatch,
+}
diff --git a/iox_query/src/exec/seriesset/converter.rs b/iox_query/src/exec/seriesset/converter.rs
new file mode 100644
index 0000000..81e8384
--- /dev/null
+++ b/iox_query/src/exec/seriesset/converter.rs
@@ -0,0 +1,1746 @@
+//! This module contains code that "unpivots" annotated
+//! [`RecordBatch`]es to [`Series`] and [`Group`]s for output by the
+//! storage gRPC interface
+
+use arrow::{
+    self,
+    array::{downcast_array, Array, BooleanArray, DictionaryArray, StringArray},
+    compute,
+    datatypes::{DataType, Int32Type, SchemaRef},
+    record_batch::RecordBatch,
+};
+use datafusion::{
+    error::DataFusionError,
+    execution::memory_pool::{proxy::VecAllocExt, MemoryConsumer, MemoryPool, MemoryReservation},
+    physical_plan::SendableRecordBatchStream,
+};
+
+use futures::{ready, Stream, StreamExt};
+use predicate::rpc_predicate::{GROUP_KEY_SPECIAL_START, GROUP_KEY_SPECIAL_STOP};
+use snafu::{OptionExt, Snafu};
+use std::{
+    collections::VecDeque,
+    future::Future,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+
+use crate::exec::{
+    field::{self, FieldColumns, FieldIndexes},
+    seriesset::series::Group,
+};
+
+use super::{
+    series::{Either, Series},
+    SeriesSet,
+};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Internal field error while converting series set: {}", source))]
+    InternalField { source: field::Error },
+
+    #[snafu(display("Internal error finding grouping colum: {}", column_name))]
+    FindingGroupColumn { column_name: String },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+// Handles converting record batches into SeriesSets
+#[derive(Debug, Default, Copy, Clone)]
+pub struct SeriesSetConverter {}
+
+impl SeriesSetConverter {
+    /// Convert the results from running a DataFusion plan into the
+    /// appropriate SeriesSetItems.
+    ///
+    /// The results must be in the logical format described in this
+    /// module's documentation (i.e. ordered by tag keys)
+    ///
+    /// table_name: The name of the table
+    ///
+    /// tag_columns: The names of the columns that define tags
+    ///
+    /// field_columns: The names of the columns which are "fields"
+    ///
+    /// it: record batch iterator that produces data in the desired order
+    pub async fn convert(
+        &mut self,
+        table_name: Arc<str>,
+        tag_columns: Arc<Vec<Arc<str>>>,
+        field_columns: FieldColumns,
+        it: SendableRecordBatchStream,
+    ) -> Result<impl Stream<Item = Result<SeriesSet, DataFusionError>>, DataFusionError> {
+        assert_eq!(
+            tag_columns.as_ref(),
+            &{
+                let mut tmp = tag_columns.as_ref().clone();
+                tmp.sort();
+                tmp
+            },
+            "Tag column sorted",
+        );
+
+        let schema = it.schema();
+
+        let tag_indexes = FieldIndexes::names_to_indexes(&schema, &tag_columns).map_err(|e| {
+            DataFusionError::Context(
+                "Internal field error while converting series set".to_string(),
+                Box::new(DataFusionError::External(Box::new(e))),
+            )
+        })?;
+        let field_indexes =
+            FieldIndexes::from_field_columns(&schema, &field_columns).map_err(|e| {
+                DataFusionError::Context(
+                    "Internal field error while converting series set".to_string(),
+                    Box::new(DataFusionError::External(Box::new(e))),
+                )
+            })?;
+
+        Ok(SeriesSetConverterStream {
+            result_buffer: VecDeque::default(),
+            open_batches: Vec::default(),
+            need_new_batch: true,
+            we_finished: false,
+            schema,
+            it: Some(it),
+            tag_indexes,
+            field_indexes,
+            table_name,
+            tag_columns,
+        })
+    }
+
+    /// Returns the row indexes in `batch` where all of the values in the `tag_indexes` columns
+    /// take on a new value.
+    ///
+    /// For example:
+    ///
+    /// ```text
+    /// tags A, B
+    /// ```
+    ///
+    /// If the input is:
+    ///
+    /// A | B | C
+    /// - | - | -
+    /// 1 | 2 | x
+    /// 1 | 2 | y
+    /// 2 | 2 | z
+    /// 3 | 3 | q
+    /// 3 | 3 | r
+    ///
+    /// Then this function will return `[3, 4]`:
+    ///
+    /// - The row at index 3 has values for A and B (2,2) different than the previous row (1,2).
+    /// - Similarly the row at index 4 has values (3,3) which are different than (2,2).
+    /// - However, the row at index 5 has the same values (3,3) so is NOT a transition point
+    fn compute_changepoints(batch: &RecordBatch, tag_indexes: &[usize]) -> Vec<usize> {
+        let tag_transitions = tag_indexes
+            .iter()
+            .map(|&col| Self::compute_transitions(batch, col))
+            .collect::<Vec<_>>();
+
+        // no tag columns, emit a single tagset
+        if tag_transitions.is_empty() {
+            vec![]
+        } else {
+            // OR bitsets together to to find all rows where the
+            // keyset (values of the tag keys) changes
+            let mut tag_transitions_it = tag_transitions.into_iter();
+            let init = tag_transitions_it.next().expect("not empty");
+            let intersections =
+                tag_transitions_it.fold(init, |a, b| compute::or(&a, &b).expect("or operation"));
+
+            intersections
+                .iter()
+                .enumerate()
+                .filter(|(_idx, mask)| mask.unwrap_or(true))
+                .map(|(idx, _mask)| idx)
+                .collect()
+        }
+    }
+
+    /// returns a bitset with all row indexes where the value of the
+    /// batch `col_idx` changes.  Does not include row 0, always includes
+    /// the last row, `batch.num_rows() - 1`
+    ///
+    /// Note: This may return false positives in the presence of dictionaries
+    /// containing duplicates
+    fn compute_transitions(batch: &RecordBatch, col_idx: usize) -> BooleanArray {
+        let num_rows = batch.num_rows();
+
+        if num_rows == 0 {
+            return BooleanArray::builder(0).finish();
+        }
+
+        let col = batch.column(col_idx);
+
+        let arr = compute::concat(&[
+            &{
+                let mut b = BooleanArray::builder(1);
+                b.append_value(false);
+                b.finish()
+            },
+            &arrow::compute::kernels::cmp::neq(
+                &col.slice(0, col.len() - 1),
+                &col.slice(1, col.len() - 1),
+            )
+            .expect("cmp"),
+        ])
+        .expect("concat");
+
+        downcast_array(&arr)
+    }
+
+    /// Creates (column_name, column_value) pairs for each column
+    /// named in `tag_column_name` at the corresponding index
+    /// `tag_indexes`
+    fn get_tag_keys(
+        batch: &RecordBatch,
+        row: usize,
+        tag_column_names: &[Arc<str>],
+        tag_indexes: &[usize],
+    ) -> Vec<(Arc<str>, Arc<str>)> {
+        assert_eq!(tag_column_names.len(), tag_indexes.len());
+
+        let mut out = tag_column_names
+            .iter()
+            .zip(tag_indexes)
+            .filter_map(|(column_name, column_index)| {
+                let col = batch.column(*column_index);
+                let tag_value = match col.data_type() {
+                    DataType::Utf8 => {
+                        let col = col.as_any().downcast_ref::<StringArray>().unwrap();
+
+                        if col.is_valid(row) {
+                            Some(col.value(row).to_string())
+                        } else {
+                            None
+                        }
+                    }
+                    DataType::Dictionary(key, value)
+                        if key.as_ref() == &DataType::Int32
+                            && value.as_ref() == &DataType::Utf8 =>
+                    {
+                        let col = col
+                            .as_any()
+                            .downcast_ref::<DictionaryArray<Int32Type>>()
+                            .expect("Casting column");
+
+                        if col.is_valid(row) {
+                            let key = col.keys().value(row);
+                            let value = col
+                                .values()
+                                .as_any()
+                                .downcast_ref::<StringArray>()
+                                .unwrap()
+                                .value(key as _)
+                                .to_string();
+                            Some(value)
+                        } else {
+                            None
+                        }
+                    }
+                    _ => unimplemented!(
+                        "Series get_tag_keys not supported for type {:?} in column {:?}",
+                        col.data_type(),
+                        batch.schema().fields()[*column_index]
+                    ),
+                };
+
+                tag_value.map(|tag_value| (Arc::clone(column_name), Arc::from(tag_value.as_str())))
+            })
+            .collect::<Vec<_>>();
+
+        out.shrink_to_fit();
+        out
+    }
+}
+
+struct SeriesSetConverterStream {
+    /// [`SeriesSet`]s that are ready to be emitted by this stream.
+    ///
+    /// These results must always be emitted before doing any additional work.
+    result_buffer: VecDeque<SeriesSet>,
+
+    /// Batches of data that have NO change point, i.e. they all belong to the same output set. However we have not yet
+    /// found the next change point (or the end of the stream) so we need to keep them.
+    ///
+    /// We keep a list of batches instead of a giant concatenated batch to avoid `O(n^2)` complexity due to repeated mem-copies.
+    open_batches: Vec<RecordBatch>,
+
+    /// If `true`, we need to pull a new batch of `it`.
+    need_new_batch: bool,
+
+    /// We (i.e. [`SeriesSetConverterStream`]) completed its work. However there might be data available in
+    /// [`result_buffer`](Self::result_buffer) which must be drained before returning `Ready(None)`.
+    we_finished: bool,
+
+    /// The schema of the input data.
+    schema: SchemaRef,
+
+    /// Indexes (within [`schema`](Self::schema)) of the tag columns.
+    tag_indexes: Vec<usize>,
+
+    /// Indexes (within [`schema`](Self::schema)) of the field columns.
+    field_indexes: FieldIndexes,
+
+    /// Name of the table we're operating on.
+    ///
+    /// This is required because this is part of the output [`SeriesSet`]s.
+    table_name: Arc<str>,
+
+    /// Name of the tag columns.
+    ///
+    /// This is kept in addition to [`tag_indexes`](Self::tag_indexes) because it is part of the output [`SeriesSet`]s.
+    tag_columns: Arc<Vec<Arc<str>>>,
+
+    /// Input data stream.
+    ///
+    ///
+    /// This may be `None` when the stream was fully drained. We need to remember that fact so we don't pull a
+    /// finished stream (which may panic).
+    it: Option<SendableRecordBatchStream>,
+}
+
+impl Stream for SeriesSetConverterStream {
+    type Item = Result<SeriesSet, DataFusionError>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = &mut *self;
+
+        loop {
+            // drain results
+            if let Some(sset) = this.result_buffer.pop_front() {
+                return Poll::Ready(Some(Ok(sset)));
+            }
+
+            // early exit
+            if this.we_finished {
+                return Poll::Ready(None);
+            }
+
+            // do we need more input data?
+            if this.need_new_batch {
+                loop {
+                    match ready!(this
+                        .it
+                        .as_mut()
+                        .expect("need new input but input stream is already drained")
+                        .poll_next_unpin(cx))
+                    {
+                        Some(Err(e)) => {
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                        Some(Ok(batch)) => {
+                            // skip empty batches (simplifies our code further down below because we can always assume that
+                            // there's at least one row in the batch)
+                            if batch.num_rows() == 0 {
+                                continue;
+                            }
+
+                            this.open_batches.push(batch)
+                        }
+                        None => {
+                            this.it = None;
+                        }
+                    }
+                    break;
+                }
+
+                this.need_new_batch = false;
+            }
+
+            // do we only have a single batch or do we "overflow" from the last batch?
+            let (batch_for_changepoints, extra_first_row) = match this.open_batches.len() {
+                0 => {
+                    assert!(
+                        this.it.is_none(),
+                        "We have no open batches left, so the input stream should be finished",
+                    );
+                    this.we_finished = true;
+                    return Poll::Ready(None);
+                }
+                1 => (
+                    this.open_batches.last().expect("checked length").clone(),
+                    false,
+                ),
+                _ => {
+                    // `open_batches` contains at least two batches. The last one was added just from the input stream.
+                    // The prev. one was the end of the "open" interval and all before that belong to the same output
+                    // set (because otherwise we would have flushed them earlier).
+                    let batch_last = &this.open_batches[this.open_batches.len() - 2];
+                    let batch_current = &this.open_batches[this.open_batches.len() - 1];
+                    assert!(batch_last.num_rows() > 0);
+
+                    let batch = match compute::concat_batches(
+                        &this.schema,
+                        &[
+                            batch_last.slice(batch_last.num_rows() - 1, 1),
+                            batch_current.clone(),
+                        ],
+                    ) {
+                        Ok(batch) => batch,
+                        Err(e) => {
+                            // internal state is broken, end this stream
+                            this.we_finished = true;
+                            return Poll::Ready(Some(Err(DataFusionError::ArrowError(e, None))));
+                        }
+                    };
+
+                    (batch, true)
+                }
+            };
+
+            // compute changepoints
+            let mut changepoints = SeriesSetConverter::compute_changepoints(
+                &batch_for_changepoints,
+                &this.tag_indexes,
+            );
+            if this.it.is_none() {
+                // need to finish last SeriesSet
+                changepoints.push(batch_for_changepoints.num_rows());
+            }
+            let prev_sizes = this.open_batches[..(this.open_batches.len() - 1)]
+                .iter()
+                .map(|b| b.num_rows())
+                .sum::<usize>();
+            let cp_delta = if extra_first_row {
+                prev_sizes
+                    .checked_sub(1)
+                    .expect("at least one non-empty prev. batch")
+            } else {
+                prev_sizes
+            };
+            let changepoints = changepoints
+                .into_iter()
+                .map(|x| x + cp_delta)
+                .collect::<Vec<_>>();
+
+            // already change to "needs data" before we start emission
+            this.need_new_batch = true;
+            if this.it.is_none() {
+                this.we_finished = true;
+            }
+
+            if !changepoints.is_empty() {
+                // `batch_for_changepoints` only contains the last batch and the last row of the prev. one. However we
+                // need to flush ALL rows in `open_batches` (and keep the ones after the last changepoint as a new open
+                // batch). So concat again.
+                let batch_for_flush =
+                    match compute::concat_batches(&this.schema, &this.open_batches) {
+                        Ok(batch) => batch,
+                        Err(e) => {
+                            // internal state is broken, end this stream
+                            this.we_finished = true;
+                            return Poll::Ready(Some(Err(DataFusionError::ArrowError(e, None))));
+                        }
+                    };
+
+                let last_cp = *changepoints.last().expect("checked length");
+                if last_cp == batch_for_flush.num_rows() {
+                    // fully drained open batches
+                    // This can ONLY happen when the input stream finished because `comput_changepoint` never returns
+                    // the last row as changepoint (so we must have manually added that above).
+                    assert!(
+                        this.it.is_none(),
+                        "Fully flushed all open batches but the input stream still has data?!"
+                    );
+                    this.open_batches.drain(..);
+                } else {
+                    // need to keep the open bit
+                    // do NOT use `batch` here because it contains data for all open batches, we just need the last one
+                    // (`slice` is zero-copy)
+                    let offset = last_cp.checked_sub(prev_sizes).expect("underflow");
+                    let last_batch = this.open_batches.last().expect("at least one batch");
+                    let last_batch = last_batch.slice(
+                        offset,
+                        last_batch
+                            .num_rows()
+                            .checked_sub(offset)
+                            .expect("underflow"),
+                    );
+                    this.open_batches.drain(..);
+                    this.open_batches.push(last_batch);
+                }
+
+                // emit each series
+                let mut start_row: usize = 0;
+                assert!(this.result_buffer.is_empty());
+                this.result_buffer = changepoints
+                    .into_iter()
+                    .map(|end_row| {
+                        let series_set = SeriesSet {
+                            table_name: Arc::clone(&this.table_name),
+                            tags: SeriesSetConverter::get_tag_keys(
+                                &batch_for_flush,
+                                start_row,
+                                &this.tag_columns,
+                                &this.tag_indexes,
+                            ),
+                            field_indexes: this.field_indexes.clone(),
+                            start_row,
+                            num_rows: (end_row - start_row),
+                            // batch clones are super cheap (in contrast to `slice` which has a way higher overhead!)
+                            batch: batch_for_flush.clone(),
+                        };
+
+                        start_row = end_row;
+                        series_set
+                    })
+                    .collect();
+            }
+        }
+    }
+}
+
+/// Reorders and groups a sequence of Series is grouped correctly
+#[derive(Debug)]
+pub struct GroupGenerator {
+    group_columns: Vec<Arc<str>>,
+    memory_pool: Arc<dyn MemoryPool>,
+    collector_buffered_size_max: usize,
+}
+
+impl GroupGenerator {
+    pub fn new(group_columns: Vec<Arc<str>>, memory_pool: Arc<dyn MemoryPool>) -> Self {
+        Self::new_with_buffered_size_max(
+            group_columns,
+            memory_pool,
+            Collector::<()>::DEFAULT_ALLOCATION_BUFFER_SIZE,
+        )
+    }
+
+    fn new_with_buffered_size_max(
+        group_columns: Vec<Arc<str>>,
+        memory_pool: Arc<dyn MemoryPool>,
+        collector_buffered_size_max: usize,
+    ) -> Self {
+        Self {
+            group_columns,
+            memory_pool,
+            collector_buffered_size_max,
+        }
+    }
+
+    /// groups the set of `series` into SeriesOrGroups
+    ///
+    /// TODO: make this truly stream-based, see <https://github.com/influxdata/influxdb_iox/issues/6347>.
+    pub async fn group<S>(
+        self,
+        series: S,
+    ) -> Result<impl Stream<Item = Result<Either, DataFusionError>>, DataFusionError>
+    where
+        S: Stream<Item = Result<Series, DataFusionError>> + Send,
+    {
+        let series = Box::pin(series);
+        let mut series = Collector::new(
+            series,
+            self.group_columns,
+            self.memory_pool,
+            self.collector_buffered_size_max,
+        )
+        .await?;
+
+        // Potential optimization is to skip this sort if we are
+        // grouping by a prefix of the tags for a single measurement
+        //
+        // Another potential optimization is if we are only grouping on
+        // tag columns is to change the the actual plan output using
+        // DataFusion to sort the data in the required group (likely
+        // only possible with a single table)
+
+        // Resort the data according to group key values
+        series.sort();
+
+        // now find the groups boundaries and emit the output
+        let mut last_partition_key_vals: Option<Vec<Arc<str>>> = None;
+
+        // Note that if there are no group columns, we still need to
+        // sort by the tag keys, so that the output is sorted by tag
+        // keys, and thus we can't bail out early here
+        //
+        // Interesting, it isn't clear flux requires this ordering, but
+        // it is what TSM does so we preserve the behavior
+        let mut output = vec![];
+
+        // TODO make this more functional (issue is that sometimes the
+        // loop inserts one item into `output` and sometimes it inserts 2)
+        for SortableSeries {
+            series,
+            tag_vals,
+            num_partition_keys,
+        } in series.into_iter()
+        {
+            // keep only the values that form the group
+            let mut partition_key_vals = tag_vals;
+            partition_key_vals.truncate(num_partition_keys);
+
+            // figure out if we are in a new group (partition key values have changed)
+            let need_group_start = match &last_partition_key_vals {
+                None => true,
+                Some(last_partition_key_vals) => &partition_key_vals != last_partition_key_vals,
+            };
+
+            if need_group_start {
+                last_partition_key_vals = Some(partition_key_vals.clone());
+
+                let tag_keys = series.tags.iter().map(|tag| Arc::clone(&tag.key)).collect();
+
+                let group = Group {
+                    tag_keys,
+                    partition_key_vals,
+                };
+
+                output.push(group.into());
+            }
+
+            output.push(series.into())
+        }
+
+        Ok(futures::stream::iter(output).map(Ok))
+    }
+}
+
+#[derive(Debug)]
+/// Wrapper around a Series that has the values of the group_by columns extracted
+struct SortableSeries {
+    series: Series,
+
+    /// All the tag values, reordered so that the group_columns are first
+    tag_vals: Vec<Arc<str>>,
+
+    /// How many of the first N tag_values are used for the partition key
+    num_partition_keys: usize,
+}
+
+impl PartialEq for SortableSeries {
+    fn eq(&self, other: &Self) -> bool {
+        self.tag_vals.eq(&other.tag_vals)
+    }
+}
+
+impl Eq for SortableSeries {}
+
+impl PartialOrd for SortableSeries {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for SortableSeries {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.tag_vals.cmp(&other.tag_vals)
+    }
+}
+
+impl SortableSeries {
+    fn try_new(series: Series, group_columns: &[Arc<str>]) -> Result<Self> {
+        // Compute the order of new tag values
+        let tags = &series.tags;
+
+        // tag_used_set[i] is true if we have used the value in tag_columns[i]
+        let mut tag_used_set = vec![false; tags.len()];
+
+        // put the group columns first
+        //
+        // Note that this is an O(N^2) algorithm. We are assuming the
+        // number of tag columns is reasonably small
+        let mut tag_vals: Vec<_> = group_columns
+            .iter()
+            .map(|col| {
+                tags.iter()
+                    .enumerate()
+                    // Searching for columns linearly is likely to be pretty slow....
+                    .find(|(_i, tag)| tag.key == *col)
+                    .map(|(i, tag)| {
+                        assert!(!tag_used_set[i], "repeated group column");
+                        tag_used_set[i] = true;
+                        Arc::clone(&tag.value)
+                    })
+                    .or_else(|| {
+                        // treat these specially and use value "" to mirror what TSM does
+                        // see https://github.com/influxdata/influxdb_iox/issues/2693#issuecomment-947695442
+                        // for more details
+                        if col.as_ref() == GROUP_KEY_SPECIAL_START
+                            || col.as_ref() == GROUP_KEY_SPECIAL_STOP
+                        {
+                            Some(Arc::from(""))
+                        } else {
+                            None
+                        }
+                    })
+                    .context(FindingGroupColumnSnafu {
+                        column_name: col.as_ref(),
+                    })
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        // Fill in all remaining tags
+        tag_vals.extend(tags.iter().enumerate().filter_map(|(i, tag)| {
+            let use_tag = !tag_used_set[i];
+            use_tag.then(|| Arc::clone(&tag.value))
+        }));
+
+        // safe memory
+        tag_vals.shrink_to_fit();
+
+        Ok(Self {
+            series,
+            tag_vals,
+            num_partition_keys: group_columns.len(),
+        })
+    }
+
+    /// Memory usage in bytes, including `self`.
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self) + self.series.size() - std::mem::size_of_val(&self.series)
+            + (std::mem::size_of::<Arc<str>>() * self.tag_vals.capacity())
+            + self.tag_vals.iter().map(|s| s.len()).sum::<usize>()
+    }
+}
+
+/// [`Future`] that collects [`Series`] objects into a [`SortableSeries`] vector while registering/checking memory
+/// allocations with a [`MemoryPool`].
+///
+/// This avoids unbounded memory growth when merging multiple `Series` in memory
+struct Collector<S> {
+    /// The inner stream was fully drained.
+    inner_done: bool,
+
+    /// This very future finished.
+    outer_done: bool,
+
+    /// Inner stream.
+    inner: S,
+
+    /// Group columns.
+    ///
+    /// These are required for [`SortableSeries::try_new`].
+    group_columns: Vec<Arc<str>>,
+
+    /// Already collected objects.
+    collected: Vec<SortableSeries>,
+
+    /// Buffered but not-yet-registered allocated size.
+    ///
+    /// We use an additional buffer here because in contrast to the normal DataFusion processing, the input stream is
+    /// NOT batched and we want to avoid costly memory allocations checks with the [`MemoryPool`] for every single element.
+    buffered_size: usize,
+
+    /// Maximum [buffered size](Self::buffered_size). Decreasing this
+    /// value causes allocations to be reported to the [`MemoryPool`]
+    /// more frequently.
+    buffered_size_max: usize,
+
+    /// Our memory reservation.
+    mem_reservation: MemoryReservation,
+}
+
+impl<S> Collector<S> {
+    /// Default maximum [buffered size](Self::buffered_size) before updating [`MemoryPool`] reservation
+    const DEFAULT_ALLOCATION_BUFFER_SIZE: usize = 1024 * 1024;
+}
+
+impl<S> Collector<S>
+where
+    S: Stream<Item = Result<Series, DataFusionError>> + Send + Unpin,
+{
+    fn new(
+        inner: S,
+        group_columns: Vec<Arc<str>>,
+        memory_pool: Arc<dyn MemoryPool>,
+        buffered_size_max: usize,
+    ) -> Self {
+        let mem_reservation = MemoryConsumer::new("SeriesSet Collector").register(&memory_pool);
+
+        Self {
+            inner_done: false,
+            outer_done: false,
+            inner,
+            group_columns,
+            collected: Vec::with_capacity(0),
+            buffered_size: 0,
+            buffered_size_max,
+            mem_reservation,
+        }
+    }
+
+    /// Registers all `self.buffered_size` with the MemoryPool,
+    /// resetting self.buffered_size to zero. Returns an error if new
+    /// memory can not be allocated from the pool.
+    fn alloc(&mut self) -> Result<(), DataFusionError> {
+        let bytes = std::mem::take(&mut self.buffered_size);
+        self.mem_reservation.try_grow(bytes)
+    }
+}
+
+impl<S> Future for Collector<S>
+where
+    S: Stream<Item = Result<Series, DataFusionError>> + Send + Unpin,
+{
+    type Output = Result<Vec<SortableSeries>, DataFusionError>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = &mut *self;
+
+        loop {
+            assert!(!this.outer_done);
+            // if the underlying stream is drained and the allocation future is ready (see above), we can finalize this future
+            if this.inner_done {
+                this.outer_done = true;
+                return Poll::Ready(Ok(std::mem::take(&mut this.collected)));
+            }
+
+            match ready!(this.inner.poll_next_unpin(cx)) {
+                Some(Ok(series)) => match SortableSeries::try_new(series, &this.group_columns) {
+                    Ok(series) => {
+                        // Note: the size of `SortableSeries` itself is already included in the vector allocation
+                        this.buffered_size += series.size() - std::mem::size_of_val(&series);
+                        this.collected
+                            .push_accounted(series, &mut this.buffered_size);
+
+                        // should we clear our allocation buffer?
+                        if this.buffered_size > this.buffered_size_max {
+                            if let Err(e) = this.alloc() {
+                                return Poll::Ready(Err(e));
+                            }
+                            continue;
+                        }
+                    }
+                    Err(e) => {
+                        // poison this future
+                        this.outer_done = true;
+                        return Poll::Ready(Err(DataFusionError::External(Box::new(e))));
+                    }
+                },
+                Some(Err(e)) => {
+                    // poison this future
+                    this.outer_done = true;
+                    return Poll::Ready(Err(e));
+                }
+                None => {
+                    // underlying stream drained. now register the final allocation and then we're done
+                    this.inner_done = true;
+                    if this.buffered_size > 0 {
+                        if let Err(e) = this.alloc() {
+                            return Poll::Ready(Err(e));
+                        }
+                    }
+                    continue;
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::{
+        array::{ArrayRef, Float64Array, Int64Array, TimestampNanosecondArray},
+        csv,
+        datatypes::DataType,
+        datatypes::Field,
+        datatypes::{Schema, SchemaRef},
+        record_batch::RecordBatch,
+    };
+    use arrow_util::assert_batches_eq;
+    use assert_matches::assert_matches;
+    use datafusion::execution::memory_pool::GreedyMemoryPool;
+    use datafusion_util::{stream_from_batch, stream_from_batches, stream_from_schema};
+    use futures::TryStreamExt;
+    use itertools::Itertools;
+    use test_helpers::str_vec_to_arc_vec;
+
+    use crate::exec::seriesset::series::{Batch, Data, Tag};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_convert_empty() {
+        let schema = test_schema();
+        let empty_iterator = stream_from_schema(schema);
+
+        let table_name = "foo";
+        let tag_columns = [];
+        let field_columns = [];
+
+        let results = convert(table_name, &tag_columns, &field_columns, empty_iterator).await;
+        assert_eq!(results.len(), 0);
+    }
+
+    fn test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("tag_a", DataType::Utf8, true),
+            Field::new("tag_b", DataType::Utf8, true),
+            Field::new("float_field", DataType::Float64, true),
+            Field::new("int_field", DataType::Int64, true),
+            Field::new("time", DataType::Int64, false),
+        ]))
+    }
+
+    #[tokio::test]
+    async fn test_convert_single_series_no_tags() {
+        // single series
+        let schema = test_schema();
+        let inputs = parse_to_iterators(schema, &["one,ten,10.0,1,1000", "one,ten,10.1,2,2000"]);
+        for (i, input) in inputs.into_iter().enumerate() {
+            println!("Stream {i}");
+
+            let table_name = "foo";
+            let tag_columns = [];
+            let field_columns = ["float_field"];
+            let results = convert(table_name, &tag_columns, &field_columns, input).await;
+
+            assert_eq!(results.len(), 1);
+
+            assert_series_set(
+                &results[0],
+                "foo",
+                [],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[2]),
+                [
+                    "+-------+-------+-------------+-----------+------+",
+                    "| tag_a | tag_b | float_field | int_field | time |",
+                    "+-------+-------+-------------+-----------+------+",
+                    "| one   | ten   | 10.0        | 1         | 1000 |",
+                    "| one   | ten   | 10.1        | 2         | 2000 |",
+                    "+-------+-------+-------------+-----------+------+",
+                ],
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_convert_single_series_no_tags_nulls() {
+        // single series
+        let schema = test_schema();
+
+        let inputs = parse_to_iterators(schema, &["one,ten,10.0,,1000", "one,ten,10.1,,2000"]);
+
+        // send no values in the int_field colum
+        for (i, input) in inputs.into_iter().enumerate() {
+            println!("Stream {i}");
+
+            let table_name = "foo";
+            let tag_columns = [];
+            let field_columns = ["float_field"];
+            let results = convert(table_name, &tag_columns, &field_columns, input).await;
+
+            assert_eq!(results.len(), 1);
+
+            assert_series_set(
+                &results[0],
+                "foo",
+                [],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[2]),
+                [
+                    "+-------+-------+-------------+-----------+------+",
+                    "| tag_a | tag_b | float_field | int_field | time |",
+                    "+-------+-------+-------------+-----------+------+",
+                    "| one   | ten   | 10.0        |           | 1000 |",
+                    "| one   | ten   | 10.1        |           | 2000 |",
+                    "+-------+-------+-------------+-----------+------+",
+                ],
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_convert_single_series_one_tag() {
+        // single series
+        let schema = test_schema();
+        let inputs = parse_to_iterators(schema, &["one,ten,10.0,1,1000", "one,ten,10.1,2,2000"]);
+
+        for (i, input) in inputs.into_iter().enumerate() {
+            println!("Stream {i}");
+
+            // test with one tag column, one series
+            let table_name = "bar";
+            let tag_columns = ["tag_a"];
+            let field_columns = ["float_field"];
+            let results = convert(table_name, &tag_columns, &field_columns, input).await;
+
+            assert_eq!(results.len(), 1);
+
+            assert_series_set(
+                &results[0],
+                "bar",
+                [("tag_a", "one")],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[2]),
+                [
+                    "+-------+-------+-------------+-----------+------+",
+                    "| tag_a | tag_b | float_field | int_field | time |",
+                    "+-------+-------+-------------+-----------+------+",
+                    "| one   | ten   | 10.0        | 1         | 1000 |",
+                    "| one   | ten   | 10.1        | 2         | 2000 |",
+                    "+-------+-------+-------------+-----------+------+",
+                ],
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_convert_single_series_one_tag_more_rows() {
+        // single series
+        let schema = test_schema();
+        let inputs = parse_to_iterators(
+            schema,
+            &[
+                "one,ten,10.0,1,1000",
+                "one,ten,10.1,2,2000",
+                "one,ten,10.2,3,3000",
+            ],
+        );
+
+        for (i, input) in inputs.into_iter().enumerate() {
+            println!("Stream {i}");
+
+            // test with one tag column, one series
+            let table_name = "bar";
+            let tag_columns = ["tag_a"];
+            let field_columns = ["float_field"];
+            let results = convert(table_name, &tag_columns, &field_columns, input).await;
+
+            assert_eq!(results.len(), 1);
+
+            assert_series_set(
+                &results[0],
+                "bar",
+                [("tag_a", "one")],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[2]),
+                [
+                    "+-------+-------+-------------+-----------+------+",
+                    "| tag_a | tag_b | float_field | int_field | time |",
+                    "+-------+-------+-------------+-----------+------+",
+                    "| one   | ten   | 10.0        | 1         | 1000 |",
+                    "| one   | ten   | 10.1        | 2         | 2000 |",
+                    "| one   | ten   | 10.2        | 3         | 3000 |",
+                    "+-------+-------+-------------+-----------+------+",
+                ],
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_convert_one_tag_multi_series() {
+        let schema = test_schema();
+
+        let inputs = parse_to_iterators(
+            schema,
+            &[
+                "one,ten,10.0,1,1000",
+                "one,ten,10.1,2,2000",
+                "one,eleven,10.1,3,3000",
+                "two,eleven,10.2,4,4000",
+                "two,eleven,10.3,5,5000",
+            ],
+        );
+
+        for (i, input) in inputs.into_iter().enumerate() {
+            println!("Stream {i}");
+
+            let table_name = "foo";
+            let tag_columns = ["tag_a"];
+            let field_columns = ["int_field"];
+            let results = convert(table_name, &tag_columns, &field_columns, input).await;
+
+            assert_eq!(results.len(), 2);
+
+            assert_series_set(
+                &results[0],
+                "foo",
+                [("tag_a", "one")],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[3]),
+                [
+                    "+-------+--------+-------------+-----------+------+",
+                    "| tag_a | tag_b  | float_field | int_field | time |",
+                    "+-------+--------+-------------+-----------+------+",
+                    "| one   | ten    | 10.0        | 1         | 1000 |",
+                    "| one   | ten    | 10.1        | 2         | 2000 |",
+                    "| one   | eleven | 10.1        | 3         | 3000 |",
+                    "+-------+--------+-------------+-----------+------+",
+                ],
+            );
+            assert_series_set(
+                &results[1],
+                "foo",
+                [("tag_a", "two")],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[3]),
+                [
+                    "+-------+--------+-------------+-----------+------+",
+                    "| tag_a | tag_b  | float_field | int_field | time |",
+                    "+-------+--------+-------------+-----------+------+",
+                    "| two   | eleven | 10.2        | 4         | 4000 |",
+                    "| two   | eleven | 10.3        | 5         | 5000 |",
+                    "+-------+--------+-------------+-----------+------+",
+                ],
+            );
+        }
+    }
+
+    // two tag columns, three series
+    #[tokio::test]
+    async fn test_convert_two_tag_multi_series() {
+        let schema = test_schema();
+
+        let inputs = parse_to_iterators(
+            schema,
+            &[
+                "one,ten,10.0,1,1000",
+                "one,ten,10.1,2,2000",
+                "one,eleven,10.1,3,3000",
+                "two,eleven,10.2,4,4000",
+                "two,eleven,10.3,5,5000",
+            ],
+        );
+
+        for (i, input) in inputs.into_iter().enumerate() {
+            println!("Stream {i}");
+
+            let table_name = "foo";
+            let tag_columns = ["tag_a", "tag_b"];
+            let field_columns = ["int_field"];
+            let results = convert(table_name, &tag_columns, &field_columns, input).await;
+
+            assert_eq!(results.len(), 3);
+
+            assert_series_set(
+                &results[0],
+                "foo",
+                [("tag_a", "one"), ("tag_b", "ten")],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[3]),
+                [
+                    "+-------+-------+-------------+-----------+------+",
+                    "| tag_a | tag_b | float_field | int_field | time |",
+                    "+-------+-------+-------------+-----------+------+",
+                    "| one   | ten   | 10.0        | 1         | 1000 |",
+                    "| one   | ten   | 10.1        | 2         | 2000 |",
+                    "+-------+-------+-------------+-----------+------+",
+                ],
+            );
+            assert_series_set(
+                &results[1],
+                "foo",
+                [("tag_a", "one"), ("tag_b", "eleven")],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[3]),
+                [
+                    "+-------+--------+-------------+-----------+------+",
+                    "| tag_a | tag_b  | float_field | int_field | time |",
+                    "+-------+--------+-------------+-----------+------+",
+                    "| one   | eleven | 10.1        | 3         | 3000 |",
+                    "+-------+--------+-------------+-----------+------+",
+                ],
+            );
+            assert_series_set(
+                &results[2],
+                "foo",
+                [("tag_a", "two"), ("tag_b", "eleven")],
+                FieldIndexes::from_timestamp_and_value_indexes(4, &[3]),
+                [
+                    "+-------+--------+-------------+-----------+------+",
+                    "| tag_a | tag_b  | float_field | int_field | time |",
+                    "+-------+--------+-------------+-----------+------+",
+                    "| two   | eleven | 10.2        | 4         | 4000 |",
+                    "| two   | eleven | 10.3        | 5         | 5000 |",
+                    "+-------+--------+-------------+-----------+------+",
+                ],
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_convert_two_tag_with_null_multi_series() {
+        let tag_a = StringArray::from(vec!["one", "one", "one"]);
+        let tag_b = StringArray::from(vec![Some("ten"), Some("ten"), None]);
+        let float_field = Float64Array::from(vec![10.0, 10.1, 10.1]);
+        let int_field = Int64Array::from(vec![1, 2, 3]);
+        let time = TimestampNanosecondArray::from(vec![1000, 2000, 3000]);
+
+        let batch = RecordBatch::try_from_iter_with_nullable(vec![
+            ("tag_a", Arc::new(tag_a) as ArrayRef, true),
+            ("tag_b", Arc::new(tag_b), true),
+            ("float_field", Arc::new(float_field), true),
+            ("int_field", Arc::new(int_field), true),
+            ("time", Arc::new(time), false),
+        ])
+        .unwrap();
+
+        // Input has one row that has no value (NULL value) for tag_b, which is its own series
+        let input = stream_from_batch(batch.schema(), batch);
+
+        let table_name = "foo";
+        let tag_columns = ["tag_a", "tag_b"];
+        let field_columns = ["int_field"];
+        let results = convert(table_name, &tag_columns, &field_columns, input).await;
+
+        assert_eq!(results.len(), 2);
+
+        assert_series_set(
+            &results[0],
+            "foo",
+            [("tag_a", "one"), ("tag_b", "ten")],
+            FieldIndexes::from_timestamp_and_value_indexes(4, &[3]),
+            [
+                "+-------+-------+-------------+-----------+-----------------------------+",
+                "| tag_a | tag_b | float_field | int_field | time                        |",
+                "+-------+-------+-------------+-----------+-----------------------------+",
+                "| one   | ten   | 10.0        | 1         | 1970-01-01T00:00:00.000001Z |",
+                "| one   | ten   | 10.1        | 2         | 1970-01-01T00:00:00.000002Z |",
+                "+-------+-------+-------------+-----------+-----------------------------+",
+            ],
+        );
+        assert_series_set(
+            &results[1],
+            "foo",
+            [("tag_a", "one")], // note no value for tag_b, only one tag
+            FieldIndexes::from_timestamp_and_value_indexes(4, &[3]),
+            [
+                "+-------+-------+-------------+-----------+-----------------------------+",
+                "| tag_a | tag_b | float_field | int_field | time                        |",
+                "+-------+-------+-------------+-----------+-----------------------------+",
+                "| one   |       | 10.1        | 3         | 1970-01-01T00:00:00.000003Z |",
+                "+-------+-------+-------------+-----------+-----------------------------+",
+            ],
+        );
+    }
+
+    /// Test helper: run conversion and return a Vec
+    pub async fn convert<'a>(
+        table_name: &'a str,
+        tag_columns: &'a [&'a str],
+        field_columns: &'a [&'a str],
+        it: SendableRecordBatchStream,
+    ) -> Vec<SeriesSet> {
+        let mut converter = SeriesSetConverter::default();
+
+        let table_name = Arc::from(table_name);
+        let tag_columns = Arc::new(str_vec_to_arc_vec(tag_columns));
+        let field_columns = FieldColumns::from(field_columns);
+
+        converter
+            .convert(table_name, tag_columns, field_columns, it)
+            .await
+            .expect("Conversion happened without error")
+            .try_collect()
+            .await
+            .expect("Conversion happened without error")
+    }
+
+    /// Test helper: parses the csv content into a single record batch arrow
+    /// arrays columnar ArrayRef according to the schema
+    fn parse_to_record_batch(schema: SchemaRef, data: &str) -> RecordBatch {
+        if data.is_empty() {
+            return RecordBatch::new_empty(schema);
+        }
+
+        let batch_size = 1000;
+        let mut reader = csv::ReaderBuilder::new(schema)
+            .with_batch_size(batch_size)
+            .build_buffered(data.as_bytes())
+            .unwrap();
+
+        let first_batch = reader.next().expect("Reading first batch");
+        assert!(
+            first_batch.is_ok(),
+            "Can not parse record batch from csv: {first_batch:?}"
+        );
+        assert!(
+            reader.next().is_none(),
+            "Unexpected batch while parsing csv"
+        );
+
+        println!("batch: \n{first_batch:#?}");
+
+        first_batch.unwrap()
+    }
+
+    /// Parses a set of CSV lines into several `RecordBatchStream`s of varying sizes
+    ///
+    /// For example, with three input lines:
+    /// line1
+    /// line2
+    /// line3
+    ///
+    /// This will produce two output streams:
+    /// Stream1: (line1), (line2), (line3)
+    /// Stream2: (line1, line2), (line3)
+    fn parse_to_iterators(schema: SchemaRef, lines: &[&str]) -> Vec<SendableRecordBatchStream> {
+        split_lines(lines)
+            .into_iter()
+            .map(|batches| {
+                let batches = batches
+                    .into_iter()
+                    .map(|chunk| parse_to_record_batch(Arc::clone(&schema), &chunk))
+                    .collect::<Vec<_>>();
+
+                stream_from_batches(Arc::clone(&schema), batches)
+            })
+            .collect()
+    }
+
+    fn split_lines(lines: &[&str]) -> Vec<Vec<String>> {
+        println!("** Input data:\n{lines:#?}\n\n");
+        if lines.is_empty() {
+            return vec![vec![], vec![String::from("")]];
+        }
+
+        // potential split points for batches
+        // we keep each split point twice so we may also produce empty batches
+        let n_lines = lines.len();
+        let mut split_points = (0..=n_lines).chain(0..=n_lines).collect::<Vec<_>>();
+        split_points.sort();
+
+        let mut split_point_sets = split_points
+            .into_iter()
+            .powerset()
+            .map(|mut split_points| {
+                split_points.sort();
+
+                // ensure that "begin" and "end" are always split points
+                if split_points.first() != Some(&0) {
+                    split_points.insert(0, 0);
+                }
+                if split_points.last() != Some(&n_lines) {
+                    split_points.push(n_lines);
+                }
+
+                split_points
+            })
+            .collect::<Vec<_>>();
+        split_point_sets.sort();
+
+        let variants = split_point_sets
+            .into_iter()
+            .unique()
+            .map(|split_points| {
+                let batches = split_points
+                    .into_iter()
+                    .tuple_windows()
+                    .map(|(begin, end)| lines[begin..end].join("\n"))
+                    .collect::<Vec<_>>();
+
+                // stream from those batches
+                assert!(!batches.is_empty());
+                batches
+            })
+            .collect::<Vec<_>>();
+
+        assert!(!variants.is_empty());
+        variants
+    }
+
+    #[test]
+    fn test_split_lines() {
+        assert_eq!(split_lines(&[]), vec![vec![], vec![String::from("")],],);
+
+        assert_eq!(
+            split_lines(&["foo"]),
+            vec![
+                vec![String::from(""), String::from("foo")],
+                vec![String::from(""), String::from("foo"), String::from("")],
+                vec![String::from("foo")],
+                vec![String::from("foo"), String::from("")],
+            ],
+        );
+
+        assert_eq!(
+            split_lines(&["foo", "bar"]),
+            vec![
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from("")
+                ],
+                vec![String::from(""), String::from("foo"), String::from("bar")],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from("")
+                ],
+                vec![String::from(""), String::from("foo\nbar")],
+                vec![String::from(""), String::from("foo\nbar"), String::from("")],
+                vec![String::from("foo"), String::from(""), String::from("bar")],
+                vec![
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from("")
+                ],
+                vec![String::from("foo"), String::from("bar")],
+                vec![String::from("foo"), String::from("bar"), String::from("")],
+                vec![String::from("foo\nbar")],
+                vec![String::from("foo\nbar"), String::from("")],
+            ],
+        );
+
+        assert_eq!(
+            split_lines(&["foo", "bar", "xxx"]),
+            vec![
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar\nxxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar\nxxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from("bar\nxxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo"),
+                    String::from("bar\nxxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo\nbar"),
+                    String::from(""),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo\nbar"),
+                    String::from(""),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo\nbar"),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from(""),
+                    String::from("foo\nbar"),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![String::from(""), String::from("foo\nbar\nxxx")],
+                vec![
+                    String::from(""),
+                    String::from("foo\nbar\nxxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar"),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar\nxxx")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from(""),
+                    String::from("bar\nxxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from(""),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from("foo"),
+                    String::from("bar"),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![String::from("foo"), String::from("bar\nxxx")],
+                vec![
+                    String::from("foo"),
+                    String::from("bar\nxxx"),
+                    String::from("")
+                ],
+                vec![
+                    String::from("foo\nbar"),
+                    String::from(""),
+                    String::from("xxx")
+                ],
+                vec![
+                    String::from("foo\nbar"),
+                    String::from(""),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![String::from("foo\nbar"), String::from("xxx")],
+                vec![
+                    String::from("foo\nbar"),
+                    String::from("xxx"),
+                    String::from("")
+                ],
+                vec![String::from("foo\nbar\nxxx")],
+                vec![String::from("foo\nbar\nxxx"), String::from("")]
+            ]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_group_generator_mem_limit() {
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1)) as _;
+
+        let ggen = GroupGenerator::new(vec![Arc::from("g")], memory_pool);
+        let input = futures::stream::iter([Ok(Series {
+            tags: vec![Tag {
+                key: Arc::from("g"),
+                value: Arc::from("x"),
+            }],
+            data: Data::FloatPoints(vec![Batch {
+                timestamps: vec![],
+                values: vec![],
+            }]),
+        })]);
+        let err = match ggen.group(input).await {
+            Ok(stream) => stream.try_collect::<Vec<_>>().await.unwrap_err(),
+            Err(e) => e,
+        };
+        assert_matches!(err, DataFusionError::ResourcesExhausted(_));
+    }
+
+    #[tokio::test]
+    async fn test_group_generator_no_mem_limit() {
+        let memory_pool = Arc::new(GreedyMemoryPool::new(usize::MAX)) as _;
+        // use a generator w/ a low buffered allocation to force multiple `alloc` calls
+        let ggen = GroupGenerator::new_with_buffered_size_max(vec![Arc::from("g")], memory_pool, 1);
+        let input = futures::stream::iter([
+            Ok(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("x"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![1],
+                    values: vec![1],
+                }]),
+            }),
+            Ok(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("y"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![2],
+                    values: vec![2],
+                }]),
+            }),
+            Ok(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("x"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![3],
+                    values: vec![3],
+                }]),
+            }),
+            Ok(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("x"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![4],
+                    values: vec![4],
+                }]),
+            }),
+        ]);
+        let actual = ggen
+            .group(input)
+            .await
+            .unwrap()
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+        let expected = vec![
+            Either::Group(Group {
+                tag_keys: vec![Arc::from("g")],
+                partition_key_vals: vec![Arc::from("x")],
+            }),
+            Either::Series(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("x"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![1],
+                    values: vec![1],
+                }]),
+            }),
+            Either::Series(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("x"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![3],
+                    values: vec![3],
+                }]),
+            }),
+            Either::Series(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("x"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![4],
+                    values: vec![4],
+                }]),
+            }),
+            Either::Group(Group {
+                tag_keys: vec![Arc::from("g")],
+                partition_key_vals: vec![Arc::from("y")],
+            }),
+            Either::Series(Series {
+                tags: vec![Tag {
+                    key: Arc::from("g"),
+                    value: Arc::from("y"),
+                }],
+                data: Data::IntegerPoints(vec![Batch {
+                    timestamps: vec![2],
+                    values: vec![2],
+                }]),
+            }),
+        ];
+        assert_eq!(actual, expected);
+    }
+
+    fn assert_series_set<const N: usize, const M: usize>(
+        set: &SeriesSet,
+        table_name: &'static str,
+        tags: [(&'static str, &'static str); N],
+        field_indexes: FieldIndexes,
+        data: [&'static str; M],
+    ) {
+        assert_eq!(set.table_name.as_ref(), table_name);
+
+        let set_tags = set
+            .tags
+            .iter()
+            .map(|(a, b)| (a.as_ref(), b.as_ref()))
+            .collect::<Vec<_>>();
+        assert_eq!(set_tags.as_slice(), tags);
+
+        assert_eq!(set.field_indexes, field_indexes);
+
+        assert_batches_eq!(data, &[set.batch.slice(set.start_row, set.num_rows)]);
+    }
+}
diff --git a/iox_query/src/exec/seriesset/series.rs b/iox_query/src/exec/seriesset/series.rs
new file mode 100644
index 0000000..4f12c5d
--- /dev/null
+++ b/iox_query/src/exec/seriesset/series.rs
@@ -0,0 +1,775 @@
+//! This module contains the native Rust version of the Data frames
+//! that are sent back in the storage gRPC format.
+
+use std::{fmt, sync::Arc};
+
+use arrow::{
+    array::{
+        Array, ArrayRef, BooleanArray, Float64Array, Int64Array, StringArray,
+        TimestampNanosecondArray, UInt64Array,
+    },
+    compute,
+    datatypes::DataType as ArrowDataType,
+};
+use predicate::rpc_predicate::{FIELD_COLUMN_NAME, MEASUREMENT_COLUMN_NAME};
+
+use crate::exec::{field::FieldIndex, seriesset::SeriesSet};
+use snafu::Snafu;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Unsupported data type while translating to Frames: {}", data_type))]
+    UnsupportedDataType { data_type: ArrowDataType },
+
+    #[snafu(display("Unsupported field data while translating to Frames: {}", data_type))]
+    UnsupportedFieldType { data_type: ArrowDataType },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A name=value pair used to represent a series's tag
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct Tag {
+    pub key: Arc<str>,
+    pub value: Arc<str>,
+}
+
+impl Tag {
+    /// Memory usage in bytes, including `self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of_val(self) + self.key.len() + self.value.len()
+    }
+}
+
+impl fmt::Display for Tag {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}={}", self.key, self.value)
+    }
+}
+
+/// Represents a single logical TimeSeries
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub struct Series {
+    /// key = value pairs that define this series
+    /// (including the _measurement and _field that correspond to table name and column name)
+    pub tags: Vec<Tag>,
+
+    /// The raw data for this series
+    pub data: Data,
+}
+
+impl Series {
+    pub fn num_batches(&self) -> usize {
+        match &self.data {
+            Data::FloatPoints(batches) => batches.len(),
+            Data::IntegerPoints(batches) => batches.len(),
+            Data::UnsignedPoints(batches) => batches.len(),
+            Data::BooleanPoints(batches) => batches.len(),
+            Data::StringPoints(batches) => batches.len(),
+        }
+    }
+
+    /// Memory usage in bytes, including `self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+            + (std::mem::size_of::<Tag>() * self.tags.capacity())
+            + self
+                .tags
+                .iter()
+                .map(|tag| tag.size() - std::mem::size_of_val(tag))
+                .sum::<usize>()
+            + self.data.size()
+            - std::mem::size_of_val(&self.data)
+    }
+}
+
+impl fmt::Display for Series {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "Series tags={{")?;
+        let mut first = true;
+        self.tags.iter().try_for_each(|tag| {
+            if !first {
+                write!(f, ", ")?;
+            } else {
+                first = false;
+            }
+            write!(f, "{tag}")
+        })?;
+        writeln!(f, "}}")?;
+        write!(f, "  {}", self.data)
+    }
+}
+
+/// Typed data for a particular timeseries
+#[derive(Clone, Debug)]
+pub enum Data {
+    FloatPoints(Vec<Batch<f64>>),
+    IntegerPoints(Vec<Batch<i64>>),
+    UnsignedPoints(Vec<Batch<u64>>),
+    BooleanPoints(Vec<Batch<bool>>),
+    StringPoints(Vec<Batch<String>>),
+}
+
+impl Data {
+    /// Memory usage in bytes, including `self`.
+    pub fn size(&self) -> usize {
+        let data_sz: usize = match self {
+            Self::FloatPoints(points_vec) => points_vec.iter().map(|ps| ps.size()).sum(),
+            Self::IntegerPoints(points_vec) => points_vec.iter().map(|ps| ps.size()).sum(),
+            Self::UnsignedPoints(points_vec) => points_vec.iter().map(|ps| ps.size()).sum(),
+            Self::BooleanPoints(points_vec) => points_vec.iter().map(|ps| ps.size()).sum(),
+            Self::StringPoints(points_vec) => points_vec.iter().map(|ps| ps.size()).sum(),
+        };
+        std::mem::size_of_val(self) + data_sz
+    }
+}
+
+impl PartialEq for Data {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (Self::FloatPoints(l_batches), Self::FloatPoints(r_batches)) => l_batches == r_batches,
+            (Self::IntegerPoints(l_batches), Self::IntegerPoints(r_batches)) => {
+                l_batches == r_batches
+            }
+            (Self::UnsignedPoints(l_batches), Self::UnsignedPoints(r_batches)) => {
+                l_batches == r_batches
+            }
+            (Self::BooleanPoints(l_batches), Self::BooleanPoints(r_batches)) => {
+                l_batches == r_batches
+            }
+            (Self::StringPoints(l_batches), Self::StringPoints(r_batches)) => {
+                l_batches == r_batches
+            }
+            _ => false,
+        }
+    }
+}
+
+impl Eq for Data {}
+
+/// Returns size of given vector of primitive types in bytes, EXCLUDING `vec` itself.
+fn primitive_vec_size<T>(vec: &Vec<T>) -> usize {
+    std::mem::size_of::<T>() * vec.capacity()
+}
+
+impl fmt::Display for Data {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::FloatPoints(batches) => write!(f, "FloatPoints batches: {batches:?}"),
+            Self::IntegerPoints(batches) => write!(f, "IntegerPoints batches: {batches:?}"),
+            Self::UnsignedPoints(batches) => write!(f, "UnsignedPoints batches: {batches:?}"),
+            Self::BooleanPoints(batches) => write!(f, "BooleanPoints batches: {batches:?}"),
+            Self::StringPoints(batches) => write!(f, "StringPoints batches: {batches:?}"),
+        }
+    }
+}
+
+#[derive(Clone, Debug, PartialEq)]
+pub struct Batch<T> {
+    pub timestamps: Vec<i64>,
+    pub values: Vec<T>,
+}
+
+impl<T> Batch<T> {
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+            + primitive_vec_size(&self.timestamps)
+            + primitive_vec_size(&self.values)
+    }
+}
+
+impl SeriesSet {
+    /// Returns true if the array is entirely null between start_row and
+    /// start_row+num_rows
+    fn is_all_null(arr: &ArrayRef) -> bool {
+        arr.null_count() == arr.len()
+    }
+
+    pub fn is_timestamp_all_null(&self) -> bool {
+        self.field_indexes.iter().all(|field_index| {
+            let array = self.batch.column(field_index.timestamp_index);
+            Self::is_all_null(array)
+        })
+    }
+
+    pub fn try_into_series(self, batch_size: usize) -> Result<Vec<Series>> {
+        self.field_indexes
+            .iter()
+            .filter_map(|index| self.field_to_series(index, batch_size).transpose())
+            .collect()
+    }
+
+    // Convert and append the values from a single field to a Series
+    // appended to `frames`
+    fn field_to_series(&self, index: &FieldIndex, batch_size: usize) -> Result<Option<Series>> {
+        let batch = self.batch.slice(self.start_row, self.num_rows);
+        let schema = batch.schema();
+
+        let field = schema.field(index.value_index);
+        let array = batch.column(index.value_index);
+
+        // No values for this field are in the array so it does not
+        // contribute to a series.
+        if field.is_nullable() && Self::is_all_null(array) {
+            return Ok(None);
+        }
+
+        let tags = self.create_frame_tags(schema.field(index.value_index).name());
+
+        let mut timestamps = compute::kernels::nullif::nullif(
+            batch.column(index.timestamp_index),
+            &compute::is_null(array).expect("is_null"),
+        )
+        .expect("null handling")
+        .as_any()
+        .downcast_ref::<TimestampNanosecondArray>()
+        .unwrap()
+        .extract_batched_values(batch_size);
+        timestamps.shrink_to_fit();
+
+        let data = match array.data_type() {
+            ArrowDataType::Utf8 => {
+                let values = array
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap()
+                    .extract_batched_values(batch_size);
+                Data::StringPoints(build_batches(timestamps, values))
+            }
+            ArrowDataType::Float64 => {
+                let values = array
+                    .as_any()
+                    .downcast_ref::<Float64Array>()
+                    .unwrap()
+                    .extract_batched_values(batch_size);
+                Data::FloatPoints(build_batches(timestamps, values))
+            }
+            ArrowDataType::Int64 => {
+                let values = array
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap()
+                    .extract_batched_values(batch_size);
+                Data::IntegerPoints(build_batches(timestamps, values))
+            }
+            ArrowDataType::UInt64 => {
+                let values = array
+                    .as_any()
+                    .downcast_ref::<UInt64Array>()
+                    .unwrap()
+                    .extract_batched_values(batch_size);
+                Data::UnsignedPoints(build_batches(timestamps, values))
+            }
+            ArrowDataType::Boolean => {
+                let values = array
+                    .as_any()
+                    .downcast_ref::<BooleanArray>()
+                    .unwrap()
+                    .extract_batched_values(batch_size);
+                Data::BooleanPoints(build_batches(timestamps, values))
+            }
+            _ => {
+                return UnsupportedDataTypeSnafu {
+                    data_type: array.data_type().clone(),
+                }
+                .fail();
+            }
+        };
+
+        Ok(Some(Series { tags, data }))
+    }
+
+    /// Create the tag=value pairs for this series set, adding
+    /// adding the _f and _m tags for the field name and measurement
+    fn create_frame_tags(&self, field_name: &str) -> Vec<Tag> {
+        // Add special _field and _measurement tags and return them in
+        // lexicographical (sorted) order
+
+        let mut all_tags = self
+            .tags
+            .iter()
+            .cloned()
+            .chain([
+                (Arc::from(FIELD_COLUMN_NAME), Arc::from(field_name)),
+                (
+                    Arc::from(MEASUREMENT_COLUMN_NAME),
+                    Arc::clone(&self.table_name),
+                ),
+            ])
+            .collect::<Vec<_>>();
+
+        // sort by name
+        all_tags.sort_by(|(key1, _value), (key2, _value2)| key1.cmp(key2));
+
+        all_tags
+            .into_iter()
+            .map(|(key, value)| Tag { key, value })
+            .collect()
+    }
+}
+
+/// Zip together nested vectors of timestamps and values to create batches of points
+fn build_batches<T>(timestamps: Vec<Vec<i64>>, values: Vec<Vec<T>>) -> Vec<Batch<T>> {
+    timestamps
+        .into_iter()
+        .zip(values)
+        .map(|(timestamps, values)| Batch { timestamps, values })
+        .collect()
+}
+
+/// Represents a group of `Series`
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct Group {
+    /// Contains *ALL* tag keys (not just those used for grouping)
+    pub tag_keys: Vec<Arc<str>>,
+
+    /// Contains the values that define the group (may be values from
+    /// fields other than tags).
+    ///
+    /// the values of the group tags that defined the group.
+    /// For example,
+    ///
+    /// If there were tags `t0`, `t1`, and `t2`, and the query had
+    /// group_keys of `[t1, t2]` then this list would have the values
+    /// of the t1 and t2 columns
+    pub partition_key_vals: Vec<Arc<str>>,
+}
+
+impl fmt::Display for Group {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "Group tag_keys: ")?;
+        fmt_strings(f, &self.tag_keys)?;
+        write!(f, " partition_key_vals: ")?;
+        fmt_strings(f, &self.partition_key_vals)?;
+        Ok(())
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum Either {
+    Series(Series),
+    Group(Group),
+}
+
+impl From<Series> for Either {
+    fn from(value: Series) -> Self {
+        Self::Series(value)
+    }
+}
+
+impl From<Group> for Either {
+    fn from(value: Group) -> Self {
+        Self::Group(value)
+    }
+}
+
+impl fmt::Display for Either {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Series(series) => series.fmt(f),
+            Self::Group(group) => group.fmt(f),
+        }
+    }
+}
+
+fn fmt_strings(f: &mut fmt::Formatter<'_>, strings: &[Arc<str>]) -> fmt::Result {
+    let mut first = true;
+    strings.iter().try_for_each(|item| {
+        if !first {
+            write!(f, ", ")?;
+        } else {
+            first = false;
+        }
+        write!(f, "{item}")
+    })
+}
+
+trait ExtractBatchedValues<T> {
+    /// Extracts rows as a vector,
+    /// for all rows `i` where `valid[i]` is set
+    fn extract_batched_values(&self, batch_size: usize) -> Vec<Vec<T>>;
+}
+
+/// Implements extract_batched_values for Arrow arrays.
+macro_rules! extract_batched_values_impl {
+    ($DATA_TYPE:ty) => {
+        extract_batched_values_impl! { $DATA_TYPE, identity }
+    };
+    ($DATA_TYPE:ty, $ITER_ADAPTER:expr) => {
+        fn extract_batched_values(&self, batch_size: usize) -> Vec<Vec<$DATA_TYPE>> {
+            let num_batches = 1 + self.len() / batch_size;
+            let mut batches = Vec::with_capacity(num_batches);
+
+            let mut v = Vec::with_capacity(batch_size);
+            for e in $ITER_ADAPTER(self.iter().flatten()) {
+                if v.len() >= batch_size {
+                    batches.push(v);
+                    v = Vec::with_capacity(batch_size);
+                }
+                v.push(e);
+            }
+            if !v.is_empty() {
+                v.shrink_to_fit();
+                batches.push(v);
+            }
+            batches.shrink_to_fit();
+            batches
+        }
+    };
+}
+
+fn identity<T>(t: T) -> T {
+    t
+}
+
+fn to_owned_string<'a, I>(i: I) -> impl Iterator<Item = String>
+where
+    I: Iterator<Item = &'a str>,
+{
+    i.map(str::to_string)
+}
+
+impl ExtractBatchedValues<String> for StringArray {
+    extract_batched_values_impl! { String,  to_owned_string }
+}
+
+impl ExtractBatchedValues<i64> for Int64Array {
+    extract_batched_values_impl! {i64}
+}
+
+impl ExtractBatchedValues<u64> for UInt64Array {
+    extract_batched_values_impl! {u64}
+}
+
+impl ExtractBatchedValues<f64> for Float64Array {
+    extract_batched_values_impl! {f64}
+}
+
+impl ExtractBatchedValues<bool> for BooleanArray {
+    extract_batched_values_impl! {bool}
+}
+
+impl ExtractBatchedValues<i64> for TimestampNanosecondArray {
+    extract_batched_values_impl! {i64}
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::exec::field::FieldIndexes;
+    use arrow::{compute::concat_batches, record_batch::RecordBatch};
+
+    use super::*;
+
+    fn series_set_to_series_strings(series_set: SeriesSet, batch_size: usize) -> Vec<String> {
+        let series: Vec<Series> = series_set.try_into_series(batch_size).unwrap();
+
+        let series: Vec<String> = series.into_iter().map(|s| s.to_string()).collect();
+
+        series
+            .iter()
+            .flat_map(|s| s.split('\n'))
+            .map(|s| s.to_string())
+            .collect()
+    }
+
+    #[test]
+    fn test_series_set_conversion() {
+        let series_set = SeriesSet {
+            table_name: Arc::from("the_table"),
+            tags: vec![(Arc::from("tag1"), Arc::from("val1"))],
+            field_indexes: FieldIndexes::from_timestamp_and_value_indexes(5, &[0, 1, 2, 3, 4]),
+            start_row: 1,
+            num_rows: 4,
+            batch: make_record_batch(),
+        };
+
+        let series_strings = series_set_to_series_strings(series_set, 3);
+
+        let expected = vec![
+            "Series tags={_field=string_field, _measurement=the_table, tag1=val1}",
+            "  StringPoints batches: [Batch { timestamps: [2000, 3000, 4000], values: [\"bar\", \"baz\", \"bar\"] }, Batch { timestamps: [5000], values: [\"baz\"] }]",
+            "Series tags={_field=int_field, _measurement=the_table, tag1=val1}",
+            "  IntegerPoints batches: [Batch { timestamps: [2000, 3000, 4000], values: [2, 3, 4] }, Batch { timestamps: [5000], values: [5] }]",
+            "Series tags={_field=uint_field, _measurement=the_table, tag1=val1}",
+            "  UnsignedPoints batches: [Batch { timestamps: [2000, 3000, 4000], values: [22, 33, 44] }, Batch { timestamps: [5000], values: [55] }]",
+            "Series tags={_field=float_field, _measurement=the_table, tag1=val1}",
+            "  FloatPoints batches: [Batch { timestamps: [2000, 3000, 4000], values: [20.1, 30.1, 40.1] }, Batch { timestamps: [5000], values: [50.1] }]",
+            "Series tags={_field=boolean_field, _measurement=the_table, tag1=val1}",
+            "  BooleanPoints batches: [Batch { timestamps: [2000, 3000, 4000], values: [false, true, false] }, Batch { timestamps: [5000], values: [true] }]",
+        ];
+
+        assert_eq!(
+            series_strings, expected,
+            "Expected:\n{expected:#?}\nActual:\n{series_strings:#?}"
+        );
+    }
+
+    #[test]
+    fn test_series_set_conversion_mixed_case_tags() {
+        let time1_array: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![1, 2, 3]));
+        let string1_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("time1", time1_array as ArrayRef),
+            ("string_field1", string1_array),
+        ])
+        .expect("created new record batch");
+
+        let series_set = SeriesSet {
+            table_name: Arc::from("the_table"),
+            tags: vec![
+                (Arc::from("CAPITAL_TAG"), Arc::from("the_value")),
+                (Arc::from("tag1"), Arc::from("val1")),
+            ],
+            // field indexes are (value, time)
+            field_indexes: FieldIndexes::from_slice(&[(1, 0)]),
+            start_row: 1,
+            num_rows: 2,
+            batch,
+        };
+
+        let series_strings = series_set_to_series_strings(series_set, 100);
+
+        // expect  CAPITAL_TAG is before `_field` and `_measurement` tags
+        // (as that is the correct lexicographical ordering)
+        let expected = vec![
+            "Series tags={CAPITAL_TAG=the_value, _field=string_field1, _measurement=the_table, tag1=val1}",
+            "  StringPoints batches: [Batch { timestamps: [2, 3], values: [\"bar\", \"baz\"] }]",
+        ];
+
+        assert_eq!(
+            series_strings, expected,
+            "Expected:\n{expected:#?}\nActual:\n{series_strings:#?}"
+        );
+    }
+
+    #[test]
+    fn test_series_set_conversion_different_time_columns() {
+        let time1_array: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![1, 2, 3]));
+        let string1_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
+        let time2_array: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![3, 4, 5]));
+        let string2_array: ArrayRef = Arc::new(StringArray::from(vec!["boo", "far", "faz"]));
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("time1", time1_array as ArrayRef),
+            ("string_field1", string1_array),
+            ("time2", time2_array),
+            ("string_field2", string2_array),
+        ])
+        .expect("created new record batch");
+
+        let series_set = SeriesSet {
+            table_name: Arc::from("the_table"),
+            tags: vec![(Arc::from("tag1"), Arc::from("val1"))],
+            // field indexes are (value, time)
+            field_indexes: FieldIndexes::from_slice(&[(3, 2), (1, 0)]),
+            start_row: 1,
+            num_rows: 2,
+            batch,
+        };
+
+        let series_strings = series_set_to_series_strings(series_set, 100);
+
+        let expected = vec![
+            "Series tags={_field=string_field2, _measurement=the_table, tag1=val1}",
+            "  StringPoints batches: [Batch { timestamps: [4, 5], values: [\"far\", \"faz\"] }]",
+            "Series tags={_field=string_field1, _measurement=the_table, tag1=val1}",
+            "  StringPoints batches: [Batch { timestamps: [2, 3], values: [\"bar\", \"baz\"] }]",
+        ];
+
+        assert_eq!(
+            series_strings, expected,
+            "Expected:\n{expected:#?}\nActual:\n{series_strings:#?}"
+        );
+    }
+
+    #[test]
+    fn test_series_set_conversion_with_entirely_null_field() {
+        // single series
+        let tag_array: ArrayRef = Arc::new(StringArray::from(vec!["MA", "MA", "MA", "MA"]));
+        let int_array: ArrayRef = Arc::new(Int64Array::from(vec![None, None, None, None]));
+        let float_array: ArrayRef = Arc::new(Float64Array::from(vec![
+            Some(10.1),
+            Some(20.1),
+            None,
+            Some(40.1),
+        ]));
+
+        let timestamp_array: ArrayRef =
+            Arc::new(TimestampNanosecondArray::from(vec![1000, 2000, 3000, 4000]));
+
+        let batch = RecordBatch::try_from_iter_with_nullable(vec![
+            ("state", tag_array, true),
+            ("int_field", int_array, true),
+            ("float_field", float_array, true),
+            ("time", timestamp_array, false),
+        ])
+        .expect("created new record batch");
+
+        let series_set = SeriesSet {
+            table_name: Arc::from("the_table"),
+            tags: vec![(Arc::from("state"), Arc::from("MA"))],
+            field_indexes: FieldIndexes::from_timestamp_and_value_indexes(3, &[1, 2]),
+            start_row: 0,
+            num_rows: batch.num_rows(),
+            batch: batch.clone(),
+        };
+
+        // Expect only a single series (for the data in float_field, int_field is all
+        // nulls)
+        let series_strings = series_set_to_series_strings(series_set, 100);
+
+        let expected = vec![
+            "Series tags={_field=float_field, _measurement=the_table, state=MA}",
+            "  FloatPoints batches: [Batch { timestamps: [1000, 2000, 4000], values: [10.1, 20.1, 40.1] }]",
+        ];
+
+        assert_eq!(
+            series_strings, expected,
+            "Expected:\n{expected:#?}\nActual:\n{series_strings:#?}"
+        );
+
+        // Multi-batch case
+        // We can just append record batches here because the tag field does not change
+        let batch = repeat_batch(3, &batch);
+        let series_set = SeriesSet {
+            table_name: Arc::from("the_table"),
+            tags: vec![(Arc::from("state"), Arc::from("MA"))],
+            field_indexes: FieldIndexes::from_timestamp_and_value_indexes(3, &[1, 2]),
+            start_row: 0,
+            num_rows: batch.num_rows(),
+            batch,
+        };
+
+        let series_strings = series_set_to_series_strings(series_set, 4);
+        let expected = vec![
+            "Series tags={_field=float_field, _measurement=the_table, state=MA}",
+            "  FloatPoints batches: [Batch { timestamps: [1000, 2000, 4000, 1000], values: [10.1, 20.1, 40.1, 10.1] }, Batch { timestamps: [2000, 4000, 1000, 2000], values: [20.1, 40.1, 10.1, 20.1] }, Batch { timestamps: [4000], values: [40.1] }]",
+        ];
+
+        assert_eq!(
+            series_strings, expected,
+            "Expected:\n{expected:#?}\nActual:\n{series_strings:#?}"
+        );
+    }
+
+    #[test]
+    fn test_series_set_conversion_with_some_null_fields() {
+        // single series
+        let tag_array = StringArray::from(vec!["MA", "MA"]);
+        let string_array = StringArray::from(vec![None, Some("foo")]);
+        let float_array = Float64Array::from(vec![None, Some(1.0)]);
+        let int_array = Int64Array::from(vec![None, Some(-10)]);
+        let uint_array = UInt64Array::from(vec![None, Some(100)]);
+        let bool_array = BooleanArray::from(vec![None, Some(true)]);
+
+        let timestamp_array = TimestampNanosecondArray::from(vec![1000, 2000]);
+
+        let batch = RecordBatch::try_from_iter_with_nullable(vec![
+            ("state", Arc::new(tag_array) as ArrayRef, true),
+            ("string_field", Arc::new(string_array), true),
+            ("float_field", Arc::new(float_array), true),
+            ("int_field", Arc::new(int_array), true),
+            ("uint_field", Arc::new(uint_array), true),
+            ("bool_field", Arc::new(bool_array), true),
+            ("time", Arc::new(timestamp_array), false),
+        ])
+        .expect("created new record batch");
+
+        let series_set = SeriesSet {
+            table_name: Arc::from("the_table"),
+            tags: vec![(Arc::from("state"), Arc::from("MA"))],
+            field_indexes: FieldIndexes::from_timestamp_and_value_indexes(6, &[1, 2, 3, 4, 5]),
+            start_row: 0,
+            num_rows: batch.num_rows(),
+            batch: batch.clone(),
+        };
+
+        // Expect only a single series (for the data in float_field, int_field is all
+        // nulls)
+        let series_strings = series_set_to_series_strings(series_set, 100);
+
+        let expected = vec![
+            "Series tags={_field=string_field, _measurement=the_table, state=MA}",
+            "  StringPoints batches: [Batch { timestamps: [2000], values: [\"foo\"] }]",
+            "Series tags={_field=float_field, _measurement=the_table, state=MA}",
+            "  FloatPoints batches: [Batch { timestamps: [2000], values: [1.0] }]",
+            "Series tags={_field=int_field, _measurement=the_table, state=MA}",
+            "  IntegerPoints batches: [Batch { timestamps: [2000], values: [-10] }]",
+            "Series tags={_field=uint_field, _measurement=the_table, state=MA}",
+            "  UnsignedPoints batches: [Batch { timestamps: [2000], values: [100] }]",
+            "Series tags={_field=bool_field, _measurement=the_table, state=MA}",
+            "  BooleanPoints batches: [Batch { timestamps: [2000], values: [true] }]",
+        ];
+
+        assert_eq!(
+            series_strings, expected,
+            "Expected:\n{expected:#?}\nActual:\n{series_strings:#?}"
+        );
+
+        // multi-batch case
+
+        // the tag columns have just a single value so we can just repeat the original batch to
+        // generate more rows
+        let batch = repeat_batch(4, &batch);
+        let series_set = SeriesSet {
+            table_name: Arc::from("the_table"),
+            tags: vec![(Arc::from("state"), Arc::from("MA"))],
+            field_indexes: FieldIndexes::from_timestamp_and_value_indexes(6, &[1, 2, 3, 4, 5]),
+            start_row: 0,
+            num_rows: batch.num_rows(),
+            batch,
+        };
+
+        let series_strings = series_set_to_series_strings(series_set, 3);
+
+        let expected = vec![
+            "Series tags={_field=string_field, _measurement=the_table, state=MA}",
+            "  StringPoints batches: [Batch { timestamps: [2000, 2000, 2000], values: [\"foo\", \"foo\", \"foo\"] }, Batch { timestamps: [2000], values: [\"foo\"] }]",
+            "Series tags={_field=float_field, _measurement=the_table, state=MA}",
+            "  FloatPoints batches: [Batch { timestamps: [2000, 2000, 2000], values: [1.0, 1.0, 1.0] }, Batch { timestamps: [2000], values: [1.0] }]",
+            "Series tags={_field=int_field, _measurement=the_table, state=MA}",
+            "  IntegerPoints batches: [Batch { timestamps: [2000, 2000, 2000], values: [-10, -10, -10] }, Batch { timestamps: [2000], values: [-10] }]",
+            "Series tags={_field=uint_field, _measurement=the_table, state=MA}",
+            "  UnsignedPoints batches: [Batch { timestamps: [2000, 2000, 2000], values: [100, 100, 100] }, Batch { timestamps: [2000], values: [100] }]",
+            "Series tags={_field=bool_field, _measurement=the_table, state=MA}",
+            "  BooleanPoints batches: [Batch { timestamps: [2000, 2000, 2000], values: [true, true, true] }, Batch { timestamps: [2000], values: [true] }]",
+        ];
+
+        assert_eq!(
+            series_strings, expected,
+            "Expected:\n{expected:#?}\nActual:\n{series_strings:#?}"
+        );
+    }
+
+    fn make_record_batch() -> RecordBatch {
+        let string_array: ArrayRef = Arc::new(StringArray::from(vec![
+            "foo", "bar", "baz", "bar", "baz", "foo",
+        ]));
+        let int_array: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6]));
+        let uint_array: ArrayRef = Arc::new(UInt64Array::from(vec![11, 22, 33, 44, 55, 66]));
+        let float_array: ArrayRef =
+            Arc::new(Float64Array::from(vec![10.1, 20.1, 30.1, 40.1, 50.1, 60.1]));
+        let bool_array: ArrayRef = Arc::new(BooleanArray::from(vec![
+            true, false, true, false, true, false,
+        ]));
+
+        let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![
+            1000, 2000, 3000, 4000, 5000, 6000,
+        ]));
+
+        RecordBatch::try_from_iter_with_nullable(vec![
+            ("string_field", string_array, true),
+            ("int_field", int_array, true),
+            ("uint_field", uint_array, true),
+            ("float_field", float_array, true),
+            ("boolean_field", bool_array, true),
+            ("time", timestamp_array, true),
+        ])
+        .expect("created new record batch")
+    }
+
+    fn repeat_batch(count: usize, rb: &RecordBatch) -> RecordBatch {
+        concat_batches(&rb.schema(), std::iter::repeat(rb).take(count)).unwrap()
+    }
+}
diff --git a/iox_query/src/exec/sleep.rs b/iox_query/src/exec/sleep.rs
new file mode 100644
index 0000000..b7fa505
--- /dev/null
+++ b/iox_query/src/exec/sleep.rs
@@ -0,0 +1,265 @@
+/// Implementation of a "sleep" operation in DataFusion.
+///
+/// The sleep operation passes through its input data and sleeps asynchronously for a duration determined by an
+/// expression. The async sleep is implemented as a special [execution plan](SleepExpr) so we can perform this as part
+/// of the async data stream. In contrast to a UDF, this will NOT block any threads.
+use std::{sync::Arc, time::Duration};
+
+use arrow::{
+    array::{Array, Float32Array, Float64Array, Int64Array},
+    datatypes::{DataType, SchemaRef, TimeUnit},
+};
+use datafusion::{
+    common::DFSchemaRef,
+    error::DataFusionError,
+    execution::{context::SessionState, TaskContext},
+    logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore},
+    physical_plan::{
+        stream::RecordBatchStreamAdapter, DisplayAs, DisplayFormatType, ExecutionPlan,
+        PhysicalExpr, SendableRecordBatchStream, Statistics,
+    },
+    physical_planner::PhysicalPlanner,
+    prelude::Expr,
+};
+use futures::TryStreamExt;
+
+/// Logical plan note that represents a "sleep" operation.
+///
+/// This will be lowered to [`SleepExpr`].
+///
+/// See [module](super) docs for more details.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct SleepNode {
+    input: LogicalPlan,
+    duration: Vec<Expr>,
+}
+
+impl SleepNode {
+    pub fn new(input: LogicalPlan, duration: Vec<Expr>) -> Self {
+        Self { input, duration }
+    }
+
+    pub fn plan(
+        &self,
+        planner: &dyn PhysicalPlanner,
+        logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        session_state: &SessionState,
+    ) -> Result<SleepExpr, DataFusionError> {
+        let duration = self
+            .duration
+            .iter()
+            .map(|e| {
+                planner.create_physical_expr(
+                    e,
+                    logical_inputs[0].schema(),
+                    &physical_inputs[0].schema(),
+                    session_state,
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        Ok(SleepExpr::new(Arc::clone(&physical_inputs[0]), duration))
+    }
+}
+
+impl UserDefinedLogicalNodeCore for SleepNode {
+    fn name(&self) -> &str {
+        "Sleep"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.duration.clone()
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let duration = self
+            .duration
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        write!(f, "{}: duration=[{}]", self.name(), duration)
+    }
+
+    fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
+        Self::new(inputs[0].clone(), exprs.to_vec())
+    }
+}
+
+/// Physical node that implements a "sleep" operation.
+///
+/// This was lowered from [`SleepNode`].
+///
+/// See [module](super) docs for more details.
+#[derive(Debug)]
+pub struct SleepExpr {
+    /// Input data.
+    input: Arc<dyn ExecutionPlan>,
+
+    /// Expression that determines the sum of the sleep duration.
+    duration: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl SleepExpr {
+    pub fn new(input: Arc<dyn ExecutionPlan>, duration: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        Self { input, duration }
+    }
+}
+
+impl DisplayAs for SleepExpr {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                let duration = self
+                    .duration
+                    .iter()
+                    .map(|e| e.to_string())
+                    .collect::<Vec<String>>()
+                    .join(", ");
+
+                write!(f, "Sleep: duration=[{}]", duration)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for SleepExpr {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning {
+        self.input.output_partitioning()
+    }
+
+    fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        assert_eq!(children.len(), 1);
+
+        Ok(Arc::new(Self::new(
+            Arc::clone(&children[0]),
+            self.duration.clone(),
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> datafusion::error::Result<SendableRecordBatchStream> {
+        let stream = self.input.execute(partition, context)?;
+
+        let duration = self.duration.clone();
+        let stream = RecordBatchStreamAdapter::new(
+            stream.schema(),
+            stream.and_then(move |batch| {
+                let duration = duration.clone();
+
+                async move {
+                    let mut sum = Duration::ZERO;
+                    for expr in duration {
+                        let array = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+                        let d = array_to_duration(&array)?;
+                        if let Some(d) = d {
+                            sum += d;
+                        }
+                    }
+                    if !sum.is_zero() {
+                        tokio::time::sleep(sum).await;
+                    }
+                    Ok(batch)
+                }
+            }),
+        );
+        Ok(Box::pin(stream))
+    }
+
+    fn statistics(&self) -> Result<Statistics, DataFusionError> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+fn array_to_duration(array: &dyn Array) -> Result<Option<Duration>, DataFusionError> {
+    match array.data_type() {
+        DataType::Null => Ok(None),
+        DataType::Duration(tunit) => {
+            let array = arrow::compute::cast(array, &DataType::Int64)?;
+            let array = array
+                .as_any()
+                .downcast_ref::<Int64Array>()
+                .expect("just casted");
+            let Some(sum) = arrow::compute::sum(array) else {
+                return Ok(None);
+            };
+            if sum < 0 {
+                return Err(DataFusionError::Execution(format!(
+                    "duration must be non-negative but is {sum}{tunit:?}"
+                )));
+            }
+            let sum = sum as u64;
+            let duration = match tunit {
+                TimeUnit::Second => Duration::from_secs(sum),
+                TimeUnit::Millisecond => Duration::from_millis(sum),
+                TimeUnit::Microsecond => Duration::from_micros(sum),
+                TimeUnit::Nanosecond => Duration::from_nanos(sum),
+            };
+            Ok(Some(duration))
+        }
+        DataType::Float32 => {
+            let array = array
+                .as_any()
+                .downcast_ref::<Float32Array>()
+                .expect("just checked");
+            let Some(sum) = arrow::compute::sum(array) else {
+                return Ok(None);
+            };
+            if sum < 0.0 || !sum.is_finite() {
+                return Err(DataFusionError::Execution(format!(
+                    "duration must be non-negative but is {sum}s"
+                )));
+            }
+            Ok(Some(Duration::from_secs_f32(sum)))
+        }
+        DataType::Float64 => {
+            let array = array
+                .as_any()
+                .downcast_ref::<Float64Array>()
+                .expect("just checked");
+            let Some(sum) = arrow::compute::sum(array) else {
+                return Ok(None);
+            };
+            if sum < 0.0 || !sum.is_finite() {
+                return Err(DataFusionError::Execution(format!(
+                    "duration must be non-negative but is {sum}s"
+                )));
+            }
+            Ok(Some(Duration::from_secs_f64(sum)))
+        }
+        other => Err(DataFusionError::Internal(format!(
+            "Expected duration pattern to sleep(...), got: {other:?}"
+        ))),
+    }
+}
diff --git a/iox_query/src/exec/split.rs b/iox_query/src/exec/split.rs
new file mode 100644
index 0000000..3010884
--- /dev/null
+++ b/iox_query/src/exec/split.rs
@@ -0,0 +1,931 @@
+//! This module contains a DataFusion extension node to "split" a
+//! stream based on an expression.
+//!
+//! All rows for which the expression are true are sent to partition
+//! `0` and all other rows are sent to partition `1`.
+//!
+//! There are corresponding [`LogicalPlan`] ([`StreamSplitNode`]) and
+//! [`ExecutionPlan`] ([`StreamSplitExec`]) implementations, which are
+//! typically used as shown in the following diagram:
+//!
+//!
+//! ```text
+//!                                               partition 0            partition 1
+//!                                                   ▲                       ▲
+//!                                                   │                       │
+//!                                                   └────────────┬──────────┘
+//!                                                                │
+//!                                                                │
+//!                                                   ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
+//!                                                        StreamSplitExec     │
+//!                                                   │          expr
+//!                                                    ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
+//!                                                                ▲
+//!                                                                │
+//!                                                   ┌────────────────────────┐
+//!                                                   │         Union          │
+//!                                                   │                        │
+//!                                                   └────────────────────────┘
+//!                                                                ▲
+//!                                                                │
+//!
+//!                                                       Other IOxScan code
+//!       ┌────────────────────────┐                     (Filter, Dedup, etc)
+//!       │ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─  │                             ...
+//!       │      StreamSplit     │ │
+//!       │ └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─  │                               ▲
+//!       │       Extension        │                               │
+//!       └────────────────────────┘                               │
+//!                    ▲                              ┌────────────────────────┐
+//!                    │                              │     TableProvider      │
+//!       ┌────────────────────────┐                  │                        │
+//!       │       TableScan        │                  └────────────────────────┘
+//!       │                        │
+//!       └────────────────────────┘
+//!
+//!                                                          Execution Plan
+//!             Logical Plan                                (Physical Plan)
+//! ```
+
+use std::{
+    fmt::{self, Debug},
+    sync::Arc,
+};
+
+use arrow::{
+    array::{as_boolean_array, Array, ArrayRef, BooleanArray},
+    compute::{self, filter_record_batch},
+    datatypes::SchemaRef,
+    record_batch::RecordBatch,
+};
+use datafusion::{
+    common::DFSchemaRef,
+    error::{DataFusionError, Result},
+    execution::context::TaskContext,
+    logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore},
+    physical_expr::PhysicalSortRequirement,
+    physical_plan::{
+        expressions::PhysicalSortExpr,
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput},
+        ColumnarValue, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+        PhysicalExpr, SendableRecordBatchStream, Statistics,
+    },
+    scalar::ScalarValue,
+};
+
+use datafusion_util::{watch::WatchedTask, AdapterStream};
+use futures::StreamExt;
+use observability_deps::tracing::*;
+use parking_lot::Mutex;
+use tokio::sync::mpsc::Sender;
+
+/// Implements stream splitting described in `make_stream_split`
+///
+/// The resulting execution plan always produces exactly split_exprs's length + 1 partitions:
+///
+/// * partition i (i < split_exprs.len()) are the rows for which the `split_expr[i]`
+///   evaluates to true. If the rows are evaluated true for both `split_expr[i]` and
+///   `split_expr[j]`, where i < j, the rows will be sent to partition i. However,
+///   this will be mostly used in the use case of range expressions (e.g: [2 <= x, 2= x <= 5])
+///   in which rows are only evaluated to true in at most one of the expressions.
+/// * partition n (n = partition split_exprs.len())  are the rows for which all split_exprs
+///   do not evaluate to true (e.g. Null or false)
+#[derive(Hash, PartialEq, Eq)]
+pub struct StreamSplitNode {
+    input: LogicalPlan,
+    split_exprs: Vec<Expr>,
+}
+
+impl StreamSplitNode {
+    /// Create a new `StreamSplitNode` using `split_exprs` to divide the
+    /// rows. All `split_exprs` must evaluate to a boolean otherwise a
+    /// runtime error will occur.
+    pub fn new(input: LogicalPlan, split_exprs: Vec<Expr>) -> Self {
+        Self { input, split_exprs }
+    }
+
+    pub fn split_exprs(&self) -> &Vec<Expr> {
+        &self.split_exprs
+    }
+}
+
+impl Debug for StreamSplitNode {
+    /// Use explain format for the Debug format.
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.fmt_for_explain(f)
+    }
+}
+
+impl UserDefinedLogicalNodeCore for StreamSplitNode {
+    fn name(&self) -> &str {
+        "StreamSplit"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    /// Schema is the same as the input schema
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.split_exprs.clone()
+    }
+
+    fn fmt_for_explain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{} split_expr={:?}", self.name(), self.split_exprs)
+    }
+
+    fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
+        assert_eq!(inputs.len(), 1, "StreamSplitNode: input sizes inconsistent");
+        Self {
+            input: inputs[0].clone(),
+            split_exprs: (*exprs).to_vec(),
+        }
+    }
+}
+
+/// Tracks the state of the physical operator
+enum State {
+    New,
+    Running {
+        streams: Vec<Option<SendableRecordBatchStream>>,
+    },
+}
+
+/// Physical operator that implements steam splitting operation
+pub struct StreamSplitExec {
+    state: Mutex<State>,
+    input: Arc<dyn ExecutionPlan>,
+    split_exprs: Vec<Arc<dyn PhysicalExpr>>,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl StreamSplitExec {
+    pub fn new(input: Arc<dyn ExecutionPlan>, split_exprs: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        let state = Mutex::new(State::New);
+        Self {
+            state,
+            input,
+            split_exprs,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+}
+
+impl Debug for StreamSplitExec {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "StreamSplitExec {:?}", self.split_exprs)
+    }
+}
+
+impl ExecutionPlan for StreamSplitExec {
+    fn as_any(&self) -> &(dyn std::any::Any + 'static) {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    /// Always produces exactly two outputs
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(self.split_exprs.len() + 1)
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    /// Always require a single input (eventually we might imagine
+    /// running this on multiple partitions concurrently to compute
+    /// the splits in parallel, but not now)
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::SinglePartition]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
+        // require that the output ordering of the child is preserved
+        // (so that this node logically splits what was desired)
+        let requirement = self
+            .input
+            .output_ordering()
+            .map(PhysicalSortRequirement::from_sort_exprs);
+
+        vec![requirement]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        match children.len() {
+            1 => Ok(Arc::new(Self::new(
+                Arc::clone(&children[0]),
+                self.split_exprs.clone(),
+            ))),
+            _ => Err(DataFusionError::Internal(
+                "StreamSplitExec wrong number of children".to_string(),
+            )),
+        }
+    }
+
+    /// Stream split has multiple partitions from 0 to n
+    /// Each partition i includes rows for which `split_exprs[i]` evaluate to true
+    ///
+    /// # Deadlock
+    ///
+    /// This will deadlock unless all partitions are consumed from
+    /// concurrently. Failing to consume from one partition blocks the other
+    /// partitions from progressing.
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        trace!(partition, "Start SplitExec::execute");
+        self.start_if_needed(context)?;
+
+        let mut state = self.state.lock();
+        match &mut (*state) {
+            State::New => panic!("should have been initialized"),
+            State::Running { streams } => {
+                assert!(partition < streams.len());
+                let stream = streams[partition].take().unwrap_or_else(|| {
+                    panic!("Error executing stream #{partition} of StreamSplitExec");
+                });
+                trace!(partition, "End SplitExec::execute");
+                Ok(stream)
+            }
+        }
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        // For now, don't return any statistics (in the future we
+        // could potentially estimate the output cardinalities)
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for StreamSplitExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "StreamSplitExec")
+            }
+        }
+    }
+}
+
+impl StreamSplitExec {
+    /// if in State::New, sets up the output running and sets self.state --> `Running`
+    fn start_if_needed(&self, context: Arc<TaskContext>) -> Result<()> {
+        let mut state = self.state.lock();
+        if matches!(*state, State::Running { .. }) {
+            return Ok(());
+        }
+
+        let num_input_streams = self.input.output_partitioning().partition_count();
+        assert_eq!(
+            num_input_streams, 1,
+            "need exactly one input partition for stream split exec"
+        );
+
+        trace!("Setting up SplitStreamExec state");
+        let input_stream = self.input.execute(0, context)?;
+
+        let split_exprs = self.split_exprs.clone();
+
+        let num_streams = split_exprs.len() + 1;
+        let mut baseline_metrics = Vec::with_capacity(num_streams);
+        let mut txs = Vec::with_capacity(num_streams);
+        let mut rxs = Vec::with_capacity(num_streams);
+        for i in 0..num_streams {
+            baseline_metrics.push(BaselineMetrics::new(&self.metrics, i));
+            let (tx, rx) = tokio::sync::mpsc::channel(2);
+            txs.push(tx);
+            rxs.push(rx);
+        }
+
+        // launch the work on a different task, with a task to handle its output values
+        let fut = split_the_stream(input_stream, split_exprs, txs.clone(), baseline_metrics);
+        let handle = WatchedTask::new(fut, txs, "split");
+
+        let streams = rxs
+            .into_iter()
+            .map(|rx| {
+                Some(AdapterStream::adapt(
+                    self.input.schema(),
+                    rx,
+                    Arc::clone(&handle),
+                ))
+            })
+            .collect::<Vec<_>>();
+
+        *state = State::Running { streams };
+
+        Ok(())
+    }
+}
+
+/// This function does the actual splitting: evaluates `split_exprs` on
+/// each input [`RecordBatch`], and then sends the rows to the correct
+/// output `tx[i]`
+async fn split_the_stream(
+    mut input_stream: SendableRecordBatchStream,
+    split_exprs: Vec<Arc<dyn PhysicalExpr>>,
+    tx: Vec<Sender<Result<RecordBatch, DataFusionError>>>,
+    baseline_metrics: Vec<BaselineMetrics>,
+) -> std::result::Result<(), DataFusionError> {
+    assert_eq!(split_exprs.len() + 1, tx.len());
+    assert_eq!(tx.len(), baseline_metrics.len());
+
+    let elapsed_computes = baseline_metrics
+        .iter()
+        .map(|b| b.elapsed_compute())
+        .collect::<Vec<_>>();
+
+    while let Some(batch) = input_stream.next().await {
+        let batch = batch?;
+        trace!(num_rows = batch.num_rows(), "Processing batch");
+
+        // All streams are not done yet
+        let mut tx_done = tx.iter().map(|_| false).collect::<Vec<_>>();
+
+        // Get data from the current batch for each stream
+        let mut remaining_indices: Option<ColumnarValue> = None;
+        for i in 0..split_exprs.len() {
+            let timer = elapsed_computes[i].timer();
+            let expr = &split_exprs[i];
+
+            // Compute indices that meets this expr
+            let true_indices = expr.evaluate(&batch)?;
+            // Indices that does not meet this expr
+            let not_true_indices = negate(&true_indices)?;
+
+            // Indices that do not meet all exprs
+            if let Some(not_true) = remaining_indices {
+                remaining_indices = Some(
+                    and(&not_true, &not_true_indices)
+                        .expect("Error computing combining negating indices"),
+                );
+            } else {
+                remaining_indices = Some(not_true_indices);
+            };
+
+            // data that meets expr
+            let true_batch = compute_batch(&batch, true_indices, false)?;
+            timer.done();
+
+            // record output counts
+            let true_batch = true_batch.record_output(&baseline_metrics[i]);
+
+            // don't treat a hangup as an error, as it can also be caused
+            // by a LIMIT operation where the entire stream is not
+            // consumed)
+            if let Err(e) = tx[i].send(Ok(true_batch)).await {
+                debug!(%e, "Split tx[{}] hung up, ignoring", i);
+                tx_done[i] = true;
+            }
+        }
+
+        // last stream of data gets values that did not get routed to other streams
+        let timer = elapsed_computes[elapsed_computes.len() - 1].timer();
+        let remaining_indices =
+            remaining_indices.expect("The last set of indices of the split should have values");
+        let final_not_true_batch = compute_batch(&batch, remaining_indices, true)?;
+        timer.done();
+
+        // record output counts
+        let final_not_true_batch =
+            final_not_true_batch.record_output(&baseline_metrics[elapsed_computes.len() - 1]);
+
+        // don't treat a hangup as an error, as it can also be caused
+        // by a LIMIT operation where the entire stream is not
+        // consumed)
+        if let Err(e) = tx[elapsed_computes.len() - 1]
+            .send(Ok(final_not_true_batch))
+            .await
+        {
+            debug!(%e, "Split tx[{}] hung up, ignoring", elapsed_computes.len()-1);
+            tx_done[elapsed_computes.len() - 1] = true;
+        }
+
+        if tx_done.iter().all(|x| *x) {
+            debug!("All split tx ends have hung up, stopping loop");
+            return Ok(());
+        }
+    }
+
+    trace!("Splitting done successfully");
+    Ok(())
+}
+
+fn compute_batch(
+    input_batch: &RecordBatch,
+    indices: ColumnarValue,
+    last_batch: bool,
+) -> Result<RecordBatch> {
+    let batch = match indices {
+        ColumnarValue::Array(indices) => {
+            let indices = indices.as_any().downcast_ref::<BooleanArray>().unwrap();
+
+            // include null for last batch
+            if last_batch && indices.null_count() > 0 {
+                // since !Null --> Null, but we want all the
+                // remaining rows, that are not in true_indicies,
+                // transform any nulls into true for this one
+                let mapped_indicies = indices.iter().map(|v| v.or(Some(true))).collect::<Vec<_>>();
+
+                filter_record_batch(input_batch, &BooleanArray::from(mapped_indicies))
+            } else {
+                filter_record_batch(input_batch, indices)
+            }?
+        }
+        ColumnarValue::Scalar(ScalarValue::Boolean(val)) => {
+            let empty_record_batch = RecordBatch::new_empty(input_batch.schema());
+            match val {
+                Some(true) => input_batch.clone(),
+                Some(false) => empty_record_batch,
+                _ => panic!("mismatched boolean values: {val:?}"),
+            }
+        }
+        _ => {
+            panic!("mismatched array types");
+        }
+    };
+
+    Ok(batch)
+}
+
+/// compute the boolean compliment of the columnar value (which must be boolean)
+fn negate(v: &ColumnarValue) -> Result<ColumnarValue> {
+    match v {
+        ColumnarValue::Array(arr) => {
+            let arr = arr.as_any().downcast_ref::<BooleanArray>().ok_or_else(|| {
+                let msg = format!("Expected boolean array, but had type {:?}", arr.data_type());
+                DataFusionError::Internal(msg)
+            })?;
+            let neg_array = Arc::new(compute::not(arr)?) as ArrayRef;
+            Ok(ColumnarValue::Array(neg_array))
+        }
+        ColumnarValue::Scalar(val) => {
+            if let ScalarValue::Boolean(v) = val {
+                let not_v = v.map(|v| !v);
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(not_v)))
+            } else {
+                let msg = format!(
+                    "Expected boolean literal, but got type {:?}",
+                    val.data_type()
+                );
+                Err(DataFusionError::Internal(msg))
+            }
+        }
+    }
+}
+
+fn and(left: &ColumnarValue, right: &ColumnarValue) -> Result<ColumnarValue> {
+    match (left, right) {
+        (ColumnarValue::Array(arr_left), ColumnarValue::Array(arr_right)) => {
+            let arr_left = as_boolean_array(arr_left);
+            let arr_right = as_boolean_array(arr_right);
+            let and_array = Arc::new(compute::and(arr_left, arr_right)?) as ArrayRef;
+            Ok(ColumnarValue::Array(and_array))
+        }
+        (ColumnarValue::Scalar(val_left), ColumnarValue::Scalar(val_right)) => {
+            if let (ScalarValue::Boolean(Some(v_left)), ScalarValue::Boolean(Some(v_right))) =
+                (val_left, val_right)
+            {
+                let and_val = v_left & v_right;
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(and_val))))
+            } else {
+                let msg = format!(
+                    "Expected two boolean literals, but got type {:?} and type {:?}",
+                    val_left.data_type(),
+                    val_right.data_type()
+                );
+                Err(DataFusionError::Internal(msg))
+            }
+        }
+        _ => {
+            panic!("Expected either two boolean arrays or two boolean scalars, but had type {:?} and type {:?}", left.data_type(), right.data_type());
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::{Int64Array, StringArray};
+    use arrow_util::assert_batches_sorted_eq;
+    use datafusion::{
+        physical_plan::memory::MemoryExec,
+        prelude::{col, lit},
+    };
+    use datafusion_util::test_collect_partition;
+
+    use crate::util::df_physical_expr;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_basic_split() {
+        test_helpers::maybe_start_logging();
+        let batch0 = RecordBatch::try_from_iter(vec![
+            (
+                "int_col",
+                Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef,
+            ),
+            (
+                "str_col",
+                Arc::new(StringArray::from(vec!["one", "two", "three"])) as ArrayRef,
+            ),
+        ])
+        .unwrap();
+
+        let batch1 = RecordBatch::try_from_iter(vec![
+            (
+                "int_col",
+                Arc::new(Int64Array::from(vec![4, -2])) as ArrayRef,
+            ),
+            (
+                "str_col",
+                Arc::new(StringArray::from(vec!["four", "negative 2"])) as ArrayRef,
+            ),
+        ])
+        .unwrap();
+
+        let input = make_input(vec![vec![batch0, batch1]]);
+        // int_col < 3
+        let split_expr = df_physical_expr(input.schema(), col("int_col").lt(lit(3))).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec![
+            "+---------+------------+",
+            "| int_col | str_col    |",
+            "+---------+------------+",
+            "| -2      | negative 2 |",
+            "| 1       | one        |",
+            "| 2       | two        |",
+            "+---------+------------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(split_exec, 1).await;
+        let expected = vec![
+            "+---------+---------+",
+            "| int_col | str_col |",
+            "+---------+---------+",
+            "| 3       | three   |",
+            "| 4       | four    |",
+            "+---------+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output1);
+    }
+
+    #[tokio::test]
+    async fn test_basic_split_multi_exprs() {
+        test_helpers::maybe_start_logging();
+        let batch0 = RecordBatch::try_from_iter(vec![
+            (
+                "int_col",
+                Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef,
+            ),
+            (
+                "str_col",
+                Arc::new(StringArray::from(vec!["one", "two", "three"])) as ArrayRef,
+            ),
+        ])
+        .unwrap();
+
+        let batch1 = RecordBatch::try_from_iter(vec![
+            (
+                "int_col",
+                Arc::new(Int64Array::from(vec![4, -2])) as ArrayRef,
+            ),
+            (
+                "str_col",
+                Arc::new(StringArray::from(vec!["four", "negative 2"])) as ArrayRef,
+            ),
+        ])
+        .unwrap();
+
+        let input = make_input(vec![vec![batch0, batch1]]);
+        // int_col < 2
+        let split_expr1 =
+            df_physical_expr(input.schema(), col("int_col").lt(lit::<i16>(2))).unwrap();
+        // 2 <= int_col < 3
+        let expr = col("int_col")
+            .gt_eq(lit::<i16>(2))
+            .and(col("int_col").lt(lit::<i16>(3)));
+        let split_expr2 = df_physical_expr(input.schema(), expr).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec![
+            "+---------+------------+",
+            "| int_col | str_col    |",
+            "+---------+------------+",
+            "| -2      | negative 2 |",
+            "| 1       | one        |",
+            "+---------+------------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(Arc::clone(&split_exec), 1).await;
+        let expected = vec![
+            "+---------+---------+",
+            "| int_col | str_col |",
+            "+---------+---------+",
+            "| 2       | two     |",
+            "+---------+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output1);
+
+        let output2 = test_collect_partition(split_exec, 2).await;
+        let expected = vec![
+            "+---------+---------+",
+            "| int_col | str_col |",
+            "+---------+---------+",
+            "| 3       | three   |",
+            "| 4       | four    |",
+            "+---------+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output2);
+    }
+
+    #[tokio::test]
+    async fn test_constant_split() {
+        // test that it works with a constant expression
+        test_helpers::maybe_start_logging();
+        let batch0 = RecordBatch::try_from_iter(vec![(
+            "int_col",
+            Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef,
+        )])
+        .unwrap();
+
+        let input = make_input(vec![vec![batch0]]);
+        // use `false` to send all outputs to second stream
+        let split_expr = df_physical_expr(input.schema(), lit(false)).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec!["+---------+", "| int_col |", "+---------+", "+---------+"];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(split_exec, 1).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "| 1       |",
+            "| 2       |",
+            "| 3       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output1);
+    }
+
+    #[tokio::test]
+    async fn test_constant_split_multi_exprs() {
+        // test that it works with a constant expression
+        test_helpers::maybe_start_logging();
+        let batch0 = RecordBatch::try_from_iter(vec![(
+            "int_col",
+            Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef,
+        )])
+        .unwrap();
+
+        // Test 1: 3 streams but all data is sent to the second one
+        let input = make_input(vec![vec![batch0.clone()]]);
+        // use `false` & `true` to send all outputs to second stream
+        let split_expr1 = df_physical_expr(input.schema(), lit(false)).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), lit(true)).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec!["+---------+", "| int_col |", "+---------+", "+---------+"];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(Arc::clone(&split_exec), 1).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "| 1       |",
+            "| 2       |",
+            "| 3       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output1);
+
+        let output2 = test_collect_partition(split_exec, 2).await;
+        let expected = vec!["+---------+", "| int_col |", "+---------+", "+---------+"];
+        assert_batches_sorted_eq!(&expected, &output2);
+
+        // -----------------------
+        // Test 2: 3 streams but all data is sent to the last one
+        let input = make_input(vec![vec![batch0.clone()]]);
+
+        // use `false` & `false` to send all outputs to third stream
+        let split_expr1 = df_physical_expr(input.schema(), lit(false)).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), lit(false)).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec!["+---------+", "| int_col |", "+---------+", "+---------+"];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(Arc::clone(&split_exec), 1).await;
+        let expected = vec!["+---------+", "| int_col |", "+---------+", "+---------+"];
+        assert_batches_sorted_eq!(&expected, &output1);
+
+        let output2 = test_collect_partition(Arc::clone(&split_exec), 2).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "| 1       |",
+            "| 2       |",
+            "| 3       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output2);
+
+        // -----------------------
+        // Test 3: 3 streams but all data is sent to the first
+        let input = make_input(vec![vec![batch0]]);
+
+        // use `true` & `false` to send all outputs to first stream
+        let split_expr1 = df_physical_expr(input.schema(), lit(true)).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), lit(false)).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "| 1       |",
+            "| 2       |",
+            "| 3       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(Arc::clone(&split_exec), 1).await;
+        let expected = vec!["+---------+", "| int_col |", "+---------+", "+---------+"];
+        assert_batches_sorted_eq!(&expected, &output1);
+
+        let output2 = test_collect_partition(Arc::clone(&split_exec), 2).await;
+        let expected = vec!["+---------+", "| int_col |", "+---------+", "+---------+"];
+        assert_batches_sorted_eq!(&expected, &output2);
+    }
+
+    #[tokio::test]
+    async fn test_nulls() {
+        // test with null inputs (so rows evaluate to null)
+
+        test_helpers::maybe_start_logging();
+        let batch0 = RecordBatch::try_from_iter(vec![(
+            "int_col",
+            Arc::new(Int64Array::from(vec![Some(1), None, Some(2), Some(3)])) as ArrayRef,
+        )])
+        .unwrap();
+
+        let input = make_input(vec![vec![batch0]]);
+        // int_col < 3
+        let split_expr = df_physical_expr(input.schema(), col("int_col").lt(lit(3))).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "| 1       |",
+            "| 2       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(split_exec, 1).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "|         |",
+            "| 3       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output1);
+    }
+
+    #[tokio::test]
+    async fn test_nulls_multi_exprs() {
+        // test with null inputs (so rows evaluate to null)
+
+        test_helpers::maybe_start_logging();
+        let batch0 = RecordBatch::try_from_iter(vec![(
+            "int_col",
+            Arc::new(Int64Array::from(vec![Some(1), None, Some(2), Some(3)])) as ArrayRef,
+        )])
+        .unwrap();
+
+        let input = make_input(vec![vec![batch0]]);
+        // int_col < 2
+        let split_expr1 =
+            df_physical_expr(input.schema(), col("int_col").lt(lit::<i16>(2))).unwrap();
+        // 2 <= int_col < 3
+        let expr = col("int_col")
+            .gt_eq(lit::<i16>(2))
+            .and(col("int_col").lt(lit::<i16>(3)));
+        let split_expr2 = df_physical_expr(input.schema(), expr).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
+
+        let output0 = test_collect_partition(Arc::clone(&split_exec), 0).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "| 1       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output0);
+
+        let output1 = test_collect_partition(Arc::clone(&split_exec), 1).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "| 2       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output1);
+
+        let output2 = test_collect_partition(split_exec, 2).await;
+        let expected = vec![
+            "+---------+",
+            "| int_col |",
+            "+---------+",
+            "|         |",
+            "| 3       |",
+            "+---------+",
+        ];
+        assert_batches_sorted_eq!(&expected, &output2);
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "Expected boolean array, but had type Int64")]
+    async fn test_non_bool() {
+        // test non boolean expression (expect error)
+
+        test_helpers::maybe_start_logging();
+        let batch0 = RecordBatch::try_from_iter(vec![(
+            "int_col",
+            Arc::new(Int64Array::from(vec![Some(1), None, Some(2), Some(3)])) as ArrayRef,
+        )])
+        .unwrap();
+
+        let input = make_input(vec![vec![batch0]]);
+        // int_col (not a boolean)
+        let split_expr = df_physical_expr(input.schema(), col("int_col")).unwrap();
+        let split_exec: Arc<dyn ExecutionPlan> =
+            Arc::new(StreamSplitExec::new(input, vec![split_expr]));
+
+        test_collect_partition(split_exec, 0).await;
+    }
+
+    fn make_input(partitions: Vec<Vec<RecordBatch>>) -> Arc<dyn ExecutionPlan> {
+        let schema = partitions
+            .iter()
+            .flat_map(|p| p.iter())
+            .map(|batch| batch.schema())
+            .next()
+            .expect("must be at least one batch");
+
+        let projection = None;
+        let input =
+            MemoryExec::try_new(&partitions, schema, projection).expect("Created MemoryExec");
+        Arc::new(input)
+    }
+}
diff --git a/iox_query/src/exec/stringset.rs b/iox_query/src/exec/stringset.rs
new file mode 100644
index 0000000..69fe153
--- /dev/null
+++ b/iox_query/src/exec/stringset.rs
@@ -0,0 +1,149 @@
+//! This module contains the definition of a "StringSet" a set of
+//! logical strings and the code to create them from record batches.
+
+use std::{collections::BTreeSet, sync::Arc};
+
+use arrow::{
+    array::{Array, DictionaryArray, StringArray},
+    datatypes::{DataType, Int32Type, SchemaRef},
+    record_batch::RecordBatch,
+};
+
+use snafu::{ensure, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Error extracting results from Record Batches: schema not a single Utf8 or string dictionary: {:?}",
+        schema
+    ))]
+    InternalSchemaWasNotString { schema: SchemaRef },
+
+    #[snafu(display("Internal error, unexpected null value"))]
+    InternalUnexpectedNull {},
+
+    #[snafu(display(
+        "Error reading record batch while converting to StringSet: {:?}",
+        source
+    ))]
+    ReadingRecordBatch { source: arrow::error::ArrowError },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+pub type StringSet = BTreeSet<String>;
+pub type StringSetRef = Arc<StringSet>;
+
+/// Trait to convert RecordBatch'y things into
+/// `StringSetRef`s. Assumes that the input record batches each have a
+/// single string column. Can return errors, so don't use
+/// `std::convert::From`
+pub trait IntoStringSet {
+    /// Convert this thing into a stringset
+    fn into_stringset(self) -> Result<StringSetRef>;
+}
+
+impl IntoStringSet for &[&str] {
+    fn into_stringset(self) -> Result<StringSetRef> {
+        let set: StringSet = self.iter().map(|s| s.to_string()).collect();
+        Ok(Arc::new(set))
+    }
+}
+
+/// Converts record batches into StringSets.
+impl IntoStringSet for Vec<RecordBatch> {
+    fn into_stringset(self) -> Result<StringSetRef> {
+        let mut strings = StringSet::new();
+
+        // process the record batches one by one
+        for record_batch in self.into_iter() {
+            let num_rows = record_batch.num_rows();
+            let schema = record_batch.schema();
+            let fields = schema.fields();
+            ensure!(
+                fields.len() == 1,
+                InternalSchemaWasNotStringSnafu {
+                    schema: Arc::clone(&schema),
+                }
+            );
+
+            let field = &fields[0];
+
+            match field.data_type() {
+                DataType::Utf8 => {
+                    let array = record_batch
+                        .column(0)
+                        .as_any()
+                        .downcast_ref::<StringArray>()
+                        .unwrap();
+
+                    add_utf8_array_to_stringset(&mut strings, array, num_rows)?;
+                }
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    let array = record_batch
+                        .column(0)
+                        .as_any()
+                        .downcast_ref::<DictionaryArray<Int32Type>>()
+                        .unwrap();
+
+                    add_utf8_dictionary_to_stringset(&mut strings, array, num_rows)?;
+                }
+                _ => InternalSchemaWasNotStringSnafu {
+                    schema: Arc::clone(&schema),
+                }
+                .fail()?,
+            }
+        }
+        Ok(StringSetRef::new(strings))
+    }
+}
+
+fn add_utf8_array_to_stringset(
+    dest: &mut StringSet,
+    src: &StringArray,
+    num_rows: usize,
+) -> Result<()> {
+    for i in 0..num_rows {
+        // Not sure how to handle a NULL -- StringSet contains
+        // Strings, not Option<String>
+        if src.is_null(i) {
+            return InternalUnexpectedNullSnafu {}.fail();
+        } else {
+            let src_value = src.value(i);
+            if !dest.contains(src_value) {
+                dest.insert(src_value.into());
+            }
+        }
+    }
+    Ok(())
+}
+
+fn add_utf8_dictionary_to_stringset(
+    dest: &mut StringSet,
+    dictionary: &DictionaryArray<Int32Type>,
+    num_rows: usize,
+) -> Result<()> {
+    let keys = dictionary.keys();
+    let values = dictionary.values();
+    let values = values.as_any().downcast_ref::<StringArray>().unwrap();
+
+    // It might be quicker to construct an intermediate collection
+    // of unique indexes and then hydrate them
+
+    for i in 0..num_rows {
+        // Not sure how to handle a NULL -- StringSet contains
+        // Strings, not Option<String>
+        if keys.is_null(i) {
+            return InternalUnexpectedNullSnafu {}.fail();
+        } else {
+            let idx = keys.value(i);
+            let src_value = values.value(idx as _);
+            if !dest.contains(src_value) {
+                dest.insert(src_value.into());
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/iox_query/src/frontend.rs b/iox_query/src/frontend.rs
new file mode 100644
index 0000000..7a6bd64
--- /dev/null
+++ b/iox_query/src/frontend.rs
@@ -0,0 +1,251 @@
+pub mod reorg;
+pub mod sql;
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use datafusion::physical_plan::{
+        metrics::{self, MetricValue},
+        ExecutionPlan, ExecutionPlanVisitor,
+    };
+    use datafusion_util::test_execute_partition;
+    use futures::StreamExt;
+    use schema::{merge::SchemaMerger, sort::SortKey, Schema};
+
+    use crate::{
+        exec::{split::StreamSplitExec, Executor, ExecutorType},
+        frontend::reorg::ReorgPlanner,
+        provider::{DeduplicateExec, RecordBatchesExec},
+        test::TestChunk,
+        QueryChunk,
+    };
+
+    /// A macro to asserts the contents of the extracted metrics is reasonable
+    ///
+    macro_rules! assert_extracted_metrics {
+        ($EXTRACTED: expr, $EXPECTED_OUTPUT_ROWS: expr) => {
+            assert!(
+                $EXTRACTED.elapsed_compute.value() > 0,
+                "some elapsed compute time"
+            );
+            assert_eq!(
+                $EXTRACTED.output_rows.value(),
+                $EXPECTED_OUTPUT_ROWS,
+                "expected output row count"
+            );
+
+            let start_ts = $EXTRACTED
+                .start_timestamp
+                .value()
+                .expect("start timestamp")
+                .timestamp_nanos_opt()
+                .expect("start timestamp in range");
+            let end_ts = $EXTRACTED
+                .end_timestamp
+                .value()
+                .expect("end timestamp")
+                .timestamp_nanos_opt()
+                .expect("end timestamp in range");
+
+            assert!(start_ts > 0, "start timestamp was non zero");
+            assert!(end_ts > 0, "end timestamp was non zero");
+            assert!(
+                start_ts < end_ts,
+                "start timestamp was before end timestamp"
+            );
+        };
+    }
+
+    #[tokio::test]
+    async fn test_metrics() {
+        test_helpers::maybe_start_logging();
+        let (schema, chunks) = get_test_chunks();
+        let sort_key = SortKey::from_columns(vec!["time", "tag1"]);
+
+        // Use a split plan as it has StreamSplitExec, DeduplicateExec and IOxReadFilternode
+        let split_plan = ReorgPlanner::new()
+            .split_plan(Arc::from("t"), &schema, chunks, sort_key, vec![1000])
+            .expect("created compact plan");
+
+        let executor = Executor::new_testing();
+        let plan = executor
+            .new_context(ExecutorType::Reorg)
+            .create_physical_plan(&split_plan)
+            .await
+            .unwrap();
+
+        assert_eq!(plan.output_partitioning().partition_count(), 2);
+
+        println!("Executing partition 0");
+        let mut stream0 = test_execute_partition(Arc::clone(&plan), 0).await;
+        let mut num_rows = 0;
+        while let Some(batch) = stream0.next().await {
+            num_rows += batch.unwrap().num_rows();
+        }
+        assert_eq!(num_rows, 3);
+
+        println!("Executing partition 1");
+        let mut stream1 = test_execute_partition(Arc::clone(&plan), 1).await;
+        let mut num_rows = 0;
+        while let Some(batch) = stream1.next().await {
+            num_rows += batch.unwrap().num_rows();
+        }
+        assert_eq!(num_rows, 5);
+
+        // now validate metrics are good
+        let extracted = extract_metrics(plan.as_ref(), |plan| {
+            plan.as_any().downcast_ref::<RecordBatchesExec>().is_some()
+        })
+        .unwrap();
+
+        assert_extracted_metrics!(extracted, 9);
+
+        // now the deduplicator
+        let extracted = extract_metrics(plan.as_ref(), |plan| {
+            plan.as_any().downcast_ref::<DeduplicateExec>().is_some()
+        })
+        .unwrap();
+
+        assert_extracted_metrics!(extracted, 3);
+
+        // now the the split
+        let extracted = extract_metrics(plan.as_ref(), |plan| {
+            plan.as_any().downcast_ref::<StreamSplitExec>().is_some()
+        })
+        .unwrap();
+
+        assert_extracted_metrics!(extracted, 8);
+    }
+
+    // Extracted baseline metrics for the specified operator
+    #[derive(Debug)]
+    struct ExtractedMetrics {
+        elapsed_compute: metrics::Time,
+        output_rows: metrics::Count,
+        start_timestamp: metrics::Timestamp,
+        end_timestamp: metrics::Timestamp,
+    }
+
+    // walks a plan tree, looking for the first plan node where a
+    // predicate returns true and extracts the common metrics
+    struct MetricsExtractor<P>
+    where
+        P: FnMut(&dyn ExecutionPlan) -> bool,
+    {
+        pred: P,
+        inner: Option<ExtractedMetrics>,
+    }
+
+    impl<P> ExecutionPlanVisitor for MetricsExtractor<P>
+    where
+        P: FnMut(&dyn ExecutionPlan) -> bool,
+    {
+        type Error = std::convert::Infallible;
+
+        fn pre_visit(
+            &mut self,
+            plan: &dyn ExecutionPlan,
+        ) -> std::result::Result<bool, Self::Error> {
+            // not visiting this one
+            if !(self.pred)(plan) {
+                return Ok(true);
+            }
+            let metrics = plan.metrics().unwrap().aggregate_by_name();
+            let mut elapsed_compute: Option<metrics::Time> = None;
+            let mut output_rows: Option<metrics::Count> = None;
+            let mut start_timestamp: Option<metrics::Timestamp> = None;
+            let mut end_timestamp: Option<metrics::Timestamp> = None;
+
+            metrics.iter().for_each(|m| match m.value() {
+                MetricValue::ElapsedCompute(t) => {
+                    assert!(elapsed_compute.is_none());
+                    elapsed_compute = Some(t.clone())
+                }
+                MetricValue::OutputRows(c) => {
+                    assert!(output_rows.is_none());
+                    output_rows = Some(c.clone())
+                }
+                MetricValue::StartTimestamp(ts) => {
+                    assert!(start_timestamp.is_none());
+                    start_timestamp = Some(ts.clone())
+                }
+                MetricValue::EndTimestamp(ts) => {
+                    assert!(end_timestamp.is_none());
+                    end_timestamp = Some(ts.clone())
+                }
+                _ => {}
+            });
+
+            let new = ExtractedMetrics {
+                elapsed_compute: elapsed_compute.expect("did not find metric"),
+                output_rows: output_rows.expect("did not find metric"),
+                start_timestamp: start_timestamp.expect("did not find metric"),
+                end_timestamp: end_timestamp.expect("did not find metric"),
+            };
+
+            if let Some(existing) = &self.inner {
+                let ExtractedMetrics {
+                    elapsed_compute,
+                    output_rows,
+                    start_timestamp,
+                    end_timestamp,
+                } = existing;
+                new.elapsed_compute.add(elapsed_compute);
+                new.output_rows.add(output_rows.value());
+                new.start_timestamp.update_to_min(start_timestamp);
+                new.end_timestamp.update_to_max(end_timestamp);
+            }
+            self.inner = Some(new);
+
+            // found what we are looking for, no need to continue
+            Ok(false)
+        }
+    }
+
+    fn extract_metrics<P>(plan: &dyn ExecutionPlan, pred: P) -> Option<ExtractedMetrics>
+    where
+        P: FnMut(&dyn ExecutionPlan) -> bool,
+    {
+        let mut extractor = MetricsExtractor { pred, inner: None };
+
+        datafusion::physical_plan::accept(plan, &mut extractor).unwrap();
+
+        extractor.inner
+    }
+
+    fn get_test_chunks() -> (Schema, Vec<Arc<dyn QueryChunk>>) {
+        let max_time = 7000;
+        let chunk1 = Arc::new(
+            TestChunk::new("t")
+                .with_order(1)
+                .with_partition(1)
+                .with_time_column_with_stats(Some(50), Some(max_time))
+                .with_tag_column_with_stats("tag1", Some("AL"), Some("MT"))
+                .with_i64_field_column("field_int")
+                .with_five_rows_of_data(),
+        );
+
+        // Chunk 2 has an extra field, and only 4 rows
+        let chunk2 = Arc::new(
+            TestChunk::new("t")
+                .with_order(2)
+                .with_partition(1)
+                .with_time_column_with_stats(Some(28000), Some(220000))
+                .with_tag_column_with_stats("tag1", Some("UT"), Some("WA"))
+                .with_i64_field_column("field_int")
+                .with_i64_field_column("field_int2")
+                .with_may_contain_pk_duplicates(true)
+                .with_four_rows_of_data(),
+        );
+
+        let schema = SchemaMerger::new()
+            .merge(chunk1.schema())
+            .unwrap()
+            .merge(chunk2.schema())
+            .unwrap()
+            .build();
+
+        (schema, vec![chunk1, chunk2])
+    }
+}
diff --git a/iox_query/src/frontend/reorg.rs b/iox_query/src/frontend/reorg.rs
new file mode 100644
index 0000000..9bf8259
--- /dev/null
+++ b/iox_query/src/frontend/reorg.rs
@@ -0,0 +1,732 @@
+//! planning for physical reorganization operations (e.g. COMPACT)
+
+use std::sync::Arc;
+
+use datafusion::{logical_expr::LogicalPlan, prelude::col};
+use datafusion_util::lit_timestamptz_nano;
+use observability_deps::tracing::debug;
+use schema::{sort::SortKey, Schema, TIME_COLUMN_NAME};
+
+use crate::{
+    exec::make_stream_split, provider::ProviderBuilder, util::logical_sort_key_exprs, QueryChunk,
+};
+use snafu::{ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Chunk schema not compatible for compact plan: {}", source))]
+    ChunkSchemaNotCompatible { source: schema::merge::Error },
+
+    #[snafu(display("Reorg planner got error building plan: {}", source))]
+    BuildingPlan {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display(
+        "Reorg planner got error adding creating scan for {}: {}",
+        table_name,
+        source
+    ))]
+    CreatingScan {
+        table_name: String,
+        source: crate::provider::Error,
+    },
+}
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl From<datafusion::error::DataFusionError> for Error {
+    fn from(source: datafusion::error::DataFusionError) -> Self {
+        Self::BuildingPlan { source }
+    }
+}
+
+/// Planner for physically rearranging chunk data. This planner
+/// creates COMPACT and SPLIT plans for use in the database lifecycle manager
+#[derive(Debug, Default, Copy, Clone)]
+pub struct ReorgPlanner {}
+
+impl ReorgPlanner {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Creates an execution plan for the COMPACT operations which does the following:
+    ///
+    /// 1. Merges chunks together into a single stream
+    /// 2. Deduplicates via PK as necessary
+    /// 3. Sorts the result according to the requested `output_sort_key` (if necessary)
+    ///
+    /// The plan looks like:
+    ///
+    /// ```text
+    /// (Optional Sort on output_sort_key)
+    ///   (Scan chunks) <-- any needed deduplication happens here
+    /// ```
+    pub fn compact_plan<I>(
+        &self,
+        table_name: Arc<str>,
+        schema: &Schema,
+        chunks: I,
+        output_sort_key: SortKey,
+    ) -> Result<LogicalPlan>
+    where
+        I: IntoIterator<Item = Arc<dyn QueryChunk>>,
+    {
+        let mut builder = ProviderBuilder::new(Arc::clone(&table_name), schema.clone())
+            .with_enable_deduplication(true);
+
+        for chunk in chunks {
+            builder = builder.add_chunk(chunk);
+        }
+
+        let provider = builder.build().context(CreatingScanSnafu {
+            table_name: table_name.as_ref(),
+        })?;
+        let plan_builder = Arc::new(provider)
+            .into_logical_plan_builder()
+            .context(BuildingPlanSnafu)?;
+        let sort_expr = logical_sort_key_exprs(&output_sort_key);
+        let plan = plan_builder
+            .sort(sort_expr)
+            .context(BuildingPlanSnafu)?
+            .build()
+            .context(BuildingPlanSnafu)?;
+
+        debug!(table_name=table_name.as_ref(), plan=%plan.display_indent_schema(),
+               "created compact plan for table");
+
+        Ok(plan)
+    }
+
+    /// Creates an execution plan for the SPLIT operations which does the following:
+    ///
+    /// 1. Merges chunks together into a single stream
+    /// 2. Deduplicates via PK as necessary
+    /// 3. Sorts the result according to the requested output_sort_key
+    /// 4. Splits the stream on value of the `time` column: Those
+    ///    rows that are on or before the time and those that are after
+    ///
+    /// The plan looks like:
+    ///
+    /// ```text
+    /// (Split on Time)
+    ///   (Sort on output_sort)
+    ///     (Scan chunks) <-- any needed deduplication happens here
+    /// ```
+    ///
+    /// The output execution plan has `N` "output streams" (DataFusion
+    /// partitions) where `N` = `split_times.len() + 1`. The
+    /// time ranges of the streams are:
+    ///
+    /// Stream 0: Rows that have `time` *on or before* the `split_times[0]`
+    ///
+    /// Stream i, where 0 < i < split_times.len():
+    /// Rows have: `time` in range `(split_times[i-1], split_times[i]]`,
+    ///  Which is: greater than `split_times[i-1]` up to and including `split_times[i]`.
+    ///
+    /// Stream n, where n = split_times.len()): Rows that have `time`
+    /// *after* `split_times[n-1]` as well as NULL rows
+    ///
+    /// # Panics
+    ///
+    /// The code will panic if split_times are not in monotonically increasing order
+    ///
+    /// # Example
+    /// if the input looks like:
+    /// ```text
+    ///  X | time
+    /// ---+-----
+    ///  b | 2000
+    ///  a | 1000
+    ///  c | 4000
+    ///  d | 2000
+    ///  e | 3000
+    /// ```
+    /// A split plan with `sort=time` and `split_times=[2000, 3000]` will produce the following three output streams
+    ///
+    /// ```text
+    ///  X | time
+    /// ---+-----
+    ///  a | 1000
+    ///  b | 2000
+    ///  d | 2000
+    /// ```
+    ///
+    /// ```text
+    ///  X | time
+    /// ---+-----
+    ///  e | 3000
+    /// ```
+    ///
+    /// ```text
+    ///  X | time
+    /// ---+-----
+    ///  c | 4000
+    /// ```
+    pub fn split_plan<I>(
+        &self,
+        table_name: Arc<str>,
+        schema: &Schema,
+        chunks: I,
+        output_sort_key: SortKey,
+        split_times: Vec<i64>,
+    ) -> Result<LogicalPlan>
+    where
+        I: IntoIterator<Item = Arc<dyn QueryChunk>>,
+    {
+        // split_times must have values
+        if split_times.is_empty() {
+            panic!("Split plan does not accept empty split_times");
+        }
+
+        let mut builder = ProviderBuilder::new(Arc::clone(&table_name), schema.clone())
+            .with_enable_deduplication(true);
+
+        for chunk in chunks {
+            builder = builder.add_chunk(chunk);
+        }
+
+        let provider = builder.build().context(CreatingScanSnafu {
+            table_name: table_name.as_ref(),
+        })?;
+        let plan_builder = Arc::new(provider)
+            .into_logical_plan_builder()
+            .context(BuildingPlanSnafu)?;
+        let sort_expr = logical_sort_key_exprs(&output_sort_key);
+        let plan = plan_builder
+            .sort(sort_expr)
+            .context(BuildingPlanSnafu)?
+            .build()
+            .context(BuildingPlanSnafu)?;
+
+        let mut split_exprs = Vec::with_capacity(split_times.len());
+        // time <= split_times[0]
+        split_exprs.push(col(TIME_COLUMN_NAME).lt_eq(lit_timestamptz_nano(split_times[0])));
+        // split_times[i-1] , time <= split_time[i]
+        for i in 1..split_times.len() {
+            if split_times[i - 1] >= split_times[i] {
+                panic!(
+                    "split_times[{}]: {} must be smaller than split_times[{}]: {}",
+                    i - 1,
+                    split_times[i - 1],
+                    i,
+                    split_times[i]
+                );
+            }
+            split_exprs.push(
+                col(TIME_COLUMN_NAME)
+                    .gt(lit_timestamptz_nano(split_times[i - 1]))
+                    .and(col(TIME_COLUMN_NAME).lt_eq(lit_timestamptz_nano(split_times[i]))),
+            );
+        }
+        let plan = make_stream_split(plan, split_exprs);
+
+        debug!(table_name=table_name.as_ref(), plan=%plan.display_indent_schema(),
+               "created split plan for table");
+
+        Ok(plan)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use arrow_util::assert_batches_eq;
+    use datafusion_util::{test_collect, test_collect_partition};
+    use schema::merge::SchemaMerger;
+    use schema::sort::SortKeyBuilder;
+
+    use crate::{
+        exec::{Executor, ExecutorType},
+        test::{format_execution_plan, raw_data, TestChunk},
+    };
+
+    use super::*;
+
+    async fn get_test_chunks() -> (Schema, Vec<Arc<dyn QueryChunk>>) {
+        // Chunk 1 with 5 rows of data on 2 tags
+        let chunk1 = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(50), Some(7000))
+                .with_tag_column_with_stats("tag1", Some("AL"), Some("MT"))
+                .with_i64_field_column("field_int")
+                .with_five_rows_of_data(),
+        ) as Arc<dyn QueryChunk>;
+
+        // Chunk 2 has an extra field, and only 4 fields
+        let chunk2 = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(28000), Some(220000))
+                .with_tag_column_with_stats("tag1", Some("UT"), Some("WA"))
+                .with_i64_field_column("field_int")
+                .with_i64_field_column("field_int2")
+                .with_may_contain_pk_duplicates(true)
+                .with_four_rows_of_data(),
+        ) as Arc<dyn QueryChunk>;
+
+        let expected = vec![
+            "+-----------+------+--------------------------------+",
+            "| field_int | tag1 | time                           |",
+            "+-----------+------+--------------------------------+",
+            "| 1000      | MT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 10        | MT   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 100       | AL   | 1970-01-01T00:00:00.000000050Z |",
+            "| 5         | MT   | 1970-01-01T00:00:00.000005Z    |",
+            "+-----------+------+--------------------------------+",
+        ];
+        assert_batches_eq!(&expected, &raw_data(&[Arc::clone(&chunk1)]).await);
+
+        let expected = vec![
+            "+-----------+------------+------+-----------------------------+",
+            "| field_int | field_int2 | tag1 | time                        |",
+            "+-----------+------------+------+-----------------------------+",
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z |",
+            "| 10        | 10         | VT   | 1970-01-01T00:00:00.000210Z |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z |",
+            "+-----------+------------+------+-----------------------------+",
+        ];
+        assert_batches_eq!(&expected, &raw_data(&[Arc::clone(&chunk2)]).await);
+
+        let schema = SchemaMerger::new()
+            .merge(chunk1.schema())
+            .unwrap()
+            .merge(chunk2.schema())
+            .unwrap()
+            .build();
+
+        (schema, vec![chunk1, chunk2])
+    }
+
+    async fn get_sorted_test_chunks() -> (Schema, Vec<Arc<dyn QueryChunk>>) {
+        // Chunk 1
+        let chunk1 = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(1000), Some(1000))
+                .with_tag_column_with_stats("tag1", Some("A"), Some("A"))
+                .with_i64_field_column("field_int")
+                .with_one_row_of_specific_data("A", 1, 1000),
+        ) as Arc<dyn QueryChunk>;
+
+        let expected = vec![
+            "+-----------+------+-----------------------------+",
+            "| field_int | tag1 | time                        |",
+            "+-----------+------+-----------------------------+",
+            "| 1         | A    | 1970-01-01T00:00:00.000001Z |",
+            "+-----------+------+-----------------------------+",
+        ];
+        assert_batches_eq!(&expected, &raw_data(&[Arc::clone(&chunk1)]).await);
+
+        // Chunk 2
+        let chunk2 = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(2000), Some(2000))
+                .with_tag_column_with_stats("tag1", Some("B"), Some("B"))
+                .with_i64_field_column("field_int")
+                .with_one_row_of_specific_data("B", 2, 2000),
+        ) as Arc<dyn QueryChunk>;
+
+        let expected = vec![
+            "+-----------+------+-----------------------------+",
+            "| field_int | tag1 | time                        |",
+            "+-----------+------+-----------------------------+",
+            "| 2         | B    | 1970-01-01T00:00:00.000002Z |",
+            "+-----------+------+-----------------------------+",
+        ];
+        assert_batches_eq!(&expected, &raw_data(&[Arc::clone(&chunk2)]).await);
+
+        (chunk1.schema().clone(), vec![chunk1, chunk2])
+    }
+
+    #[tokio::test]
+    async fn test_compact_plan_sorted() {
+        test_helpers::maybe_start_logging();
+
+        // ensures that the output is actually sorted
+        // https://github.com/influxdata/influxdb_iox/issues/6125
+        let (schema, chunks) = get_sorted_test_chunks().await;
+
+        let chunk_orders = vec![
+            // reverse order
+            vec![Arc::clone(&chunks[1]), Arc::clone(&chunks[0])],
+            chunks,
+        ];
+
+        // executor has only 1 thread
+        let executor = Executor::new_testing();
+        for chunks in chunk_orders {
+            let sort_key = SortKeyBuilder::with_capacity(2)
+                .with_col_opts("tag1", false, true)
+                .with_col_opts(TIME_COLUMN_NAME, false, true)
+                .build();
+
+            let compact_plan = ReorgPlanner::new()
+                .compact_plan(Arc::from("t"), &schema, chunks, sort_key)
+                .expect("created compact plan");
+
+            let physical_plan = executor
+                .new_context(ExecutorType::Reorg)
+                .create_physical_plan(&compact_plan)
+                .await
+                .unwrap();
+
+            let batches = test_collect(physical_plan).await;
+
+            // should be sorted on tag1 then timestamp
+            let expected = vec![
+                "+-----------+------+-----------------------------+",
+                "| field_int | tag1 | time                        |",
+                "+-----------+------+-----------------------------+",
+                "| 1         | A    | 1970-01-01T00:00:00.000001Z |",
+                "| 2         | B    | 1970-01-01T00:00:00.000002Z |",
+                "+-----------+------+-----------------------------+",
+            ];
+
+            assert_batches_eq!(&expected, &batches);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_compact_plan_default_sort() {
+        test_helpers::maybe_start_logging();
+
+        let (schema, chunks) = get_test_chunks().await;
+
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col("tag1")
+            .with_col(TIME_COLUMN_NAME)
+            .build();
+
+        let compact_plan = ReorgPlanner::new()
+            .compact_plan(Arc::from("t"), &schema, chunks, sort_key)
+            .expect("created compact plan");
+
+        let executor = Executor::new_testing();
+        let physical_plan = executor
+            .new_context(ExecutorType::Reorg)
+            .create_physical_plan(&compact_plan)
+            .await
+            .unwrap();
+
+        // It is critical that the plan only sorts the inputs and is not resorted after the UnionExec.
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&physical_plan),
+            @r###"
+        ---
+        - " SortPreservingMergeExec: [tag1@2 ASC,time@3 ASC]"
+        - "   UnionExec"
+        - "     SortExec: expr=[tag1@2 ASC,time@3 ASC]"
+        - "       RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
+        - "     ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
+        - "       DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
+        - "         SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
+        - "           RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
+        "###
+        );
+
+        assert_eq!(
+            physical_plan.output_partitioning().partition_count(),
+            1,
+            "{:?}",
+            physical_plan.output_partitioning()
+        );
+
+        let batches = test_collect(physical_plan).await;
+
+        // sorted on state ASC and time ASC (defaults)
+        let expected = vec![
+            "+-----------+------------+------+--------------------------------+",
+            "| field_int | field_int2 | tag1 | time                           |",
+            "+-----------+------------+------+--------------------------------+",
+            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
+            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |            | MT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z    |",
+            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z    |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z    |",
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z    |",
+            "+-----------+------------+------+--------------------------------+",
+        ];
+
+        assert_batches_eq!(&expected, &batches);
+    }
+
+    #[tokio::test]
+    async fn test_compact_plan_alternate_sort() {
+        test_helpers::maybe_start_logging();
+
+        let (schema, chunks) = get_test_chunks().await;
+
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            // use something other than the default sort
+            .with_col_opts("tag1", true, true)
+            .with_col_opts(TIME_COLUMN_NAME, false, false)
+            .build();
+
+        let compact_plan = ReorgPlanner::new()
+            .compact_plan(Arc::from("t"), &schema, chunks, sort_key)
+            .expect("created compact plan");
+
+        let executor = Executor::new_testing();
+        let physical_plan = executor
+            .new_context(ExecutorType::Reorg)
+            .create_physical_plan(&compact_plan)
+            .await
+            .unwrap();
+
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&physical_plan),
+            @r###"
+        ---
+        - " SortPreservingMergeExec: [tag1@2 DESC,time@3 ASC NULLS LAST]"
+        - "   UnionExec"
+        - "     SortExec: expr=[tag1@2 DESC,time@3 ASC NULLS LAST]"
+        - "       RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
+        - "     SortExec: expr=[tag1@2 DESC,time@3 ASC NULLS LAST]"
+        - "       ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
+        - "         DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
+        - "           SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
+        - "             RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
+        "###
+        );
+
+        assert_eq!(
+            physical_plan.output_partitioning().partition_count(),
+            1,
+            "{:?}",
+            physical_plan.output_partitioning()
+        );
+
+        let batches = test_collect(physical_plan).await;
+
+        // sorted on state DESC and time ASC
+        let expected = vec![
+            "+-----------+------------+------+--------------------------------+",
+            "| field_int | field_int2 | tag1 | time                           |",
+            "+-----------+------------+------+--------------------------------+",
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z    |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z    |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z    |",
+            "| 1000      |            | MT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z    |",
+            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
+            "+-----------+------------+------+--------------------------------+",
+        ];
+
+        assert_batches_eq!(&expected, &batches);
+    }
+
+    #[tokio::test]
+    async fn test_split_plan() {
+        test_helpers::maybe_start_logging();
+        // validate that the plumbing is all hooked up. The logic of
+        // the operator is tested in its own module.
+        let (schema, chunks) = get_test_chunks().await;
+
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("time", false, false)
+            .with_col_opts("tag1", false, true)
+            .build();
+
+        // split on 1000 should have timestamps 1000, 5000, and 7000
+        let split_plan = ReorgPlanner::new()
+            .split_plan(Arc::from("t"), &schema, chunks, sort_key, vec![1000])
+            .expect("created compact plan");
+
+        let executor = Executor::new_testing();
+        let physical_plan = executor
+            .new_context(ExecutorType::Reorg)
+            .create_physical_plan(&split_plan)
+            .await
+            .unwrap();
+
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&physical_plan),
+            @r###"
+        ---
+        - " StreamSplitExec"
+        - "   SortPreservingMergeExec: [time@3 ASC NULLS LAST,tag1@2 ASC]"
+        - "     UnionExec"
+        - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
+        - "         RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
+        - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
+        - "         ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
+        - "           DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
+        - "             SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
+        - "               RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
+        "###
+        );
+
+        assert_eq!(
+            physical_plan.output_partitioning().partition_count(),
+            2,
+            "{:?}",
+            physical_plan.output_partitioning()
+        );
+
+        // verify that the stream was split
+        let batches0 = test_collect_partition(Arc::clone(&physical_plan), 0).await;
+
+        // Note sorted on time
+        let expected = vec![
+            "+-----------+------------+------+--------------------------------+",
+            "| field_int | field_int2 | tag1 | time                           |",
+            "+-----------+------------+------+--------------------------------+",
+            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
+            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |            | MT   | 1970-01-01T00:00:00.000001Z    |",
+            "+-----------+------------+------+--------------------------------+",
+        ];
+        assert_batches_eq!(&expected, &batches0);
+
+        let batches1 = test_collect_partition(physical_plan, 1).await;
+
+        // Sorted on time
+        let expected = vec![
+            "+-----------+------------+------+-----------------------------+",
+            "| field_int | field_int2 | tag1 | time                        |",
+            "+-----------+------------+------+-----------------------------+",
+            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z |",
+            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z |",
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z |",
+            "+-----------+------------+------+-----------------------------+",
+        ];
+
+        assert_batches_eq!(&expected, &batches1);
+    }
+
+    #[tokio::test]
+    async fn test_split_plan_multi_exps() {
+        test_helpers::maybe_start_logging();
+        // validate that the plumbing is all hooked up. The logic of
+        // the operator is tested in its own module.
+        let (schema, chunks) = get_test_chunks().await;
+
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("time", false, false)
+            .with_col_opts("tag1", false, true)
+            .build();
+
+        // split on 1000 and 7000
+        let split_plan = ReorgPlanner::new()
+            .split_plan(Arc::from("t"), &schema, chunks, sort_key, vec![1000, 7000])
+            .expect("created compact plan");
+
+        let executor = Executor::new_testing();
+        let physical_plan = executor
+            .new_context(ExecutorType::Reorg)
+            .create_physical_plan(&split_plan)
+            .await
+            .unwrap();
+
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&physical_plan),
+            @r###"
+        ---
+        - " StreamSplitExec"
+        - "   SortPreservingMergeExec: [time@3 ASC NULLS LAST,tag1@2 ASC]"
+        - "     UnionExec"
+        - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
+        - "         RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
+        - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
+        - "         ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
+        - "           DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
+        - "             SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
+        - "               RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
+        "###
+        );
+
+        assert_eq!(
+            physical_plan.output_partitioning().partition_count(),
+            3,
+            "{:?}",
+            physical_plan.output_partitioning()
+        );
+
+        // Verify that the stream was split
+
+        // Note sorted on time
+        // Should include time <= 1000
+        let batches0 = test_collect_partition(Arc::clone(&physical_plan), 0).await;
+        let expected = vec![
+            "+-----------+------------+------+--------------------------------+",
+            "| field_int | field_int2 | tag1 | time                           |",
+            "+-----------+------------+------+--------------------------------+",
+            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
+            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |            | MT   | 1970-01-01T00:00:00.000001Z    |",
+            "+-----------+------------+------+--------------------------------+",
+        ];
+        assert_batches_eq!(&expected, &batches0);
+
+        // Sorted on time
+        // Should include 1000 < time <= 7000
+        let batches1 = test_collect_partition(Arc::clone(&physical_plan), 1).await;
+        let expected = vec![
+            "+-----------+------------+------+-----------------------------+",
+            "| field_int | field_int2 | tag1 | time                        |",
+            "+-----------+------------+------+-----------------------------+",
+            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z |",
+            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z |",
+            "+-----------+------------+------+-----------------------------+",
+        ];
+        assert_batches_eq!(&expected, &batches1);
+
+        // Sorted on time
+        // Should include 7000 < time
+        let batches2 = test_collect_partition(physical_plan, 2).await;
+        let expected = vec![
+            "+-----------+------------+------+-----------------------------+",
+            "| field_int | field_int2 | tag1 | time                        |",
+            "+-----------+------------+------+-----------------------------+",
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z |",
+            "+-----------+------------+------+-----------------------------+",
+        ];
+        assert_batches_eq!(&expected, &batches2);
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "Split plan does not accept empty split_times")]
+    async fn test_split_plan_panic_empty() {
+        test_helpers::maybe_start_logging();
+        // validate that the plumbing is all hooked up. The logic of
+        // the operator is tested in its own module.
+        let (schema, chunks) = get_test_chunks().await;
+
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("time", false, false)
+            .with_col_opts("tag1", false, true)
+            .build();
+
+        // split on 1000 and 7000
+        let _split_plan = ReorgPlanner::new()
+            .split_plan(Arc::from("t"), &schema, chunks, sort_key, vec![]) // reason of panic: empty split_times
+            .expect("created compact plan");
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "split_times[0]: 1000 must be smaller than split_times[1]: 500")]
+    async fn test_split_plan_panic_times() {
+        test_helpers::maybe_start_logging();
+        // validate that the plumbing is all hooked up. The logic of
+        // the operator is tested in its own module.
+        let (schema, chunks) = get_test_chunks().await;
+
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("time", false, false)
+            .with_col_opts("tag1", false, true)
+            .build();
+
+        // split on 1000 and 7000
+        let _split_plan = ReorgPlanner::new()
+            .split_plan(Arc::from("t"), &schema, chunks, sort_key, vec![1000, 500]) // reason of panic: split_times not in ascending order
+            .expect("created compact plan");
+    }
+}
diff --git a/iox_query/src/frontend/sql.rs b/iox_query/src/frontend/sql.rs
new file mode 100644
index 0000000..4008e3c
--- /dev/null
+++ b/iox_query/src/frontend/sql.rs
@@ -0,0 +1,26 @@
+use std::sync::Arc;
+
+use crate::exec::context::IOxSessionContext;
+use datafusion::{common::ParamValues, error::Result, physical_plan::ExecutionPlan};
+
+/// This struct can create plans for running SQL queries against databases
+#[derive(Debug, Default, Copy, Clone)]
+pub struct SqlQueryPlanner {}
+
+impl SqlQueryPlanner {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Plan a SQL query against the catalogs registered with `ctx`, and return a
+    /// DataFusion physical execution plan that runs on the query executor.
+    pub async fn query(
+        &self,
+        query: &str,
+        params: impl Into<ParamValues> + Send,
+        ctx: &IOxSessionContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let ctx = ctx.child_ctx("SqlQueryPlanner::query");
+        ctx.sql_to_physical_plan_with_params(query, params).await
+    }
+}
diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs
new file mode 100644
index 0000000..e5afb92
--- /dev/null
+++ b/iox_query/src/lib.rs
@@ -0,0 +1,227 @@
+//! Contains the IOx query engine
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(unreachable_pub)]
+
+use datafusion_util::MemoryStream;
+use futures::TryStreamExt;
+use query_log::{QueryCompletedToken, QueryText, StateReceived};
+use trace::{ctx::SpanContext, span::Span};
+
+use tracker::InstrumentedAsyncOwnedSemaphorePermit;
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use arrow::{
+    datatypes::{DataType, Field, SchemaRef},
+    record_batch::RecordBatch,
+};
+use async_trait::async_trait;
+use data_types::{ChunkId, ChunkOrder, TransitionPartitionId};
+use datafusion::{
+    error::DataFusionError,
+    physical_plan::{SendableRecordBatchStream, Statistics},
+    prelude::{Expr, SessionContext},
+};
+use exec::IOxSessionContext;
+use once_cell::sync::Lazy;
+use parquet_file::storage::ParquetExecInput;
+use schema::{sort::SortKey, Projection, Schema};
+use std::{any::Any, fmt::Debug, sync::Arc};
+
+pub mod chunk_statistics;
+pub mod config;
+pub mod exec;
+pub mod frontend;
+pub mod logical_optimizer;
+pub mod physical_optimizer;
+pub mod plan;
+pub mod provider;
+pub mod pruning;
+pub mod query_log;
+pub mod statistics;
+pub mod util;
+
+pub use query_functions::group_by::{Aggregate, WindowDuration};
+
+/// The name of the virtual column that represents the chunk order.
+pub const CHUNK_ORDER_COLUMN_NAME: &str = "__chunk_order";
+
+static CHUNK_ORDER_FIELD: Lazy<Arc<Field>> =
+    Lazy::new(|| Arc::new(Field::new(CHUNK_ORDER_COLUMN_NAME, DataType::Int64, false)));
+
+/// Generate [`Field`] for [chunk order column](CHUNK_ORDER_COLUMN_NAME).
+pub fn chunk_order_field() -> Arc<Field> {
+    Arc::clone(&CHUNK_ORDER_FIELD)
+}
+
+/// A single chunk of data.
+pub trait QueryChunk: Debug + Send + Sync + 'static {
+    /// Return a statistics of the data
+    fn stats(&self) -> Arc<Statistics>;
+
+    /// return a reference to the summary of the data held in this chunk
+    fn schema(&self) -> &Schema;
+
+    /// Return partition identifier for this chunk
+    fn partition_id(&self) -> &TransitionPartitionId;
+
+    /// return a reference to the sort key if any
+    fn sort_key(&self) -> Option<&SortKey>;
+
+    /// returns the Id of this chunk. Ids are unique within a
+    /// particular partition.
+    fn id(&self) -> ChunkId;
+
+    /// Returns true if the chunk may contain a duplicate "primary
+    /// key" within itself
+    fn may_contain_pk_duplicates(&self) -> bool;
+
+    /// Provides access to raw [`QueryChunk`] data.
+    ///
+    /// The engine assume that minimal work shall be performed to gather the `QueryChunkData`.
+    fn data(&self) -> QueryChunkData;
+
+    /// Returns chunk type. Useful in tests and debug logs.
+    fn chunk_type(&self) -> &str;
+
+    /// Order of this chunk relative to other overlapping chunks.
+    fn order(&self) -> ChunkOrder;
+
+    /// Return backend as [`Any`] which can be used to downcast to a specific implementation.
+    fn as_any(&self) -> &dyn Any;
+}
+
+/// `QueryNamespace` is the main trait implemented by the IOx subsystems that store actual data.
+///
+/// Namespaces store data organized by partitions and each partition stores data in Chunks.
+#[async_trait]
+pub trait QueryNamespace: Debug + Send + Sync {
+    /// Returns a set of chunks within the partition with data that may match the provided
+    /// filter expression.
+    ///
+    /// If possible, chunks which have no rows that can possibly match the filter may be omitted.
+    ///
+    /// If projection is `None`, returned chunks will include all columns of its original data.
+    /// Otherwise, returned chunks will include PK columns (tags and time) and columns specified in
+    /// the projection. Projecting chunks here is optional and a mere optimization. The query
+    /// subsystem does NOT rely on it.
+    async fn chunks(
+        &self,
+        table_name: &str,
+        filters: &[Expr],
+        projection: Option<&Vec<usize>>,
+        ctx: IOxSessionContext,
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError>;
+
+    /// Retention cutoff time.
+    ///
+    /// This gives the timestamp (NOT the duration) at which data should be cut off. This should result in an additional
+    /// filter of the following form:
+    ///
+    /// ```text
+    /// time >= retention_time_ns
+    /// ```
+    ///
+    /// Returns `None` if now retention policy was defined.
+    fn retention_time_ns(&self) -> Option<i64>;
+
+    /// Record that particular type of query was run / planned
+    fn record_query(
+        &self,
+        span_ctx: Option<&SpanContext>,
+        query_type: &'static str,
+        query_text: QueryText,
+    ) -> QueryCompletedToken<StateReceived>;
+
+    /// Returns a new execution context suitable for running queries
+    fn new_query_context(&self, span_ctx: Option<SpanContext>) -> IOxSessionContext;
+}
+
+/// Trait that allows the query engine (which includes flight and storage/InfluxRPC) to access a
+/// virtual set of namespaces.
+///
+/// This is the only entry point for the query engine. This trait and the traits reachable by it (e.g.
+/// [`QueryNamespace`]) are the only wait to access the catalog and payload data.
+#[async_trait]
+pub trait QueryNamespaceProvider: std::fmt::Debug + Send + Sync + 'static {
+    /// Get namespace if it exists.
+    ///
+    /// System tables may contain debug information depending on `include_debug_info_tables`.
+    async fn db(
+        &self,
+        name: &str,
+        span: Option<Span>,
+        include_debug_info_tables: bool,
+    ) -> Option<Arc<dyn QueryNamespace>>;
+
+    /// Acquire concurrency-limiting sempahore
+    async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit;
+}
+
+/// Raw data of a [`QueryChunk`].
+pub enum QueryChunkData {
+    /// Record batches.
+    RecordBatches(SendableRecordBatchStream),
+
+    /// Parquet file.
+    ///
+    /// See [`ParquetExecInput`] for details.
+    Parquet(ParquetExecInput),
+}
+
+impl QueryChunkData {
+    /// Read data into [`RecordBatch`]es. This is mostly meant for testing!
+    pub async fn read_to_batches(
+        self,
+        schema: &Schema,
+        session_ctx: &SessionContext,
+    ) -> Vec<RecordBatch> {
+        match self {
+            Self::RecordBatches(batches) => batches.try_collect::<Vec<_>>().await.unwrap(),
+            Self::Parquet(exec_input) => exec_input
+                .read_to_batches(schema.as_arrow(), Projection::All, session_ctx)
+                .await
+                .unwrap(),
+        }
+    }
+
+    /// Create data based on batches and schema.
+    pub fn in_mem(batches: Vec<RecordBatch>, schema: SchemaRef) -> Self {
+        let s = MemoryStream::new_with_schema(batches, schema);
+        let s: SendableRecordBatchStream = Box::pin(s);
+        Self::RecordBatches(s)
+    }
+}
+
+impl std::fmt::Debug for QueryChunkData {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::RecordBatches(_) => f.debug_tuple("RecordBatches").field(&"<stream>").finish(),
+            Self::Parquet(input) => f.debug_tuple("Parquet").field(input).finish(),
+        }
+    }
+}
+
+// Note: I would like to compile this module only in the 'test' cfg,
+// but when I do so then other modules can not find them. For example:
+//
+// error[E0433]: failed to resolve: could not find `test` in `storage`
+//   --> src/server/mutable_buffer_routes.rs:353:19
+//     |
+// 353 |     use iox_query::test::TestDatabaseStore;
+//     |                ^^^^ could not find `test` in `query`
+
+//
+//#[cfg(test)]
+pub mod test;
diff --git a/iox_query/src/logical_optimizer/extract_sleep.rs b/iox_query/src/logical_optimizer/extract_sleep.rs
new file mode 100644
index 0000000..2f11446
--- /dev/null
+++ b/iox_query/src/logical_optimizer/extract_sleep.rs
@@ -0,0 +1,100 @@
+use std::sync::Arc;
+
+use datafusion::logical_expr::expr::ScalarFunction;
+use datafusion::{
+    common::{tree_node::TreeNodeRewriter, DFSchema},
+    error::DataFusionError,
+    logical_expr::{expr_rewriter::rewrite_preserving_name, Extension, LogicalPlan},
+    optimizer::{OptimizerConfig, OptimizerRule},
+    prelude::{lit, Expr},
+    scalar::ScalarValue,
+};
+use query_functions::SLEEP_UDF_NAME;
+
+use crate::exec::sleep::SleepNode;
+
+/// Rewrites the ["sleep" UDF](SLEEP_UDF_NAME) to a NULL expression and a [`SleepNode`].
+///
+/// See [`crate::exec::sleep`] for more details.
+#[derive(Debug, Clone)]
+pub struct ExtractSleep {}
+
+impl ExtractSleep {
+    /// Create new optimizer rule.
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for ExtractSleep {
+    fn name(&self) -> &str {
+        "extract_sleep"
+    }
+
+    fn try_optimize(
+        &self,
+        plan: &LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> datafusion::error::Result<Option<LogicalPlan>> {
+        optimize(plan).map(Some)
+    }
+}
+
+fn optimize(plan: &LogicalPlan) -> Result<LogicalPlan, DataFusionError> {
+    let new_inputs = plan
+        .inputs()
+        .iter()
+        .map(|input| optimize(input))
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+
+    let mut schema =
+        new_inputs
+            .iter()
+            .map(|input| input.schema())
+            .fold(DFSchema::empty(), |mut lhs, rhs| {
+                lhs.merge(rhs);
+                lhs
+            });
+
+    schema.merge(plan.schema());
+
+    let mut expr_rewriter = Rewriter::default();
+
+    let new_exprs = plan
+        .expressions()
+        .into_iter()
+        .map(|expr| rewrite_preserving_name(expr, &mut expr_rewriter))
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+    let mut plan = plan.with_new_exprs(new_exprs, &new_inputs)?;
+
+    if !expr_rewriter.found_exprs.is_empty() {
+        plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(SleepNode::new(plan, expr_rewriter.found_exprs)),
+        });
+    }
+
+    Ok(plan)
+}
+
+#[derive(Default)]
+struct Rewriter {
+    found_exprs: Vec<Expr>,
+}
+
+impl TreeNodeRewriter for Rewriter {
+    type N = Expr;
+
+    fn mutate(&mut self, expr: Expr) -> Result<Expr, DataFusionError> {
+        match expr {
+            Expr::ScalarFunction(ScalarFunction { func_def, mut args }) => {
+                if func_def.name() == SLEEP_UDF_NAME {
+                    self.found_exprs.append(&mut args);
+                    return Ok(lit(ScalarValue::Null));
+                }
+
+                Ok(Expr::ScalarFunction(ScalarFunction { func_def, args }))
+            }
+            _ => Ok(expr),
+        }
+    }
+}
diff --git a/iox_query/src/logical_optimizer/handle_gapfill.rs b/iox_query/src/logical_optimizer/handle_gapfill.rs
new file mode 100644
index 0000000..bd046b1
--- /dev/null
+++ b/iox_query/src/logical_optimizer/handle_gapfill.rs
@@ -0,0 +1,1176 @@
+//! An optimizer rule that transforms a plan
+//! to fill gaps in time series data.
+
+pub mod range_predicate;
+
+use crate::exec::gapfill::{FillStrategy, GapFill, GapFillParams};
+use datafusion::logical_expr::ScalarFunctionDefinition;
+use datafusion::{
+    common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter, VisitRecursion},
+    error::{DataFusionError, Result},
+    logical_expr::{
+        expr::{Alias, ScalarFunction},
+        utils::expr_to_columns,
+        Aggregate, BuiltinScalarFunction, Extension, LogicalPlan, Projection,
+    },
+    optimizer::{optimizer::ApplyOrder, OptimizerConfig, OptimizerRule},
+    prelude::{col, Column, Expr},
+};
+use hashbrown::{hash_map, HashMap};
+use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME};
+use std::{
+    collections::HashSet,
+    ops::{Bound, Range},
+    sync::Arc,
+};
+
+/// This optimizer rule enables gap-filling semantics for SQL queries
+/// that contain calls to `DATE_BIN_GAPFILL()` and related functions
+/// like `LOCF()`.
+///
+/// In SQL a typical gap-filling query might look like this:
+/// ```sql
+/// SELECT
+///   location,
+///   DATE_BIN_GAPFILL(INTERVAL '1 minute', time, '1970-01-01T00:00:00Z') AS minute,
+///   LOCF(AVG(temp))
+/// FROM temps
+/// WHERE time > NOW() - INTERVAL '6 hours' AND time < NOW()
+/// GROUP BY LOCATION, MINUTE
+/// ```
+///
+/// The initial logical plan will look like this:
+///
+/// ```text
+///   Projection: location, date_bin_gapfill(...) as minute, LOCF(AVG(temps.temp))
+///     Aggregate: groupBy=[[location, date_bin_gapfill(...)]], aggr=[[AVG(temps.temp)]]
+///       ...
+/// ```
+///
+/// This optimizer rule transforms it to this:
+///
+/// ```text
+///   Projection: location, date_bin_gapfill(...) as minute, AVG(temps.temp)
+///     GapFill: groupBy=[[location, date_bin_gapfill(...))]], aggr=[[LOCF(AVG(temps.temp))]], start=..., stop=...
+///       Aggregate: groupBy=[[location, date_bin(...))]], aggr=[[AVG(temps.temp)]]
+///         ...
+/// ```
+///
+/// For `Aggregate` nodes that contain calls to `DATE_BIN_GAPFILL`, this rule will:
+/// - Convert `DATE_BIN_GAPFILL()` to `DATE_BIN()`
+/// - Create a `GapFill` node that fills in gaps in the query
+/// - The range for gap filling is found by analyzing any preceding `Filter` nodes
+///
+/// If there is a `Projection` above the `GapFill` node that gets created:
+/// - Look for calls to gap-filling functions like `LOCF`
+/// - Push down these functions into the `GapFill` node, updating the fill strategy for the column.
+///
+/// Note: both `DATE_BIN_GAPFILL` and `LOCF` are functions that don't have implementations.
+/// This rule must rewrite the plan to get rid of them.
+pub struct HandleGapFill;
+
+impl HandleGapFill {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl Default for HandleGapFill {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl OptimizerRule for HandleGapFill {
+    fn try_optimize(
+        &self,
+        plan: &LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> Result<Option<LogicalPlan>> {
+        handle_gap_fill(plan)
+    }
+
+    fn name(&self) -> &str {
+        "handle_gap_fill"
+    }
+
+    fn apply_order(&self) -> Option<ApplyOrder> {
+        Some(ApplyOrder::BottomUp)
+    }
+}
+
+fn handle_gap_fill(plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
+    let res = match plan {
+        LogicalPlan::Aggregate(aggr) => {
+            handle_aggregate(aggr).map_err(|e| e.context("handle_aggregate"))?
+        }
+        LogicalPlan::Projection(proj) => {
+            handle_projection(proj).map_err(|e| e.context("handle_projection"))?
+        }
+        _ => None,
+    };
+
+    if res.is_none() {
+        // no transformation was applied,
+        // so make sure the plan is not using gap filling
+        // functions in an unsupported way.
+        check_node(plan)?;
+    }
+
+    Ok(res)
+}
+
+fn handle_aggregate(aggr: &Aggregate) -> Result<Option<LogicalPlan>> {
+    let Aggregate {
+        input,
+        group_expr,
+        aggr_expr,
+        schema,
+        ..
+    } = aggr;
+
+    // new_group_expr has DATE_BIN_GAPFILL replaced with DATE_BIN.
+    let RewriteInfo {
+        new_group_expr,
+        date_bin_gapfill_index,
+        date_bin_gapfill_args,
+    } = if let Some(v) =
+        replace_date_bin_gapfill(group_expr).map_err(|e| e.context("replace_date_bin_gapfill"))?
+    {
+        v
+    } else {
+        return Ok(None);
+    };
+
+    let new_aggr_plan = {
+        // Create the aggregate node with the same output schema as the orignal
+        // one. This means that there will be an output column called `date_bin_gapfill(...)`
+        // even though the actual expression populating that column will be `date_bin(...)`.
+        // This seems acceptable since it avoids having to deal with renaming downstream.
+        let new_aggr_plan = Aggregate::try_new_with_schema(
+            Arc::clone(input),
+            new_group_expr,
+            aggr_expr.clone(),
+            Arc::clone(schema),
+        )
+        .map_err(|e| e.context("Aggregate::try_new_with_schema"))?;
+        let new_aggr_plan = LogicalPlan::Aggregate(new_aggr_plan);
+        check_node(&new_aggr_plan).map_err(|e| e.context("check_node"))?;
+        new_aggr_plan
+    };
+
+    let new_gap_fill_plan =
+        build_gapfill_node(new_aggr_plan, date_bin_gapfill_index, date_bin_gapfill_args)
+            .map_err(|e| e.context("build_gapfill_node"))?;
+    Ok(Some(new_gap_fill_plan))
+}
+
+fn build_gapfill_node(
+    new_aggr_plan: LogicalPlan,
+    date_bin_gapfill_index: usize,
+    date_bin_gapfill_args: Vec<Expr>,
+) -> Result<LogicalPlan> {
+    match date_bin_gapfill_args.len() {
+        2 | 3 => (),
+        nargs => {
+            return Err(DataFusionError::Plan(format!(
+                "DATE_BIN_GAPFILL expects 2 or 3 arguments, got {nargs}",
+            )));
+        }
+    }
+
+    let mut args_iter = date_bin_gapfill_args.into_iter();
+
+    // Ensure that stride argument is a scalar
+    let stride = args_iter.next().unwrap();
+    validate_scalar_expr("stride argument to DATE_BIN_GAPFILL", &stride)
+        .map_err(|e| e.context("validate_scalar_expr"))?;
+
+    fn get_column(expr: Expr) -> Result<Column> {
+        match expr {
+            Expr::Column(c) => Ok(c),
+            Expr::Cast(c) => get_column(*c.expr),
+            _ => Err(DataFusionError::Plan(
+                "DATE_BIN_GAPFILL requires a column as the source argument".to_string(),
+            )),
+        }
+    }
+
+    // Ensure that the source argument is a column
+    let time_col =
+        get_column(args_iter.next().unwrap()).map_err(|e| e.context("get time column"))?;
+
+    // Ensure that a time range was specified and is valid for gap filling
+    let time_range = range_predicate::find_time_range(new_aggr_plan.inputs()[0], &time_col)
+        .map_err(|e| e.context("find time range"))?;
+    validate_time_range(&time_range).map_err(|e| e.context("validate time range"))?;
+
+    // Ensure that origin argument is a scalar
+    let origin = args_iter.next();
+    if let Some(ref origin) = origin {
+        validate_scalar_expr("origin argument to DATE_BIN_GAPFILL", origin)
+            .map_err(|e| e.context("validate origin"))?;
+    }
+
+    // Make sure the time output to the gapfill node matches what the
+    // aggregate output was.
+    let time_column =
+        col(new_aggr_plan.schema().fields()[date_bin_gapfill_index].qualified_column());
+
+    let LogicalPlan::Aggregate(aggr) = &new_aggr_plan else {
+        return Err(DataFusionError::Internal(format!(
+            "Expected Aggregate plan, got {}",
+            new_aggr_plan.display()
+        )));
+    };
+    let mut new_group_expr: Vec<_> = aggr
+        .schema
+        .fields()
+        .iter()
+        .map(|f| Expr::Column(f.qualified_column()))
+        .collect();
+    let aggr_expr = new_group_expr.split_off(aggr.group_expr.len());
+
+    let fill_behavior = aggr_expr
+        .iter()
+        .cloned()
+        .map(|e| (e, FillStrategy::Null))
+        .collect();
+
+    Ok(LogicalPlan::Extension(Extension {
+        node: Arc::new(
+            GapFill::try_new(
+                Arc::new(new_aggr_plan),
+                new_group_expr,
+                aggr_expr,
+                GapFillParams {
+                    stride,
+                    time_column,
+                    origin,
+                    time_range,
+                    fill_strategy: fill_behavior,
+                },
+            )
+            .map_err(|e| e.context("GapFill::try_new"))?,
+        ),
+    }))
+}
+
+fn validate_time_range(range: &Range<Bound<Expr>>) -> Result<()> {
+    let Range { ref start, ref end } = range;
+    let (start, end) = match (start, end) {
+        (Bound::Unbounded, Bound::Unbounded) => {
+            return Err(DataFusionError::Plan(
+                "gap-filling query is missing both upper and lower time bounds".to_string(),
+            ))
+        }
+        (Bound::Unbounded, _) => Err(DataFusionError::Plan(
+            "gap-filling query is missing lower time bound".to_string(),
+        )),
+        (_, Bound::Unbounded) => Err(DataFusionError::Plan(
+            "gap-filling query is missing upper time bound".to_string(),
+        )),
+        (
+            Bound::Included(start) | Bound::Excluded(start),
+            Bound::Included(end) | Bound::Excluded(end),
+        ) => Ok((start, end)),
+    }?;
+    validate_scalar_expr("lower time bound", start)?;
+    validate_scalar_expr("upper time bound", end)
+}
+
+fn validate_scalar_expr(what: &str, e: &Expr) -> Result<()> {
+    let mut cols = HashSet::new();
+    expr_to_columns(e, &mut cols)?;
+    if !cols.is_empty() {
+        Err(DataFusionError::Plan(format!(
+            "{what} for gap fill query must evaluate to a scalar"
+        )))
+    } else {
+        Ok(())
+    }
+}
+
+struct RewriteInfo {
+    // Group expressions with DATE_BIN_GAPFILL rewritten to DATE_BIN.
+    new_group_expr: Vec<Expr>,
+    // The index of the group expression that contained the call to DATE_BIN_GAPFILL.
+    date_bin_gapfill_index: usize,
+    // The arguments to the call to DATE_BIN_GAPFILL.
+    date_bin_gapfill_args: Vec<Expr>,
+}
+
+// Iterate over the group expression list.
+// If it finds no occurrences of date_bin_gapfill, it will return None.
+// If it finds more than one occurrence it will return an error.
+// Otherwise it will return a RewriteInfo for the optimizer rule to use.
+fn replace_date_bin_gapfill(group_expr: &[Expr]) -> Result<Option<RewriteInfo>> {
+    let mut date_bin_gapfill_count = 0;
+    let mut dbg_idx = None;
+    group_expr
+        .iter()
+        .enumerate()
+        .try_for_each(|(i, e)| -> Result<()> {
+            let fn_cnt = count_udf(e, DATE_BIN_GAPFILL_UDF_NAME)?;
+            date_bin_gapfill_count += fn_cnt;
+            if fn_cnt > 0 {
+                dbg_idx = Some(i);
+            }
+            Ok(())
+        })?;
+    match date_bin_gapfill_count {
+        0 => return Ok(None),
+        1 => {
+            // Make sure that the call to DATE_BIN_GAPFILL is root expression
+            // excluding aliases.
+            let dbg_idx = dbg_idx.expect("should have found exactly one call");
+            if !matches_udf(
+                unwrap_alias(&group_expr[dbg_idx]),
+                DATE_BIN_GAPFILL_UDF_NAME,
+            ) {
+                return Err(DataFusionError::Plan(
+                    "DATE_BIN_GAPFILL must be a top-level expression in the GROUP BY clause when gap filling. It cannot be part of another expression or cast".to_string(),
+                ));
+            }
+        }
+        _ => {
+            return Err(DataFusionError::Plan(
+                "DATE_BIN_GAPFILL specified more than once".to_string(),
+            ))
+        }
+    }
+
+    let date_bin_gapfill_index = dbg_idx.expect("should be found exactly one call");
+
+    let mut rewriter = DateBinGapfillRewriter { args: None };
+    let group_expr = group_expr
+        .iter()
+        .enumerate()
+        .map(|(i, e)| {
+            if i == date_bin_gapfill_index {
+                e.clone().rewrite(&mut rewriter)
+            } else {
+                Ok(e.clone())
+            }
+        })
+        .collect::<Result<Vec<_>>>()?;
+    let date_bin_gapfill_args = rewriter.args.expect("should have found args");
+
+    Ok(Some(RewriteInfo {
+        new_group_expr: group_expr,
+        date_bin_gapfill_index,
+        date_bin_gapfill_args,
+    }))
+}
+
+fn unwrap_alias(mut e: &Expr) -> &Expr {
+    loop {
+        match e {
+            Expr::Alias(Alias { expr, .. }) => e = expr.as_ref(),
+            e => break e,
+        }
+    }
+}
+
+struct DateBinGapfillRewriter {
+    args: Option<Vec<Expr>>,
+}
+
+impl TreeNodeRewriter for DateBinGapfillRewriter {
+    type N = Expr;
+    fn pre_visit(&mut self, expr: &Expr) -> Result<RewriteRecursion> {
+        match expr {
+            Expr::ScalarFunction(fun) if fun.func_def.name() == DATE_BIN_GAPFILL_UDF_NAME => {
+                Ok(RewriteRecursion::Mutate)
+            }
+            _ => Ok(RewriteRecursion::Continue),
+        }
+    }
+
+    fn mutate(&mut self, expr: Expr) -> Result<Expr> {
+        // We need to preserve the name of the original expression
+        // so that everything stays wired up.
+        let orig_name = expr.display_name()?;
+        match expr {
+            Expr::ScalarFunction(ScalarFunction { func_def, args })
+                if func_def.name() == DATE_BIN_GAPFILL_UDF_NAME =>
+            {
+                self.args = Some(args.clone());
+                Ok(Expr::ScalarFunction(ScalarFunction {
+                    func_def: ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::DateBin),
+                    args,
+                })
+                .alias(orig_name))
+            }
+            _ => Ok(expr),
+        }
+    }
+}
+
+fn udf_to_fill_strategy(name: &str) -> Option<FillStrategy> {
+    match name {
+        LOCF_UDF_NAME => Some(FillStrategy::PrevNullAsMissing),
+        INTERPOLATE_UDF_NAME => Some(FillStrategy::LinearInterpolate),
+        _ => None,
+    }
+}
+
+fn fill_strategy_to_udf(fs: &FillStrategy) -> Result<&'static str> {
+    match fs {
+        FillStrategy::PrevNullAsMissing => Ok(LOCF_UDF_NAME),
+        FillStrategy::LinearInterpolate => Ok(INTERPOLATE_UDF_NAME),
+        _ => Err(DataFusionError::Internal(format!(
+            "unknown UDF for fill strategy {fs:?}"
+        ))),
+    }
+}
+
+fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
+    let Projection {
+        input,
+        expr: proj_exprs,
+        schema: proj_schema,
+        ..
+    } = proj;
+    let Some(child_gapfill) = (match input.as_ref() {
+        LogicalPlan::Extension(Extension { node }) => node.as_any().downcast_ref::<GapFill>(),
+        _ => None,
+    }) else {
+        // If this is not a projection that is a parent to a GapFill node,
+        // then there is nothing to do.
+        return Ok(None);
+    };
+
+    let mut fill_fn_rewriter = FillFnRewriter {
+        aggr_col_fill_map: HashMap::new(),
+    };
+    let new_proj_exprs = proj_exprs
+        .iter()
+        .map(|expr| {
+            expr.clone()
+                .rewrite(&mut fill_fn_rewriter)
+                .map_err(|e| e.context(format!("rewrite: {expr}")))
+        })
+        .collect::<Result<Vec<Expr>>>()?;
+
+    let FillFnRewriter { aggr_col_fill_map } = fill_fn_rewriter;
+    if aggr_col_fill_map.is_empty() {
+        return Ok(None);
+    }
+
+    // Clone the existing GapFill node, then modify it in place
+    // to reflect the new fill strategy.
+    let mut new_gapfill = child_gapfill.clone();
+    for (e, fs) in aggr_col_fill_map {
+        let udf = fill_strategy_to_udf(&fs).map_err(|e| e.context("fill_strategy_to_udf"))?;
+        if new_gapfill.replace_fill_strategy(&e, fs).is_none() {
+            // There was a gap filling function called on a non-aggregate column.
+            return Err(DataFusionError::Plan(format!(
+                "{udf} must be called on an aggregate column in a gap-filling query",
+            )));
+        }
+    }
+
+    let new_proj = {
+        let mut proj = proj.clone();
+        proj.expr = new_proj_exprs;
+        proj.input = Arc::new(LogicalPlan::Extension(Extension {
+            node: Arc::new(new_gapfill),
+        }));
+        proj.schema = Arc::clone(proj_schema);
+        LogicalPlan::Projection(proj)
+    };
+
+    Ok(Some(new_proj))
+}
+
+/// Implements `TreeNodeRewriter`:
+/// - Traverses over the expressions in a projection node
+/// - If it finds `locf(col)` or `interpolate(col)`,
+///   it replaces them with `col AS <original name>`
+/// - Collects into [`Self::aggr_col_fill_map`] which correlates
+///   aggregate columns to their [`FillStrategy`].
+struct FillFnRewriter {
+    aggr_col_fill_map: HashMap<Expr, FillStrategy>,
+}
+
+impl TreeNodeRewriter for FillFnRewriter {
+    type N = Expr;
+    fn pre_visit(&mut self, expr: &Expr) -> Result<RewriteRecursion> {
+        match expr {
+            Expr::ScalarFunction(fun) if udf_to_fill_strategy(fun.func_def.name()).is_some() => {
+                Ok(RewriteRecursion::Mutate)
+            }
+            _ => Ok(RewriteRecursion::Continue),
+        }
+    }
+
+    fn mutate(&mut self, expr: Expr) -> Result<Expr> {
+        let orig_name = expr.display_name()?;
+        match expr {
+            Expr::ScalarFunction(ref fun)
+                if udf_to_fill_strategy(fun.func_def.name()).is_none() =>
+            {
+                Ok(expr)
+            }
+            Expr::ScalarFunction(mut fun) => {
+                let fs = udf_to_fill_strategy(fun.func_def.name()).expect("must be a fill fn");
+                let arg = fun.args.remove(0);
+                self.add_fill_strategy(arg.clone(), fs)?;
+                Ok(arg.alias(orig_name))
+            }
+            _ => Ok(expr),
+        }
+    }
+}
+
+impl FillFnRewriter {
+    fn add_fill_strategy(&mut self, e: Expr, fs: FillStrategy) -> Result<()> {
+        match self.aggr_col_fill_map.entry(e) {
+            hash_map::Entry::Occupied(_) => Err(DataFusionError::NotImplemented(
+                "multiple fill strategies for the same column".to_string(),
+            )),
+            hash_map::Entry::Vacant(ve) => {
+                ve.insert(fs);
+                Ok(())
+            }
+        }
+    }
+}
+
+fn count_udf(e: &Expr, name: &str) -> Result<usize> {
+    let mut count = 0;
+    e.apply(&mut |expr| {
+        if matches_udf(expr, name) {
+            count += 1;
+        }
+        Ok(VisitRecursion::Continue)
+    })?;
+    Ok(count)
+}
+
+fn matches_udf(e: &Expr, name: &str) -> bool {
+    matches!(
+        e,
+        Expr::ScalarFunction(fun) if fun.func_def.name() == name
+    )
+}
+
+fn check_node(node: &LogicalPlan) -> Result<()> {
+    node.expressions().iter().try_for_each(|expr| {
+        let dbg_count = count_udf(expr, DATE_BIN_GAPFILL_UDF_NAME)?;
+        if dbg_count > 0 {
+            return Err(DataFusionError::Plan(format!(
+                "{DATE_BIN_GAPFILL_UDF_NAME} may only be used as a GROUP BY expression"
+            )));
+        }
+
+        for fn_name in [LOCF_UDF_NAME, INTERPOLATE_UDF_NAME] {
+            if count_udf(expr, fn_name)? > 0 {
+                return Err(DataFusionError::Plan(format!(
+                    "{fn_name} may only be used in the SELECT list of a gap-filling query"
+                )));
+            }
+        }
+        Ok(())
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use super::HandleGapFill;
+
+    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use datafusion::error::Result;
+    use datafusion::logical_expr::builder::table_scan_with_filters;
+    use datafusion::logical_expr::{logical_plan, LogicalPlan, LogicalPlanBuilder};
+    use datafusion::optimizer::optimizer::Optimizer;
+    use datafusion::optimizer::OptimizerContext;
+    use datafusion::prelude::{avg, case, col, lit, min, Expr};
+    use datafusion::scalar::ScalarValue;
+    use datafusion_util::lit_timestamptz_nano;
+    use query_functions::gapfill::{
+        DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME,
+    };
+
+    fn schema() -> Schema {
+        Schema::new(vec![
+            Field::new(
+                "time",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+            Field::new(
+                "time2",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+            Field::new("loc", DataType::Utf8, false),
+            Field::new("temp", DataType::Float64, false),
+        ])
+    }
+
+    fn table_scan() -> Result<LogicalPlan> {
+        logical_plan::table_scan(Some("temps"), &schema(), None)?.build()
+    }
+
+    fn date_bin_gapfill(interval: Expr, time: Expr) -> Result<Expr> {
+        date_bin_gapfill_with_origin(interval, time, None)
+    }
+
+    fn date_bin_gapfill_with_origin(
+        interval: Expr,
+        time: Expr,
+        origin: Option<Expr>,
+    ) -> Result<Expr> {
+        let mut args = vec![interval, time];
+        if let Some(origin) = origin {
+            args.push(origin)
+        }
+
+        Ok(query_functions::registry()
+            .udf(DATE_BIN_GAPFILL_UDF_NAME)?
+            .call(args))
+    }
+
+    fn locf(arg: Expr) -> Result<Expr> {
+        Ok(query_functions::registry()
+            .udf(LOCF_UDF_NAME)?
+            .call(vec![arg]))
+    }
+
+    fn interpolate(arg: Expr) -> Result<Expr> {
+        Ok(query_functions::registry()
+            .udf(INTERPOLATE_UDF_NAME)?
+            .call(vec![arg]))
+    }
+
+    fn optimize(plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
+        let optimizer = Optimizer::with_rules(vec![Arc::new(HandleGapFill)]);
+        optimizer.optimize_recursively(&optimizer.rules[0], plan, &OptimizerContext::new())
+    }
+
+    fn assert_optimizer_err(plan: &LogicalPlan, expected: &str) {
+        match optimize(plan) {
+            Ok(plan) => assert_eq!(format!("{}", plan.unwrap().display_indent()), "an error"),
+            Err(ref e) => {
+                let actual = e.to_string();
+                if expected.is_empty() || !actual.contains(expected) {
+                    assert_eq!(actual, expected)
+                }
+            }
+        }
+    }
+
+    fn assert_optimization_skipped(plan: &LogicalPlan) -> Result<()> {
+        let new_plan = optimize(plan)?;
+        if new_plan.is_none() {
+            return Ok(());
+        }
+        assert_eq!(
+            format!("{}", plan.display_indent()),
+            format!("{}", new_plan.unwrap().display_indent())
+        );
+        Ok(())
+    }
+
+    fn format_optimized_plan(plan: &LogicalPlan) -> Result<Vec<String>> {
+        let plan = optimize(plan)?
+            .expect("plan should have been optimized")
+            .display_indent()
+            .to_string();
+        Ok(plan.split('\n').map(|s| s.to_string()).collect())
+    }
+
+    #[test]
+    fn misplaced_dbg_err() -> Result<()> {
+        // date_bin_gapfill used in a filter should produce an error
+        let scan = table_scan()?;
+        let plan = LogicalPlanBuilder::from(scan)
+            .filter(
+                date_bin_gapfill(
+                    lit(ScalarValue::IntervalDayTime(Some(600_000))),
+                    col("temp"),
+                )?
+                .gt(lit(100.0)),
+            )?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "Error during planning: date_bin_gapfill may only be used as a GROUP BY expression",
+        );
+        Ok(())
+    }
+
+    /// calling LOCF in a WHERE predicate is not valid
+    #[test]
+    fn misplaced_locf_err() -> Result<()> {
+        // date_bin_gapfill used in a filter should produce an error
+        let scan = table_scan()?;
+        let plan = LogicalPlanBuilder::from(scan)
+            .filter(locf(col("temp"))?.gt(lit(100.0)))?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "Error during planning: locf may only be used in the SELECT list of a gap-filling query",
+        );
+        Ok(())
+    }
+
+    /// calling INTERPOLATE in a WHERE predicate is not valid
+    #[test]
+    fn misplaced_interpolate_err() -> Result<()> {
+        // date_bin_gapfill used in a filter should produce an error
+        let scan = table_scan()?;
+        let plan = LogicalPlanBuilder::from(scan)
+            .filter(interpolate(col("temp"))?.gt(lit(100.0)))?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "Error during planning: interpolate may only be used in the SELECT list of a gap-filling query",
+        );
+        Ok(())
+    }
+    /// calling LOCF on the SELECT list but not on an aggregate column is not valid.
+    #[test]
+    fn misplaced_locf_non_agg_err() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![
+                    col("loc"),
+                    date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time"))?,
+                ],
+                vec![avg(col("temp")), min(col("temp"))],
+            )?
+            .project(vec![
+                locf(col("loc"))?,
+                col("date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)"),
+                locf(col("AVG(temps.temp)"))?,
+                locf(col("MIN(temps.temp)"))?,
+            ])?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "locf must be called on an aggregate column in a gap-filling query",
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn different_fill_strategies_one_col() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![
+                    col("loc"),
+                    date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time"))?,
+                ],
+                vec![avg(col("temp")), min(col("temp"))],
+            )?
+            .project(vec![
+                locf(col("loc"))?,
+                col("date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)"),
+                locf(col("AVG(temps.temp)"))?,
+                interpolate(col("AVG(temps.temp)"))?,
+            ])?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "This feature is not implemented: multiple fill strategies for the same column",
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn nonscalar_origin() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill_with_origin(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                    Some(col("time2")),
+                )?],
+                vec![avg(col("temp"))],
+            )?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "Error during planning: origin argument to DATE_BIN_GAPFILL for gap fill query must evaluate to a scalar",
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn nonscalar_stride() -> Result<()> {
+        let stride = case(col("loc"))
+            .when(
+                lit("kitchen"),
+                lit(ScalarValue::IntervalDayTime(Some(60_000))),
+            )
+            .otherwise(lit(ScalarValue::IntervalDayTime(Some(30_000))))
+            .unwrap();
+
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill(stride, col("time"))?],
+                vec![avg(col("temp"))],
+            )?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "Error during planning: stride argument to DATE_BIN_GAPFILL for gap fill query must evaluate to a scalar",
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn time_range_errs() -> Result<()> {
+        let cases = vec![
+            (
+                lit(true),
+                "Error during planning: gap-filling query is missing both upper and lower time bounds",
+            ),
+            (
+                col("time").gt_eq(lit_timestamptz_nano(1000)),
+                "Error during planning: gap-filling query is missing upper time bound",
+            ),
+            (
+                col("time").lt(lit_timestamptz_nano(2000)),
+                "Error during planning: gap-filling query is missing lower time bound",
+            ),
+            (
+                col("time").gt_eq(col("time2")).and(
+                    col("time").lt(lit_timestamptz_nano(2000))),
+                "Error during planning: lower time bound for gap fill query must evaluate to a scalar",
+            ),
+            (
+                col("time").gt_eq(lit_timestamptz_nano(2000)).and(
+                    col("time").lt(col("time2"))),
+                "Error during planning: upper time bound for gap fill query must evaluate to a scalar",
+            )
+        ];
+        for c in cases {
+            let plan = LogicalPlanBuilder::from(table_scan()?)
+                .filter(c.0)?
+                .aggregate(
+                    vec![date_bin_gapfill(
+                        lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                        col("time"),
+                    )?],
+                    vec![avg(col("temp"))],
+                )?
+                .build()?;
+            assert_optimizer_err(&plan, c.1);
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn no_change() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .aggregate(vec![col("loc")], vec![avg(col("temp"))])?
+            .build()?;
+        assert_optimization_skipped(&plan)?;
+        Ok(())
+    }
+
+    #[test]
+    fn date_bin_gapfill_simple() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                )?],
+                vec![avg(col("temp"))],
+            )?
+            .build()?;
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan)?,
+            @r###"
+        ---
+        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[AVG(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[AVG(temps.temp)]]"
+        - "    Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
+        - "      TableScan: temps"
+        "###);
+        Ok(())
+    }
+
+    #[test]
+    fn date_bin_gapfill_origin() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill_with_origin(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                    Some(lit_timestamptz_nano(7)),
+                )?],
+                vec![avg(col("temp"))],
+            )?
+            .build()?;
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan)?,
+            @r###"
+        ---
+        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,TimestampNanosecond(7, None))], aggr=[[AVG(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,TimestampNanosecond(7, None)), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time, TimestampNanosecond(7, None)) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time,TimestampNanosecond(7, None))]], aggr=[[AVG(temps.temp)]]"
+        - "    Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
+        - "      TableScan: temps"
+        "###);
+        Ok(())
+    }
+    #[test]
+    fn two_group_exprs() -> Result<()> {
+        // grouping by date_bin_gapfill(...), loc
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![
+                    date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time"))?,
+                    col("loc"),
+                ],
+                vec![avg(col("temp"))],
+            )?
+            .build()?;
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan)?,
+            @r###"
+        ---
+        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), temps.loc], aggr=[[AVG(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), temps.loc]], aggr=[[AVG(temps.temp)]]"
+        - "    Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
+        - "      TableScan: temps"
+        "###);
+        Ok(())
+    }
+
+    #[test]
+    fn double_date_bin_gapfill() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .aggregate(
+                vec![
+                    date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time"))?,
+                    date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(30_000))), col("time"))?,
+                ],
+                vec![avg(col("temp"))],
+            )?
+            .build()?;
+        assert_optimizer_err(
+            &plan,
+            "Error during planning: DATE_BIN_GAPFILL specified more than once",
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn with_projection() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                )?],
+                vec![avg(col("temp"))],
+            )?
+            .project(vec![
+                col("date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)"),
+                col("AVG(temps.temp)"),
+            ])?
+            .build()?;
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan)?,
+            @r###"
+        ---
+        - "Projection: date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), AVG(temps.temp)"
+        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[AVG(temps.temp)]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[AVG(temps.temp)]]"
+        - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
+        - "        TableScan: temps"
+        "###);
+        Ok(())
+    }
+
+    #[test]
+    fn with_locf() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                )?],
+                vec![avg(col("temp")), min(col("temp"))],
+            )?
+            .project(vec![
+                col("date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)"),
+                locf(col("AVG(temps.temp)"))?,
+                locf(col("MIN(temps.temp)"))?,
+            ])?
+            .build()?;
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan)?,
+            @r###"
+        ---
+        - "Projection: date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), AVG(temps.temp) AS locf(AVG(temps.temp)), MIN(temps.temp) AS locf(MIN(temps.temp))"
+        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[LOCF(AVG(temps.temp)), LOCF(MIN(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[AVG(temps.temp), MIN(temps.temp)]]"
+        - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
+        - "        TableScan: temps"
+        "###);
+        Ok(())
+    }
+
+    #[test]
+    fn with_locf_aliased() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                )?],
+                vec![avg(col("temp")), min(col("temp"))],
+            )?
+            .project(vec![
+                col("date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)"),
+                locf(col("MIN(temps.temp)"))?.alias("locf_min_temp"),
+            ])?
+            .build()?;
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan)?,
+            @r###"
+        ---
+        - "Projection: date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), MIN(temps.temp) AS locf(MIN(temps.temp)) AS locf_min_temp"
+        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[AVG(temps.temp), LOCF(MIN(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[AVG(temps.temp), MIN(temps.temp)]]"
+        - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
+        - "        TableScan: temps"
+        "###);
+        Ok(())
+    }
+
+    #[test]
+    fn with_interpolate() -> Result<()> {
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+            )?
+            .aggregate(
+                vec![date_bin_gapfill(
+                    lit(ScalarValue::IntervalDayTime(Some(60_000))),
+                    col("time"),
+                )?],
+                vec![avg(col("temp")), min(col("temp"))],
+            )?
+            .project(vec![
+                col("date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)"),
+                interpolate(col("AVG(temps.temp)"))?,
+                interpolate(col("MIN(temps.temp)"))?,
+            ])?
+            .build()?;
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan)?,
+            @r###"
+        ---
+        - "Projection: date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), AVG(temps.temp) AS interpolate(AVG(temps.temp)), MIN(temps.temp) AS interpolate(MIN(temps.temp))"
+        - "  GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[INTERPOLATE(AVG(temps.temp)), INTERPOLATE(MIN(temps.temp))]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "    Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[AVG(temps.temp), MIN(temps.temp)]]"
+        - "      Filter: temps.time >= TimestampNanosecond(1000, None) AND temps.time < TimestampNanosecond(2000, None)"
+        - "        TableScan: temps"
+        "###);
+        Ok(())
+    }
+
+    #[test]
+    fn scan_filter_not_part_of_projection() {
+        let schema = schema();
+        let plan = table_scan_with_filters(
+            Some("temps"),
+            &schema,
+            Some(vec![schema.index_of("time").unwrap()]),
+            vec![
+                col("temps.time").gt_eq(lit_timestamptz_nano(1000)),
+                col("temps.time").lt(lit_timestamptz_nano(2000)),
+                col("temps.loc").eq(lit("foo")),
+            ],
+        )
+        .unwrap()
+        .aggregate(
+            vec![
+                date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time"))
+                    .unwrap(),
+            ],
+            std::iter::empty::<Expr>(),
+        )
+        .unwrap()
+        .build()
+        .unwrap();
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan).unwrap(),
+            @r###"
+        ---
+        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[]]"
+        - "    TableScan: temps projection=[time], full_filters=[temps.time >= TimestampNanosecond(1000, None), temps.time < TimestampNanosecond(2000, None), temps.loc = Utf8(\"foo\")]"
+        "###);
+    }
+}
diff --git a/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs b/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs
new file mode 100644
index 0000000..26b9682
--- /dev/null
+++ b/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs
@@ -0,0 +1,367 @@
+//! Find the time range from the filters in a logical plan.
+use std::{
+    ops::{Bound, Range},
+    sync::Arc,
+};
+
+use datafusion::{
+    common::{
+        tree_node::{TreeNode, TreeNodeVisitor, VisitRecursion},
+        DFSchema,
+    },
+    error::Result,
+    logical_expr::{
+        utils::split_conjunction, Between, BinaryExpr, LogicalPlan, LogicalPlanBuilder, Operator,
+    },
+    prelude::{Column, Expr},
+};
+
+use super::unwrap_alias;
+
+/// Given a plan and a column, finds the predicates that use that column
+/// and return a range with expressions for upper and lower bounds.
+pub fn find_time_range(plan: &LogicalPlan, time_col: &Column) -> Result<Range<Bound<Expr>>> {
+    let mut v = TimeRangeVisitor {
+        col: time_col.clone(),
+        range: TimeRange::default(),
+    };
+    plan.visit(&mut v)?;
+    Ok(v.range.0)
+}
+
+struct TimeRangeVisitor {
+    col: Column,
+    range: TimeRange,
+}
+
+impl TreeNodeVisitor for TimeRangeVisitor {
+    type N = LogicalPlan;
+
+    fn pre_visit(&mut self, plan: &LogicalPlan) -> Result<VisitRecursion> {
+        match plan {
+            LogicalPlan::Projection(p) => {
+                let idx = p.schema.index_of_column(&self.col)?;
+                match unwrap_alias(&p.expr[idx]) {
+                    Expr::Column(ref c) => {
+                        self.col = c.clone();
+                        Ok(VisitRecursion::Continue)
+                    }
+                    _ => Ok(VisitRecursion::Stop),
+                }
+            }
+            LogicalPlan::Filter(f) => {
+                let range = self.range.clone();
+                let range = split_conjunction(&f.predicate)
+                    .iter()
+                    .try_fold(range, |range, expr| {
+                        range.with_expr(f.input.schema().as_ref(), &self.col, expr)
+                    })?;
+                self.range = range;
+                Ok(VisitRecursion::Continue)
+            }
+            LogicalPlan::TableScan(t) => {
+                let range = self.range.clone();
+
+                // filters may use columns that are NOT part of a projection, so we need the underlying schema. Because
+                // that's a bit of a mess in DF, we reconstruct the schema using the plan builder.
+                let unprojected_scan = LogicalPlanBuilder::scan_with_filters(
+                    t.table_name.to_owned(),
+                    Arc::clone(&t.source),
+                    None,
+                    t.filters.clone(),
+                )
+                .map_err(|e| e.context("reconstruct unprojected scheam"))?;
+                let unprojected_schema = unprojected_scan.schema();
+                let range = t
+                    .filters
+                    .iter()
+                    .flat_map(split_conjunction)
+                    .try_fold(range, |range, expr| {
+                        range.with_expr(unprojected_schema, &self.col, expr)
+                    })?;
+                self.range = range;
+                Ok(VisitRecursion::Continue)
+            }
+            LogicalPlan::SubqueryAlias(_) => {
+                // The nodes below this one refer to the column with a different table name,
+                // just unset the relation so we match on the column name.
+                self.col.relation = None;
+                Ok(VisitRecursion::Continue)
+            }
+            // These nodes do not alter their schema, so we can recurse through them
+            LogicalPlan::Sort(_)
+            | LogicalPlan::Repartition(_)
+            | LogicalPlan::Limit(_)
+            | LogicalPlan::Distinct(_) => Ok(VisitRecursion::Continue),
+            // At some point we may wish to handle joins here too.
+            _ => Ok(VisitRecursion::Stop),
+        }
+    }
+}
+
+/// Encapsulates the upper and lower bounds of a time column
+/// in a logical plan.
+#[derive(Clone)]
+struct TimeRange(pub Range<Bound<Expr>>);
+
+impl Default for TimeRange {
+    fn default() -> Self {
+        Self(Range {
+            start: Bound::Unbounded,
+            end: Bound::Unbounded,
+        })
+    }
+}
+
+impl TimeRange {
+    // If the given expression uses the given column with comparison operators, update
+    // this time range to reflect that.
+    fn with_expr(self, schema: &DFSchema, time_col: &Column, expr: &Expr) -> Result<Self> {
+        let is_time_col = |e| -> Result<bool> {
+            match Expr::try_into_col(e) {
+                Ok(col) => Ok(schema.index_of_column(&col)? == schema.index_of_column(time_col)?),
+                Err(_) => Ok(false),
+            }
+        };
+
+        Ok(match expr {
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) if is_time_col(left)? => match op {
+                Operator::Lt => self.with_upper(Bound::Excluded(*right.clone())),
+                Operator::LtEq => self.with_upper(Bound::Included(*right.clone())),
+                Operator::Gt => self.with_lower(Bound::Excluded(*right.clone())),
+                Operator::GtEq => self.with_lower(Bound::Included(*right.clone())),
+                _ => self,
+            },
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) if is_time_col(right)? => match op {
+                Operator::Lt => self.with_lower(Bound::Excluded(*left.clone())),
+                Operator::LtEq => self.with_lower(Bound::Included(*left.clone())),
+                Operator::Gt => self.with_upper(Bound::Excluded(*left.clone())),
+                Operator::GtEq => self.with_upper(Bound::Included(*left.clone())),
+                _ => self,
+            },
+            // Between bounds are inclusive
+            Expr::Between(Between {
+                expr,
+                negated: false,
+                low,
+                high,
+            }) if is_time_col(expr)? => self
+                .with_lower(Bound::Included(*low.clone()))
+                .with_upper(Bound::Included(*high.clone())),
+            _ => self,
+        })
+    }
+
+    fn with_lower(self, start: Bound<Expr>) -> Self {
+        Self(Range {
+            start,
+            end: self.0.end,
+        })
+    }
+
+    fn with_upper(self, end: Bound<Expr>) -> Self {
+        Self(Range {
+            start: self.0.start,
+            end,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        ops::{Bound, Range},
+        sync::Arc,
+    };
+
+    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use datafusion::{
+        error::Result,
+        logical_expr::{
+            logical_plan::{self, builder::LogicalTableSource},
+            Between, LogicalPlan, LogicalPlanBuilder,
+        },
+        prelude::{col, lit, Column, Expr, Partitioning},
+        sql::TableReference,
+    };
+    use datafusion_util::lit_timestamptz_nano;
+
+    use super::find_time_range;
+
+    fn schema() -> Schema {
+        Schema::new(vec![
+            Field::new(
+                "time",
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                false,
+            ),
+            Field::new("temp", DataType::Float64, false),
+        ])
+    }
+
+    fn table_scan() -> Result<LogicalPlan> {
+        let schema = schema();
+        logical_plan::table_scan(Some("t"), &schema, None)?.build()
+    }
+
+    fn simple_filter_plan(pred: Expr, inline_filter: bool) -> Result<LogicalPlan> {
+        let schema = schema();
+        let table_source = Arc::new(LogicalTableSource::new(Arc::new(schema)));
+        let name = TableReference::from("t").to_quoted_string();
+        if inline_filter {
+            LogicalPlanBuilder::scan_with_filters(name, table_source, None, vec![pred])?.build()
+        } else {
+            LogicalPlanBuilder::scan(name, table_source, None)?
+                .filter(pred)?
+                .build()
+        }
+    }
+
+    fn between(expr: Expr, low: Expr, high: Expr) -> Expr {
+        Expr::Between(Between {
+            expr: Box::new(expr),
+            negated: false,
+            low: Box::new(low),
+            high: Box::new(high),
+        })
+    }
+
+    #[test]
+    fn test_find_range() -> Result<()> {
+        let time_col = Column::from_name("time");
+
+        let cases = vec![
+            (
+                "unbounded",
+                lit(true),
+                Range {
+                    start: Bound::Unbounded,
+                    end: Bound::Unbounded,
+                },
+            ),
+            (
+                "time_gt_val",
+                col("time").gt(lit_timestamptz_nano(1000)),
+                Range {
+                    start: Bound::Excluded(lit_timestamptz_nano(1000)),
+                    end: Bound::Unbounded,
+                },
+            ),
+            (
+                "time_gt_eq_val",
+                col("time").gt_eq(lit_timestamptz_nano(1000)),
+                Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Unbounded,
+                },
+            ),
+            (
+                "time_lt_val",
+                col("time").lt(lit_timestamptz_nano(1000)),
+                Range {
+                    start: Bound::Unbounded,
+                    end: Bound::Excluded(lit_timestamptz_nano(1000)),
+                },
+            ),
+            (
+                "time_lt_eq_val",
+                col("time").lt_eq(lit_timestamptz_nano(1000)),
+                Range {
+                    start: Bound::Unbounded,
+                    end: Bound::Included(lit_timestamptz_nano(1000)),
+                },
+            ),
+            (
+                "val_gt_time",
+                lit_timestamptz_nano(1000).gt(col("time")),
+                Range {
+                    start: Bound::Unbounded,
+                    end: Bound::Excluded(lit_timestamptz_nano(1000)),
+                },
+            ),
+            (
+                "val_gt_eq_time",
+                lit_timestamptz_nano(1000).gt_eq(col("time")),
+                Range {
+                    start: Bound::Unbounded,
+                    end: Bound::Included(lit_timestamptz_nano(1000)),
+                },
+            ),
+            (
+                "val_lt_time",
+                lit_timestamptz_nano(1000).lt(col("time")),
+                Range {
+                    start: Bound::Excluded(lit_timestamptz_nano(1000)),
+                    end: Bound::Unbounded,
+                },
+            ),
+            (
+                "val_lt_eq_time",
+                lit_timestamptz_nano(1000).lt_eq(col("time")),
+                Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Unbounded,
+                },
+            ),
+            (
+                "and",
+                col("time")
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
+                Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
+                },
+            ),
+            (
+                "between",
+                between(
+                    col("time"),
+                    lit_timestamptz_nano(1000),
+                    lit_timestamptz_nano(2000),
+                ),
+                Range {
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Included(lit_timestamptz_nano(2000)),
+                },
+            ),
+        ];
+        for (name, pred, expected) in cases {
+            for inline_filter in [false, true] {
+                let plan = simple_filter_plan(pred.clone(), inline_filter)?;
+                let actual = find_time_range(&plan, &time_col)?;
+                assert_eq!(
+                    expected, actual,
+                    "test case `{name}` with inline_filter={inline_filter} failed",
+                );
+            }
+        }
+        Ok(())
+    }
+
+    #[test]
+    fn plan_traversal() -> Result<()> {
+        // Show that the time range can be found
+        // - through nodes that don't alter their schema
+        // - even when predicates are in different filter nodes
+        // - through projections that alias columns
+        let plan = LogicalPlanBuilder::from(table_scan()?)
+            .filter(col("time").gt_eq(lit_timestamptz_nano(1000)))?
+            .sort(vec![col("time")])?
+            .limit(0, Some(10))?
+            .project(vec![col("time").alias("other_time")])?
+            .filter(col("other_time").lt(lit_timestamptz_nano(2000)))?
+            .distinct()?
+            .repartition(Partitioning::RoundRobinBatch(1))?
+            .project(vec![col("other_time").alias("my_time")])?
+            .build()?;
+        let time_col = Column::from_name("my_time");
+        let actual = find_time_range(&plan, &time_col)?;
+        let expected = Range {
+            start: Bound::Included(lit_timestamptz_nano(1000)),
+            end: Bound::Excluded(lit_timestamptz_nano(2000)),
+        };
+        assert_eq!(expected, actual);
+        Ok(())
+    }
+}
diff --git a/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs b/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs
new file mode 100644
index 0000000..3660cdb
--- /dev/null
+++ b/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs
@@ -0,0 +1,96 @@
+use datafusion::logical_expr::expr::ScalarFunction;
+use datafusion::{
+    common::{tree_node::TreeNodeRewriter, DFSchema},
+    error::DataFusionError,
+    logical_expr::{expr_rewriter::rewrite_preserving_name, LogicalPlan, Operator},
+    optimizer::{OptimizerConfig, OptimizerRule},
+    prelude::{binary_expr, lit, Expr},
+    scalar::ScalarValue,
+};
+use query_functions::{clean_non_meta_escapes, REGEX_MATCH_UDF_NAME, REGEX_NOT_MATCH_UDF_NAME};
+
+/// Replaces InfluxDB-specific regex operator with DataFusion regex operator.
+///
+/// InfluxDB has a special regex operator that is especially used by Flux/InfluxQL and that excepts certain escape
+/// sequences that are normal Rust regex crate does NOT support. If the pattern is already known at planning time (i.e.
+/// it is a constant), then we can clean the escape sequences and just use the ordinary DataFusion regex operator. This
+/// is desired because the ordinary DataFusion regex operator can be optimized further (e.g. to cheaper `LIKE` expressions).
+#[derive(Debug, Clone)]
+pub struct InfluxRegexToDataFusionRegex {}
+
+impl InfluxRegexToDataFusionRegex {
+    /// Create new optimizer rule.
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for InfluxRegexToDataFusionRegex {
+    fn name(&self) -> &str {
+        "influx_regex_to_datafusion_regex"
+    }
+
+    fn try_optimize(
+        &self,
+        plan: &LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> datafusion::error::Result<Option<LogicalPlan>> {
+        optimize(plan).map(Some)
+    }
+}
+
+fn optimize(plan: &LogicalPlan) -> Result<LogicalPlan, DataFusionError> {
+    let new_inputs = plan
+        .inputs()
+        .iter()
+        .map(|input| optimize(input))
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+
+    let mut schema =
+        new_inputs
+            .iter()
+            .map(|input| input.schema())
+            .fold(DFSchema::empty(), |mut lhs, rhs| {
+                lhs.merge(rhs);
+                lhs
+            });
+
+    schema.merge(plan.schema());
+
+    let mut expr_rewriter = InfluxRegexToDataFusionRegex {};
+
+    let new_exprs = plan
+        .expressions()
+        .into_iter()
+        .map(|expr| rewrite_preserving_name(expr, &mut expr_rewriter))
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+    plan.with_new_exprs(new_exprs, &new_inputs)
+}
+
+impl TreeNodeRewriter for InfluxRegexToDataFusionRegex {
+    type N = Expr;
+
+    fn mutate(&mut self, expr: Expr) -> Result<Expr, DataFusionError> {
+        match expr {
+            Expr::ScalarFunction(ScalarFunction { func_def, mut args }) => {
+                let name = func_def.name();
+                if (args.len() == 2)
+                    && ((name == REGEX_MATCH_UDF_NAME) || (name == REGEX_NOT_MATCH_UDF_NAME))
+                {
+                    if let Expr::Literal(ScalarValue::Utf8(Some(s))) = &args[1] {
+                        let s = clean_non_meta_escapes(s);
+                        let op = match name {
+                            REGEX_MATCH_UDF_NAME => Operator::RegexMatch,
+                            REGEX_NOT_MATCH_UDF_NAME => Operator::RegexNotMatch,
+                            _ => unreachable!(),
+                        };
+                        return Ok(binary_expr(args.remove(0), op, lit(s)));
+                    }
+                }
+
+                Ok(Expr::ScalarFunction(ScalarFunction { func_def, args }))
+            }
+            _ => Ok(expr),
+        }
+    }
+}
diff --git a/iox_query/src/logical_optimizer/mod.rs b/iox_query/src/logical_optimizer/mod.rs
new file mode 100644
index 0000000..42b72e1
--- /dev/null
+++ b/iox_query/src/logical_optimizer/mod.rs
@@ -0,0 +1,23 @@
+use std::sync::Arc;
+
+use datafusion::execution::context::SessionState;
+
+use self::{
+    extract_sleep::ExtractSleep, handle_gapfill::HandleGapFill,
+    influx_regex_to_datafusion_regex::InfluxRegexToDataFusionRegex,
+};
+
+mod extract_sleep;
+mod handle_gapfill;
+mod influx_regex_to_datafusion_regex;
+pub use handle_gapfill::range_predicate;
+
+/// Register IOx-specific logical [`OptimizerRule`]s with the SessionContext
+///
+/// [`OptimizerRule`]: datafusion::optimizer::OptimizerRule
+pub fn register_iox_logical_optimizers(state: SessionState) -> SessionState {
+    state
+        .add_optimizer_rule(Arc::new(InfluxRegexToDataFusionRegex::new()))
+        .add_optimizer_rule(Arc::new(ExtractSleep::new()))
+        .add_optimizer_rule(Arc::new(HandleGapFill::new()))
+}
diff --git a/iox_query/src/physical_optimizer/chunk_extraction.rs b/iox_query/src/physical_optimizer/chunk_extraction.rs
new file mode 100644
index 0000000..488b5df
--- /dev/null
+++ b/iox_query/src/physical_optimizer/chunk_extraction.rs
@@ -0,0 +1,367 @@
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::{
+    datasource::physical_plan::ParquetExec,
+    error::DataFusionError,
+    physical_plan::{
+        empty::EmptyExec, placeholder_row::PlaceholderRowExec, union::UnionExec,
+        visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor,
+    },
+};
+use observability_deps::tracing::debug;
+use schema::sort::SortKey;
+
+use crate::{
+    provider::{PartitionedFileExt, RecordBatchesExec},
+    QueryChunk,
+};
+
+/// List of [`QueryChunk`]s.
+pub type QueryChunks = Vec<Arc<dyn QueryChunk>>;
+
+/// Extract chunks, schema, and output sort key from plans created with [`chunks_to_physical_nodes`].
+///
+/// Returns `None` if no chunks (or an [`EmptyExec`] in case that no chunks where passed to
+/// [`chunks_to_physical_nodes`]) were found or if the chunk data is inconsistent.
+///
+/// When no chunks were passed to [`chunks_to_physical_nodes`] and hence an [`EmptyExec`] was created, then no output
+/// sort key can be reconstructed. However this is usually OK because it does not have any effect anyways.
+///
+/// Note that this only works on the direct output of [`chunks_to_physical_nodes`]. If the plan is wrapped into
+/// additional nodes (like de-duplication, filtering, projection) then NO data will be returned. Also [`ParquetExec`]
+/// MUST NOT have a predicate attached.
+///
+///
+/// [`chunks_to_physical_nodes`]: crate::provider::chunks_to_physical_nodes
+pub fn extract_chunks(
+    plan: &dyn ExecutionPlan,
+) -> Option<(SchemaRef, QueryChunks, Option<SortKey>)> {
+    let mut visitor = ExtractChunksVisitor::default();
+    if let Err(e) = visit_execution_plan(plan, &mut visitor) {
+        debug!(
+            %e,
+            "cannot extract chunks",
+        );
+        return None;
+    }
+    visitor
+        .schema
+        .map(|schema| (schema, visitor.chunks, visitor.sort_key))
+}
+
+#[derive(Debug, Default)]
+struct ExtractChunksVisitor {
+    chunks: Vec<Arc<dyn QueryChunk>>,
+    schema: Option<SchemaRef>,
+    sort_key: Option<SortKey>,
+}
+
+impl ExtractChunksVisitor {
+    fn add_chunk(&mut self, chunk: Arc<dyn QueryChunk>) {
+        self.chunks.push(chunk);
+    }
+
+    fn add_schema_from_exec(&mut self, exec: &dyn ExecutionPlan) -> Result<(), DataFusionError> {
+        let schema = exec.schema();
+        if let Some(existing) = &self.schema {
+            if existing != &schema {
+                return Err(DataFusionError::External(
+                    String::from("Different schema").into(),
+                ));
+            }
+        } else {
+            self.schema = Some(schema);
+        }
+        Ok(())
+    }
+
+    fn add_sort_key(&mut self, sort_key: Option<&SortKey>) -> Result<(), DataFusionError> {
+        let Some(sort_key) = sort_key else {
+            return Ok(());
+        };
+
+        if let Some(existing) = &self.sort_key {
+            if existing != sort_key {
+                return Err(DataFusionError::External(
+                    String::from("Different sort key").into(),
+                ));
+            }
+        } else {
+            self.sort_key = Some(sort_key.clone());
+        }
+
+        Ok(())
+    }
+}
+
+impl ExecutionPlanVisitor for ExtractChunksVisitor {
+    type Error = DataFusionError;
+
+    fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
+        let plan_any = plan.as_any();
+
+        if let Some(record_batches_exec) = plan_any.downcast_ref::<RecordBatchesExec>() {
+            self.add_schema_from_exec(record_batches_exec)
+                .map_err(|e| {
+                    DataFusionError::Context(
+                        "add schema from RecordBatchesExec".to_owned(),
+                        Box::new(e),
+                    )
+                })?;
+
+            self.add_sort_key(record_batches_exec.output_sort_key_memo())?;
+
+            for chunk in record_batches_exec.chunks() {
+                self.add_chunk(Arc::clone(chunk));
+            }
+        } else if let Some(parquet_exec) = plan_any.downcast_ref::<ParquetExec>() {
+            if parquet_exec.predicate().is_some() {
+                return Err(DataFusionError::External(
+                    String::from("ParquetExec has predicate").into(),
+                ));
+            }
+
+            self.add_schema_from_exec(parquet_exec).map_err(|e| {
+                DataFusionError::Context("add schema from ParquetExec".to_owned(), Box::new(e))
+            })?;
+
+            for group in &parquet_exec.base_config().file_groups {
+                for file in group {
+                    let ext = file
+                        .extensions
+                        .as_ref()
+                        .and_then(|any| any.downcast_ref::<PartitionedFileExt>())
+                        .ok_or_else(|| {
+                            DataFusionError::External(
+                                String::from("PartitionedFileExt not found").into(),
+                            )
+                        })?;
+                    self.add_sort_key(ext.output_sort_key_memo.as_ref())?;
+                    self.add_chunk(Arc::clone(&ext.chunk));
+                }
+            }
+        } else if plan_any.downcast_ref::<PlaceholderRowExec>().is_some() {
+            // should not produce dummy data
+            return Err(DataFusionError::External(
+                String::from("EmptyExec produces row").into(),
+            ));
+        } else if let Some(empty_exec) = plan_any.downcast_ref::<EmptyExec>() {
+            self.add_schema_from_exec(empty_exec).map_err(|e| {
+                DataFusionError::Context("add schema from EmptyExec".to_owned(), Box::new(e))
+            })?;
+        } else if plan_any.downcast_ref::<UnionExec>().is_some() {
+            // continue visiting
+        } else {
+            // unsupported node
+            return Err(DataFusionError::External(
+                String::from("Unsupported node").into(),
+            ));
+        }
+
+        Ok(true)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{provider::chunks_to_physical_nodes, test::TestChunk, util::df_physical_expr};
+    use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use data_types::ChunkId;
+    use datafusion::{
+        common::tree_node::{Transformed, TreeNode},
+        physical_plan::{expressions::Literal, filter::FilterExec},
+        prelude::{col, lit},
+        scalar::ScalarValue,
+    };
+    use schema::{merge::SchemaMerger, sort::SortKeyBuilder, SchemaBuilder, TIME_COLUMN_NAME};
+
+    use super::*;
+
+    #[test]
+    fn test_roundtrip_empty() {
+        let schema = chunk(1).schema().as_arrow();
+        assert_roundtrip(schema, vec![], None);
+    }
+
+    #[test]
+    fn test_roundtrip_single_record_batch() {
+        let chunk1 = chunk(1);
+        let sort_key = Some(sort_key());
+        assert_roundtrip(chunk1.schema().as_arrow(), vec![Arc::new(chunk1)], sort_key);
+    }
+
+    #[test]
+    fn test_roundtrip_single_parquet() {
+        let chunk1 = chunk(1).with_dummy_parquet_file();
+        let sort_key = Some(sort_key());
+        assert_roundtrip(chunk1.schema().as_arrow(), vec![Arc::new(chunk1)], sort_key);
+    }
+
+    #[test]
+    fn test_roundtrip_many_chunks() {
+        let chunk1 = chunk(1).with_dummy_parquet_file();
+        let chunk2 = chunk(2).with_dummy_parquet_file();
+        let chunk3 = chunk(3).with_dummy_parquet_file();
+        let chunk4 = chunk(4);
+        let chunk5 = chunk(5);
+        let sort_key = Some(sort_key());
+        assert_roundtrip(
+            chunk1.schema().as_arrow(),
+            vec![
+                Arc::new(chunk1),
+                Arc::new(chunk2),
+                Arc::new(chunk3),
+                Arc::new(chunk4),
+                Arc::new(chunk5),
+            ],
+            sort_key,
+        );
+    }
+
+    #[test]
+    fn test_different_schemas() {
+        let some_chunk = chunk(1);
+        let iox_schema = some_chunk.schema();
+        let schema1 = iox_schema.as_arrow();
+        let schema2 = iox_schema.select_by_indices(&[]).as_arrow();
+        let plan = UnionExec::new(vec![
+            Arc::new(EmptyExec::new(schema1)),
+            Arc::new(EmptyExec::new(schema2)),
+        ]);
+        assert!(extract_chunks(&plan).is_none());
+    }
+
+    #[test]
+    fn test_empty_exec_with_rows() {
+        let schema = chunk(1).schema().as_arrow();
+        let plan = PlaceholderRowExec::new(schema);
+        assert!(extract_chunks(&plan).is_none());
+    }
+
+    #[test]
+    fn test_empty_exec_no_iox_schema() {
+        let schema = Arc::new(ArrowSchema::new(vec![Field::new(
+            "x",
+            DataType::Float64,
+            true,
+        )]));
+        let plan = EmptyExec::new(Arc::clone(&schema));
+        let (schema2, chunks, sort_key) = extract_chunks(&plan).unwrap();
+        assert_eq!(schema, schema2);
+        assert!(chunks.is_empty());
+        assert!(sort_key.is_none());
+    }
+
+    #[test]
+    fn test_different_sort_keys() {
+        let sort_key1 = Arc::new(SortKeyBuilder::new().with_col("tag1").build());
+        let sort_key2 = Arc::new(SortKeyBuilder::new().with_col("tag2").build());
+        let chunk1 = Arc::new(chunk(1)) as Arc<dyn QueryChunk>;
+        let schema = chunk1.schema().as_arrow();
+        let plan = UnionExec::new(vec![
+            chunks_to_physical_nodes(&schema, Some(&sort_key1), vec![Arc::clone(&chunk1)], 1),
+            chunks_to_physical_nodes(&schema, Some(&sort_key2), vec![chunk1], 1),
+        ]);
+        assert!(extract_chunks(&plan).is_none());
+    }
+
+    #[test]
+    fn test_stop_at_other_node_types() {
+        let chunk1 = chunk(1);
+        let schema = chunk1.schema().as_arrow();
+        let plan = chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2);
+        let plan = FilterExec::try_new(
+            df_physical_expr(plan.schema(), col("tag1").eq(lit("foo"))).unwrap(),
+            plan,
+        )
+        .unwrap();
+        assert!(extract_chunks(&plan).is_none());
+    }
+
+    #[test]
+    fn test_preserve_record_batches_exec_schema() {
+        let chunk = chunk(1);
+        let schema_ext = SchemaBuilder::new().tag("zzz").build().unwrap();
+        let schema = SchemaMerger::new()
+            .merge(chunk.schema())
+            .unwrap()
+            .merge(&schema_ext)
+            .unwrap()
+            .build()
+            .as_arrow();
+        assert_roundtrip(schema, vec![Arc::new(chunk)], None);
+    }
+
+    #[test]
+    fn test_preserve_parquet_exec_schema() {
+        let chunk = chunk(1).with_dummy_parquet_file();
+        let schema_ext = SchemaBuilder::new().tag("zzz").build().unwrap();
+        let schema = SchemaMerger::new()
+            .merge(chunk.schema())
+            .unwrap()
+            .merge(&schema_ext)
+            .unwrap()
+            .build()
+            .as_arrow();
+        assert_roundtrip(schema, vec![Arc::new(chunk)], None);
+    }
+
+    #[test]
+    fn test_parquet_with_predicate_fails() {
+        let chunk = chunk(1).with_dummy_parquet_file();
+        let schema = chunk.schema().as_arrow();
+        let plan = chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk)], 2);
+        let plan = plan
+            .transform_down(&|plan| {
+                if let Some(exec) = plan.as_any().downcast_ref::<ParquetExec>() {
+                    let exec = ParquetExec::new(
+                        exec.base_config().clone(),
+                        Some(Arc::new(Literal::new(ScalarValue::from(false)))),
+                        None,
+                    );
+                    return Ok(Transformed::Yes(Arc::new(exec)));
+                }
+                Ok(Transformed::No(plan))
+            })
+            .unwrap();
+        assert!(extract_chunks(plan.as_ref()).is_none());
+    }
+
+    #[track_caller]
+    fn assert_roundtrip(
+        schema: SchemaRef,
+        chunks: Vec<Arc<dyn QueryChunk>>,
+        output_sort_key: Option<SortKey>,
+    ) {
+        let plan = chunks_to_physical_nodes(&schema, output_sort_key.as_ref(), chunks.clone(), 2);
+        let (schema2, chunks2, output_sort_key2) =
+            extract_chunks(plan.as_ref()).expect("data found");
+        assert_eq!(schema, schema2);
+        assert_eq!(chunk_ids(&chunks), chunk_ids(&chunks2));
+        assert_eq!(output_sort_key, output_sort_key2);
+    }
+
+    fn chunk_ids(chunks: &[Arc<dyn QueryChunk>]) -> Vec<ChunkId> {
+        let mut ids = chunks.iter().map(|c| c.id()).collect::<Vec<_>>();
+        ids.sort();
+        ids
+    }
+
+    fn chunk(id: u128) -> TestChunk {
+        TestChunk::new("table")
+            .with_id(id)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_i64_field_column("field")
+            .with_time_column()
+    }
+
+    fn sort_key() -> SortKey {
+        SortKeyBuilder::new()
+            .with_col("tag2")
+            .with_col("tag1")
+            .with_col(TIME_COLUMN_NAME)
+            .build()
+    }
+}
diff --git a/iox_query/src/physical_optimizer/combine_chunks.rs b/iox_query/src/physical_optimizer/combine_chunks.rs
new file mode 100644
index 0000000..d09681e
--- /dev/null
+++ b/iox_query/src/physical_optimizer/combine_chunks.rs
@@ -0,0 +1,436 @@
+use std::sync::Arc;
+
+use arrow::compute::SortOptions;
+use datafusion::{
+    common::{
+        plan_err,
+        tree_node::{Transformed, TreeNode},
+    },
+    config::ConfigOptions,
+    error::{DataFusionError, Result},
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{union::UnionExec, ExecutionPlan},
+};
+use observability_deps::tracing::trace;
+use schema::TIME_COLUMN_NAME;
+
+use crate::{
+    physical_optimizer::{
+        chunk_extraction::extract_chunks,
+        sort::util::{collect_statistics_min_max, sort_by_value_ranges},
+    },
+    provider::chunks_to_physical_nodes,
+};
+
+/// Collects [`QueryChunk`]s and re-creates a appropriate physical nodes.
+///
+/// Invariants of inputs of the union:
+///   1. They do not overlap on time ranges (done in previous step: TimeSplit)
+///   2. Each input of the union is either with_chunks or other_plans.
+///      - An input with_chunks is a plan that contains only (union of) ParquetExecs or RecordBatchesExec
+///      - An input of other_plans is a plan that contains at least one node that is not a ParquetExec or
+///        RecordBatchesExec or Union of them. Examples of those other nodes are FilterExec, DeduplicateExec,
+///        ProjectionExec, etc.
+//
+/// Goals of this optimzation step:
+///   i. Combine **possible** plans with_chunks into a single union
+///   ii. - Keep the the combined plan non-overlapped on time ranges. This will likely help later optimization steps.
+///       - If time ranges cannot be computed, combine all plans with_chunks into a single union.
+///
+/// Example: w = with_chunks, o = other_plans
+///   Input:  |--P1 w --| |--P2 w --| |-- P3 o --| |-- P4 w --| |-- P5 w --| |-- P6 o --| |--P7 w --|
+///   Output when time ranges can be computed: Only two sets of plans that are combined: [P1, P2], [P4, P5]
+///           |------ P1 & P2 w ----| |-- P3 o --| |------ P4 & P5 w ------| |-- P6 o --| |--P7 w --|
+///   Output when time ranges cannot be computed: all plans with_chunks are combined into a single union
+///           |-------------------------- P1, P2, P4, P5, P7 w -------------------------------------|
+///                                   |-- P3 o --|                           |-- P6 o --|
+///
+///
+/// This is mostly useful after multiple re-arrangements (e.g. [`PartitionSplit`]-[`TimeSplit`]-[`RemoveDedup`]) created
+/// a bunch of freestanding chunks that can be re-arranged into more packed, more efficient physical nodes.
+///
+///
+/// [`PartitionSplit`]: super::dedup::partition_split::PartitionSplit
+/// [`QueryChunk`]: crate::QueryChunk
+/// [`RemoveDedup`]: super::dedup::remove_dedup::RemoveDedup
+/// [`TimeSplit`]: super::dedup::time_split::TimeSplit
+#[derive(Debug, Default)]
+pub struct CombineChunks;
+
+impl PhysicalOptimizerRule for CombineChunks {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            if let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() {
+                // sort and group the inputs by time range
+                let inputs = union_exec.inputs();
+                // We only need to ensure the input are sorted by time range,
+                // any order is fine and hence we choose to go with ASC here
+                let groups = sort_and_group_plans(
+                    inputs.clone(),
+                    TIME_COLUMN_NAME,
+                    SortOptions {
+                        descending: false,
+                        nulls_first: false,
+                    },
+                )?;
+
+                // combine plans from each group
+                let plans = groups
+                    .into_iter()
+                    .map(|group| combine_plans(group, config))
+                    .collect::<Result<Vec<_>>>()?
+                    .into_iter()
+                    .flatten()
+                    .collect::<Vec<_>>();
+
+                let final_union = UnionExec::new(plans);
+                trace!(?final_union, "-------- final union");
+                return Ok(Transformed::Yes(Arc::new(final_union)));
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "combine_chunks"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+/// Sort the given plans on the given column name and a given sort order.
+///
+/// Then group them into non-overlapped groups based on the ranges of the given column, and return the groups.
+///
+/// # Input Invariants
+/// - Plans do not overlap on the given column
+///
+/// # Output Invariants
+/// - Plans in the same group do not overlap on the given column
+/// -The groups do not overlap on the given column
+///
+/// # Example
+/// Input:
+///
+/// ```text
+/// 7 plans with value ranges : |--P1 w --| |--P2 w --| |-- P3 o --| |-- P4 w --| |-- P5 w --| |-- P6 o --| |--P7 w --|
+/// ```
+///
+/// Output:
+///
+/// ```text
+/// 5 groups: [P1, P2], [P3], [P4, P5], [P6], [P7]
+/// ```
+fn sort_and_group_plans(
+    plans: Vec<Arc<dyn ExecutionPlan>>,
+    col_name: &str,
+    sort_options: SortOptions,
+) -> Result<Vec<Vec<Arc<dyn ExecutionPlan>>>> {
+    if plans.len() <= 1 {
+        return Ok(vec![plans]);
+    }
+
+    let Some(value_ranges) = collect_statistics_min_max(&plans, col_name)? else {
+        // No statistics to sort and group the plans.
+        // Return all plans in the same group
+        trace!("-------- combine chunks - cannot collect statistics min max for column {col_name}");
+        return Ok(vec![plans]);
+    };
+
+    // Sort the plans by their value ranges
+    trace!("-------- value_ranges: {:?}", value_ranges);
+    let Some(plans_value_ranges) = sort_by_value_ranges(plans.clone(), value_ranges, sort_options)?
+    else {
+        // The inputs are not being sorted by value ranges, cannot group them
+        // Return all plans in the same group
+        trace!("-------- inputs are not sorted by value ranges. No optimization");
+        return Ok(vec![plans]);
+    };
+
+    // Group plans that can be combined
+    let plans = plans_value_ranges.plans;
+    let mut final_groups = Vec::with_capacity(plans.len());
+    let mut combinable_plans = Vec::new();
+    for plan in plans {
+        if extract_chunks(plan.as_ref()).is_some() {
+            combinable_plans.push(plan);
+        } else {
+            if !combinable_plans.is_empty() {
+                final_groups.push(combinable_plans);
+                combinable_plans = Vec::new();
+            }
+            final_groups.push(vec![plan]);
+        }
+    }
+
+    if !combinable_plans.is_empty() {
+        final_groups.push(combinable_plans);
+    }
+
+    Ok(final_groups)
+}
+
+/// Combine the given plans with chunks  into a single union. The other plans stay as is.
+fn combine_plans(
+    plans: Vec<Arc<dyn ExecutionPlan>>,
+    config: &ConfigOptions,
+) -> Result<Vec<Arc<dyn ExecutionPlan>>> {
+    let (inputs_with_chunks, inputs_other): (Vec<_>, Vec<_>) = plans
+        .iter()
+        .cloned()
+        .partition(|plan| extract_chunks(plan.as_ref()).is_some());
+
+    if inputs_with_chunks.is_empty() {
+        return Ok(plans);
+    }
+    let union_of_chunks = UnionExec::new(inputs_with_chunks);
+
+    if let Some((schema, chunks, output_sort_key)) = extract_chunks(&union_of_chunks) {
+        let union_of_chunks = chunks_to_physical_nodes(
+            &schema,
+            output_sort_key.as_ref(),
+            chunks,
+            config.execution.target_partitions,
+        );
+        let Some(union_of_chunks) = union_of_chunks.as_any().downcast_ref::<UnionExec>() else {
+            return plan_err!("Expected chunks_to_physical_nodes to produce UnionExec but got {union_of_chunks:?}");
+        };
+
+        // return other_plans and the union_of_chunks
+        let plans = union_of_chunks
+            .inputs()
+            .iter()
+            .cloned()
+            .chain(inputs_other)
+            .collect();
+        return Ok(plans);
+    }
+
+    Ok(plans)
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{
+        physical_plan::{expressions::Literal, filter::FilterExec, union::UnionExec},
+        scalar::ScalarValue,
+    };
+
+    use crate::{physical_optimizer::test_util::OptimizationTest, test::TestChunk, QueryChunk};
+
+    use super::*;
+
+    #[test]
+    fn test_combine_single_union_tree() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_time_column_with_stats(Some(1), Some(2));
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(3), Some(4));
+        let chunk3 = TestChunk::new("table")
+            .with_id(3)
+            .with_time_column_with_stats(Some(5), Some(6));
+        let chunk4 = TestChunk::new("table")
+            .with_id(4)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(7), Some(8));
+        let chunk5 = TestChunk::new("table")
+            .with_id(5)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(9), Some(10));
+        let schema = chunk1.schema().as_arrow();
+        let plan = Arc::new(UnionExec::new(vec![
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1), Arc::new(chunk2)], 2),
+            chunks_to_physical_nodes(
+                &schema,
+                None,
+                vec![Arc::new(chunk3), Arc::new(chunk4), Arc::new(chunk5)],
+                2,
+            ),
+        ]));
+        let opt = CombineChunks;
+        let mut config = ConfigOptions::default();
+        config.execution.target_partitions = 2;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=1, projection=[time]"
+          - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[time]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=1, projection=[time]"
+          - "     ParquetExec: file_groups={2 groups: [[4.parquet], [5.parquet]]}, projection=[time]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   RecordBatchesExec: chunks=2, projection=[time]"
+            - "   ParquetExec: file_groups={2 groups: [[2.parquet, 5.parquet], [4.parquet]]}, projection=[time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_only_combine_contiguous_arms() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(1), Some(2));
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(3), Some(4));
+        let chunk3 = TestChunk::new("table")
+            .with_id(3)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(5), Some(6));
+        let chunk4 = TestChunk::new("table")
+            .with_id(4)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(7), Some(8));
+        let schema = chunk1.schema().as_arrow();
+        let plan = Arc::new(UnionExec::new(vec![
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2),
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk2)], 2),
+            Arc::new(
+                FilterExec::try_new(
+                    Arc::new(Literal::new(ScalarValue::from(false))),
+                    chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk3)], 2),
+                )
+                .unwrap(),
+            ),
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk4)], 2),
+        ]));
+        let opt = CombineChunks;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[time]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[time]"
+          - "   FilterExec: false"
+          - "     UnionExec"
+          - "       ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[time]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[time]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[time]"
+            - "   FilterExec: false"
+            - "     UnionExec"
+            - "       ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[time]"
+            - "   ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_combine_some_union_arms() {
+        let chunk1 = TestChunk::new("table").with_id(1).with_dummy_parquet_file();
+        let chunk2 = TestChunk::new("table").with_id(1).with_dummy_parquet_file();
+        let chunk3 = TestChunk::new("table").with_id(1).with_dummy_parquet_file();
+        let schema = chunk1.schema().as_arrow();
+        let plan = Arc::new(UnionExec::new(vec![
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2),
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk2)], 2),
+            Arc::new(
+                FilterExec::try_new(
+                    Arc::new(Literal::new(ScalarValue::from(false))),
+                    chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk3)], 2),
+                )
+                .unwrap(),
+            ),
+        ]));
+        let opt = CombineChunks;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}"
+          - "   FilterExec: false"
+          - "     UnionExec"
+          - "       ParquetExec: file_groups={1 group: [[1.parquet]]}"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   ParquetExec: file_groups={2 groups: [[1.parquet], [1.parquet]]}"
+            - "   FilterExec: false"
+            - "     UnionExec"
+            - "       ParquetExec: file_groups={1 group: [[1.parquet]]}"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_no_chunks() {
+        let chunk1 = TestChunk::new("table").with_id(1);
+        let schema = chunk1.schema().as_arrow();
+        let plan = chunks_to_physical_nodes(&schema, None, vec![], 2);
+        let opt = CombineChunks;
+        let mut config = ConfigOptions::default();
+        config.execution.target_partitions = 2;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_no_valid_arms() {
+        let chunk1 = TestChunk::new("table").with_id(1);
+        let schema = chunk1.schema().as_arrow();
+        let plan = Arc::new(UnionExec::new(vec![Arc::new(
+            FilterExec::try_new(
+                Arc::new(Literal::new(ScalarValue::from(false))),
+                chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2),
+            )
+            .unwrap(),
+        )]));
+        let opt = CombineChunks;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   FilterExec: false"
+          - "     UnionExec"
+          - "       RecordBatchesExec: chunks=1"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   FilterExec: false"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=1"
+        "###
+        );
+    }
+}
diff --git a/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs b/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs
new file mode 100644
index 0000000..341ae47
--- /dev/null
+++ b/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs
@@ -0,0 +1,249 @@
+use std::{collections::HashSet, sync::Arc};
+
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::ExecutionPlan,
+};
+use schema::{sort::SortKeyBuilder, TIME_COLUMN_NAME};
+
+use crate::{
+    physical_optimizer::chunk_extraction::extract_chunks,
+    provider::{chunks_to_physical_nodes, DeduplicateExec},
+    util::arrow_sort_key_exprs,
+};
+
+/// Determine sort key set of [`DeduplicateExec`] by elimating all-NULL columns.
+///
+/// This finds a good sort key for [`DeduplicateExec`] based on the [`QueryChunk`]s covered by the deduplication.
+///
+/// We assume that, columns that are NOT present in any chunks and hence are only created as pure NULL-columns are
+/// not relevant for deduplication since they are effectively constant.
+///
+///
+/// [`QueryChunk`]: crate::QueryChunk
+#[derive(Debug, Default)]
+pub struct DedupNullColumns;
+
+impl PhysicalOptimizerRule for DedupNullColumns {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(dedup_exec) = plan_any.downcast_ref::<DeduplicateExec>() {
+                let mut children = dedup_exec.children();
+                assert_eq!(children.len(), 1);
+                let child = children.remove(0);
+                let Some((schema, chunks, _output_sort_key)) = extract_chunks(child.as_ref())
+                else {
+                    return Ok(Transformed::No(plan));
+                };
+
+                let pk_cols = dedup_exec.sort_columns();
+
+                let mut used_pk_cols = HashSet::new();
+                for chunk in &chunks {
+                    for (_type, field) in chunk.schema().iter() {
+                        if pk_cols.contains(field.name().as_str()) {
+                            used_pk_cols.insert(field.name().as_str());
+                        }
+                    }
+                }
+
+                let mut used_pk_cols = used_pk_cols.into_iter().collect::<Vec<_>>();
+                used_pk_cols.sort_by_key(|col| (*col == TIME_COLUMN_NAME, *col));
+
+                let mut sort_key_builder = SortKeyBuilder::new();
+                for col in used_pk_cols {
+                    sort_key_builder = sort_key_builder.with_col(col);
+                }
+
+                let sort_key = sort_key_builder.build();
+                let child = chunks_to_physical_nodes(
+                    &schema,
+                    (!sort_key.is_empty()).then_some(&sort_key),
+                    chunks,
+                    config.execution.target_partitions,
+                );
+
+                let sort_exprs = arrow_sort_key_exprs(&sort_key, &schema);
+                return Ok(Transformed::Yes(Arc::new(DeduplicateExec::new(
+                    child,
+                    sort_exprs,
+                    dedup_exec.use_chunk_order_col(),
+                ))));
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "dedup_null_columns"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use schema::SchemaBuilder;
+
+    use crate::{
+        physical_optimizer::{
+            dedup::test_util::{chunk, dedup_plan, dedup_plan_with_chunk_order_col},
+            test_util::OptimizationTest,
+        },
+        test::TestChunk,
+        QueryChunk,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_no_chunks() {
+        let schema = chunk(1).schema().clone();
+        let plan = dedup_plan(schema, vec![]);
+        let opt = DedupNullColumns;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " DeduplicateExec: []"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_all_cols() {
+        let chunk = chunk(1).with_dummy_parquet_file();
+        let schema = chunk.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk]);
+        let opt = DedupNullColumns;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_schema_has_chunk_order_col() {
+        let chunk = chunk(1).with_dummy_parquet_file();
+        let schema = chunk.schema().clone();
+        let plan = dedup_plan_with_chunk_order_col(schema, vec![chunk]);
+        let opt = DedupNullColumns;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_misses_pk_cols() {
+        let chunk = TestChunk::new("table")
+            .with_id(1)
+            .with_tag_column("tag1")
+            .with_dummy_parquet_file();
+        let schema = SchemaBuilder::new()
+            .tag("tag1")
+            .tag("tag2")
+            .tag("zzz")
+            .timestamp()
+            .build()
+            .unwrap();
+        let plan = dedup_plan(schema, vec![chunk]);
+        let opt = DedupNullColumns;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,zzz@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[tag1, tag2, zzz, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@0 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[tag1, tag2, zzz, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_two_chunks() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_time_column()
+            .with_dummy_parquet_file();
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_tag_column("tag1")
+            .with_tag_column("tag3")
+            .with_time_column()
+            .with_dummy_parquet_file();
+        let schema = SchemaBuilder::new()
+            .tag("tag1")
+            .tag("tag2")
+            .tag("tag3")
+            .tag("tag4")
+            .timestamp()
+            .build()
+            .unwrap();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2]);
+        let opt = DedupNullColumns;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,tag3@2 ASC,tag4@3 ASC,time@4 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[tag1, tag2, tag3, tag4, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,tag3@2 ASC,time@4 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[tag1, tag2, tag3, tag4, time]"
+        "###
+        );
+    }
+}
diff --git a/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs b/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs
new file mode 100644
index 0000000..c4b3924
--- /dev/null
+++ b/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs
@@ -0,0 +1,636 @@
+use std::{cmp::Reverse, sync::Arc};
+
+use arrow::compute::SortOptions;
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::ExecutionPlan,
+};
+use indexmap::IndexSet;
+use schema::{sort::SortKeyBuilder, TIME_COLUMN_NAME};
+
+use crate::{
+    physical_optimizer::chunk_extraction::extract_chunks,
+    provider::{chunks_to_physical_nodes, DeduplicateExec},
+    util::arrow_sort_key_exprs,
+    CHUNK_ORDER_COLUMN_NAME,
+};
+
+/// Determine sort key order of [`DeduplicateExec`].
+///
+/// This finds a cheap sort key order for [`DeduplicateExec`] based on the [`QueryChunk`]s covered by the deduplication.
+/// This means that the sort key of the [`DeduplicateExec`] should be as close as possible to the pre-sorted chunks to
+/// avoid resorting. If all chunks are pre-sorted (or not sorted at all), this is basically the joined merged sort key
+/// of all of them. If the chunks do not agree on a single sort order[^different_orders], then we use a vote-based
+/// system where we column-by-column pick the sort key order in the hope that this does the least harm.
+///
+/// The produces sort key MUST be the same set of columns as before, i.e. this rule does NOT change the column set, it
+/// only changes the order.
+///
+/// We assume that the order of the sort key passed to [`DeduplicateExec`] is not relevant for correctness.
+///
+/// This optimizer makes no assumption about how the ingester or compaction tier work or how chunks relate to each
+/// other. As a consequence, it does NOT use the partition sort key.
+///
+///
+/// [^different_orders]: In an ideal system, all chunks that have a sort order should agree on a single one. However we
+///     want to avoid that the querier disintegrates when the ingester or compactor are buggy or when manual
+///     interventions (like manual file creations) insert files that are slightly off.
+///
+///
+/// [`QueryChunk`]: crate::QueryChunk
+#[derive(Debug, Default)]
+pub struct DedupSortOrder;
+
+impl PhysicalOptimizerRule for DedupSortOrder {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(dedup_exec) = plan_any.downcast_ref::<DeduplicateExec>() {
+                let mut children = dedup_exec.children();
+                assert_eq!(children.len(), 1);
+                let child = children.remove(0);
+                let Some((schema, chunks, _output_sort_key)) = extract_chunks(child.as_ref())
+                else {
+                    return Ok(Transformed::No(plan));
+                };
+
+                let mut chunk_sort_keys: Vec<IndexSet<_>> = chunks
+                    .iter()
+                    .map(|chunk| {
+                        chunk
+                            .sort_key()
+                            .map(|sort_key| {
+                                sort_key
+                                    .iter()
+                                    .map(|(col, opts)| {
+                                        assert_eq!(opts, &SortOptions::default());
+                                        col.as_ref()
+                                    })
+                                    .collect()
+                            })
+                            .unwrap_or_default()
+                    })
+                    .collect();
+
+                let mut quorum_sort_key_builder = SortKeyBuilder::default();
+                let mut todo_pk_columns = dedup_exec.sort_columns();
+                todo_pk_columns.remove(CHUNK_ORDER_COLUMN_NAME);
+                while !todo_pk_columns.is_empty() {
+                    let candidate_counts = todo_pk_columns.iter().copied().map(|col| {
+                        let count = chunk_sort_keys
+                            .iter()
+                            .filter(|sort_key| {
+                                match sort_key.get_index_of(col) {
+                                    Some(0) => {
+                                        // Column next in sort order from this chunks PoV. This is good.
+                                        true
+                                    }
+                                    Some(_) => {
+                                        // Column part of the sort order but we have at least one more column before
+                                        // that. Try to avoid an expensive resort for this chunk.
+                                        false
+                                    }
+                                    None => {
+                                        // Column is not in the sort order of this chunk at all. Hence we can place it
+                                        // everywhere in the quorum sort key w/o having to worry about this particular
+                                        // chunk.
+                                        true
+                                    }
+                                }
+                            })
+                            .count();
+                        (col, count)
+                    });
+                    let candidate_counts = sorted(
+                        candidate_counts
+                            .into_iter()
+                            .map(|(col, count)| (Reverse(count), col == TIME_COLUMN_NAME, col)),
+                    );
+                    let next_key = candidate_counts.first().expect("all TODO cols inserted").2;
+
+                    for chunk_sort_key in &mut chunk_sort_keys {
+                        chunk_sort_key.shift_remove_full(next_key);
+                    }
+
+                    let was_present = todo_pk_columns.remove(next_key);
+                    assert!(was_present);
+
+                    quorum_sort_key_builder = quorum_sort_key_builder.with_col(next_key);
+                }
+
+                let quorum_sort_key = quorum_sort_key_builder.build();
+                let child = chunks_to_physical_nodes(
+                    &schema,
+                    (!quorum_sort_key.is_empty()).then_some(&quorum_sort_key),
+                    chunks,
+                    config.execution.target_partitions,
+                );
+
+                let sort_exprs = arrow_sort_key_exprs(&quorum_sort_key, &schema);
+                return Ok(Transformed::Yes(Arc::new(DeduplicateExec::new(
+                    child,
+                    sort_exprs,
+                    dedup_exec.use_chunk_order_col(),
+                ))));
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "dedup_sort_order"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+/// Collect items into a sorted vector.
+fn sorted<T>(it: impl IntoIterator<Item = T>) -> Vec<T>
+where
+    T: Ord,
+{
+    let mut items = it.into_iter().collect::<Vec<T>>();
+    items.sort();
+    items
+}
+
+#[cfg(test)]
+mod tests {
+    use schema::{sort::SortKey, SchemaBuilder, TIME_COLUMN_NAME};
+
+    use crate::{
+        physical_optimizer::{
+            dedup::test_util::{chunk, dedup_plan, dedup_plan_with_chunk_order_col},
+            test_util::OptimizationTest,
+        },
+        test::TestChunk,
+        QueryChunk,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_no_chunks() {
+        let schema = chunk(1).schema().clone();
+        let plan = dedup_plan(schema, vec![]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_no_sort_key() {
+        let chunk = chunk(1).with_dummy_parquet_file();
+        let schema = chunk.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_order() {
+        let chunk = chunk(1)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let schema = chunk.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time], output_ordering=[tag2@2 ASC, tag1@1 ASC, time@3 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@2 ASC,tag1@1 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time], output_ordering=[tag2@2 ASC, tag1@1 ASC, time@3 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_with_chunk_order_col() {
+        let chunk = chunk(1)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let schema = chunk.schema().clone();
+        let plan = dedup_plan_with_chunk_order_col(schema, vec![chunk]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[tag2@2 ASC, tag1@1 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@2 ASC,tag1@1 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[tag2@2 ASC, tag1@1 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_unusual_time_order() {
+        let chunk = chunk(1)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from(TIME_COLUMN_NAME),
+                Arc::from("tag1"),
+                Arc::from("tag2"),
+            ]));
+        let schema = chunk.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time], output_ordering=[time@3 ASC, tag1@1 ASC, tag2@2 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [time@3 ASC,tag1@1 ASC,tag2@2 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time], output_ordering=[time@3 ASC, tag1@1 ASC, tag2@2 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_time_always_included() {
+        let chunk = chunk(1)
+            .with_tag_column("zzz")
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+            ]));
+        let schema = chunk.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,zzz@4 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time, zzz], output_ordering=[tag2@2 ASC, tag1@1 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@2 ASC,tag1@1 ASC,zzz@4 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[field, tag1, tag2, time, zzz]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_misses_pk_cols() {
+        let chunk = TestChunk::new("table")
+            .with_id(1)
+            .with_tag_column("tag1")
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([Arc::from("tag1")]));
+        let schema = SchemaBuilder::new()
+            .tag("tag1")
+            .tag("tag2")
+            .tag("zzz")
+            .timestamp()
+            .build()
+            .unwrap();
+        let plan = dedup_plan(schema, vec![chunk]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,zzz@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[tag1, tag2, zzz, time], output_ordering=[tag1@0 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,zzz@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[tag1, tag2, zzz, time], output_ordering=[tag1@0 ASC, tag2@1 ASC, zzz@2 ASC, time@3 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_two_chunks_break_even_by_col_name() {
+        let chunk1 = chunk(1)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag1"),
+                Arc::from("tag2"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk2 = chunk(2)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_two_chunks_sorted_ranks_higher_than_not_sorted() {
+        let chunk1 = chunk(1)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk2 = chunk(2)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[field, tag1, tag2, time], output_ordering=[tag2@2 ASC, tag1@1 ASC, time@3 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@2 ASC,tag1@1 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_two_chunks_one_without_sort_key() {
+        let chunk1 = chunk(1)
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk2 = chunk(2).with_dummy_parquet_file();
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@2 ASC,tag1@1 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_three_chunks_different_subsets() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_tag_column("tag1")
+            .with_tag_column("tag3")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag3"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk3 = TestChunk::new("table")
+            .with_id(3)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_tag_column("tag3")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag3"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let schema = chunk3.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,tag3@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={2 groups: [[1.parquet, 3.parquet], [2.parquet]]}, projection=[tag1, tag2, tag3, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@1 ASC,tag3@2 ASC,tag1@0 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={3 groups: [[1.parquet], [3.parquet], [2.parquet]]}, projection=[tag1, tag2, tag3, time], output_ordering=[tag2@1 ASC, tag3@2 ASC, tag1@0 ASC, time@3 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_three_chunks_single_chunk_has_extra_col1() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_tag_column("tag1")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_tag_column("tag1")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk3 = TestChunk::new("table")
+            .with_id(3)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let schema = chunk3.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,time@2 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={2 groups: [[1.parquet, 3.parquet], [2.parquet]]}, projection=[tag1, tag2, time], output_ordering=[tag2@1 ASC, tag1@0 ASC, time@2 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@1 ASC,tag1@0 ASC,time@2 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={3 groups: [[1.parquet], [3.parquet], [2.parquet]]}, projection=[tag1, tag2, time], output_ordering=[tag2@1 ASC, tag1@0 ASC, time@2 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_three_chunks_single_chunk_has_extra_col2() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let chunk3 = TestChunk::new("table")
+            .with_id(3)
+            .with_tag_column("tag1")
+            .with_tag_column("tag2")
+            .with_time_column()
+            .with_dummy_parquet_file()
+            .with_sort_key(SortKey::from_columns([
+                Arc::from("tag2"),
+                Arc::from("tag1"),
+                Arc::from(TIME_COLUMN_NAME),
+            ]));
+        let schema = chunk3.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3]);
+        let opt = DedupSortOrder;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC,time@2 ASC]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={2 groups: [[1.parquet, 3.parquet], [2.parquet]]}, projection=[tag1, tag2, time], output_ordering=[tag2@1 ASC, tag1@0 ASC, time@2 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag2@1 ASC,tag1@0 ASC,time@2 ASC]"
+            - "   UnionExec"
+            - "     ParquetExec: file_groups={3 groups: [[1.parquet], [3.parquet], [2.parquet]]}, projection=[tag1, tag2, time]"
+        "###
+        );
+    }
+}
diff --git a/iox_query/src/physical_optimizer/dedup/mod.rs b/iox_query/src/physical_optimizer/dedup/mod.rs
new file mode 100644
index 0000000..813cd3b
--- /dev/null
+++ b/iox_query/src/physical_optimizer/dedup/mod.rs
@@ -0,0 +1,10 @@
+//! Optimizer passes concering de-duplication.
+
+pub mod dedup_null_columns;
+pub mod dedup_sort_order;
+pub mod partition_split;
+pub mod remove_dedup;
+pub mod time_split;
+
+#[cfg(test)]
+mod test_util;
diff --git a/iox_query/src/physical_optimizer/dedup/partition_split.rs b/iox_query/src/physical_optimizer/dedup/partition_split.rs
new file mode 100644
index 0000000..386cd9c
--- /dev/null
+++ b/iox_query/src/physical_optimizer/dedup/partition_split.rs
@@ -0,0 +1,287 @@
+use crate::{
+    config::IoxConfigExt,
+    physical_optimizer::chunk_extraction::extract_chunks,
+    provider::{chunks_to_physical_nodes, DeduplicateExec},
+    QueryChunk,
+};
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{union::UnionExec, ExecutionPlan},
+};
+use hashbrown::HashMap;
+use observability_deps::tracing::warn;
+use std::sync::Arc;
+
+/// Split de-duplication operations based on partitons.
+///
+/// This should usually be more cost-efficient.
+#[derive(Debug, Default)]
+pub struct PartitionSplit;
+
+impl PhysicalOptimizerRule for PartitionSplit {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(dedup_exec) = plan_any.downcast_ref::<DeduplicateExec>() {
+                let mut children = dedup_exec.children();
+                assert_eq!(children.len(), 1);
+                let child = children.remove(0);
+                let Some((schema, chunks, output_sort_key)) = extract_chunks(child.as_ref()) else {
+                    return Ok(Transformed::No(plan));
+                };
+
+                let mut chunks_by_partition: HashMap<_, Vec<Arc<dyn QueryChunk>>> =
+                    Default::default();
+                for chunk in chunks {
+                    chunks_by_partition
+                        .entry(chunk.partition_id().clone())
+                        .or_default()
+                        .push(chunk);
+                }
+
+                // If there not multiple partitions (0 or 1), then this optimizer is a no-op. Signal that to the
+                // optimizer framework.
+                if chunks_by_partition.len() < 2 {
+                    return Ok(Transformed::No(plan));
+                }
+
+                // Protect against degenerative plans
+                let max_dedup_partition_split = config
+                    .extensions
+                    .get::<IoxConfigExt>()
+                    .cloned()
+                    .unwrap_or_default()
+                    .max_dedup_partition_split;
+                if chunks_by_partition.len() > max_dedup_partition_split {
+                    warn!(
+                        n_partitions = chunks_by_partition.len(),
+                        max_dedup_partition_split,
+                        "cannot split dedup operation based on partition, too many partitions"
+                    );
+                    return Ok(Transformed::No(plan));
+                }
+
+                // ensure deterministic order
+                let mut chunks_by_partition = chunks_by_partition.into_iter().collect::<Vec<_>>();
+                chunks_by_partition.sort_by(|a, b| a.0.cmp(&b.0));
+
+                let out = UnionExec::new(
+                    chunks_by_partition
+                        .into_iter()
+                        .map(|(_p_id, chunks)| {
+                            Arc::new(DeduplicateExec::new(
+                                chunks_to_physical_nodes(
+                                    &schema,
+                                    output_sort_key.as_ref(),
+                                    chunks,
+                                    config.execution.target_partitions,
+                                ),
+                                dedup_exec.sort_keys().to_vec(),
+                                dedup_exec.use_chunk_order_col(),
+                            )) as _
+                        })
+                        .collect(),
+                );
+                return Ok(Transformed::Yes(Arc::new(out)));
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "partition_split"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::physical_optimizer::{
+        dedup::test_util::{chunk, dedup_plan},
+        test_util::OptimizationTest,
+    };
+    use data_types::{PartitionHashId, PartitionId, TransitionPartitionId};
+
+    #[test]
+    fn test_no_chunks() {
+        let schema = chunk(1).schema().clone();
+        let plan = dedup_plan(schema, vec![]);
+        let opt = PartitionSplit;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_same_partition() {
+        let chunk1 = chunk(1);
+        let chunk2 = chunk(2);
+        let chunk3 = chunk(3).with_dummy_parquet_file();
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3]);
+        let opt = PartitionSplit;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+          - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+            - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_different_partitions() {
+        let chunk1 = chunk(1).with_partition(1);
+        let chunk2 = chunk(2).with_partition(2);
+        // use at least 3 parquet files for one of the two partitions to validate that `target_partitions` is forwared correctly
+        let chunk3 = chunk(3).with_dummy_parquet_file().with_partition(1);
+        let chunk4 = chunk(4).with_dummy_parquet_file().with_partition(2);
+        let chunk5 = chunk(5).with_dummy_parquet_file().with_partition(1);
+        let chunk6 = chunk(6).with_dummy_parquet_file().with_partition(1);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3, chunk4, chunk5, chunk6]);
+        let opt = PartitionSplit;
+        let mut config = ConfigOptions::default();
+        config.execution.target_partitions = 2;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+          - "     ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+            - "       ParquetExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time]"
+            - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+            - "       ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_different_partitions_with_and_without_hash_ids() {
+        // Partition without hash ID in the catalog
+        let legacy_partition_id = 1;
+        let legacy_transition_partition_id =
+            TransitionPartitionId::Deprecated(PartitionId::new(legacy_partition_id));
+
+        // Partition with hash ID in the catalog
+        let transition_partition_id =
+            TransitionPartitionId::Deterministic(PartitionHashId::arbitrary_for_testing());
+
+        let chunk1 = chunk(1).with_partition_id(legacy_transition_partition_id.clone());
+        let chunk2 = chunk(2).with_partition_id(transition_partition_id.clone());
+
+        let chunk3 = chunk(3)
+            .with_dummy_parquet_file()
+            .with_partition_id(legacy_transition_partition_id.clone());
+        let chunk4 = chunk(4)
+            .with_dummy_parquet_file()
+            .with_partition_id(transition_partition_id.clone());
+        let chunk5 = chunk(5)
+            .with_dummy_parquet_file()
+            .with_partition_id(legacy_transition_partition_id.clone());
+        let chunk6 = chunk(6)
+            .with_dummy_parquet_file()
+            .with_partition_id(legacy_transition_partition_id.clone());
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3, chunk4, chunk5, chunk6]);
+        let opt = PartitionSplit;
+        let mut config = ConfigOptions::default();
+        config.execution.target_partitions = 2;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+          - "     ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+            - "       ParquetExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time]"
+            - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+            - "       ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_max_split() {
+        let chunk1 = chunk(1).with_partition(1);
+        let chunk2 = chunk(2).with_partition(2);
+        let chunk3 = chunk(3).with_partition(3);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3]);
+        let opt = PartitionSplit;
+        let mut config = ConfigOptions::default();
+        config.extensions.insert(IoxConfigExt {
+            max_dedup_partition_split: 2,
+            ..Default::default()
+        });
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+}
diff --git a/iox_query/src/physical_optimizer/dedup/remove_dedup.rs b/iox_query/src/physical_optimizer/dedup/remove_dedup.rs
new file mode 100644
index 0000000..9558c5a
--- /dev/null
+++ b/iox_query/src/physical_optimizer/dedup/remove_dedup.rs
@@ -0,0 +1,159 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::ExecutionPlan,
+};
+
+use crate::{
+    physical_optimizer::chunk_extraction::extract_chunks,
+    provider::{chunks_to_physical_nodes, DeduplicateExec},
+};
+
+/// Removes de-duplication operation if there are at most 1 chunks and this chunk does NOT contain primary-key duplicates.
+#[derive(Debug, Default)]
+pub struct RemoveDedup;
+
+impl PhysicalOptimizerRule for RemoveDedup {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(dedup_exec) = plan_any.downcast_ref::<DeduplicateExec>() {
+                let mut children = dedup_exec.children();
+                assert_eq!(children.len(), 1);
+                let child = children.remove(0);
+                let Some((schema, chunks, output_sort_key)) = extract_chunks(child.as_ref()) else {
+                    return Ok(Transformed::No(plan));
+                };
+
+                if (chunks.len() < 2) && chunks.iter().all(|c| !c.may_contain_pk_duplicates()) {
+                    return Ok(Transformed::Yes(chunks_to_physical_nodes(
+                        &schema,
+                        output_sort_key.as_ref(),
+                        chunks,
+                        config.execution.target_partitions,
+                    )));
+                }
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "remove_dedup"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        physical_optimizer::{
+            dedup::test_util::{chunk, dedup_plan},
+            test_util::OptimizationTest,
+        },
+        QueryChunk,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_no_chunks() {
+        let schema = chunk(1).schema().clone();
+        let plan = dedup_plan(schema, vec![]);
+        let opt = RemoveDedup;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_no_pk_dups() {
+        let chunk1 = chunk(1).with_may_contain_pk_duplicates(false);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1]);
+        let opt = RemoveDedup;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_single_chunk_with_pk_dups() {
+        let chunk1 = chunk(1).with_may_contain_pk_duplicates(true);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1]);
+        let opt = RemoveDedup;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_multiple_chunks() {
+        let chunk1 = chunk(1).with_may_contain_pk_duplicates(false);
+        let chunk2 = chunk(2).with_may_contain_pk_duplicates(false);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2]);
+        let opt = RemoveDedup;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+}
diff --git a/iox_query/src/physical_optimizer/dedup/test_util.rs b/iox_query/src/physical_optimizer/dedup/test_util.rs
new file mode 100644
index 0000000..20d8a99
--- /dev/null
+++ b/iox_query/src/physical_optimizer/dedup/test_util.rs
@@ -0,0 +1,62 @@
+use std::sync::Arc;
+
+use arrow::datatypes::{Fields, Schema as ArrowSchema};
+use datafusion::physical_plan::ExecutionPlan;
+use schema::Schema;
+
+use crate::{
+    chunk_order_field,
+    provider::{chunks_to_physical_nodes, DeduplicateExec},
+    test::TestChunk,
+    util::arrow_sort_key_exprs,
+    QueryChunk,
+};
+
+pub fn dedup_plan(schema: Schema, chunks: Vec<TestChunk>) -> Arc<dyn ExecutionPlan> {
+    dedup_plan_impl(schema, chunks, false)
+}
+
+pub fn dedup_plan_with_chunk_order_col(
+    schema: Schema,
+    chunks: Vec<TestChunk>,
+) -> Arc<dyn ExecutionPlan> {
+    dedup_plan_impl(schema, chunks, true)
+}
+
+fn dedup_plan_impl(
+    schema: Schema,
+    chunks: Vec<TestChunk>,
+    use_chunk_order_col: bool,
+) -> Arc<dyn ExecutionPlan> {
+    let chunks = chunks
+        .into_iter()
+        .map(|c| Arc::new(c) as _)
+        .collect::<Vec<Arc<dyn QueryChunk>>>();
+    let arrow_schema = if use_chunk_order_col {
+        Arc::new(ArrowSchema::new(
+            schema
+                .as_arrow()
+                .fields
+                .iter()
+                .cloned()
+                .chain(std::iter::once(chunk_order_field()))
+                .collect::<Fields>(),
+        ))
+    } else {
+        schema.as_arrow()
+    };
+    let plan = chunks_to_physical_nodes(&arrow_schema, None, chunks, 2);
+
+    let sort_key = schema::sort::SortKey::from_columns(schema.primary_key());
+    let sort_exprs = arrow_sort_key_exprs(&sort_key, &plan.schema());
+    Arc::new(DeduplicateExec::new(plan, sort_exprs, use_chunk_order_col))
+}
+
+pub fn chunk(id: u128) -> TestChunk {
+    TestChunk::new("table")
+        .with_id(id)
+        .with_tag_column("tag1")
+        .with_tag_column("tag2")
+        .with_i64_field_column("field")
+        .with_time_column()
+}
diff --git a/iox_query/src/physical_optimizer/dedup/time_split.rs b/iox_query/src/physical_optimizer/dedup/time_split.rs
new file mode 100644
index 0000000..29acccb
--- /dev/null
+++ b/iox_query/src/physical_optimizer/dedup/time_split.rs
@@ -0,0 +1,235 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{union::UnionExec, ExecutionPlan},
+};
+use observability_deps::tracing::warn;
+
+use crate::{
+    config::IoxConfigExt,
+    physical_optimizer::chunk_extraction::extract_chunks,
+    provider::{chunks_to_physical_nodes, group_potential_duplicates, DeduplicateExec},
+};
+
+/// Split de-duplication operations based on time.
+///
+/// Chunks that overlap will be part of the same de-dup group.
+///
+/// This should usually be more cost-efficient.
+#[derive(Debug, Default)]
+pub struct TimeSplit;
+
+impl PhysicalOptimizerRule for TimeSplit {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(dedup_exec) = plan_any.downcast_ref::<DeduplicateExec>() {
+                let mut children = dedup_exec.children();
+                assert_eq!(children.len(), 1);
+                let child = children.remove(0);
+                let Some((schema, chunks, output_sort_key)) = extract_chunks(child.as_ref()) else {
+                    return Ok(Transformed::No(plan));
+                };
+
+                let groups = group_potential_duplicates(chunks);
+
+                // if there are no chunks or there is only one group, we don't need to split
+                if groups.len() < 2 {
+                    return Ok(Transformed::No(plan));
+                }
+
+                // Protect against degenerative plans
+                let max_dedup_time_split = config
+                    .extensions
+                    .get::<IoxConfigExt>()
+                    .cloned()
+                    .unwrap_or_default()
+                    .max_dedup_time_split;
+                if groups.len() > max_dedup_time_split {
+                    warn!(
+                        n_groups = groups.len(),
+                        max_dedup_time_split,
+                        "cannot split dedup operation based on time overlaps, too many groups"
+                    );
+                    return Ok(Transformed::No(plan));
+                }
+
+                let out = UnionExec::new(
+                    groups
+                        .into_iter()
+                        .map(|chunks| {
+                            Arc::new(DeduplicateExec::new(
+                                chunks_to_physical_nodes(
+                                    &schema,
+                                    output_sort_key.as_ref(),
+                                    chunks,
+                                    config.execution.target_partitions,
+                                ),
+                                dedup_exec.sort_keys().to_vec(),
+                                dedup_exec.use_chunk_order_col(),
+                            )) as _
+                        })
+                        .collect(),
+                );
+                return Ok(Transformed::Yes(Arc::new(out)));
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "time_split"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        physical_optimizer::{
+            dedup::test_util::{chunk, dedup_plan},
+            test_util::OptimizationTest,
+        },
+        QueryChunk,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_no_chunks() {
+        let schema = chunk(1).schema().clone();
+        let plan = dedup_plan(schema, vec![]);
+        let opt = TimeSplit;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_all_overlap() {
+        let chunk1 = chunk(1).with_timestamp_min_max(5, 10);
+        let chunk2 = chunk(2).with_timestamp_min_max(3, 5);
+        let chunk3 = chunk(3)
+            .with_dummy_parquet_file()
+            .with_timestamp_min_max(8, 9);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3]);
+        let opt = TimeSplit;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+          - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+            - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_different_groups() {
+        let chunk1 = chunk(1).with_timestamp_min_max(0, 10);
+        let chunk2 = chunk(2).with_timestamp_min_max(11, 12);
+        // use at least 3 parquet files for one of the two partitions to validate that `target_partitions` is forwarded correctly
+        let chunk3 = chunk(3)
+            .with_dummy_parquet_file()
+            .with_timestamp_min_max(1, 5);
+        let chunk4 = chunk(4)
+            .with_dummy_parquet_file()
+            .with_timestamp_min_max(11, 11);
+        let chunk5 = chunk(5)
+            .with_dummy_parquet_file()
+            .with_timestamp_min_max(7, 8);
+        let chunk6 = chunk(6)
+            .with_dummy_parquet_file()
+            .with_timestamp_min_max(0, 0);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3, chunk4, chunk5, chunk6]);
+        let opt = TimeSplit;
+        let mut config = ConfigOptions::default();
+        config.execution.target_partitions = 2;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
+          - "     ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+            - "       ParquetExec: file_groups={2 groups: [[6.parquet, 5.parquet], [3.parquet]]}, projection=[field, tag1, tag2, time]"
+            - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
+            - "       ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_max_split() {
+        let chunk1 = chunk(1).with_timestamp_min_max(1, 1);
+        let chunk2 = chunk(2).with_timestamp_min_max(2, 2);
+        let chunk3 = chunk(3).with_timestamp_min_max(3, 3);
+        let schema = chunk1.schema().clone();
+        let plan = dedup_plan(schema, vec![chunk1, chunk2, chunk3]);
+        let opt = TimeSplit;
+        let mut config = ConfigOptions::default();
+        config.extensions.insert(IoxConfigExt {
+            max_dedup_time_split: 2,
+            ..Default::default()
+        });
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+            - "   UnionExec"
+            - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
+        "###
+        );
+    }
+}
diff --git a/iox_query/src/physical_optimizer/mod.rs b/iox_query/src/physical_optimizer/mod.rs
new file mode 100644
index 0000000..a0bf7a4
--- /dev/null
+++ b/iox_query/src/physical_optimizer/mod.rs
@@ -0,0 +1,56 @@
+use std::sync::Arc;
+
+use datafusion::{execution::context::SessionState, physical_optimizer::PhysicalOptimizerRule};
+
+use self::{
+    combine_chunks::CombineChunks,
+    dedup::{
+        dedup_null_columns::DedupNullColumns, dedup_sort_order::DedupSortOrder,
+        partition_split::PartitionSplit, remove_dedup::RemoveDedup, time_split::TimeSplit,
+    },
+    predicate_pushdown::PredicatePushdown,
+    projection_pushdown::ProjectionPushdown,
+    sort::{order_union_sorted_inputs::OrderUnionSortedInputs, parquet_sortness::ParquetSortness},
+    union::{nested_union::NestedUnion, one_union::OneUnion},
+};
+
+mod chunk_extraction;
+mod combine_chunks;
+mod dedup;
+mod predicate_pushdown;
+mod projection_pushdown;
+mod sort;
+mod union;
+
+#[cfg(test)]
+mod test_util;
+
+#[cfg(test)]
+mod tests;
+
+/// Register IOx-specific [`PhysicalOptimizerRule`]s with the SessionContext
+pub fn register_iox_physical_optimizers(state: SessionState) -> SessionState {
+    // prepend IOx-specific rules to DataFusion builtins
+    // The optimizer rules have to be done in this order
+    let mut optimizers: Vec<Arc<dyn PhysicalOptimizerRule + Sync + Send>> = vec![
+        Arc::new(PartitionSplit),
+        Arc::new(TimeSplit),
+        Arc::new(RemoveDedup),
+        Arc::new(CombineChunks),
+        Arc::new(DedupNullColumns),
+        Arc::new(DedupSortOrder),
+        Arc::new(PredicatePushdown),
+        Arc::new(ProjectionPushdown),
+        Arc::new(ParquetSortness) as _,
+        Arc::new(NestedUnion),
+        Arc::new(OneUnion),
+    ];
+
+    // Append DataFUsion physical rules to the IOx-specific rules
+    optimizers.append(&mut state.physical_optimizers().to_vec());
+
+    // Add a rule to optimize plan with limit
+    optimizers.push(Arc::new(OrderUnionSortedInputs));
+
+    state.with_physical_optimizer_rules(optimizers)
+}
diff --git a/iox_query/src/physical_optimizer/predicate_pushdown.rs b/iox_query/src/physical_optimizer/predicate_pushdown.rs
new file mode 100644
index 0000000..ab8ccd4
--- /dev/null
+++ b/iox_query/src/physical_optimizer/predicate_pushdown.rs
@@ -0,0 +1,496 @@
+use std::{collections::HashSet, sync::Arc};
+
+use datafusion::{
+    common::tree_node::{RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter},
+    config::ConfigOptions,
+    datasource::physical_plan::ParquetExec,
+    error::{DataFusionError, Result},
+    logical_expr::Operator,
+    physical_expr::{split_conjunction, utils::collect_columns},
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{
+        empty::EmptyExec,
+        expressions::{BinaryExpr, Column},
+        filter::FilterExec,
+        union::UnionExec,
+        ExecutionPlan, PhysicalExpr,
+    },
+};
+
+use crate::provider::DeduplicateExec;
+
+/// Push down predicates.
+#[derive(Debug, Default)]
+pub struct PredicatePushdown;
+
+impl PhysicalOptimizerRule for PredicatePushdown {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_down(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(filter_exec) = plan_any.downcast_ref::<FilterExec>() {
+                let mut children = filter_exec.children();
+                assert_eq!(children.len(), 1);
+                let child = children.remove(0);
+
+                let child_any = child.as_any();
+                if child_any.downcast_ref::<EmptyExec>().is_some() {
+                    return Ok(Transformed::Yes(child));
+                } else if let Some(child_union) = child_any.downcast_ref::<UnionExec>() {
+                    let new_inputs = child_union
+                        .inputs()
+                        .iter()
+                        .map(|input| {
+                            FilterExec::try_new(
+                                Arc::clone(filter_exec.predicate()),
+                                Arc::clone(input),
+                            )
+                            .map(|p| Arc::new(p) as Arc<dyn ExecutionPlan>)
+                        })
+                        .collect::<Result<Vec<_>>>()?;
+                    let new_union = UnionExec::new(new_inputs);
+                    return Ok(Transformed::Yes(Arc::new(new_union)));
+                } else if let Some(child_parquet) = child_any.downcast_ref::<ParquetExec>() {
+                    let existing = child_parquet
+                        .predicate()
+                        .map(split_conjunction)
+                        .unwrap_or_default();
+                    let both = conjunction(
+                        existing
+                            .into_iter()
+                            .chain(split_conjunction(filter_exec.predicate()))
+                            .cloned(),
+                    );
+
+                    let new_node = Arc::new(FilterExec::try_new(
+                        Arc::clone(filter_exec.predicate()),
+                        Arc::new(ParquetExec::new(
+                            child_parquet.base_config().clone(),
+                            both,
+                            None,
+                        )),
+                    )?);
+                    return Ok(Transformed::Yes(new_node));
+                } else if let Some(child_dedup) = child_any.downcast_ref::<DeduplicateExec>() {
+                    let dedup_cols = child_dedup.sort_columns();
+                    let (pushdown, no_pushdown): (Vec<_>, Vec<_>) =
+                        split_conjunction(filter_exec.predicate())
+                            .into_iter()
+                            .cloned()
+                            .partition(|expr| {
+                                collect_columns(expr)
+                                    .into_iter()
+                                    .all(|c| dedup_cols.contains(c.name()))
+                            });
+
+                    if !pushdown.is_empty() {
+                        let mut grandchildren = child_dedup.children();
+                        assert_eq!(grandchildren.len(), 1);
+                        let grandchild = grandchildren.remove(0);
+
+                        let mut new_node: Arc<dyn ExecutionPlan> = Arc::new(DeduplicateExec::new(
+                            Arc::new(FilterExec::try_new(
+                                conjunction(pushdown).expect("not empty"),
+                                grandchild,
+                            )?),
+                            child_dedup.sort_keys().to_vec(),
+                            child_dedup.use_chunk_order_col(),
+                        ));
+                        if !no_pushdown.is_empty() {
+                            new_node = Arc::new(FilterExec::try_new(
+                                conjunction(no_pushdown).expect("not empty"),
+                                new_node,
+                            )?);
+                        }
+                        return Ok(Transformed::Yes(new_node));
+                    }
+                }
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "predicate_pushdown"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[derive(Debug, Default)]
+struct ColumnCollector {
+    cols: HashSet<Column>,
+}
+
+impl TreeNodeRewriter for ColumnCollector {
+    type N = Arc<dyn PhysicalExpr>;
+
+    fn pre_visit(
+        &mut self,
+        node: &Arc<dyn PhysicalExpr>,
+    ) -> Result<RewriteRecursion, DataFusionError> {
+        if let Some(column) = node.as_any().downcast_ref::<Column>() {
+            self.cols.insert(column.clone());
+        }
+        Ok(RewriteRecursion::Continue)
+    }
+
+    fn mutate(
+        &mut self,
+        expr: Arc<dyn PhysicalExpr>,
+    ) -> Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+        Ok(expr)
+    }
+}
+
+fn conjunction(
+    parts: impl IntoIterator<Item = Arc<dyn PhysicalExpr>>,
+) -> Option<Arc<dyn PhysicalExpr>> {
+    parts
+        .into_iter()
+        .reduce(|lhs, rhs| Arc::new(BinaryExpr::new(lhs, Operator::And, rhs)))
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use datafusion::{
+        datasource::object_store::ObjectStoreUrl,
+        datasource::physical_plan::FileScanConfig,
+        logical_expr::Operator,
+        physical_expr::PhysicalSortExpr,
+        physical_plan::{
+            expressions::{BinaryExpr, Column, Literal},
+            placeholder_row::PlaceholderRowExec,
+            PhysicalExpr, Statistics,
+        },
+        scalar::ScalarValue,
+    };
+    use schema::sort::SortKeyBuilder;
+
+    use crate::{physical_optimizer::test_util::OptimizationTest, util::arrow_sort_key_exprs};
+
+    use super::*;
+
+    #[test]
+    fn test_empty_no_rows() {
+        let schema = schema();
+        let plan = Arc::new(
+            FilterExec::try_new(predicate_tag(&schema), Arc::new(EmptyExec::new(schema))).unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: tag1@0 = foo"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_empty_with_rows() {
+        let schema = schema();
+        let plan = Arc::new(
+            FilterExec::try_new(
+                predicate_tag(&schema),
+                Arc::new(PlaceholderRowExec::new(schema)),
+            )
+            .unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: tag1@0 = foo"
+          - "   PlaceholderRowExec"
+        output:
+          Ok:
+            - " FilterExec: tag1@0 = foo"
+            - "   PlaceholderRowExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_union() {
+        let schema = schema();
+        let plan = Arc::new(
+            FilterExec::try_new(
+                predicate_tag(&schema),
+                Arc::new(UnionExec::new(
+                    (0..2)
+                        .map(|_| Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))) as _)
+                        .collect(),
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: tag1@0 = foo"
+          - "   UnionExec"
+          - "     PlaceholderRowExec"
+          - "     PlaceholderRowExec"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   FilterExec: tag1@0 = foo"
+            - "     PlaceholderRowExec"
+            - "   FilterExec: tag1@0 = foo"
+            - "     PlaceholderRowExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_union_nested() {
+        let schema = schema();
+        let plan = Arc::new(
+            FilterExec::try_new(
+                predicate_tag(&schema),
+                Arc::new(UnionExec::new(vec![Arc::new(UnionExec::new(
+                    (0..2)
+                        .map(|_| Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))) as _)
+                        .collect(),
+                ))])),
+            )
+            .unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: tag1@0 = foo"
+          - "   UnionExec"
+          - "     UnionExec"
+          - "       PlaceholderRowExec"
+          - "       PlaceholderRowExec"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   UnionExec"
+            - "     FilterExec: tag1@0 = foo"
+            - "       PlaceholderRowExec"
+            - "     FilterExec: tag1@0 = foo"
+            - "       PlaceholderRowExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_parquet() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![],
+        };
+        let plan = Arc::new(
+            FilterExec::try_new(
+                predicate_mixed(&schema),
+                Arc::new(ParquetExec::new(
+                    base_config,
+                    Some(predicate_tag(&schema)),
+                    None,
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: tag1@0 = field@2"
+          - "   ParquetExec: file_groups={0 groups: []}, projection=[tag1, tag2, field], predicate=tag1@0 = foo, pruning_predicate=tag1_min@0 <= foo AND foo <= tag1_max@1"
+        output:
+          Ok:
+            - " FilterExec: tag1@0 = field@2"
+            - "   ParquetExec: file_groups={0 groups: []}, projection=[tag1, tag2, field], predicate=tag1@0 = foo AND tag1@0 = field@2, pruning_predicate=tag1_min@0 <= foo AND foo <= tag1_max@1"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_dedup_no_pushdown() {
+        let schema = schema();
+        let plan = Arc::new(
+            FilterExec::try_new(
+                predicate_field(&schema),
+                Arc::new(DeduplicateExec::new(
+                    Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))),
+                    sort_expr(&schema),
+                    false,
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: field@2 = val"
+          - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
+          - "     PlaceholderRowExec"
+        output:
+          Ok:
+            - " FilterExec: field@2 = val"
+            - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
+            - "     PlaceholderRowExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_dedup_all_pushdown() {
+        let schema = schema();
+        let plan = Arc::new(
+            FilterExec::try_new(
+                predicate_tag(&schema),
+                Arc::new(DeduplicateExec::new(
+                    Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))),
+                    sort_expr(&schema),
+                    false,
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: tag1@0 = foo"
+          - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
+          - "     PlaceholderRowExec"
+        output:
+          Ok:
+            - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
+            - "   FilterExec: tag1@0 = foo"
+            - "     PlaceholderRowExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_dedup_mixed() {
+        let schema = schema();
+        let plan = Arc::new(
+            FilterExec::try_new(
+                conjunction([
+                    predicate_tag(&schema),
+                    predicate_tags(&schema),
+                    predicate_field(&schema),
+                    predicate_mixed(&schema),
+                    predicate_other(),
+                ])
+                .expect("not empty"),
+                Arc::new(DeduplicateExec::new(
+                    Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))),
+                    sort_expr(&schema),
+                    false,
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = PredicatePushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " FilterExec: tag1@0 = foo AND tag1@0 = tag2@1 AND field@2 = val AND tag1@0 = field@2 AND true"
+          - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
+          - "     PlaceholderRowExec"
+        output:
+          Ok:
+            - " FilterExec: field@2 = val AND tag1@0 = field@2"
+            - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
+            - "     FilterExec: tag1@0 = foo AND tag1@0 = tag2@1 AND true"
+            - "       PlaceholderRowExec"
+        "###
+        );
+    }
+
+    fn schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("tag1", DataType::Utf8, true),
+            Field::new("tag2", DataType::Utf8, true),
+            Field::new("field", DataType::UInt8, true),
+        ]))
+    }
+
+    fn sort_expr(schema: &SchemaRef) -> Vec<PhysicalSortExpr> {
+        let sort_key = SortKeyBuilder::new()
+            .with_col("tag1")
+            .with_col("tag2")
+            .build();
+        arrow_sort_key_exprs(&sort_key, schema)
+    }
+
+    fn predicate_tag(schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new_with_schema("tag1", schema).unwrap()),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::from("foo"))),
+        ))
+    }
+
+    fn predicate_tags(schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new_with_schema("tag1", schema).unwrap()),
+            Operator::Eq,
+            Arc::new(Column::new_with_schema("tag2", schema).unwrap()),
+        ))
+    }
+
+    fn predicate_field(schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new_with_schema("field", schema).unwrap()),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::from("val"))),
+        ))
+    }
+
+    fn predicate_mixed(schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new_with_schema("tag1", schema).unwrap()),
+            Operator::Eq,
+            Arc::new(Column::new_with_schema("field", schema).unwrap()),
+        ))
+    }
+
+    fn predicate_other() -> Arc<dyn PhysicalExpr> {
+        Arc::new(Literal::new(ScalarValue::from(true)))
+    }
+}
diff --git a/iox_query/src/physical_optimizer/projection_pushdown.rs b/iox_query/src/physical_optimizer/projection_pushdown.rs
new file mode 100644
index 0000000..0efe597
--- /dev/null
+++ b/iox_query/src/physical_optimizer/projection_pushdown.rs
@@ -0,0 +1,1718 @@
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+};
+
+use arrow::datatypes::SchemaRef;
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    datasource::physical_plan::{FileScanConfig, ParquetExec},
+    error::{DataFusionError, Result},
+    physical_expr::{
+        utils::{collect_columns, reassign_predicate_columns},
+        PhysicalSortExpr,
+    },
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{
+        empty::EmptyExec,
+        expressions::Column,
+        filter::FilterExec,
+        placeholder_row::PlaceholderRowExec,
+        projection::ProjectionExec,
+        sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
+        union::UnionExec,
+        ExecutionPlan, PhysicalExpr,
+    },
+};
+
+use crate::provider::{DeduplicateExec, RecordBatchesExec};
+
+/// Push down projections.
+#[derive(Debug, Default)]
+pub struct ProjectionPushdown;
+
+impl PhysicalOptimizerRule for ProjectionPushdown {
+    #[allow(clippy::only_used_in_recursion)]
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_down(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(projection_exec) = plan_any.downcast_ref::<ProjectionExec>() {
+                let child = projection_exec.input();
+
+                let mut column_indices = Vec::with_capacity(projection_exec.expr().len());
+                let mut column_names = Vec::with_capacity(projection_exec.expr().len());
+                for (expr, output_name) in projection_exec.expr() {
+                    if let Some(column) = expr.as_any().downcast_ref::<Column>() {
+                        if column.name() == output_name {
+                            column_indices.push(column.index());
+                            column_names.push(output_name.as_str());
+                        } else {
+                            // don't bother w/ renames
+                            return Ok(Transformed::No(plan));
+                        }
+                    } else {
+                        // don't bother to deal w/ calculation within projection nodes
+                        return Ok(Transformed::No(plan));
+                    }
+                }
+
+                let child_any = child.as_any();
+                if let Some(child_empty) = child_any.downcast_ref::<EmptyExec>() {
+                    let new_child =
+                        EmptyExec::new(Arc::new(child_empty.schema().project(&column_indices)?));
+                    return Ok(Transformed::Yes(Arc::new(new_child)));
+                } else if let Some(child_placeholder) =
+                    child_any.downcast_ref::<PlaceholderRowExec>()
+                {
+                    let new_child = PlaceholderRowExec::new(Arc::new(
+                        child_placeholder.schema().project(&column_indices)?,
+                    ));
+                    return Ok(Transformed::Yes(Arc::new(new_child)));
+                } else if let Some(child_union) = child_any.downcast_ref::<UnionExec>() {
+                    let new_inputs = child_union
+                        .inputs()
+                        .iter()
+                        .map(|input| {
+                            let exec = ProjectionExec::try_new(
+                                projection_exec.expr().to_vec(),
+                                Arc::clone(input),
+                            )?;
+                            Ok(Arc::new(exec) as _)
+                        })
+                        .collect::<Result<Vec<_>>>()?;
+                    let new_union = UnionExec::new(new_inputs);
+                    return Ok(Transformed::Yes(Arc::new(new_union)));
+                } else if let Some(child_parquet) = child_any.downcast_ref::<ParquetExec>() {
+                    let projection = match child_parquet.base_config().projection.as_ref() {
+                        Some(projection) => column_indices
+                            .into_iter()
+                            .map(|idx| {
+                                projection.get(idx).copied().ok_or_else(|| {
+                                    DataFusionError::Execution("Projection broken".to_string())
+                                })
+                            })
+                            .collect::<Result<Vec<_>>>()?,
+                        None => column_indices,
+                    };
+                    let output_ordering = child_parquet
+                        .base_config()
+                        .output_ordering
+                        .iter()
+                        .map(|output_ordering| {
+                            project_output_ordering(output_ordering, projection_exec.schema())
+                        })
+                        .collect::<Result<_>>()?;
+                    let base_config = FileScanConfig {
+                        projection: Some(projection),
+                        output_ordering,
+                        ..child_parquet.base_config().clone()
+                    };
+                    let new_child =
+                        ParquetExec::new(base_config, child_parquet.predicate().cloned(), None);
+                    return Ok(Transformed::Yes(Arc::new(new_child)));
+                } else if let Some(child_filter) = child_any.downcast_ref::<FilterExec>() {
+                    let filter_required_cols = collect_columns(child_filter.predicate());
+                    let filter_required_cols = filter_required_cols
+                        .iter()
+                        .map(|col| col.name())
+                        .collect::<HashSet<_>>();
+
+                    let plan = wrap_user_into_projections(
+                        &filter_required_cols,
+                        &column_names,
+                        Arc::clone(child_filter.input()),
+                        |plan| {
+                            Ok(Arc::new(FilterExec::try_new(
+                                reassign_predicate_columns(
+                                    Arc::clone(child_filter.predicate()),
+                                    &plan.schema(),
+                                    false,
+                                )?,
+                                plan,
+                            )?))
+                        },
+                    )?;
+
+                    return Ok(Transformed::Yes(plan));
+                } else if let Some(child_sort) = child_any.downcast_ref::<SortExec>() {
+                    let sort_required_cols = child_sort
+                        .expr()
+                        .iter()
+                        .map(|expr| collect_columns(&expr.expr))
+                        .collect::<Vec<_>>();
+                    let sort_required_cols = sort_required_cols
+                        .iter()
+                        .flat_map(|cols| cols.iter())
+                        .map(|col| col.name())
+                        .collect::<HashSet<_>>();
+
+                    let plan = wrap_user_into_projections(
+                        &sort_required_cols,
+                        &column_names,
+                        Arc::clone(child_sort.input()),
+                        |plan| {
+                            Ok(Arc::new(
+                                SortExec::new(
+                                    reassign_sort_exprs_columns(child_sort.expr(), &plan.schema())?,
+                                    plan,
+                                )
+                                .with_preserve_partitioning(child_sort.preserve_partitioning())
+                                .with_fetch(child_sort.fetch()),
+                            ))
+                        },
+                    )?;
+
+                    return Ok(Transformed::Yes(plan));
+                } else if let Some(child_sort) = child_any.downcast_ref::<SortPreservingMergeExec>()
+                {
+                    let sort_required_cols = child_sort
+                        .expr()
+                        .iter()
+                        .map(|expr| collect_columns(&expr.expr))
+                        .collect::<Vec<_>>();
+                    let sort_required_cols = sort_required_cols
+                        .iter()
+                        .flat_map(|cols| cols.iter())
+                        .map(|col| col.name())
+                        .collect::<HashSet<_>>();
+
+                    let plan = wrap_user_into_projections(
+                        &sort_required_cols,
+                        &column_names,
+                        Arc::clone(child_sort.input()),
+                        |plan| {
+                            Ok(Arc::new(SortPreservingMergeExec::new(
+                                reassign_sort_exprs_columns(child_sort.expr(), &plan.schema())?,
+                                plan,
+                            )))
+                        },
+                    )?;
+
+                    return Ok(Transformed::Yes(plan));
+                } else if let Some(child_proj) = child_any.downcast_ref::<ProjectionExec>() {
+                    let expr = column_indices
+                        .iter()
+                        .map(|idx| child_proj.expr()[*idx].clone())
+                        .collect();
+                    let plan = Arc::new(ProjectionExec::try_new(
+                        expr,
+                        Arc::clone(child_proj.input()),
+                    )?);
+
+                    // need to call `optimize` directly on the plan, because otherwise we would continue with the child
+                    // and miss the optimization of that particular new ProjectionExec
+                    let plan = self.optimize(plan, config)?;
+
+                    return Ok(Transformed::Yes(plan));
+                } else if let Some(child_dedup) = child_any.downcast_ref::<DeduplicateExec>() {
+                    let dedup_required_cols = child_dedup.sort_columns();
+
+                    let mut children = child_dedup.children();
+                    assert_eq!(children.len(), 1);
+                    let input = children.pop().expect("just checked len");
+
+                    let plan = wrap_user_into_projections(
+                        &dedup_required_cols,
+                        &column_names,
+                        input,
+                        |plan| {
+                            let sort_keys = reassign_sort_exprs_columns(
+                                child_dedup.sort_keys(),
+                                &plan.schema(),
+                            )?;
+                            Ok(Arc::new(DeduplicateExec::new(
+                                plan,
+                                sort_keys,
+                                child_dedup.use_chunk_order_col(),
+                            )))
+                        },
+                    )?;
+
+                    return Ok(Transformed::Yes(plan));
+                } else if let Some(child_recordbatches) =
+                    child_any.downcast_ref::<RecordBatchesExec>()
+                {
+                    let new_child = RecordBatchesExec::new(
+                        child_recordbatches.chunks().cloned(),
+                        Arc::new(child_recordbatches.schema().project(&column_indices)?),
+                        child_recordbatches.output_sort_key_memo().cloned(),
+                    );
+                    return Ok(Transformed::Yes(Arc::new(new_child)));
+                }
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "projection_pushdown"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+/// Given the output ordering and a projected schema, returns the
+/// largest prefix of the ordering that is in the projection
+///
+/// For example,
+///
+/// ```text
+/// output_ordering: a, b, c
+/// projection: a, c
+/// returns --> a
+/// ```
+///
+/// To see why the input has to be a prefix, consider this input:
+///
+/// ```text
+/// a    b
+/// 1    1
+/// 2    2
+/// 3    1
+/// ``
+///
+/// It is sorted on `a,b` but *not* sorted on `b`
+fn project_output_ordering(
+    output_ordering: &[PhysicalSortExpr],
+    projected_schema: SchemaRef,
+) -> Result<Vec<PhysicalSortExpr>> {
+    // filter out sort exprs columns that got projected away
+    let known_columns = projected_schema
+        .all_fields()
+        .iter()
+        .map(|f| f.name().as_str())
+        .collect::<HashSet<_>>();
+
+    // take longest prefix
+    let sort_exprs = output_ordering
+        .iter()
+        .take_while(|expr| {
+            if let Some(col) = expr.expr.as_any().downcast_ref::<Column>() {
+                known_columns.contains(col.name())
+            } else {
+                // do not keep exprs like `a+1` or `-a` as they may
+                // not maintain ordering
+                false
+            }
+        })
+        .cloned()
+        .collect::<Vec<_>>();
+
+    reassign_sort_exprs_columns(&sort_exprs, &projected_schema)
+}
+
+fn schema_name_projection(
+    schema: &SchemaRef,
+    cols: &[&str],
+) -> Result<Vec<(Arc<dyn PhysicalExpr>, String)>> {
+    let idx_lookup = schema
+        .fields()
+        .iter()
+        .enumerate()
+        .map(|(idx, field)| (field.name().as_str(), idx))
+        .collect::<HashMap<_, _>>();
+
+    cols.iter()
+        .map(|col| {
+            let idx = *idx_lookup.get(col).ok_or_else(|| {
+                DataFusionError::Execution(format!("Cannot find column to project: {col}"))
+            })?;
+
+            let expr = Arc::new(Column::new(col, idx)) as _;
+            Ok((expr, (*col).to_owned()))
+        })
+        .collect::<Result<Vec<_>>>()
+}
+
+/// Wraps an intermediate node (like [`FilterExec`]) that has a single input but also uses some columns itself into
+/// appropriate projections.
+///
+/// This will turn:
+///
+/// ```yaml
+/// ---
+/// projection:
+///   user:  # e.g. FilterExec
+///     inner:
+/// ```
+///
+/// into
+///
+/// ```yaml
+/// ---
+/// projection:  # if `user` outputs too many cols
+///   user:
+///     projection:  # if `inner` outputs too many cols
+///       inner:
+/// ```
+fn wrap_user_into_projections<F>(
+    user_required_cols: &HashSet<&str>,
+    outer_cols: &[&str],
+    inner_plan: Arc<dyn ExecutionPlan>,
+    user_constructor: F,
+) -> Result<Arc<dyn ExecutionPlan>>
+where
+    F: FnOnce(Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>>,
+{
+    let mut plan = inner_plan;
+
+    let inner_required_cols = user_required_cols
+        .iter()
+        .chain(outer_cols.iter())
+        .copied()
+        .collect::<HashSet<_>>();
+
+    // sort inner required cols according the final projection
+    let outer_cols_order = outer_cols
+        .iter()
+        .copied()
+        .enumerate()
+        .map(|(idx, col)| (col, idx))
+        .collect::<HashMap<_, _>>();
+    let mut inner_projection_cols = inner_required_cols
+        .iter()
+        .copied()
+        .map(|col| {
+            // Note: if the col is NOT known, this will fail in `schema_name_projection`, so we just default it here
+            let idx = outer_cols_order.get(col).copied().unwrap_or_default();
+            (idx, col)
+        })
+        .collect::<Vec<_>>();
+    inner_projection_cols.sort();
+    let inner_projection_cols = inner_projection_cols
+        .into_iter()
+        .map(|(_idx, col)| col)
+        .collect::<Vec<_>>();
+
+    let plan_schema = plan.schema();
+    let plan_cols = plan_schema
+        .fields()
+        .iter()
+        .map(|f| f.name().as_str())
+        .collect::<Vec<_>>();
+    if plan_cols != inner_projection_cols {
+        let expr = schema_name_projection(&plan.schema(), &inner_projection_cols)?;
+        plan = Arc::new(ProjectionExec::try_new(expr, plan)?);
+    }
+
+    plan = user_constructor(plan)?;
+
+    if outer_cols.len() < plan.schema().fields().len() {
+        let expr = schema_name_projection(&plan.schema(), outer_cols)?;
+        plan = Arc::new(ProjectionExec::try_new(expr, plan)?);
+    }
+
+    Ok(plan)
+}
+
+fn reassign_sort_exprs_columns(
+    sort_exprs: &[PhysicalSortExpr],
+    schema: &SchemaRef,
+) -> Result<Vec<PhysicalSortExpr>> {
+    sort_exprs
+        .iter()
+        .map(|expr| {
+            Ok(PhysicalSortExpr {
+                expr: reassign_predicate_columns(Arc::clone(&expr.expr), schema, false)?,
+                options: expr.options,
+            })
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::{
+        compute::SortOptions,
+        datatypes::{DataType, Field, Fields, Schema, SchemaRef},
+    };
+    use datafusion::{
+        datasource::object_store::ObjectStoreUrl,
+        logical_expr::Operator,
+        physical_plan::{
+            expressions::{BinaryExpr, Literal},
+            DisplayAs, PhysicalExpr, Statistics,
+        },
+        scalar::ScalarValue,
+    };
+    use serde::Serialize;
+
+    use crate::{
+        physical_optimizer::test_util::{assert_unknown_partitioning, OptimizationTest},
+        test::TestChunk,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_empty_pushdown_select() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(EmptyExec::new(schema)),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        let test = OptimizationTest::new(plan, opt);
+        insta::assert_yaml_snapshot!(
+            test,
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+
+        let empty_exec = test
+            .output_plan()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<EmptyExec>()
+            .unwrap();
+        let expected_schema = Schema::new(vec![Field::new("tag1", DataType::Utf8, true)]);
+        assert_eq!(empty_exec.schema().as_ref(), &expected_schema);
+    }
+
+    #[test]
+    fn test_empty_pushdown_reorder() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (expr_col("tag2", &schema), String::from("tag2")),
+                    (expr_col("tag1", &schema), String::from("tag1")),
+                    (expr_col("field", &schema), String::from("field")),
+                ],
+                Arc::new(EmptyExec::new(schema)),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        let test = OptimizationTest::new(plan, opt);
+        insta::assert_yaml_snapshot!(
+            test,
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag2@1 as tag2, tag1@0 as tag1, field@2 as field]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+
+        let empty_exec = test
+            .output_plan()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<EmptyExec>()
+            .unwrap();
+        let expected_schema = Schema::new(vec![
+            Field::new("tag2", DataType::Utf8, true),
+            Field::new("tag1", DataType::Utf8, true),
+            Field::new("field", DataType::UInt64, true),
+        ]);
+        assert_eq!(empty_exec.schema().as_ref(), &expected_schema);
+    }
+
+    #[test]
+    fn test_ignore_when_only_impure_projection_rename() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag2", &schema), String::from("tag1"))],
+                Arc::new(EmptyExec::new(schema)),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag2@1 as tag1]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag2@1 as tag1]"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_ignore_when_partial_impure_projection_rename() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (expr_col("tag1", &schema), String::from("tag1")),
+                    (expr_col("tag2", &schema), String::from("tag3")),
+                ],
+                Arc::new(EmptyExec::new(schema)),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag3]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag3]"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_ignore_impure_projection_calc() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(
+                    Arc::new(Literal::new(ScalarValue::from("foo"))),
+                    String::from("tag1"),
+                )],
+                Arc::new(EmptyExec::new(schema)),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[foo as tag1]"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[foo as tag1]"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_unknown_node_type() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(TestExec::new(schema)),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1]"
+            - "   Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_union() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(UnionExec::new(vec![
+                    Arc::new(TestExec::new(Arc::clone(&schema))),
+                    Arc::new(TestExec::new(schema)),
+                ])),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   UnionExec"
+          - "     Test"
+          - "     Test"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   ProjectionExec: expr=[tag1@0 as tag1]"
+            - "     Test"
+            - "   ProjectionExec: expr=[tag1@0 as tag1]"
+            - "     Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_nested_union() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(UnionExec::new(vec![
+                    Arc::new(UnionExec::new(vec![
+                        Arc::new(TestExec::new(Arc::clone(&schema))),
+                        Arc::new(TestExec::new(Arc::clone(&schema))),
+                    ])),
+                    Arc::new(TestExec::new(schema)),
+                ])),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   UnionExec"
+          - "     UnionExec"
+          - "       Test"
+          - "       Test"
+          - "     Test"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   UnionExec"
+            - "     ProjectionExec: expr=[tag1@0 as tag1]"
+            - "       Test"
+            - "     ProjectionExec: expr=[tag1@0 as tag1]"
+            - "       Test"
+            - "   ProjectionExec: expr=[tag1@0 as tag1]"
+            - "     Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_parquet() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("tag1", DataType::Utf8, true),
+            Field::new("tag2", DataType::Utf8, true),
+            Field::new("tag3", DataType::Utf8, true),
+            Field::new("field", DataType::UInt64, true),
+        ]));
+        let projection = vec![3, 2, 1];
+        let schema_projected = Arc::new(schema.project(&projection).unwrap());
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![],
+            statistics: Statistics::new_unknown(&schema),
+            projection: Some(projection),
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![vec![
+                PhysicalSortExpr {
+                    expr: expr_col("tag3", &schema_projected),
+                    options: Default::default(),
+                },
+                PhysicalSortExpr {
+                    expr: expr_col("field", &schema_projected),
+                    options: Default::default(),
+                },
+                PhysicalSortExpr {
+                    expr: expr_col("tag2", &schema_projected),
+                    options: Default::default(),
+                },
+            ]],
+        };
+        let inner = ParquetExec::new(base_config, Some(expr_string_cmp("tag1", &schema)), None);
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (expr_col("tag2", &inner.schema()), String::from("tag2")),
+                    (expr_col("tag3", &inner.schema()), String::from("tag3")),
+                ],
+                Arc::new(inner),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        let test = OptimizationTest::new(plan, opt);
+        insta::assert_yaml_snapshot!(
+            test,
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag2@2 as tag2, tag3@1 as tag3]"
+          - "   ParquetExec: file_groups={0 groups: []}, projection=[field, tag3, tag2], output_ordering=[tag3@1 ASC, field@0 ASC, tag2@2 ASC], predicate=tag1@0 = foo, pruning_predicate=tag1_min@0 <= foo AND foo <= tag1_max@1"
+        output:
+          Ok:
+            - " ParquetExec: file_groups={0 groups: []}, projection=[tag2, tag3], output_ordering=[tag3@1 ASC], predicate=tag1@0 = foo, pruning_predicate=tag1_min@0 <= foo AND foo <= tag1_max@1"
+        "###
+        );
+
+        let parquet_exec = test
+            .output_plan()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<ParquetExec>()
+            .unwrap();
+        let expected_schema = Schema::new(vec![
+            Field::new("tag2", DataType::Utf8, true),
+            Field::new("tag3", DataType::Utf8, true),
+        ]);
+        assert_eq!(parquet_exec.schema().as_ref(), &expected_schema);
+    }
+
+    #[test]
+    fn test_filter_projection_split() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(
+                    FilterExec::try_new(
+                        expr_string_cmp("tag2", &schema),
+                        Arc::new(TestExec::new(schema)),
+                    )
+                    .unwrap(),
+                ),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   FilterExec: tag2@1 = foo"
+          - "     Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1]"
+            - "   FilterExec: tag2@1 = foo"
+            - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
+            - "       Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_filter_inner_does_not_need_projection() {
+        let schema = schema();
+        let inner = TestExec::new(Arc::new(schema.project(&[0, 1]).unwrap()));
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &inner.schema()), String::from("tag1"))],
+                Arc::new(
+                    FilterExec::try_new(expr_string_cmp("tag2", &inner.schema()), Arc::new(inner))
+                        .unwrap(),
+                ),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   FilterExec: tag2@1 = foo"
+          - "     Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1]"
+            - "   FilterExec: tag2@1 = foo"
+            - "     Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_filter_outer_does_not_need_projection() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag2", &schema), String::from("tag2"))],
+                Arc::new(
+                    FilterExec::try_new(
+                        expr_string_cmp("tag2", &schema),
+                        Arc::new(TestExec::new(schema)),
+                    )
+                    .unwrap(),
+                ),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag2@1 as tag2]"
+          - "   FilterExec: tag2@1 = foo"
+          - "     Test"
+        output:
+          Ok:
+            - " FilterExec: tag2@0 = foo"
+            - "   ProjectionExec: expr=[tag2@1 as tag2]"
+            - "     Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_filter_all_projections_unnecessary() {
+        let schema = schema();
+        let inner = TestExec::new(Arc::new(schema.project(&[1]).unwrap()));
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag2", &inner.schema()), String::from("tag2"))],
+                Arc::new(
+                    FilterExec::try_new(expr_string_cmp("tag2", &inner.schema()), Arc::new(inner))
+                        .unwrap(),
+                ),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag2@0 as tag2]"
+          - "   FilterExec: tag2@0 = foo"
+          - "     Test"
+        output:
+          Ok:
+            - " FilterExec: tag2@0 = foo"
+            - "   Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_filter_uses_resorted_cols() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (expr_col("tag2", &schema), String::from("tag2")),
+                    (expr_col("tag1", &schema), String::from("tag1")),
+                    (expr_col("field", &schema), String::from("field")),
+                ],
+                Arc::new(
+                    FilterExec::try_new(
+                        expr_and(
+                            expr_string_cmp("tag2", &schema),
+                            expr_string_cmp("tag1", &schema),
+                        ),
+                        Arc::new(TestExec::new(schema)),
+                    )
+                    .unwrap(),
+                ),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag2@1 as tag2, tag1@0 as tag1, field@2 as field]"
+          - "   FilterExec: tag2@1 = foo AND tag1@0 = foo"
+          - "     Test"
+        output:
+          Ok:
+            - " FilterExec: tag2@0 = foo AND tag1@1 = foo"
+            - "   ProjectionExec: expr=[tag2@1 as tag2, tag1@0 as tag1, field@2 as field]"
+            - "     Test"
+        "###
+        );
+    }
+
+    // since `SortExec` and `FilterExec` both use `wrap_user_into_projections`, we only test a few variants for `SortExec`
+    #[test]
+    fn test_sort_projection_split() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(
+                    SortExec::new(
+                        vec![PhysicalSortExpr {
+                            expr: expr_col("tag2", &schema),
+                            options: SortOptions {
+                                descending: true,
+                                ..Default::default()
+                            },
+                        }],
+                        Arc::new(TestExec::new(schema)),
+                    )
+                    .with_fetch(Some(42)),
+                ),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
+          - "     Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1]"
+            - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
+            - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
+            - "       Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_sort_preserve_partitioning() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(
+                    SortExec::new(
+                        vec![PhysicalSortExpr {
+                            expr: expr_col("tag2", &schema),
+                            options: SortOptions {
+                                descending: true,
+                                ..Default::default()
+                            },
+                        }],
+                        Arc::new(TestExec::new_with_partitions(schema, 2)),
+                    )
+                    .with_preserve_partitioning(true)
+                    .with_fetch(Some(42)),
+                ),
+            )
+            .unwrap(),
+        );
+
+        assert_unknown_partitioning(plan.output_partitioning(), 2);
+
+        let opt = ProjectionPushdown;
+        let test = OptimizationTest::new(plan, opt);
+        insta::assert_yaml_snapshot!(
+            test,
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
+          - "     Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1]"
+            - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
+            - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
+            - "       Test"
+        "###
+        );
+
+        assert_unknown_partitioning(test.output_plan().unwrap().output_partitioning(), 2);
+    }
+
+    // since `SortPreservingMergeExec` and `FilterExec` both use `wrap_user_into_projections`, we only test one variant for `SortPreservingMergeExec`
+    #[test]
+    fn test_sortpreservingmerge_projection_split() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(SortPreservingMergeExec::new(
+                    vec![PhysicalSortExpr {
+                        expr: expr_col("tag2", &schema),
+                        options: SortOptions {
+                            descending: true,
+                            ..Default::default()
+                        },
+                    }],
+                    Arc::new(TestExec::new(schema)),
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   SortPreservingMergeExec: [tag2@1 DESC]"
+          - "     Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1]"
+            - "   SortPreservingMergeExec: [tag2@1 DESC]"
+            - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
+            - "       Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_nested_proj_inner_is_impure() {
+        let schema = schema();
+        let plan = Arc::new(EmptyExec::new(schema));
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (
+                        Arc::new(Literal::new(ScalarValue::from("foo"))),
+                        String::from("tag1"),
+                    ),
+                    (
+                        Arc::new(Literal::new(ScalarValue::from("bar"))),
+                        String::from("tag2"),
+                    ),
+                ],
+                plan,
+            )
+            .unwrap(),
+        );
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &plan.schema()), String::from("tag1"))],
+                plan,
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   ProjectionExec: expr=[foo as tag1, bar as tag2]"
+          - "     EmptyExec"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[foo as tag1]"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_nested_proj_inner_is_pure() {
+        let schema = schema();
+        let plan = Arc::new(EmptyExec::new(schema));
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (expr_col("tag1", &plan.schema()), String::from("tag1")),
+                    (expr_col("tag2", &plan.schema()), String::from("tag2")),
+                ],
+                plan,
+            )
+            .unwrap(),
+        );
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &plan.schema()), String::from("tag1"))],
+                plan,
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        let test = OptimizationTest::new(plan, opt);
+        insta::assert_yaml_snapshot!(
+            test,
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
+          - "     EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+        let empty_exec = test
+            .output_plan()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<EmptyExec>()
+            .unwrap();
+        let expected_schema = Schema::new(vec![Field::new("tag1", DataType::Utf8, true)]);
+        assert_eq!(empty_exec.schema().as_ref(), &expected_schema);
+    }
+
+    // since `DeduplicateExec` and `FilterExec` both use `wrap_user_into_projections`, we only test a few variants for `DeduplicateExec`
+    #[test]
+    fn test_dedup_projection_split1() {
+        let schema = schema();
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(DeduplicateExec::new(
+                    Arc::new(TestExec::new(Arc::clone(&schema))),
+                    vec![PhysicalSortExpr {
+                        expr: expr_col("tag2", &schema),
+                        options: SortOptions {
+                            descending: true,
+                            ..Default::default()
+                        },
+                    }],
+                    false,
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   DeduplicateExec: [tag2@1 DESC]"
+          - "     Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1]"
+            - "   DeduplicateExec: [tag2@1 DESC]"
+            - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
+            - "       Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_dedup_projection_split2() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("tag1", DataType::Utf8, true),
+            Field::new("tag2", DataType::Utf8, true),
+            Field::new("field1", DataType::UInt64, true),
+            Field::new("field2", DataType::UInt64, true),
+        ]));
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (expr_col("tag1", &schema), String::from("tag1")),
+                    (expr_col("field1", &schema), String::from("field1")),
+                ],
+                Arc::new(DeduplicateExec::new(
+                    Arc::new(TestExec::new(Arc::clone(&schema))),
+                    vec![
+                        PhysicalSortExpr {
+                            expr: expr_col("tag1", &schema),
+                            options: SortOptions {
+                                descending: true,
+                                ..Default::default()
+                            },
+                        },
+                        PhysicalSortExpr {
+                            expr: expr_col("tag2", &schema),
+                            options: SortOptions {
+                                descending: false,
+                                ..Default::default()
+                            },
+                        },
+                    ],
+                    false,
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1, field1@2 as field1]"
+          - "   DeduplicateExec: [tag1@0 DESC,tag2@1 ASC]"
+          - "     Test"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[tag1@0 as tag1, field1@2 as field1]"
+            - "   DeduplicateExec: [tag1@0 DESC,tag2@1 ASC]"
+            - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2, field1@2 as field1]"
+            - "       Test"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_recordbatches() {
+        let schema = schema();
+        let chunk = TestChunk::new("table")
+            .with_tag_column("tag1")
+            .with_u64_column("field");
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("tag1", &schema), String::from("tag1"))],
+                Arc::new(RecordBatchesExec::new(
+                    vec![Arc::new(chunk) as _],
+                    schema,
+                    None,
+                )),
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        let test = OptimizationTest::new(plan, opt);
+        insta::assert_yaml_snapshot!(
+            test,
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[tag1@0 as tag1]"
+          - "   RecordBatchesExec: chunks=1, projection=[tag1, tag2, field]"
+        output:
+          Ok:
+            - " RecordBatchesExec: chunks=1, projection=[tag1]"
+        "###
+        );
+
+        let recordbatches_exec = test
+            .output_plan()
+            .unwrap()
+            .as_any()
+            .downcast_ref::<RecordBatchesExec>()
+            .unwrap();
+        let expected_schema = Schema::new(vec![Field::new("tag1", DataType::Utf8, true)]);
+        assert_eq!(recordbatches_exec.schema().as_ref(), &expected_schema);
+    }
+
+    #[test]
+    fn test_integration() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("tag1", DataType::Utf8, true),
+            Field::new("tag2", DataType::Utf8, true),
+            Field::new("field1", DataType::UInt64, true),
+            Field::new("field2", DataType::UInt64, true),
+        ]));
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![],
+        };
+        let plan = Arc::new(ParquetExec::new(base_config, None, None));
+        let plan = Arc::new(UnionExec::new(vec![plan]));
+        let plan_schema = plan.schema();
+        let plan = Arc::new(DeduplicateExec::new(
+            plan,
+            vec![
+                PhysicalSortExpr {
+                    expr: expr_col("tag1", &plan_schema),
+                    options: Default::default(),
+                },
+                PhysicalSortExpr {
+                    expr: expr_col("tag2", &plan_schema),
+                    options: Default::default(),
+                },
+            ],
+            false,
+        ));
+        let plan =
+            Arc::new(FilterExec::try_new(expr_string_cmp("tag2", &plan.schema()), plan).unwrap());
+        let plan = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("field1", &plan.schema()), String::from("field1"))],
+                plan,
+            )
+            .unwrap(),
+        );
+        let opt = ProjectionPushdown;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ProjectionExec: expr=[field1@2 as field1]"
+          - "   FilterExec: tag2@1 = foo"
+          - "     DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
+          - "       UnionExec"
+          - "         ParquetExec: file_groups={0 groups: []}, projection=[tag1, tag2, field1, field2]"
+        output:
+          Ok:
+            - " ProjectionExec: expr=[field1@0 as field1]"
+            - "   FilterExec: tag2@1 = foo"
+            - "     ProjectionExec: expr=[field1@0 as field1, tag2@2 as tag2]"
+            - "       DeduplicateExec: [tag1@1 ASC,tag2@2 ASC]"
+            - "         UnionExec"
+            - "           ParquetExec: file_groups={0 groups: []}, projection=[field1, tag1, tag2]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_project_output_ordering_keep() {
+        let schema = schema();
+        let projection = vec!["tag1", "tag2"];
+        let output_ordering = vec![
+            PhysicalSortExpr {
+                expr: expr_col("tag1", &schema),
+                options: Default::default(),
+            },
+            PhysicalSortExpr {
+                expr: expr_col("tag2", &schema),
+                options: Default::default(),
+            },
+        ];
+
+        insta::assert_yaml_snapshot!(
+            ProjectOutputOrdering::new(&schema, output_ordering, projection),
+            @r###"
+        ---
+        output_ordering:
+          - tag1@0
+          - tag2@1
+        projection:
+          - tag1
+          - tag2
+        projected_ordering:
+          - tag1@0
+          - tag2@1
+        "###
+        );
+    }
+
+    #[test]
+    fn test_project_output_ordering_project_prefix() {
+        let schema = schema();
+        let projection = vec!["tag1"]; // prefix of the sort key
+        let output_ordering = vec![
+            PhysicalSortExpr {
+                expr: expr_col("tag1", &schema),
+                options: Default::default(),
+            },
+            PhysicalSortExpr {
+                expr: expr_col("tag2", &schema),
+                options: Default::default(),
+            },
+        ];
+
+        insta::assert_yaml_snapshot!(
+            ProjectOutputOrdering::new(&schema, output_ordering, projection),
+            @r###"
+        ---
+        output_ordering:
+          - tag1@0
+          - tag2@1
+        projection:
+          - tag1
+        projected_ordering:
+          - tag1@0
+        "###
+        );
+    }
+
+    #[test]
+    fn test_project_output_ordering_project_non_prefix() {
+        let schema = schema();
+        let projection = vec!["tag2"]; // in sort key, but not prefix
+        let output_ordering = vec![
+            PhysicalSortExpr {
+                expr: expr_col("tag1", &schema),
+                options: Default::default(),
+            },
+            PhysicalSortExpr {
+                expr: expr_col("tag2", &schema),
+                options: Default::default(),
+            },
+        ];
+
+        insta::assert_yaml_snapshot!(
+            ProjectOutputOrdering::new(&schema, output_ordering, projection),
+            @r###"
+        ---
+        output_ordering:
+          - tag1@0
+          - tag2@1
+        projection:
+          - tag2
+        projected_ordering: []
+        "###
+        );
+    }
+
+    #[test]
+    fn test_project_output_ordering_projection_reorder() {
+        let schema = schema();
+        let projection = vec!["tag2", "tag1", "field"]; // in different order than sort key
+        let output_ordering = vec![
+            PhysicalSortExpr {
+                expr: expr_col("tag1", &schema),
+                options: Default::default(),
+            },
+            PhysicalSortExpr {
+                expr: expr_col("tag2", &schema),
+                options: Default::default(),
+            },
+        ];
+
+        insta::assert_yaml_snapshot!(
+            ProjectOutputOrdering::new(&schema, output_ordering, projection),
+            @r###"
+        ---
+        output_ordering:
+          - tag1@0
+          - tag2@1
+        projection:
+          - tag2
+          - tag1
+          - field
+        projected_ordering:
+          - tag1@1
+          - tag2@0
+        "###
+        );
+    }
+
+    #[test]
+    fn test_project_output_ordering_constant() {
+        let schema = schema();
+        let projection = vec!["tag2"];
+        let output_ordering = vec![
+            // ordering by a constant is ignored
+            PhysicalSortExpr {
+                expr: datafusion::physical_plan::expressions::lit(1),
+                options: Default::default(),
+            },
+            PhysicalSortExpr {
+                expr: expr_col("tag2", &schema),
+                options: Default::default(),
+            },
+        ];
+
+        insta::assert_yaml_snapshot!(
+            ProjectOutputOrdering::new(&schema, output_ordering, projection),
+            @r###"
+        ---
+        output_ordering:
+          - "1"
+          - tag2@1
+        projection:
+          - tag2
+        projected_ordering: []
+        "###
+        );
+    }
+
+    #[test]
+    fn test_project_output_ordering_constant_second_position() {
+        let schema = schema();
+        let projection = vec!["tag2"];
+        let output_ordering = vec![
+            PhysicalSortExpr {
+                expr: expr_col("tag2", &schema),
+                options: Default::default(),
+            },
+            // ordering by a constant is ignored
+            PhysicalSortExpr {
+                expr: datafusion::physical_plan::expressions::lit(1),
+                options: Default::default(),
+            },
+        ];
+
+        insta::assert_yaml_snapshot!(
+            ProjectOutputOrdering::new(&schema, output_ordering, projection),
+            @r###"
+        ---
+        output_ordering:
+          - tag2@1
+          - "1"
+        projection:
+          - tag2
+        projected_ordering:
+          - tag2@0
+        "###
+        );
+    }
+
+    /// project the output_ordering with the projection,
+    // derive serde to make a nice 'insta' snapshot
+    #[derive(Debug, Serialize)]
+    struct ProjectOutputOrdering {
+        output_ordering: Vec<String>,
+        projection: Vec<String>,
+        projected_ordering: Vec<String>,
+    }
+
+    impl ProjectOutputOrdering {
+        fn new(
+            schema: &Schema,
+            output_ordering: Vec<PhysicalSortExpr>,
+            projection: Vec<&'static str>,
+        ) -> Self {
+            let projected_fields: Fields = projection
+                .iter()
+                .map(|field_name| {
+                    schema
+                        .field_with_name(field_name)
+                        .expect("finding field")
+                        .clone()
+                })
+                .collect();
+            let projected_schema = Arc::new(Schema::new(projected_fields));
+
+            let projected_ordering = project_output_ordering(&output_ordering, projected_schema);
+
+            let projected_ordering = match projected_ordering {
+                Ok(projected_ordering) => format_sort_exprs(&projected_ordering),
+                Err(e) => vec![e.to_string()],
+            };
+
+            Self {
+                output_ordering: format_sort_exprs(&output_ordering),
+                projection: projection.iter().map(|s| s.to_string()).collect(),
+                projected_ordering,
+            }
+        }
+    }
+
+    fn schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("tag1", DataType::Utf8, true),
+            Field::new("tag2", DataType::Utf8, true),
+            Field::new("field", DataType::UInt64, true),
+        ]))
+    }
+
+    fn format_sort_exprs(sort_exprs: &[PhysicalSortExpr]) -> Vec<String> {
+        sort_exprs
+            .iter()
+            .map(|expr| {
+                let PhysicalSortExpr { expr, options: _ } = expr;
+                expr.to_string()
+            })
+            .collect::<Vec<_>>()
+    }
+
+    fn expr_col(name: &str, schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(Column::new_with_schema(name, schema).unwrap())
+    }
+
+    fn expr_string_cmp(col: &str, schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(
+            expr_col(col, schema),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::from("foo"))),
+        ))
+    }
+
+    fn expr_and(a: Arc<dyn PhysicalExpr>, b: Arc<dyn PhysicalExpr>) -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(a, Operator::And, b))
+    }
+
+    #[derive(Debug)]
+    struct TestExec {
+        schema: SchemaRef,
+        partitions: usize,
+    }
+
+    impl TestExec {
+        fn new(schema: SchemaRef) -> Self {
+            Self::new_with_partitions(schema, 1)
+        }
+
+        fn new_with_partitions(schema: SchemaRef, partitions: usize) -> Self {
+            Self { schema, partitions }
+        }
+    }
+
+    impl ExecutionPlan for TestExec {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+
+        fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning {
+            datafusion::physical_plan::Partitioning::UnknownPartitioning(self.partitions)
+        }
+
+        fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> {
+            None
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            assert!(children.is_empty());
+            Ok(self)
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<datafusion::execution::context::TaskContext>,
+        ) -> Result<datafusion::physical_plan::SendableRecordBatchStream> {
+            unimplemented!()
+        }
+
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            Ok(datafusion::physical_plan::Statistics::new_unknown(
+                &self.schema(),
+            ))
+        }
+    }
+
+    impl DisplayAs for TestExec {
+        fn fmt_as(
+            &self,
+            _t: datafusion::physical_plan::DisplayFormatType,
+            f: &mut std::fmt::Formatter<'_>,
+        ) -> std::fmt::Result {
+            write!(f, "Test")
+        }
+    }
+}
diff --git a/iox_query/src/physical_optimizer/sort/mod.rs b/iox_query/src/physical_optimizer/sort/mod.rs
new file mode 100644
index 0000000..9a9be8b
--- /dev/null
+++ b/iox_query/src/physical_optimizer/sort/mod.rs
@@ -0,0 +1,8 @@
+//! Rules specific to [`SortExec`].
+//!
+//! [`SortExec`]: datafusion::physical_plan::sorts::sort::SortExec
+
+pub mod order_union_sorted_inputs;
+pub mod parquet_sortness;
+pub mod push_sort_through_union;
+pub mod util;
diff --git a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
new file mode 100644
index 0000000..0266108
--- /dev/null
+++ b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
@@ -0,0 +1,1487 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{
+        displayable, expressions::Column, sorts::sort_preserving_merge::SortPreservingMergeExec,
+        union::UnionExec, ExecutionPlan,
+    },
+};
+use observability_deps::tracing::{trace, warn};
+
+use crate::{
+    physical_optimizer::sort::util::{collect_statistics_min_max, sort_by_value_ranges},
+    provider::progressive_eval::ProgressiveEvalExec,
+};
+
+/// IOx specific optimization that eliminates a `SortPreservingMerge`
+/// by reordering inputs in terms  of their value ranges. If all inputs are non overlapping and ordered
+/// by value range, they can be concatenated by `ProgressiveEval`  while
+/// maintaining the desired output order without actually merging.
+///
+/// Find this structure:
+///     SortPreservingMergeExec  - on one column (DESC or ASC)
+///         UnionExec
+/// and if
+///    - all inputs of UnionExec are already sorted (or has SortExec) with sortExpr also on time DESC or ASC accarsdingly and
+///    - the streams do not overlap in values of the sorted column
+/// do:
+///   - order them by the sorted column DESC or ASC accordingly and
+///   - replace SortPreservingMergeExec with ProgressiveEvalExec
+///
+/// Notes: The difference between SortPreservingMergeExec & ProgressiveEvalExec
+///    - SortPreservingMergeExec do the merge of sorted input streams. It needs each stream sorted but the streams themselves
+///      can be in any random order and they can also overlap in values of sorted columns.
+///    - ProgressiveEvalExec only outputs data in their input order of the streams and not do any merges. Thus in order to
+///      output data in the right sort order, these three conditions must be true:
+///        1. Each input stream must sorted on the same column DESC or ASC accordingly
+///        2. The streams must be sorted on the column DESC or ASC accordingly
+///        3. The streams must not overlap in the values of that column.
+///
+/// Example: for col_name ranges:
+///   |--- r1---|-- r2 ---|-- r3 ---|-- r4 --|
+///
+/// Here is what the input look like:
+///
+///   SortPreservingMergeExec: time@2 DESC, fetch=1
+///     UnionExec
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r3
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r1
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r4
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r2  -- assuming this SortExec has 2 output sorted streams
+///          ...
+///
+/// The streams do not overlap in time, and they are already sorted by time DESC.
+///
+/// The output will be the same except that all the input streams will be sorted by time DESC too and looks like
+///
+///   SortPreservingMergeExec: time@2 DESC, fetch=1
+///     UnionExec
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r1
+///         ...
+///       SortPreservingMergeExec:                                                  -- need this extra to merge the 2 streams into one
+///          SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r2
+///             ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r3
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r4
+///          ...
+///
+
+pub(crate) struct OrderUnionSortedInputs;
+
+impl PhysicalOptimizerRule for OrderUnionSortedInputs {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            // Find SortPreservingMergeExec
+            let Some(sort_preserving_merge_exec) =
+                plan.as_any().downcast_ref::<SortPreservingMergeExec>()
+            else {
+                return Ok(Transformed::No(plan));
+            };
+
+            // Check if the sortExpr is only on one column
+            let sort_expr = sort_preserving_merge_exec.expr();
+            if sort_expr.len() != 1 {
+                trace!(
+                    ?sort_expr,
+                    "-------- sortExpr is not on one column. No optimization"
+                );
+                return Ok(Transformed::No(plan));
+            };
+            let Some(sorted_col) = sort_expr[0].expr.as_any().downcast_ref::<Column>() else {
+                trace!(
+                    ?sort_expr,
+                    "-------- sortExpr is not on pure column but expression. No optimization"
+                );
+                return Ok(Transformed::No(plan));
+            };
+            let sort_options = sort_expr[0].options;
+
+            // Find UnionExec
+            let Some(union_exec) = sort_preserving_merge_exec
+                .input()
+                .as_any()
+                .downcast_ref::<UnionExec>()
+            else {
+                trace!("-------- SortPreservingMergeExec input is not UnionExec. No optimization");
+                return Ok(Transformed::No(plan));
+            };
+
+            // Check all inputs of UnionExec must be already sorted and on the same sort_expr of SortPreservingMergeExec
+            let Some(union_output_ordering) = union_exec.output_ordering() else {
+                warn!(plan=%displayable(plan.as_ref()).indent(false), "Union input to SortPreservingMerge is not sorted");
+                return Ok(Transformed::No(plan));
+            };
+
+            // Check if the first PhysicalSortExpr is the same as the sortExpr[0] in SortPreservingMergeExec
+            if sort_expr[0] != union_output_ordering[0] {
+                warn!(?sort_expr, ?union_output_ordering, plan=%displayable(plan.as_ref()).indent(false), "-------- Sort order of SortPreservingMerge and its children are different");
+                return Ok(Transformed::No(plan));
+            }
+
+            let Some(value_ranges) = collect_statistics_min_max(union_exec.inputs(), sorted_col.name())?
+            else {
+                return Ok(Transformed::No(plan));
+            };
+
+            // Sort the inputs by their value ranges
+            trace!("-------- value_ranges: {:?}", value_ranges);
+            let Some(plans_value_ranges) =
+                sort_by_value_ranges(union_exec.inputs().to_vec(), value_ranges, sort_options)?
+            else {
+                trace!("-------- inputs are not sorted by value ranges. No optimization");
+                return Ok(Transformed::No(plan));
+            };
+
+            // If each input of UnionExec outputs many sorted streams, data of different streams may overlap and
+            // even if they do not overlapped, their streams can be in any order. We need to (sort) merge them first
+            // to have a single output stream out to guarantee the output is sorted.
+            let new_inputs = plans_value_ranges.plans
+                .iter()
+                .map(|input| {
+                    if input.output_partitioning().partition_count() > 1 {
+                        // Add SortPreservingMergeExec on top of this input
+                        let sort_preserving_merge_exec = Arc::new(
+                            SortPreservingMergeExec::new(sort_expr.to_vec(), Arc::clone(input))
+                                .with_fetch(sort_preserving_merge_exec.fetch()),
+                        );
+                        Ok(sort_preserving_merge_exec as _)
+                    } else {
+                        Ok(Arc::clone(input))
+                    }
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            let new_union_exec = Arc::new(UnionExec::new(new_inputs));
+
+            // Replace SortPreservingMergeExec with ProgressiveEvalExec
+            let progresive_eval_exec = Arc::new(ProgressiveEvalExec::new(
+                new_union_exec,
+                Some(plans_value_ranges.value_ranges),
+                sort_preserving_merge_exec.fetch(),
+            ));
+
+            Ok(Transformed::Yes(progresive_eval_exec))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "order_union_sorted_inputs"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::{compute::SortOptions, datatypes::SchemaRef};
+    use datafusion::{
+        logical_expr::Operator,
+        physical_expr::PhysicalSortExpr,
+        physical_plan::{
+            expressions::{BinaryExpr, Column},
+            limit::GlobalLimitExec,
+            projection::ProjectionExec,
+            repartition::RepartitionExec,
+            sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
+            union::UnionExec,
+            ExecutionPlan, Partitioning, PhysicalExpr,
+        },
+        scalar::ScalarValue,
+    };
+    use schema::{InfluxFieldType, SchemaBuilder as IOxSchemaBuilder};
+
+    use crate::{
+        physical_optimizer::{
+            sort::order_union_sorted_inputs::OrderUnionSortedInputs, test_util::OptimizationTest,
+        },
+        provider::{chunks_to_physical_nodes, DeduplicateExec, RecordBatchesExec},
+        statistics::{column_statistics_min_max, compute_stats_column_min_max},
+        test::{format_execution_plan, TestChunk},
+        QueryChunk, CHUNK_ORDER_COLUMN_NAME,
+    };
+
+    // ------------------------------------------------------------------
+    // Positive tests: the right structure found -> plan optimized
+    // ------------------------------------------------------------------
+
+    #[test]
+    fn test_limit_mix_record_batch_parquet_1_desc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //          RecordBatchesExec                 -- 3 chunks [2001, 3000]
+        //          RecordBatchesExec                 -- 2 chunks [2500, 3500]
+        //
+        // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_batches1 = record_batches_exec_with_value_range(3, 2001, 3000);
+        let plan_batches2 = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches1, plan_batches2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        // min max of plan_sorted1 is [1000, 2000]
+        // structure of plan_sorted1
+        let p_sort1 = Arc::clone(&plan_sort1) as Arc<dyn ExecutionPlan>;
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&p_sort1),
+            @r###"
+        ---
+        - " SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+        let min_max_sort1 = compute_stats_column_min_max(&*plan_sort1, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_sort1).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(1000), None),
+                ScalarValue::TimestampNanosecond(Some(2000), None)
+            )
+        );
+        //
+        // min max of plan_sorted2 is [2001, 3500]
+        let p_sort2 = Arc::clone(&plan_sort2) as Arc<dyn ExecutionPlan>;
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&p_sort2),
+            @r###"
+        ---
+        - " SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "   UnionExec"
+        - "     RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+        - "     RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "###
+        );
+        let min_max_sort2 = compute_stats_column_min_max(&*plan_sort2, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_sort2).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(2001), None),
+                ScalarValue::TimestampNanosecond(Some(3500), None)
+            )
+        );
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // min max of plan_spm is [1000, 3500]
+        let p_spm = Arc::clone(&plan_spm) as Arc<dyn ExecutionPlan>;
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&p_spm),
+            @r###"
+        ---
+        - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+        - "   UnionExec"
+        - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "       UnionExec"
+        - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+        - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "###
+        );
+        let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_spm).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(1000), None),
+                ScalarValue::TimestampNanosecond(Some(3500), None)
+            )
+        );
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_limit_mix_record_batch_parquet_2_desc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[time@2 DESC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2001, 3000]
+        //
+        // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // test on non-time column & order desc
+    #[test]
+    fn test_limit_mix_record_batch_parquet_non_time_sort_desc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [field1@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 DESC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[field1@2 DESC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[field1@2 DESC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2001, 3000]
+        //
+        // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("field1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Desc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [field1@2 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(Int64(2001), Int64(3500)), (Int64(1000), Int64(2000))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // test on non-time column & order asc
+    #[test]
+    fn test_limit_mix_record_batch_parquet_non_time_sort_asc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [field1@2 ASC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 ASC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[field1@2 ASC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[field1@2 ASC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2001, 3000]
+        //
+        // Output plan: same as input plan
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("field1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Asc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [field1@2 ASC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(Int64(1000), Int64(2000)), (Int64(2001), Int64(3500))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right sort preserving merge struct --> optimize
+    #[test]
+    fn test_spm_time_desc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[time@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: 2 SortExec are swapped
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right sort preserving merge struct --> optimize
+    #[test]
+    fn test_spm_non_time_desc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [field1@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[field1@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: 2 SortExec are swapped
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [field1@2 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(Int64(2001), Int64(3500)), (Int64(1000), Int64(2000))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right sort preserving merge struct --> optimize
+    #[test]
+    fn test_spm_non_time_asc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [field1@2 ASC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 ASC]
+        //        ParquetExec
+        //      SortExec: expr=[field1@2 ASC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: 2 SortExec ordered as above
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Asc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // output stays the same as input
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [field1@2 ASC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(Int64(1000), Int64(2000)), (Int64(2001), Int64(3500))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // Plan starts with SortPreservingMerge and includes deduplication & projections.
+    // All conditions meet --> optimize
+    #[test]
+    fn test_spm_time_desc_with_dedupe_and_proj() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[time]
+        //          ParquetExec                           -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[time]
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+        //
+        // Output: 2 SortExec are swapped
+
+        let schema = schema();
+
+        let final_sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+
+        // Sort plan of the first parquet:
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[time]
+        //          ParquetExec
+        let plan_parquet_1 = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_projection_1 = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("time", &schema), String::from("time"))],
+                plan_parquet_1,
+            )
+            .unwrap(),
+        );
+        let plan_sort1 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_1));
+
+        // Sort plan of the second parquet and the record batch
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[time]
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+        let plan_parquet_2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+        let dedupe_sort_order = ordering_with_options(
+            [
+                ("col1", SortOp::Asc),
+                ("col2", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+        let plan_sort_rb = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_batches));
+        let plan_sort_pq = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_parquet_2));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort_rb, plan_sort_pq]));
+        let plan_spm_1 = Arc::new(SortPreservingMergeExec::new(
+            dedupe_sort_order.clone(),
+            plan_union_1,
+        ));
+        let plan_dedupe = Arc::new(DeduplicateExec::new(plan_spm_1, dedupe_sort_order, false));
+        let plan_projection_2 = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("time", &schema), String::from("time"))],
+                plan_dedupe,
+            )
+            .unwrap(),
+        );
+        let plan_sort2 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_2));
+
+        // Union them together
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        // SortPreservingMerge them
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            final_sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // compute statistics
+        let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_spm).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(1000), None),
+                ScalarValue::TimestampNanosecond(Some(3500), None)
+            )
+        );
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[time@3 as time]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[time@3 as time]"
+          - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             UnionExec"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[time@3 as time]"
+            - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             UnionExec"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[time@3 as time]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // ------------------------------------------------------------------
+    // Negative tests: the right structure not found -> nothing optimized
+    // ------------------------------------------------------------------
+
+    // Right stucture but sort on 2 columns --> plan stays the same
+    #[test]
+    fn test_negative_spm_2_column_sort_desc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@3 DESC, field1@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@3 DESC, field1@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[time@3 DESC, field1@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: same as input
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order =
+            ordering_with_options([("time", SortOp::Desc), ("field1", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " SortPreservingMergeExec: [time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit  & random plan --> plan stay the same
+    #[test]
+    fn test_negative_no_limit() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_batches = record_batches_exec_with_value_range(2, 1500, 2500);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "   SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "   SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
+            - "         UnionExec"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // has limit but no sort preserving merge --> plan stay the same
+    #[test]
+    fn test_negative_limit_no_preserving_merge() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+
+        let plan_batches1 = record_batches_exec_with_value_range(1, 1000, 2000);
+        let plan_batches2 = record_batches_exec_with_value_range(3, 2001, 3000);
+        let plan_batches3 = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches2, plan_batches3]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_batches1));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_union_2, 0, Some(1)));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "###
+        );
+    }
+
+    // right structure and same sort order but inputs of uion overlap --> plan stay the same
+    #[test]
+    fn test_negative_overlap() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]  that overlaps with the other SorExec
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2000, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[time@2 DESC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2000, 3000]
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2000, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+            - "     UnionExec"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right union struct --> plan stay the same
+    #[test]
+    fn test_negative_no_sortpreservingmerge_input_union() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[time@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_union_2, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "     ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "     ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // Projection expression (field + field) ==> not optimze. Plan stays the same
+    #[test]
+    fn test_negative_spm_time_desc_with_dedupe_and_proj_on_expr() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[field1 + field1, time]                                <-- NOTE: has expresssion col1+col2
+        //          ParquetExec                           -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[field1 + field1, time]                                <-- NOTE: has expresssion col1+col2
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+
+        let schema = schema();
+
+        let final_sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+
+        // Sort plan of the first parquet:
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[field1 + field1, time]
+        //          ParquetExec
+        let plan_parquet_1 = parquet_exec_with_value_range(&schema, 1000, 2000);
+
+        let field_expr = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new_with_schema("field1", &schema).unwrap()),
+            Operator::Plus,
+            Arc::new(Column::new_with_schema("field1", &schema).unwrap()),
+        ));
+        let plan_projection_1 = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (Arc::<BinaryExpr>::clone(&field_expr), String::from("field")),
+                    (expr_col("time", &schema), String::from("time")),
+                ],
+                plan_parquet_1,
+            )
+            .unwrap(),
+        );
+        let plan_sort1 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_1));
+
+        // Sort plan of the second parquet and the record batch
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[field1 + field1, time]
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+        let plan_parquet_2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+        let dedupe_sort_order = ordering_with_options(
+            [
+                ("col1", SortOp::Asc),
+                ("col2", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+        let plan_sort_rb = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_batches));
+        let plan_sort_pq = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_parquet_2));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort_rb, plan_sort_pq]));
+        let plan_spm_1 = Arc::new(SortPreservingMergeExec::new(
+            dedupe_sort_order.clone(),
+            plan_union_1,
+        ));
+        let plan_dedupe = Arc::new(DeduplicateExec::new(plan_spm_1, dedupe_sort_order, false));
+        let plan_projection_2 = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (field_expr, String::from("field")),
+                    (expr_col("time", &schema), String::from("time")),
+                ],
+                plan_dedupe,
+            )
+            .unwrap(),
+        );
+        let plan_sort2 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_2));
+
+        // Union them together
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        // SortPreservingMerge them
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            final_sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // compute statistics: no stats becasue the ProjectionExec includes expression
+        let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_spm);
+        assert!(min_max.is_none());
+
+        // output plan stays the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+          - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             UnionExec"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+            - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             UnionExec"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // ------------------------------------------------------------------
+    // Helper functions
+    // ------------------------------------------------------------------
+
+    fn schema() -> SchemaRef {
+        IOxSchemaBuilder::new()
+            .tag("col1")
+            .tag("col2")
+            .influx_field("field1", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into()
+    }
+
+    fn expr_col(name: &str, schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(Column::new_with_schema(name, schema).unwrap())
+    }
+
+    // test chunk with time range and field1's value range
+    fn test_chunk(min: i64, max: i64, parquet_data: bool) -> Arc<dyn QueryChunk> {
+        let chunk = TestChunk::new("t")
+            .with_time_column_with_stats(Some(min), Some(max))
+            .with_tag_column_with_stats("col1", Some("AL"), Some("MT"))
+            .with_tag_column_with_stats("col2", Some("MA"), Some("VY"))
+            .with_i64_field_column_with_stats("field1", Some(min), Some(max));
+
+        let chunk = if parquet_data {
+            chunk.with_dummy_parquet_file()
+        } else {
+            chunk
+        };
+
+        Arc::new(chunk) as Arc<dyn QueryChunk>
+    }
+
+    fn record_batches_exec_with_value_range(
+        n_chunks: usize,
+        min: i64,
+        max: i64,
+    ) -> Arc<dyn ExecutionPlan> {
+        let chunks = std::iter::repeat(test_chunk(min, max, false))
+            .take(n_chunks)
+            .collect::<Vec<_>>();
+
+        Arc::new(RecordBatchesExec::new(chunks, schema(), None))
+    }
+
+    fn parquet_exec_with_value_range(
+        schema: &SchemaRef,
+        min: i64,
+        max: i64,
+    ) -> Arc<dyn ExecutionPlan> {
+        let chunk = test_chunk(min, max, true);
+        let plan = chunks_to_physical_nodes(schema, None, vec![chunk], 1);
+
+        if let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() {
+            if union_exec.inputs().len() == 1 {
+                Arc::clone(&union_exec.inputs()[0])
+            } else {
+                plan
+            }
+        } else {
+            plan
+        }
+    }
+
+    fn ordering_with_options<const N: usize>(
+        cols: [(&str, SortOp); N],
+        schema: &SchemaRef,
+    ) -> Vec<PhysicalSortExpr> {
+        cols.into_iter()
+            .map(|col| PhysicalSortExpr {
+                expr: Arc::new(Column::new_with_schema(col.0, schema.as_ref()).unwrap()),
+                options: SortOptions {
+                    descending: col.1 == SortOp::Desc,
+                    nulls_first: false,
+                },
+            })
+            .collect()
+    }
+
+    #[derive(Debug, PartialEq)]
+    enum SortOp {
+        Asc,
+        Desc,
+    }
+}
diff --git a/iox_query/src/physical_optimizer/sort/parquet_sortness.rs b/iox_query/src/physical_optimizer/sort/parquet_sortness.rs
new file mode 100644
index 0000000..c0f4a13
--- /dev/null
+++ b/iox_query/src/physical_optimizer/sort/parquet_sortness.rs
@@ -0,0 +1,658 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::tree_node::{RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter},
+    config::ConfigOptions,
+    datasource::physical_plan::{FileScanConfig, ParquetExec},
+    error::Result,
+    physical_expr::{PhysicalSortExpr, PhysicalSortRequirement},
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{sorts::sort::SortExec, ExecutionPlan},
+};
+use observability_deps::tracing::warn;
+
+use crate::config::IoxConfigExt;
+
+/// Trade wider fan-out of not having to sort parquet files.
+///
+/// This will fan-out [`ParquetExec`] nodes beyond [`target_partitions`] if it is under a node that desires sorting, e.g.:
+///
+/// - [`SortExec`] itself
+/// - any other node that requires sorting, e.g. [`DeduplicateExec`]
+///
+/// [`DeduplicateExec`]: crate::provider::DeduplicateExec
+/// [`target_partitions`]: datafusion::common::config::ExecutionOptions::target_partitions
+#[derive(Debug, Default)]
+pub struct ParquetSortness;
+
+impl PhysicalOptimizerRule for ParquetSortness {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_down(&|plan| {
+            let Some(children_with_sort) = detect_children_with_desired_ordering(plan.as_ref())
+            else {
+                return Ok(Transformed::No(plan));
+            };
+            let mut children_new = Vec::with_capacity(children_with_sort.len());
+            for (child, desired_ordering) in children_with_sort {
+                let mut rewriter = ParquetSortnessRewriter {
+                    config,
+                    desired_ordering: &desired_ordering,
+                };
+                let child = Arc::clone(&child).rewrite(&mut rewriter)?;
+                children_new.push(child);
+            }
+
+            Ok(Transformed::Yes(plan.with_new_children(children_new)?))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "parquet_sortness"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+type ChildWithSorting = (Arc<dyn ExecutionPlan>, Vec<PhysicalSortExpr>);
+
+fn detect_children_with_desired_ordering(
+    plan: &dyn ExecutionPlan,
+) -> Option<Vec<ChildWithSorting>> {
+    if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
+        return Some(vec![(
+            Arc::clone(sort_exec.input()),
+            sort_exec.expr().to_vec(),
+        )]);
+    }
+
+    let required_input_ordering = plan.required_input_ordering();
+    if !required_input_ordering.iter().all(|expr| expr.is_some()) {
+        // not all inputs require sorting, ignore it
+        return None;
+    }
+
+    let children = plan.children();
+    if children.len() != required_input_ordering.len() {
+        // this should normally not happen, but we ignore it
+        return None;
+    }
+    if children.is_empty() {
+        // leaf node
+        return None;
+    }
+
+    Some(
+        children
+            .into_iter()
+            .zip(
+                required_input_ordering
+                    .into_iter()
+                    .map(|requirement| requirement.expect("just checked"))
+                    .map(PhysicalSortRequirement::to_sort_exprs),
+            )
+            .collect(),
+    )
+}
+
+#[derive(Debug)]
+struct ParquetSortnessRewriter<'a> {
+    config: &'a ConfigOptions,
+    desired_ordering: &'a [PhysicalSortExpr],
+}
+
+impl<'a> TreeNodeRewriter for ParquetSortnessRewriter<'a> {
+    type N = Arc<dyn ExecutionPlan>;
+
+    fn pre_visit(&mut self, node: &Self::N) -> Result<RewriteRecursion> {
+        if detect_children_with_desired_ordering(node.as_ref()).is_some() {
+            // another sort or sort-desiring node
+            Ok(RewriteRecursion::Stop)
+        } else {
+            Ok(RewriteRecursion::Continue)
+        }
+    }
+
+    fn mutate(&mut self, node: Self::N) -> Result<Self::N> {
+        let Some(parquet_exec) = node.as_any().downcast_ref::<ParquetExec>() else {
+            // not a parquet exec
+            return Ok(node);
+        };
+
+        let base_config = parquet_exec.base_config();
+        if base_config.output_ordering.is_empty() {
+            // no output ordering requested
+            return Ok(node);
+        }
+
+        if base_config.file_groups.iter().all(|g| g.len() < 2) {
+            // already flat
+            return Ok(node);
+        }
+
+        // Protect against degenerative plans
+        let n_files = base_config.file_groups.iter().map(Vec::len).sum::<usize>();
+        let max_parquet_fanout = self
+            .config
+            .extensions
+            .get::<IoxConfigExt>()
+            .cloned()
+            .unwrap_or_default()
+            .max_parquet_fanout;
+        if n_files > max_parquet_fanout {
+            warn!(
+                n_files,
+                max_parquet_fanout, "cannot use pre-sorted parquet files, fan-out too wide"
+            );
+            return Ok(node);
+        }
+
+        let base_config = FileScanConfig {
+            file_groups: base_config
+                .file_groups
+                .iter()
+                .flat_map(|g| g.iter())
+                .map(|f| vec![f.clone()])
+                .collect(),
+            ..base_config.clone()
+        };
+        let new_parquet_exec =
+            ParquetExec::new(base_config, parquet_exec.predicate().cloned(), None);
+
+        // did this help?
+        if new_parquet_exec.output_ordering() == Some(self.desired_ordering) {
+            Ok(Arc::new(new_parquet_exec))
+        } else {
+            Ok(node)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+    use datafusion::{
+        datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl},
+        physical_expr::PhysicalSortExpr,
+        physical_plan::{
+            expressions::Column, placeholder_row::PlaceholderRowExec, sorts::sort::SortExec,
+            union::UnionExec, Statistics,
+        },
+    };
+    use object_store::{path::Path, ObjectMeta};
+
+    use crate::{
+        chunk_order_field,
+        physical_optimizer::test_util::{assert_unknown_partitioning, OptimizationTest},
+        provider::{DeduplicateExec, RecordBatchesExec},
+        CHUNK_ORDER_COLUMN_NAME,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_happy_path_sort() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col2", "col1"], &schema)],
+        };
+        let inner = ParquetExec::new(base_config, None, None);
+        let plan = Arc::new(
+            SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
+                .with_fetch(Some(42)),
+        );
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_happy_path_dedup() {
+        let schema = schema_with_chunk_order();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col2", "col1", CHUNK_ORDER_COLUMN_NAME], &schema)],
+        };
+        let inner = ParquetExec::new(base_config, None, None);
+        let plan = Arc::new(DeduplicateExec::new(
+            Arc::new(inner),
+            ordering(["col2", "col1"], &schema),
+            true,
+        ));
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]"
+          - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]"
+            - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_sort_partitioning() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)], vec![file(3)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col2", "col1"], &schema)],
+        };
+        let inner = ParquetExec::new(base_config, None, None);
+        let plan = Arc::new(
+            SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
+                .with_preserve_partitioning(true)
+                .with_fetch(Some(42)),
+        );
+
+        assert_unknown_partitioning(plan.output_partitioning(), 2);
+
+        let opt = ParquetSortness;
+        let test = OptimizationTest::new(plan, opt);
+        insta::assert_yaml_snapshot!(
+            test,
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   ParquetExec: file_groups={2 groups: [[1.parquet, 2.parquet], [3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   ParquetExec: file_groups={3 groups: [[1.parquet], [2.parquet], [3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        "###
+        );
+
+        assert_unknown_partitioning(test.output_plan().unwrap().output_partitioning(), 3);
+    }
+
+    #[test]
+    fn test_parquet_already_flat() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1)], vec![file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col2", "col1"], &schema)],
+        };
+        let inner = ParquetExec::new(base_config, None, None);
+        let plan = Arc::new(
+            SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
+                .with_fetch(Some(42)),
+        );
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_parquet_has_different_ordering() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col1", "col2"], &schema)],
+        };
+        let inner = ParquetExec::new(base_config, None, None);
+        let plan = Arc::new(
+            SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
+                .with_fetch(Some(42)),
+        );
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_parquet_has_no_ordering() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![],
+        };
+        let inner = ParquetExec::new(base_config, None, None);
+        let plan = Arc::new(
+            SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
+                .with_fetch(Some(42)),
+        );
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_fanout_limit() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2), file(3)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col2", "col1"], &schema)],
+        };
+        let inner = ParquetExec::new(base_config, None, None);
+        let plan = Arc::new(
+            SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
+                .with_fetch(Some(42)),
+        );
+        let opt = ParquetSortness;
+        let mut config = ConfigOptions::default();
+        config.extensions.insert(IoxConfigExt {
+            max_parquet_fanout: 2,
+            ..Default::default()
+        });
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new_with_config(plan, opt, &config),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet, 3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet, 3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_other_node() {
+        let schema = schema();
+        let inner = PlaceholderRowExec::new(Arc::clone(&schema));
+        let plan = Arc::new(
+            SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
+                .with_fetch(Some(42)),
+        );
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   PlaceholderRowExec"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   PlaceholderRowExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_does_not_touch_freestanding_parquet_exec() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col2", "col1"], &schema)],
+        };
+        let plan = Arc::new(ParquetExec::new(base_config, None, None));
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        output:
+          Ok:
+            - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_ignore_outer_sort_if_inner_preform_resort() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col1", "col2"], &schema)],
+        };
+        let plan = Arc::new(ParquetExec::new(base_config, None, None));
+        let plan =
+            Arc::new(SortExec::new(ordering(["col2", "col1"], &schema), plan).with_fetch(Some(42)));
+        let plan =
+            Arc::new(SortExec::new(ordering(["col1", "col2"], &schema), plan).with_fetch(Some(42)));
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
+          - "   SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
+            - "   SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_honor_inner_sort_even_if_outer_preform_resort() {
+        let schema = schema();
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col1", "col2"], &schema)],
+        };
+        let plan = Arc::new(ParquetExec::new(base_config, None, None));
+        let plan =
+            Arc::new(SortExec::new(ordering(["col1", "col2"], &schema), plan).with_fetch(Some(42)));
+        let plan =
+            Arc::new(SortExec::new(ordering(["col2", "col1"], &schema), plan).with_fetch(Some(42)));
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
+        output:
+          Ok:
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
+            - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_issue_idpe_17556() {
+        let schema = schema_with_chunk_order();
+
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(&schema),
+            file_groups: vec![vec![file(1), file(2)]],
+            statistics: Statistics::new_unknown(&schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![ordering(["col2", "col1", CHUNK_ORDER_COLUMN_NAME], &schema)],
+        };
+        let plan_parquet = Arc::new(ParquetExec::new(base_config, None, None));
+        let plan_batches = Arc::new(RecordBatchesExec::new(vec![], Arc::clone(&schema), None));
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan = Arc::new(DeduplicateExec::new(
+            plan,
+            ordering(["col2", "col1"], &schema),
+            true,
+        ));
+        let opt = ParquetSortness;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]"
+          - "   UnionExec"
+          - "     RecordBatchesExec: chunks=0, projection=[col1, col2, col3, __chunk_order]"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]"
+            - "   UnionExec"
+            - "     RecordBatchesExec: chunks=0, projection=[col1, col2, col3, __chunk_order]"
+            - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]"
+        "###
+        );
+    }
+
+    fn schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("col1", DataType::Utf8, true),
+            Field::new("col2", DataType::Utf8, true),
+            Field::new("col3", DataType::Utf8, true),
+        ]))
+    }
+
+    fn schema_with_chunk_order() -> SchemaRef {
+        Arc::new(Schema::new(
+            schema()
+                .fields()
+                .iter()
+                .cloned()
+                .chain(std::iter::once(chunk_order_field()))
+                .collect::<Fields>(),
+        ))
+    }
+
+    fn file(n: u128) -> PartitionedFile {
+        PartitionedFile {
+            object_meta: ObjectMeta {
+                location: Path::parse(format!("{n}.parquet")).unwrap(),
+                last_modified: Default::default(),
+                size: 0,
+                e_tag: None,
+                version: None,
+            },
+            partition_values: vec![],
+            range: None,
+            extensions: None,
+        }
+    }
+
+    fn ordering<const N: usize>(cols: [&str; N], schema: &SchemaRef) -> Vec<PhysicalSortExpr> {
+        cols.into_iter()
+            .map(|col| PhysicalSortExpr {
+                expr: Arc::new(Column::new_with_schema(col, schema.as_ref()).unwrap()),
+                options: Default::default(),
+            })
+            .collect()
+    }
+}
diff --git a/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs b/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs
new file mode 100644
index 0000000..f76772a
--- /dev/null
+++ b/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs
@@ -0,0 +1,706 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::{
+        internal_err,
+        tree_node::{RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter},
+    },
+    config::ConfigOptions,
+    error::{DataFusionError, Result},
+    physical_expr::{PhysicalSortExpr, PhysicalSortRequirement},
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{
+        repartition::RepartitionExec, sorts::sort::SortExec, union::UnionExec, ExecutionPlan,
+    },
+};
+
+/// Pushes a [`SortExec`] through a [`UnionExec`], possibly
+/// including multiple [`RepartitionExec`] nodes (converting them
+/// to be sort-preserving in the process), provided that at least
+/// one of the children of the union is already sorted.
+///
+/// In other words, a typical plan like this
+/// ```text
+/// DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]
+///   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]
+///     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8
+///       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4
+///         UnionExec
+///           RecordBatchesExec: batches_groups=2 batches=0 total_rows=0
+///           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]
+/// ```
+/// will become:
+/// ```text
+/// DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]
+///   SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8
+///     SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4
+///       UnionExec
+///         SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]
+///           RecordBatchesExec: batches_groups=2 batches=0 total_rows=0
+///         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]
+/// ```
+///
+/// There is a tension between:
+/// - Wanting to do sorts in parallel
+/// - Sorting fewer rows
+///
+/// DataFusion will not push down a sort through a `RepartitionExec`
+/// because it could reduce the parallelism of the sort. However,
+/// in IOx, unsorted children of `UnionExec` will tend to be
+/// [`RecordBatchesExec`] which is likely to have many fewer rows than
+/// other children which will tend to be [`ParquetExec`].
+/// So making this transformation will generally have a dramatic effect
+/// on the amount of data being sorted.
+///
+/// [`RecordBatchesExec`]: crate::provider::RecordBatchesExec
+/// [`ParquetExec`]: datafusion::datasource::physical_plan::ParquetExec
+pub(crate) struct PushSortThroughUnion;
+
+impl PhysicalOptimizerRule for PushSortThroughUnion {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() else {
+                return Ok(Transformed::No(plan));
+            };
+
+            if !sort_should_be_pushed_down(sort_exec)? {
+                return Ok(Transformed::No(plan));
+            }
+
+            let mut plan = Arc::clone(sort_exec.input());
+            let mut rewriter = SortRewriter {
+                ordering: sort_exec.output_ordering().unwrap().to_vec(),
+            };
+
+            plan = plan.rewrite(&mut rewriter)?;
+
+            // As a sanity check, make sure plan has the same ordering as before.
+            // If this fails, there is a bug in this optimization.
+            let Some(required_order) = sort_exec.output_ordering().map(sort_exprs_to_requirement)
+            else {
+                return internal_err!("No sort order after a sort");
+            };
+
+            if !plan
+                .equivalence_properties()
+                .ordering_satisfy_requirement(&required_order)
+            {
+                return internal_err!("PushSortThroughUnion corrupted plan sort order");
+            }
+
+            Ok(Transformed::Yes(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "push_sort_through_union"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+/// Returns true if the [`SortExec`] can be pushed down beneath a [`UnionExec`].
+fn sort_should_be_pushed_down(sort_exec: &SortExec) -> Result<bool> {
+    // Skip over any RepartitionExecs
+    let mut input = sort_exec.input();
+    while input.as_any().is::<RepartitionExec>() {
+        input = input
+            .as_any()
+            .downcast_ref::<RepartitionExec>()
+            .expect("this must be a RepartitionExec")
+            .input();
+    }
+
+    let Some(union_exec) = input.as_any().downcast_ref::<UnionExec>() else {
+        return Ok(false);
+    };
+
+    let Some(required_order) = sort_exec.output_ordering().map(sort_exprs_to_requirement) else {
+        return internal_err!("No sort order after a sort");
+    };
+
+    // Push down the sort if any of the children are already sorted.
+    // This means we will need to sort fewer rows than if we didn't
+    // push down the sort.
+    Ok(union_exec.children().iter().any(|child| {
+        child
+            .equivalence_properties()
+            .ordering_satisfy_requirement(&required_order)
+    }))
+}
+
+/// Rewrites a plan:
+/// - Any [`RepartitionExec`] nodes are converted to be sort-preserving
+/// - Any children of a [`UnionExec`] that are not sorted get a [`SortExec`]
+///   added to them.
+/// - Any other nodes will stop the rewrite.
+struct SortRewriter {
+    ordering: Vec<PhysicalSortExpr>,
+}
+
+impl TreeNodeRewriter for SortRewriter {
+    type N = Arc<dyn ExecutionPlan>;
+
+    fn pre_visit(&mut self, plan: &Self::N) -> Result<RewriteRecursion> {
+        if plan.as_any().is::<RepartitionExec>() {
+            Ok(datafusion::common::tree_node::RewriteRecursion::Continue)
+        } else if plan.as_any().is::<UnionExec>() {
+            Ok(datafusion::common::tree_node::RewriteRecursion::Mutate)
+        } else {
+            Ok(datafusion::common::tree_node::RewriteRecursion::Stop)
+        }
+    }
+
+    fn mutate(&mut self, plan: Self::N) -> Result<Self::N> {
+        if let Some(repartition_exec) = plan.as_any().downcast_ref::<RepartitionExec>() {
+            // Convert any RepartitionExec to be sort-preserving
+            Ok(Arc::new(
+                RepartitionExec::try_new(
+                    Arc::clone(repartition_exec.input()),
+                    repartition_exec.output_partitioning(),
+                )?
+                .with_preserve_order(),
+            ))
+        } else if let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() {
+            // Any children of the UnionExec that are not already sorted,
+            // need to be sorted.
+            let required_ordering = sort_exprs_to_requirement(self.ordering.as_ref());
+
+            let new_children = union_exec
+                .children()
+                .into_iter()
+                .map(|child| {
+                    if !child
+                        .equivalence_properties()
+                        .ordering_satisfy_requirement(&required_ordering)
+                    {
+                        let sort_exec = SortExec::new(self.ordering.clone(), child)
+                            .with_preserve_partitioning(true);
+                        Arc::new(sort_exec)
+                    } else {
+                        child
+                    }
+                })
+                .collect();
+
+            Ok(Arc::new(UnionExec::new(new_children)))
+        } else {
+            Ok(plan)
+        }
+    }
+}
+
+fn sort_exprs_to_requirement(sort_exprs: &[PhysicalSortExpr]) -> Vec<PhysicalSortRequirement> {
+    sort_exprs
+        .iter()
+        .map(|sort_expr| sort_expr.clone().into())
+        .collect()
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::datatypes::SchemaRef;
+    use datafusion::{
+        datasource::{
+            listing::PartitionedFile,
+            object_store::ObjectStoreUrl,
+            physical_plan::{FileScanConfig, ParquetExec},
+        },
+        physical_expr::PhysicalSortExpr,
+        physical_plan::{
+            coalesce_batches::CoalesceBatchesExec, expressions::Column,
+            repartition::RepartitionExec, sorts::sort::SortExec, union::UnionExec, ExecutionPlan,
+            Partitioning, Statistics,
+        },
+    };
+    use object_store::{path::Path, ObjectMeta};
+    use schema::{InfluxFieldType, SchemaBuilder as IOxSchemaBuilder};
+
+    use crate::{
+        physical_optimizer::{
+            sort::push_sort_through_union::PushSortThroughUnion, test_util::OptimizationTest,
+        },
+        provider::{DeduplicateExec, RecordBatchesExec},
+        test::TestChunk,
+        CHUNK_ORDER_COLUMN_NAME,
+    };
+
+    #[test]
+    fn test_push_sort_through_union() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan_parquet = parquet_exec(&schema, &order);
+        let plan_batches = record_batches_exec(2);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "     RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "       UnionExec"
+            - "         SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_push_sort_through_union_top_level_sort() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan_parquet = parquet_exec(&schema, &order);
+        let plan_batches = record_batches_exec(2);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        let output_order = ordering(["time"], &schema);
+        let plan = Arc::new(SortExec::new(output_order, plan));
+
+        // Nothing is done with the SortExec at the top level, because
+        // it does not match the pattern.
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " SortExec: expr=[time@3 ASC]"
+          - "   DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+          - "           UnionExec"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " SortExec: expr=[time@3 ASC]"
+            - "   DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_push_sort_through_union_no_repartition() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan_parquet = parquet_exec(&schema, &order);
+        let plan_batches = record_batches_exec(2);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // RepartitionExec does not need to be present for the optimization to apply
+        // (Although DF *will* handle this case)
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     UnionExec"
+          - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "       ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   UnionExec"
+            - "     SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_no_sorted_children() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan_batches_1 = record_batches_exec(2);
+        let plan_batches_2 = record_batches_exec(2);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches_1, plan_batches_2]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // No children of the union are sorted, so the sort will not be pushed down.
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+            - "         UnionExec"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_all_sorted_children() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan_parquet_1 = parquet_exec(&schema, &order);
+        let plan_parquet_2 = parquet_exec(&schema, &order);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_parquet_1, plan_parquet_2]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // All children of the union are sorted, so RepartitionExec nodes are converted to
+        // be sort-preserving.
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+          - "         UnionExec"
+          - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+          - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "     RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "       UnionExec"
+            - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+            - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_no_union() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan = parquet_exec(&schema, &order);
+
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // There is no union in the plan, so the pattern does not match.
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=2"
+          - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=2"
+            - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_two_sorts() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan_parquet = parquet_exec(&schema, &order);
+        let plan_batches = record_batches_exec(2);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // With two identical sorts in the plan, both of them will be removed,
+        // because the transformation is applied bottom-up.
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+          - "           UnionExec"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "     RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "       UnionExec"
+            - "         SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_extra_node() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let plan_parquet = parquet_exec(&schema, &order);
+        let plan_batches = record_batches_exec(2);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(CoalesceBatchesExec::new(plan, 4096));
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // Extra nodes in the plan, like CoalesceBatchesExec, will break the pattern matching
+        // and prevent the transformation from occurring.
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     CoalesceBatchesExec: target_batch_size=4096"
+          - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+          - "           UnionExec"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "     CoalesceBatchesExec: target_batch_size=4096"
+            - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+            - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+            - "           UnionExec"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_wrong_order() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering(["col2", "col1", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+
+        let wrong_order = ordering(["col1", "col2", "time", CHUNK_ORDER_COLUMN_NAME], &schema);
+        let plan_parquet = parquet_exec(&schema, &wrong_order);
+        let plan_batches = record_batches_exec(2);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // The ParquetExec has the wrong output order so no children of the union have the right
+        // sort order. Therefore the optimization is not applied.
+        let opt = PushSortThroughUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col1@0 ASC, col2@1 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+            - "         UnionExec"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col1@0 ASC, col2@1 ASC, time@3 ASC, __chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    fn record_batches_exec(n_chunks: usize) -> Arc<dyn ExecutionPlan> {
+        let chunks = std::iter::repeat(Arc::new(TestChunk::new("t")) as _)
+            .take(n_chunks)
+            .collect::<Vec<_>>();
+        Arc::new(RecordBatchesExec::new(chunks, schema(), None))
+    }
+
+    fn parquet_exec(schema: &SchemaRef, order: &[PhysicalSortExpr]) -> Arc<dyn ExecutionPlan> {
+        let base_config = FileScanConfig {
+            object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
+            file_schema: Arc::clone(schema),
+            file_groups: vec![vec![file(1)], vec![file(2)]],
+            statistics: Statistics::new_unknown(schema),
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![order.to_vec()],
+        };
+        Arc::new(ParquetExec::new(base_config, None, None))
+    }
+
+    fn schema() -> SchemaRef {
+        IOxSchemaBuilder::new()
+            .tag("col1")
+            .tag("col2")
+            .influx_field("field1", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into()
+    }
+
+    fn file(n: u128) -> PartitionedFile {
+        PartitionedFile {
+            object_meta: ObjectMeta {
+                location: Path::parse(format!("{n}.parquet")).unwrap(),
+                last_modified: Default::default(),
+                size: 0,
+                e_tag: None,
+                version: None,
+            },
+            partition_values: vec![],
+            range: None,
+            extensions: None,
+        }
+    }
+
+    fn ordering<const N: usize>(cols: [&str; N], schema: &SchemaRef) -> Vec<PhysicalSortExpr> {
+        cols.into_iter()
+            .map(|col| PhysicalSortExpr {
+                expr: Arc::new(Column::new_with_schema(col, schema.as_ref()).unwrap()),
+                options: Default::default(),
+            })
+            .collect()
+    }
+}
diff --git a/iox_query/src/physical_optimizer/sort/util.rs b/iox_query/src/physical_optimizer/sort/util.rs
new file mode 100644
index 0000000..274b016
--- /dev/null
+++ b/iox_query/src/physical_optimizer/sort/util.rs
@@ -0,0 +1,102 @@
+use std::sync::Arc;
+
+use crate::statistics::{column_statistics_min_max, compute_stats_column_min_max, overlap};
+use arrow::compute::{rank, SortOptions};
+use datafusion::{error::Result, physical_plan::ExecutionPlan, scalar::ScalarValue};
+use observability_deps::tracing::trace;
+
+/// Compute statistics for the given plans on a given column name
+/// Return none if the statistics are not available
+pub(crate) fn collect_statistics_min_max(
+    plans: &[Arc<dyn ExecutionPlan>],
+    col_name: &str,
+) -> Result<Option<Vec<(ScalarValue, ScalarValue)>>> {
+    // temp solution while waiting for DF's statistics to get mature
+    // Compute min max stats for all inputs of UnionExec on the sorted column
+    // https://github.com/apache/arrow-datafusion/issues/8078
+    let col_stats = plans
+        .iter()
+        .map(|plan| compute_stats_column_min_max(&**plan, col_name))
+        .collect::<Result<Vec<_>>>()?;
+
+    // If min and max not available, return none
+    let mut value_ranges = Vec::with_capacity(col_stats.len());
+    for stats in &col_stats {
+        let Some((min, max)) = column_statistics_min_max(stats) else {
+            trace!("-------- min_max not available");
+            return Ok(None);
+        };
+
+        value_ranges.push((min, max));
+    }
+
+    // todo: use this when DF satistics is ready
+    // // Get statistics for the inputs of UnionExec on the sorted column
+    // let Some(value_ranges) = statistics_min_max(plans, col_name)
+    // else {
+    //     return Ok(None);
+    // };
+
+    Ok(Some(value_ranges))
+}
+
+/// Plans and their corresponding value ranges
+pub(crate) struct PlansValueRanges {
+    pub plans: Vec<Arc<dyn ExecutionPlan>>,
+    // Min and max values of the plan on a specific column
+    pub value_ranges: Vec<(ScalarValue, ScalarValue)>,
+}
+
+/// Sort the given plans by value ranges
+/// Return none if
+///    . the number of plans is not the same as the number of value ranges
+///    . the value ranges overlap
+pub(crate) fn sort_by_value_ranges(
+    plans: Vec<Arc<dyn ExecutionPlan>>,
+    value_ranges: Vec<(ScalarValue, ScalarValue)>,
+    sort_options: SortOptions,
+) -> Result<Option<PlansValueRanges>> {
+    if plans.len() != value_ranges.len() {
+        trace!(
+            plans.len = plans.len(),
+            value_ranges.len = value_ranges.len(),
+            "--------- number of plans is not the same as the number of value ranges"
+        );
+        return Ok(None);
+    }
+
+    if overlap(&value_ranges)? {
+        trace!("--------- value ranges overlap");
+        return Ok(None);
+    }
+
+    // get the min value of each value range
+    let min_iter = value_ranges.iter().map(|(min, _)| min.clone());
+    let mins = ScalarValue::iter_to_array(min_iter)?;
+
+    // rank the min values
+    let ranks = rank(&*mins, Some(sort_options))?;
+
+    // sort the plans by the ranks of their min values
+    let mut plan_rank_zip: Vec<(Arc<dyn ExecutionPlan>, u32)> =
+        plans.into_iter().zip(ranks.clone()).collect::<Vec<_>>();
+    plan_rank_zip.sort_by(|(_, min1), (_, min2)| min1.cmp(min2));
+    let plans = plan_rank_zip
+        .into_iter()
+        .map(|(plan, _)| plan)
+        .collect::<Vec<_>>();
+
+    // Sort the value ranges by the ranks of their min values
+    let mut value_range_rank_zip: Vec<((ScalarValue, ScalarValue), u32)> =
+        value_ranges.into_iter().zip(ranks).collect::<Vec<_>>();
+    value_range_rank_zip.sort_by(|(_, min1), (_, min2)| min1.cmp(min2));
+    let value_ranges = value_range_rank_zip
+        .into_iter()
+        .map(|(value_range, _)| value_range)
+        .collect::<Vec<_>>();
+
+    Ok(Some(PlansValueRanges {
+        plans,
+        value_ranges,
+    }))
+}
diff --git a/iox_query/src/physical_optimizer/test_util.rs b/iox_query/src/physical_optimizer/test_util.rs
new file mode 100644
index 0000000..d02c21a
--- /dev/null
+++ b/iox_query/src/physical_optimizer/test_util.rs
@@ -0,0 +1,87 @@
+use std::sync::Arc;
+
+use datafusion::{
+    config::ConfigOptions,
+    error::DataFusionError,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{ExecutionPlan, Partitioning},
+};
+use serde::Serialize;
+
+use crate::test::format_execution_plan;
+
+#[derive(Debug, Serialize)]
+pub struct OptimizationTest {
+    input: Vec<String>,
+    output: Result<Vec<String>, String>,
+
+    #[serde(skip_serializing)]
+    output_plan: Option<Arc<dyn ExecutionPlan>>,
+}
+
+impl OptimizationTest {
+    pub fn new<O>(input_plan: Arc<dyn ExecutionPlan>, opt: O) -> Self
+    where
+        O: PhysicalOptimizerRule,
+    {
+        Self::new_with_config(input_plan, opt, &ConfigOptions::default())
+    }
+
+    pub fn new_with_config<O>(
+        input_plan: Arc<dyn ExecutionPlan>,
+        opt: O,
+        config: &ConfigOptions,
+    ) -> Self
+    where
+        O: PhysicalOptimizerRule,
+    {
+        let input = format_execution_plan(&input_plan);
+
+        let input_schema = input_plan.schema();
+
+        let output_result = opt.optimize(input_plan, config);
+        let output_plan = output_result.as_ref().ok().cloned();
+        let output = output_result
+            .and_then(|plan| {
+                if opt.schema_check() && (plan.schema() != input_schema) {
+                    Err(DataFusionError::External(
+                        format!(
+                            "Schema mismatch:\n\nBefore:\n{:?}\n\nAfter:\n{:?}",
+                            input_schema,
+                            plan.schema()
+                        )
+                        .into(),
+                    ))
+                } else {
+                    Ok(plan)
+                }
+            })
+            .map(|plan| format_execution_plan(&plan))
+            .map_err(|e| e.to_string());
+
+        Self {
+            input,
+            output,
+            output_plan,
+        }
+    }
+
+    pub fn output_plan(&self) -> Option<&Arc<dyn ExecutionPlan>> {
+        self.output_plan.as_ref()
+    }
+}
+
+/// Check if given partitioning is [`Partitioning::UnknownPartitioning`] with the given count.
+///
+/// This is needed because [`PartialEq`] for [`Partitioning`] is specified as "unknown != unknown".
+#[track_caller]
+pub fn assert_unknown_partitioning(partitioning: Partitioning, n: usize) {
+    match partitioning {
+        Partitioning::UnknownPartitioning(n2) if n == n2 => {}
+        _ => panic!(
+            "Unexpected partitioning, wanted: {:?}, got: {:?}",
+            Partitioning::UnknownPartitioning(n),
+            partitioning
+        ),
+    }
+}
diff --git a/iox_query/src/physical_optimizer/tests.rs b/iox_query/src/physical_optimizer/tests.rs
new file mode 100644
index 0000000..4e58227
--- /dev/null
+++ b/iox_query/src/physical_optimizer/tests.rs
@@ -0,0 +1,210 @@
+//! Optimizer edge cases.
+//!
+//! These are NOT part of the usual end2end query tests because they depend on very specific chunk arrangements that are
+//! hard to reproduce in an end2end setting.
+
+use std::sync::Arc;
+
+use arrow::datatypes::DataType;
+use datafusion::{
+    common::DFSchema,
+    datasource::provider_as_source,
+    logical_expr::{col, count, lit, Expr, ExprSchemable, LogicalPlanBuilder},
+    scalar::ScalarValue,
+};
+use schema::sort::SortKey;
+use test_helpers::maybe_start_logging;
+
+use crate::{
+    exec::{DedicatedExecutors, Executor, ExecutorConfig, ExecutorType},
+    provider::ProviderBuilder,
+    test::{format_execution_plan, TestChunk},
+    QueryChunk,
+};
+
+/// Test that reconstructs specific case where parquet files may unnecessarily be sorted.
+///
+/// See:
+/// - <https://github.com/influxdata/EAR/issues/4468>
+/// - <https://github.com/influxdata/influxdb_iox/issues/9451>
+#[tokio::test]
+async fn test_parquet_should_not_be_resorted() {
+    // DF session setup
+    let config = ExecutorConfig {
+        target_query_partitions: 16.try_into().unwrap(),
+        ..ExecutorConfig::testing()
+    };
+    let exec = Executor::new_with_config_and_executors(
+        config,
+        Arc::new(DedicatedExecutors::new_testing()),
+    );
+    let ctx = exec.new_context(ExecutorType::Query);
+    let state = ctx.inner().state();
+
+    // chunks
+    let c = TestChunk::new("t")
+        .with_tag_column("tag")
+        .with_time_column_with_full_stats(Some(0), Some(10), 10_000, None);
+    let c_mem = c.clone().with_may_contain_pk_duplicates(true);
+    let c_file = c
+        .clone()
+        .with_dummy_parquet_file()
+        .with_may_contain_pk_duplicates(false)
+        .with_sort_key(SortKey::from_columns([Arc::from("tag"), Arc::from("time")]));
+    let schema = c.schema().clone();
+    let provider = ProviderBuilder::new("t".into(), schema)
+        .add_chunk(Arc::new(c_mem.clone().with_id(1).with_order(i64::MAX)))
+        .add_chunk(Arc::new(c_file.clone().with_id(2).with_order(2)))
+        .add_chunk(Arc::new(c_file.clone().with_id(3).with_order(3)))
+        .build()
+        .unwrap();
+
+    // initial plan
+    // NOTE: we NEED two time predicates for the bug to trigger!
+    let expr = col("time")
+        .gt(lit(ScalarValue::TimestampNanosecond(Some(0), None)))
+        .and(col("time").gt(lit(ScalarValue::TimestampNanosecond(Some(2), None))));
+
+    let plan =
+        LogicalPlanBuilder::scan("t".to_owned(), provider_as_source(Arc::new(provider)), None)
+            .unwrap()
+            .filter(expr)
+            .unwrap()
+            .aggregate(
+                std::iter::empty::<Expr>(),
+                [count(lit(true)).alias("count")],
+            )
+            .unwrap()
+            .project([col("count")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+    let plan = state.create_physical_plan(&plan).await.unwrap();
+
+    // The output of the parquet files should not be resorted
+    insta::assert_yaml_snapshot!(
+        format_execution_plan(&plan),
+        @r###"
+    ---
+    - " AggregateExec: mode=Final, gby=[], aggr=[count]"
+    - "   CoalescePartitionsExec"
+    - "     AggregateExec: mode=Partial, gby=[], aggr=[count]"
+    - "       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1"
+    - "         ProjectionExec: expr=[]"
+    - "           DeduplicateExec: [tag@1 ASC,time@2 ASC]"
+    - "             SortPreservingMergeExec: [tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "               UnionExec"
+    - "                 SortExec: expr=[tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "                   CoalesceBatchesExec: target_batch_size=8192"
+    - "                     FilterExec: time@2 > 0 AND time@2 > 2"
+    - "                       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1"
+    - "                         RecordBatchesExec: chunks=1, projection=[__chunk_order, tag, time]"
+    - "                 SortExec: expr=[tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "                   CoalesceBatchesExec: target_batch_size=8192"
+    - "                     FilterExec: time@2 > 0 AND time@2 > 2"
+    - "                       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=2"
+    - "                         ParquetExec: file_groups={2 groups: [[2.parquet], [3.parquet]]}, projection=[__chunk_order, tag, time], output_ordering=[tag@1 ASC, time@2 ASC, __chunk_order@0 ASC], predicate=time@1 > 0 AND time@1 > 2, pruning_predicate=time_max@0 > 0 AND time_max@0 > 2"
+    "###
+    );
+}
+
+/// Bug reproducer for:
+/// - <https://github.com/influxdata/EAR/issues/4728>
+/// - <https://github.com/influxdata/influxdb_iox/issues/9450>
+#[tokio::test]
+async fn test_parquet_must_resorted() {
+    maybe_start_logging();
+
+    // DF session setup
+    let config = ExecutorConfig {
+        target_query_partitions: 6.try_into().unwrap(),
+        ..ExecutorConfig::testing()
+    };
+    let exec = Executor::new_with_config_and_executors(
+        config,
+        Arc::new(DedicatedExecutors::new_testing()),
+    );
+    let ctx = exec.new_context(ExecutorType::Query);
+    let state = ctx.inner().state();
+
+    // chunks
+    let c = TestChunk::new("t")
+        .with_tag_column("tag")
+        .with_f64_field_column("field")
+        .with_time_column_with_full_stats(Some(0), Some(10), 10_000, None)
+        .with_may_contain_pk_duplicates(false)
+        .with_sort_key(SortKey::from_columns([Arc::from("tag"), Arc::from("time")]));
+    let schema = c.schema().clone();
+    let df_schema = DFSchema::try_from(schema.as_arrow().as_ref().clone()).unwrap();
+    let provider = ProviderBuilder::new("t".into(), schema)
+        // need a small file followed by a big one
+        .add_chunk(Arc::new(
+            c.clone()
+                .with_id(1)
+                .with_order(1)
+                .with_dummy_parquet_file_and_size(1),
+        ))
+        .add_chunk(Arc::new(
+            c.clone()
+                .with_id(2)
+                .with_order(2)
+                .with_dummy_parquet_file_and_size(100_000_000),
+        ))
+        .build()
+        .unwrap();
+
+    // initial plan
+    let expr = col("tag")
+        .gt(lit("foo"))
+        .and(col("time").gt(lit(ScalarValue::TimestampNanosecond(Some(2), None))))
+        .and(
+            col("field")
+                .cast_to(&DataType::Utf8, &df_schema)
+                .unwrap()
+                .not_eq(lit("")),
+        );
+
+    let plan =
+        LogicalPlanBuilder::scan("t".to_owned(), provider_as_source(Arc::new(provider)), None)
+            .unwrap()
+            .filter(expr)
+            .unwrap()
+            .project([col("tag")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+    let plan = state.create_physical_plan(&plan).await.unwrap();
+
+    // The output of the parquet files must be sorted prior to merging
+    // if the first file_group has more than one file
+    //
+    // Prior to https://github.com/influxdata/influxdb_iox/issues/9450, the plan
+    // called for the ParquetExec to read the files in parallel (using subranges) like:
+    // ```
+    // {6 groups: [[1.parquet:0..1, 2.parquet:0..16666666], [2.parquet:16666666..33333333],...
+    // ```
+    //
+    // Groups with more than one file produce an output partition that is the
+    // result of concatenating them together, so even if the output of each
+    // individual file is sorted, the output of the partition is not, due to the
+    // concatenation.
+    insta::assert_yaml_snapshot!(
+        format_execution_plan(&plan),
+        @r###"
+    ---
+    - " ProjectionExec: expr=[tag@1 as tag]"
+    - "   CoalesceBatchesExec: target_batch_size=8192"
+    - "     FilterExec: CAST(field@0 AS Utf8) != "
+    - "       RepartitionExec: partitioning=RoundRobinBatch(6), input_partitions=1"
+    - "         ProjectionExec: expr=[field@1 as field, tag@3 as tag]"
+    - "           DeduplicateExec: [tag@3 ASC,time@2 ASC]"
+    - "             SortPreservingMergeExec: [tag@3 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "               CoalesceBatchesExec: target_batch_size=8192"
+    - "                 FilterExec: tag@3 > foo AND time@2 > 2"
+    - "                   RepartitionExec: partitioning=RoundRobinBatch(6), input_partitions=2, preserve_order=true, sort_exprs=tag@3 ASC,time@2 ASC,__chunk_order@0 ASC"
+    - "                     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[__chunk_order, field, time, tag], output_ordering=[tag@3 ASC, time@2 ASC, __chunk_order@0 ASC], predicate=tag@1 > foo AND time@2 > 2, pruning_predicate=tag_max@0 > foo AND time_max@1 > 2"
+    "###
+    );
+}
diff --git a/iox_query/src/physical_optimizer/union/mod.rs b/iox_query/src/physical_optimizer/union/mod.rs
new file mode 100644
index 0000000..df595eb
--- /dev/null
+++ b/iox_query/src/physical_optimizer/union/mod.rs
@@ -0,0 +1,6 @@
+//! Rules specific to [`UnionExec`].
+//!
+//! [`UnionExec`]: datafusion::physical_plan::union::UnionExec
+
+pub mod nested_union;
+pub mod one_union;
diff --git a/iox_query/src/physical_optimizer/union/nested_union.rs b/iox_query/src/physical_optimizer/union/nested_union.rs
new file mode 100644
index 0000000..7a05139
--- /dev/null
+++ b/iox_query/src/physical_optimizer/union/nested_union.rs
@@ -0,0 +1,189 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{union::UnionExec, ExecutionPlan},
+};
+
+/// Optimizer that replaces nested [`UnionExec`]s with a single level.
+///
+/// # Example
+/// ```yaml
+/// ---
+/// UnionExec:
+///  - UnionExec:
+///      - SomeExec1
+///      - SomeExec2
+///  - SomeExec3
+///
+/// ---
+/// UnionExec:
+///  - SomeExec1
+///  - SomeExec2
+///  - SomeExec3
+/// ```
+#[derive(Debug, Default)]
+pub struct NestedUnion;
+
+impl PhysicalOptimizerRule for NestedUnion {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(union_exec) = plan_any.downcast_ref::<UnionExec>() {
+                let children = union_exec.children();
+
+                let mut children_new = Vec::with_capacity(children.len());
+                let mut found_union = false;
+                for child in children {
+                    if let Some(union_child) = child.as_any().downcast_ref::<UnionExec>() {
+                        found_union = true;
+                        children_new.append(&mut union_child.children());
+                    } else {
+                        children_new.push(child)
+                    }
+                }
+
+                if found_union {
+                    return Ok(Transformed::Yes(Arc::new(UnionExec::new(children_new))));
+                }
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "nested_union"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use datafusion::physical_plan::empty::EmptyExec;
+
+    use crate::physical_optimizer::test_util::OptimizationTest;
+
+    use super::*;
+
+    #[test]
+    #[should_panic(expected = "index out of bounds")]
+    fn test_union_empty() {
+        // empty UnionExecs cannot be created in the first place
+        UnionExec::new(vec![]);
+    }
+
+    #[test]
+    fn test_union_not_nested() {
+        let plan = Arc::new(UnionExec::new(vec![other_node()]));
+        let opt = NestedUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_union_nested() {
+        let plan = Arc::new(UnionExec::new(vec![
+            Arc::new(UnionExec::new(vec![other_node(), other_node()])),
+            other_node(),
+        ]));
+        let opt = NestedUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   UnionExec"
+          - "     EmptyExec"
+          - "     EmptyExec"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   EmptyExec"
+            - "   EmptyExec"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_union_deeply_nested() {
+        let plan = Arc::new(UnionExec::new(vec![
+            Arc::new(UnionExec::new(vec![
+                other_node(),
+                Arc::new(UnionExec::new(vec![other_node()])),
+            ])),
+            other_node(),
+        ]));
+        let opt = NestedUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   UnionExec"
+          - "     EmptyExec"
+          - "     UnionExec"
+          - "       EmptyExec"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   EmptyExec"
+            - "   EmptyExec"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_other_node() {
+        let plan = other_node();
+        let opt = NestedUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+    }
+
+    fn other_node() -> Arc<dyn ExecutionPlan> {
+        Arc::new(EmptyExec::new(schema()))
+    }
+
+    fn schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new("c", DataType::UInt32, false)]))
+    }
+}
diff --git a/iox_query/src/physical_optimizer/union/one_union.rs b/iox_query/src/physical_optimizer/union/one_union.rs
new file mode 100644
index 0000000..15f277a
--- /dev/null
+++ b/iox_query/src/physical_optimizer/union/one_union.rs
@@ -0,0 +1,133 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{union::UnionExec, ExecutionPlan},
+};
+
+/// Optimizer that replaces [`UnionExec`] with a single child node w/ the child note itself.
+///
+/// # Example
+/// ```yaml
+/// ---
+/// UnionExec:
+///  - SomeExec1
+///
+/// ---
+/// SomeExec1
+/// ```
+#[derive(Debug, Default)]
+pub struct OneUnion;
+
+impl PhysicalOptimizerRule for OneUnion {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            let plan_any = plan.as_any();
+
+            if let Some(union_exec) = plan_any.downcast_ref::<UnionExec>() {
+                let mut children = union_exec.children();
+                if children.len() == 1 {
+                    return Ok(Transformed::Yes(children.remove(0)));
+                }
+            }
+
+            Ok(Transformed::No(plan))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "one_union"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use datafusion::physical_plan::empty::EmptyExec;
+
+    use crate::physical_optimizer::test_util::OptimizationTest;
+
+    use super::*;
+
+    #[test]
+    #[should_panic(expected = "index out of bounds")]
+    fn test_union_empty() {
+        // empty UnionExecs cannot be created in the first place
+        UnionExec::new(vec![]);
+    }
+
+    #[test]
+    fn test_union_one() {
+        let plan = Arc::new(UnionExec::new(vec![other_node()]));
+        let opt = OneUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_union_two() {
+        let plan = Arc::new(UnionExec::new(vec![other_node(), other_node()]));
+        let opt = OneUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   EmptyExec"
+          - "   EmptyExec"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   EmptyExec"
+            - "   EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_other_node() {
+        let plan = other_node();
+        let opt = OneUnion;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " EmptyExec"
+        output:
+          Ok:
+            - " EmptyExec"
+        "###
+        );
+    }
+
+    fn other_node() -> Arc<dyn ExecutionPlan> {
+        Arc::new(EmptyExec::new(schema()))
+    }
+
+    fn schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new("c", DataType::UInt32, false)]))
+    }
+}
diff --git a/iox_query/src/plan.rs b/iox_query/src/plan.rs
new file mode 100644
index 0000000..693ff90
--- /dev/null
+++ b/iox_query/src/plan.rs
@@ -0,0 +1,3 @@
+pub mod fieldlist;
+pub mod seriesset;
+pub mod stringset;
diff --git a/iox_query/src/plan/fieldlist.rs b/iox_query/src/plan/fieldlist.rs
new file mode 100644
index 0000000..e2e19f7
--- /dev/null
+++ b/iox_query/src/plan/fieldlist.rs
@@ -0,0 +1,57 @@
+use datafusion::logical_expr::LogicalPlan;
+
+use crate::exec::fieldlist::Field;
+use std::collections::BTreeMap;
+
+pub type FieldSet = BTreeMap<String, Field>;
+
+/// A plan which produces a logical set of Fields (e.g. InfluxDB
+/// Fields with name, and data type, and last_timestamp).
+///
+/// known_values has a set of pre-computed values to be merged with
+/// the extra_plans.
+#[derive(Debug, Default)]
+pub struct FieldListPlan {
+    /// Known values
+    pub known_values: FieldSet,
+    /// General plans
+    pub extra_plans: Vec<LogicalPlan>,
+}
+
+impl From<Vec<LogicalPlan>> for FieldListPlan {
+    /// Create FieldList plan from a DataFusion LogicalPlan node, each
+    /// of which must produce fields in the correct format. The output
+    /// of each plan will be included into the final set.
+    fn from(plans: Vec<LogicalPlan>) -> Self {
+        Self {
+            known_values: FieldSet::new(),
+            extra_plans: plans,
+        }
+    }
+}
+
+impl From<LogicalPlan> for FieldListPlan {
+    /// Create a StringSet plan from a single DataFusion LogicalPlan
+    /// node, which must produce fields in the correct format
+    fn from(plan: LogicalPlan) -> Self {
+        Self::from(vec![plan])
+    }
+}
+
+impl FieldListPlan {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Append the other plan to ourselves
+    pub fn append_other(mut self, other: Self) -> Self {
+        self.extra_plans.extend(other.extra_plans);
+        self.known_values.extend(other.known_values);
+        self
+    }
+
+    /// Append a single field to the known set of fields in this builder
+    pub fn append_field(&mut self, s: Field) {
+        self.known_values.insert(s.name.clone(), s);
+    }
+}
diff --git a/iox_query/src/plan/seriesset.rs b/iox_query/src/plan/seriesset.rs
new file mode 100644
index 0000000..a158438
--- /dev/null
+++ b/iox_query/src/plan/seriesset.rs
@@ -0,0 +1,108 @@
+use std::sync::Arc;
+
+use datafusion::logical_expr::LogicalPlan;
+
+use crate::exec::field::FieldColumns;
+
+/// A plan that can be run to produce a logical stream of time series,
+/// as represented as sequence of SeriesSets from a single DataFusion
+/// plan, optionally grouped in some way.
+///
+/// TODO: remove the tag/field designations below and attach a
+/// `Schema` to the plan (which has the tag and field column
+/// information natively)
+#[derive(Debug)]
+pub struct SeriesSetPlan {
+    /// The table name this came from
+    pub table_name: Arc<str>,
+
+    /// Datafusion plan to execute. The plan must produce
+    /// RecordBatches that have:
+    ///
+    /// * fields for each name in `tag_columns` and `field_columns`
+    /// * a timestamp column called 'time'
+    /// * each column in tag_columns must be a String (Utf8)
+    pub plan: LogicalPlan,
+
+    /// The names of the columns that define tags.
+    ///
+    /// Note these are `Arc` strings because they are duplicated for
+    /// *each* resulting `SeriesSet` that is produced when this type
+    /// of plan is executed.
+    pub tag_columns: Vec<Arc<str>>,
+
+    /// The names of the columns which are "fields"
+    pub field_columns: FieldColumns,
+}
+
+impl SeriesSetPlan {
+    /// Create a SeriesSetPlan that will not produce any Group items
+    pub fn new_from_shared_timestamp(
+        table_name: Arc<str>,
+        plan: LogicalPlan,
+        tag_columns: Vec<Arc<str>>,
+        field_columns: Vec<Arc<str>>,
+    ) -> Self {
+        Self::new(table_name, plan, tag_columns, field_columns.into())
+    }
+
+    /// Create a SeriesSetPlan that will not produce any Group items
+    pub fn new(
+        table_name: Arc<str>,
+        plan: LogicalPlan,
+        tag_columns: Vec<Arc<str>>,
+        field_columns: FieldColumns,
+    ) -> Self {
+        Self {
+            table_name,
+            plan,
+            tag_columns,
+            field_columns,
+        }
+    }
+}
+
+/// A container for plans which each produce a logical stream of
+/// timeseries (from across many potential tables).
+#[derive(Debug, Default)]
+pub struct SeriesSetPlans {
+    /// Plans the generate Series, ordered by table_name.
+    ///
+    /// Each plan produces output that is sorted by tag keys (tag
+    /// column values) and then time.
+    pub plans: Vec<SeriesSetPlan>,
+
+    /// grouping keys, if any, that specify how the output series should be
+    /// sorted (aka grouped). If empty, means no grouping is needed
+    ///
+    /// There are several special values that are possible in `group_keys`:
+    ///
+    /// 1. _field (means group by field column name)
+    /// 2. _measurement (means group by the table name)
+    /// 3. _time (means group by the time column)
+    pub group_columns: Option<Vec<Arc<str>>>,
+}
+
+impl SeriesSetPlans {
+    pub fn into_inner(self) -> Vec<SeriesSetPlan> {
+        self.plans
+    }
+}
+
+impl SeriesSetPlans {
+    /// Create a new, ungrouped SeriesSetPlans
+    pub fn new(plans: Vec<SeriesSetPlan>) -> Self {
+        Self {
+            plans,
+            group_columns: None,
+        }
+    }
+
+    /// Group the created SeriesSetPlans
+    pub fn grouped_by(self, group_columns: Vec<Arc<str>>) -> Self {
+        Self {
+            group_columns: Some(group_columns),
+            ..self
+        }
+    }
+}
diff --git a/iox_query/src/plan/stringset.rs b/iox_query/src/plan/stringset.rs
new file mode 100644
index 0000000..8e49d2c
--- /dev/null
+++ b/iox_query/src/plan/stringset.rs
@@ -0,0 +1,233 @@
+use std::sync::Arc;
+
+use arrow_util::util::str_iter_to_batch;
+use datafusion::logical_expr::LogicalPlan;
+
+/// The name of the column containing table names returned by a call to
+/// `table_names`.
+const TABLE_NAMES_COLUMN_NAME: &str = "table";
+
+use crate::{
+    exec::stringset::{StringSet, StringSetRef},
+    util::make_scan_plan,
+};
+
+use snafu::{ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Internal error converting to arrow: {}", source))]
+    InternalConvertingToArrow { source: arrow::error::ArrowError },
+
+    #[snafu(display("Internal error creating a plan for stringset: {}", source))]
+    InternalPlanningStringSet {
+        source: datafusion::error::DataFusionError,
+    },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A plan which produces a logical set of Strings (e.g. tag
+/// values). This includes variants with pre-calculated results as
+/// well a variant that runs a full on DataFusion plan.
+#[derive(Debug)]
+pub enum StringSetPlan {
+    /// The results are known from metadata only without having to run
+    /// an actual datafusion plan
+    Known(StringSetRef),
+
+    /// A DataFusion plan(s) to execute. Each plan must produce
+    /// RecordBatches with exactly one String column, though the
+    /// values produced by the plan may be repeated
+    ///
+    /// TODO: it would be cool to have a single datafusion LogicalPlan
+    /// that merged all the results together. However, no such Union
+    /// node exists at the time of writing, so we do the unioning in IOx
+    Plan(Vec<LogicalPlan>),
+}
+
+impl From<StringSetRef> for StringSetPlan {
+    /// Create a StringSetPlan from a StringSetRef
+    fn from(set: StringSetRef) -> Self {
+        Self::Known(set)
+    }
+}
+
+impl From<StringSet> for StringSetPlan {
+    /// Create a StringSetPlan from a StringSet result, wrapping the error type
+    /// appropriately
+    fn from(set: StringSet) -> Self {
+        Self::Known(StringSetRef::new(set))
+    }
+}
+
+impl From<Vec<LogicalPlan>> for StringSetPlan {
+    /// Create StringSet plan from a DataFusion LogicalPlan node, each
+    /// of which must produce a single output Utf8 column. The output
+    /// of each plan will be included into the final set.
+    fn from(plans: Vec<LogicalPlan>) -> Self {
+        Self::Plan(plans)
+    }
+}
+
+impl From<LogicalPlan> for StringSetPlan {
+    /// Create a StringSet plan from a single DataFusion LogicalPlan
+    /// node which produces a single output Utf8 column.
+    fn from(plan: LogicalPlan) -> Self {
+        Self::Plan(vec![plan])
+    }
+}
+
+/// Builder for StringSet plans for appending multiple plans together
+///
+/// If the values are known beforehand, the builder merges the
+/// strings, otherwise it falls back to generic plans
+#[derive(Debug, Default)]
+pub struct StringSetPlanBuilder {
+    /// Known strings
+    strings: StringSet,
+    /// General plans
+    plans: Vec<LogicalPlan>,
+}
+
+impl StringSetPlanBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Append the strings from the passed plan into ourselves if possible, or
+    /// passes on the plan
+    pub fn append_other(mut self, other: StringSetPlan) -> Self {
+        match other {
+            StringSetPlan::Known(ssref) => match Arc::try_unwrap(ssref) {
+                Ok(mut ss) => {
+                    self.strings.append(&mut ss);
+                }
+                Err(ssref) => {
+                    for s in &*ssref {
+                        if !self.strings.contains(s) {
+                            self.strings.insert(s.clone());
+                        }
+                    }
+                }
+            },
+            StringSetPlan::Plan(mut other_plans) => self.plans.append(&mut other_plans),
+        }
+
+        self
+    }
+
+    /// Return true if we know already that `s` is contained in the
+    /// StringSet. Note that if `contains()` returns false, `s` may be
+    /// in the stringset after execution.
+    pub fn contains(&self, s: impl AsRef<str>) -> bool {
+        self.strings.contains(s.as_ref())
+    }
+
+    /// Append a single string to the known set of strings in this builder
+    pub fn append_string(&mut self, s: impl Into<String>) {
+        self.strings.insert(s.into());
+    }
+
+    /// returns an iterator over the currently known strings in this builder
+    pub fn known_strings_iter(&self) -> impl Iterator<Item = &String> {
+        self.strings.iter()
+    }
+
+    /// Create a StringSetPlan that produces the deduplicated (union)
+    /// of all plans `append`ed to this builder.
+    pub fn build(self) -> Result<StringSetPlan> {
+        let Self { strings, mut plans } = self;
+
+        if plans.is_empty() {
+            // only a known set of strings
+            Ok(StringSetPlan::Known(Arc::new(strings)))
+        } else {
+            // Had at least one general plan, so need to use general
+            // purpose plan for the known strings
+            if !strings.is_empty() {
+                let batch =
+                    str_iter_to_batch(TABLE_NAMES_COLUMN_NAME, strings.into_iter().map(Some))
+                        .context(InternalConvertingToArrowSnafu)?;
+
+                let plan = make_scan_plan(batch).context(InternalPlanningStringSetSnafu)?;
+
+                plans.push(plan)
+            }
+
+            Ok(StringSetPlan::Plan(plans))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::exec::{Executor, ExecutorType};
+
+    use super::*;
+
+    #[test]
+    fn test_builder_empty() {
+        let plan = StringSetPlanBuilder::new().build().unwrap();
+        let empty_ss = StringSet::new().into();
+        if let StringSetPlan::Known(ss) = plan {
+            assert_eq!(ss, empty_ss)
+        } else {
+            panic!("unexpected type: {plan:?}")
+        }
+    }
+
+    #[test]
+    fn test_builder_strings_only() {
+        let plan = StringSetPlanBuilder::new()
+            .append_other(to_string_set(&["foo", "bar"]).into())
+            .append_other(to_string_set(&["bar", "baz"]).into())
+            .build()
+            .unwrap();
+
+        let expected_ss = to_string_set(&["foo", "bar", "baz"]).into();
+
+        if let StringSetPlan::Known(ss) = plan {
+            assert_eq!(ss, expected_ss)
+        } else {
+            panic!("unexpected type: {plan:?}")
+        }
+    }
+
+    #[derive(Debug)]
+    struct TestError {}
+
+    impl std::fmt::Display for TestError {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "this is an error")
+        }
+    }
+
+    impl std::error::Error for TestError {}
+
+    #[tokio::test]
+    async fn test_builder_plan() {
+        let batch = str_iter_to_batch("column_name", vec![Some("from_a_plan")]).unwrap();
+        let df_plan = make_scan_plan(batch).unwrap();
+
+        // when a df plan is appended the whole plan should be different
+        let plan = StringSetPlanBuilder::new()
+            .append_other(to_string_set(&["foo", "bar"]).into())
+            .append_other(vec![df_plan].into())
+            .append_other(to_string_set(&["baz"]).into())
+            .build()
+            .unwrap();
+
+        let expected_ss = to_string_set(&["foo", "bar", "baz", "from_a_plan"]).into();
+
+        assert!(matches!(plan, StringSetPlan::Plan(_)));
+        let exec = Executor::new_testing();
+        let ctx = exec.new_context(ExecutorType::Query);
+        let ss = ctx.to_string_set(plan).await.unwrap();
+        assert_eq!(ss, expected_ss);
+    }
+
+    fn to_string_set(v: &[&str]) -> StringSet {
+        v.iter().map(|s| s.to_string()).collect::<StringSet>()
+    }
+}
diff --git a/iox_query/src/provider.rs b/iox_query/src/provider.rs
new file mode 100644
index 0000000..3fab97e
--- /dev/null
+++ b/iox_query/src/provider.rs
@@ -0,0 +1,641 @@
+//! Implementation of a DataFusion `TableProvider` in terms of `QueryChunk`s
+
+use async_trait::async_trait;
+use std::{collections::HashSet, sync::Arc};
+
+use arrow::{
+    datatypes::{Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef},
+    error::ArrowError,
+};
+use datafusion::{
+    datasource::{provider_as_source, TableProvider},
+    error::{DataFusionError, Result as DataFusionResult},
+    execution::context::SessionState,
+    logical_expr::{
+        utils::{conjunction, split_conjunction},
+        LogicalPlanBuilder, TableProviderFilterPushDown, TableType,
+    },
+    physical_plan::{
+        expressions::col as physical_col, filter::FilterExec, projection::ProjectionExec,
+        ExecutionPlan,
+    },
+    prelude::Expr,
+    sql::TableReference,
+};
+use observability_deps::tracing::trace;
+use schema::{sort::SortKey, Schema};
+
+use crate::{
+    chunk_order_field,
+    util::{arrow_sort_key_exprs, df_physical_expr},
+    QueryChunk, CHUNK_ORDER_COLUMN_NAME,
+};
+
+use snafu::{ResultExt, Snafu};
+
+mod adapter;
+mod deduplicate;
+pub mod overlap;
+mod physical;
+pub(crate) mod progressive_eval;
+mod record_batch_exec;
+pub use self::overlap::group_potential_duplicates;
+pub use deduplicate::{DeduplicateExec, RecordBatchDeduplicator};
+pub(crate) use physical::{chunks_to_physical_nodes, PartitionedFileExt};
+
+pub(crate) use record_batch_exec::RecordBatchesExec;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Internal error: no chunk pruner provided to builder for {}",
+        table_name,
+    ))]
+    InternalNoChunkPruner { table_name: String },
+
+    #[snafu(display("Internal error: Cannot create projection select expr '{}'", source,))]
+    InternalSelectExpr {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display("Internal error adding sort operator '{}'", source,))]
+    InternalSort {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display("Internal error adding filter operator '{}'", source,))]
+    InternalFilter {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display("Internal error adding projection operator '{}'", source,))]
+    InternalProjection {
+        source: datafusion::error::DataFusionError,
+    },
+}
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl From<Error> for ArrowError {
+    // Wrap an error into an arrow error
+    fn from(e: Error) -> Self {
+        Self::ExternalError(Box::new(e))
+    }
+}
+
+impl From<Error> for DataFusionError {
+    // Wrap an error into a datafusion error
+    fn from(e: Error) -> Self {
+        Self::ArrowError(e.into(), None)
+    }
+}
+
+/// Builds a `ChunkTableProvider` from a series of `QueryChunk`s
+/// and ensures the schema across the chunks is compatible and
+/// consistent.
+#[derive(Debug)]
+pub struct ProviderBuilder {
+    table_name: Arc<str>,
+    schema: Schema,
+    chunks: Vec<Arc<dyn QueryChunk>>,
+    deduplication: bool,
+}
+
+impl ProviderBuilder {
+    pub fn new(table_name: Arc<str>, schema: Schema) -> Self {
+        assert_eq!(schema.find_index_of(CHUNK_ORDER_COLUMN_NAME), None);
+
+        Self {
+            table_name,
+            schema,
+            chunks: Vec::new(),
+            deduplication: true,
+        }
+    }
+
+    pub fn with_enable_deduplication(mut self, enable_deduplication: bool) -> Self {
+        self.deduplication = enable_deduplication;
+        self
+    }
+
+    /// Add a new chunk to this provider
+    pub fn add_chunk(mut self, chunk: Arc<dyn QueryChunk>) -> Self {
+        self.chunks.push(chunk);
+        self
+    }
+
+    /// Create the Provider
+    pub fn build(self) -> Result<ChunkTableProvider> {
+        Ok(ChunkTableProvider {
+            iox_schema: self.schema,
+            table_name: self.table_name,
+            chunks: self.chunks,
+            deduplication: self.deduplication,
+        })
+    }
+}
+
+/// Implementation of a DataFusion TableProvider in terms of QueryChunks
+///
+/// This allows DataFusion to see data from Chunks as a single table, as well as
+/// push predicates and selections down to chunks
+#[derive(Debug)]
+pub struct ChunkTableProvider {
+    table_name: Arc<str>,
+    /// The IOx schema (wrapper around Arrow Schemaref) for this table
+    iox_schema: Schema,
+    /// The chunks
+    chunks: Vec<Arc<dyn QueryChunk>>,
+    /// do deduplication
+    deduplication: bool,
+}
+
+impl ChunkTableProvider {
+    /// Return the IOx schema view for the data provided by this provider
+    pub fn iox_schema(&self) -> &Schema {
+        &self.iox_schema
+    }
+
+    /// Return the Arrow schema view for the data provided by this provider
+    pub fn arrow_schema(&self) -> ArrowSchemaRef {
+        self.iox_schema.as_arrow()
+    }
+
+    /// Return the table name
+    pub fn table_name(&self) -> &str {
+        self.table_name.as_ref()
+    }
+
+    /// Running deduplication or not
+    pub fn deduplication(&self) -> bool {
+        self.deduplication
+    }
+
+    /// Convert into a logical plan builder.
+    pub fn into_logical_plan_builder(
+        self: Arc<Self>,
+    ) -> Result<LogicalPlanBuilder, DataFusionError> {
+        let table_name = self.table_name().to_owned();
+        let source = provider_as_source(self as _);
+
+        // Scan all columns (DataFusion optimizer will prune this
+        // later if possible)
+        let projection = None;
+
+        // Do not parse the tablename as a SQL identifer, but use as is
+        let table_ref = TableReference::bare(table_name);
+        LogicalPlanBuilder::scan(table_ref, source, projection)
+    }
+}
+
+#[async_trait]
+impl TableProvider for ChunkTableProvider {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    /// Schema with all available columns across all chunks
+    fn schema(&self) -> ArrowSchemaRef {
+        self.arrow_schema()
+    }
+
+    /// Creates a plan like the following:
+    ///
+    /// ```text
+    /// Project (keep only columns needed in the rest of the plan)
+    ///   Filter (optional, apply any push down predicates)
+    ///     Deduplicate (optional, if chunks overlap)
+    ///       ... Scan of Chunks (RecordBatchExec / ParquetExec / UnionExec, etc) ...
+    /// ```
+    async fn scan(
+        &self,
+        ctx: &SessionState,
+        projection: Option<&Vec<usize>>,
+        filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> std::result::Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+        trace!("Create a scan node for ChunkTableProvider");
+
+        let schema_with_chunk_order = Arc::new(ArrowSchema::new(
+            self.iox_schema
+                .as_arrow()
+                .fields
+                .iter()
+                .cloned()
+                .chain(std::iter::once(chunk_order_field()))
+                .collect::<Fields>(),
+        ));
+        let pk = self.iox_schema().primary_key();
+        let dedup_sort_key = SortKey::from_columns(pk.iter().copied());
+
+        // Create data stream from chunk data. This is the most simple data stream possible and contains duplicates and
+        // has no filters at all.
+        let plan = chunks_to_physical_nodes(
+            &schema_with_chunk_order,
+            None,
+            self.chunks.clone(),
+            ctx.config().target_partitions(),
+        );
+
+        // De-dup before doing anything else, because all logical expressions act on de-duplicated data.
+        let plan = if self.deduplication {
+            let sort_exprs = arrow_sort_key_exprs(&dedup_sort_key, &plan.schema());
+            Arc::new(DeduplicateExec::new(plan, sort_exprs, true))
+        } else {
+            plan
+        };
+
+        // Filter as early as possible (AFTER de-dup!). Predicate pushdown will eventually push down parts of this.
+        let plan = if let Some(expr) = filters.iter().cloned().reduce(|a, b| a.and(b)) {
+            let maybe_expr = if !self.deduplication {
+                let dedup_cols = pk.into_iter().collect::<HashSet<_>>();
+                conjunction(
+                    split_conjunction(&expr)
+                        .into_iter()
+                        .filter(|expr| {
+                            let Ok(expr_cols) = expr.to_columns() else {
+                                return false;
+                            };
+                            expr_cols
+                                .into_iter()
+                                .all(|c| dedup_cols.contains(c.name.as_str()))
+                        })
+                        .cloned(),
+                )
+            } else {
+                Some(expr)
+            };
+
+            if let Some(expr) = maybe_expr {
+                Arc::new(FilterExec::try_new(
+                    df_physical_expr(plan.schema(), expr)?,
+                    plan,
+                )?)
+            } else {
+                plan
+            }
+        } else {
+            plan
+        };
+
+        // Project at last because it removes columns and hence other operations may fail. Projection pushdown will
+        // optimize that later.
+        // Always project because we MUST make sure that chunk order col doesn't leak to the user or to our parquet
+        // files.
+        let default_projection: Vec<_> = (0..self.iox_schema.len()).collect();
+        let projection = projection.unwrap_or(&default_projection);
+        let select_exprs = self
+            .iox_schema()
+            .select_by_indices(projection)
+            .as_arrow()
+            .fields()
+            .iter()
+            .map(|f| {
+                let field_name = f.name();
+                let physical_expr =
+                    physical_col(field_name, &self.schema()).context(InternalSelectExprSnafu)?;
+                Ok((physical_expr, field_name.to_string()))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let plan = Arc::new(ProjectionExec::try_new(select_exprs, plan)?);
+
+        Ok(plan)
+    }
+
+    /// Filter pushdown specification
+    fn supports_filter_pushdown(
+        &self,
+        _filter: &Expr,
+    ) -> DataFusionResult<TableProviderFilterPushDown> {
+        if self.deduplication {
+            Ok(TableProviderFilterPushDown::Exact)
+        } else {
+            Ok(TableProviderFilterPushDown::Inexact)
+        }
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::{
+        exec::IOxSessionContext,
+        pruning::retention_expr,
+        test::{format_execution_plan, TestChunk},
+    };
+    use datafusion::prelude::{col, lit};
+
+    #[tokio::test]
+    async fn provider_scan_default() {
+        let table_name = "t";
+        let chunk1 = Arc::new(
+            TestChunk::new(table_name)
+                .with_id(1)
+                .with_tag_column("tag1")
+                .with_tag_column("tag2")
+                .with_f64_field_column("field")
+                .with_time_column(),
+        ) as Arc<dyn QueryChunk>;
+        let chunk2 = Arc::new(
+            TestChunk::new(table_name)
+                .with_id(2)
+                .with_dummy_parquet_file()
+                .with_tag_column("tag1")
+                .with_tag_column("tag2")
+                .with_f64_field_column("field")
+                .with_time_column(),
+        ) as Arc<dyn QueryChunk>;
+        let schema = chunk1.schema().clone();
+
+        let ctx = IOxSessionContext::with_testing();
+        let state = ctx.inner().state();
+
+        let provider = ProviderBuilder::new(Arc::from(table_name), schema)
+            .add_chunk(Arc::clone(&chunk1))
+            .add_chunk(Arc::clone(&chunk2))
+            .build()
+            .unwrap();
+
+        // simple plan
+        let plan = provider.scan(&state, None, &[], None).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "     UnionExec"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // projection
+        let plan = provider
+            .scan(&state, Some(&vec![1, 3]), &[], None)
+            .await
+            .unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[tag1@1 as tag1, time@3 as time]"
+        - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "     UnionExec"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // filters
+        let expr = vec![lit(false)];
+        let expr_ref = expr.iter().collect::<Vec<_>>();
+        assert_eq!(
+            provider.supports_filters_pushdown(&expr_ref).unwrap(),
+            vec![TableProviderFilterPushDown::Exact]
+        );
+        let plan = provider.scan(&state, None, &expr, None).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   FilterExec: false"
+        - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "       UnionExec"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // limit pushdown is unimplemented at the moment
+        let plan = provider.scan(&state, None, &[], Some(1)).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "     UnionExec"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[tokio::test]
+    async fn provider_scan_no_dedup() {
+        let table_name = "t";
+        let chunk1 = Arc::new(
+            TestChunk::new(table_name)
+                .with_id(1)
+                .with_tag_column("tag1")
+                .with_tag_column("tag2")
+                .with_f64_field_column("field")
+                .with_time_column(),
+        ) as Arc<dyn QueryChunk>;
+        let chunk2 = Arc::new(
+            TestChunk::new(table_name)
+                .with_id(2)
+                .with_dummy_parquet_file()
+                .with_tag_column("tag1")
+                .with_tag_column("tag2")
+                .with_f64_field_column("field")
+                .with_time_column(),
+        ) as Arc<dyn QueryChunk>;
+        let schema = chunk1.schema().clone();
+
+        let ctx = IOxSessionContext::with_testing();
+        let state = ctx.inner().state();
+
+        let provider = ProviderBuilder::new(Arc::from(table_name), schema)
+            .add_chunk(Arc::clone(&chunk1))
+            .add_chunk(Arc::clone(&chunk2))
+            .with_enable_deduplication(false)
+            .build()
+            .unwrap();
+
+        // simple plan
+        let plan = provider.scan(&state, None, &[], None).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   UnionExec"
+        - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // projection
+        let plan = provider
+            .scan(&state, Some(&vec![1, 3]), &[], None)
+            .await
+            .unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[tag1@1 as tag1, time@3 as time]"
+        - "   UnionExec"
+        - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // filters
+        // Expressions on fields are NOT pushed down because they cannot be pushed through de-dup.
+        let expr = vec![
+            lit(false),
+            col("tag1").eq(lit("foo")),
+            col("field").eq(lit(1.0)),
+        ];
+        let expr_ref = expr.iter().collect::<Vec<_>>();
+        assert_eq!(
+            provider.supports_filters_pushdown(&expr_ref).unwrap(),
+            vec![
+                TableProviderFilterPushDown::Inexact,
+                TableProviderFilterPushDown::Inexact,
+                TableProviderFilterPushDown::Inexact
+            ]
+        );
+        let plan = provider.scan(&state, None, &expr, None).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   FilterExec: false AND tag1@1 = CAST(foo AS Dictionary(Int32, Utf8))"
+        - "     UnionExec"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // limit pushdown is unimplemented at the moment
+        let plan = provider.scan(&state, None, &[], Some(1)).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   UnionExec"
+        - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[tokio::test]
+    async fn provider_scan_retention() {
+        let table_name = "t";
+        let pred = retention_expr(100);
+        let chunk1 = Arc::new(
+            TestChunk::new(table_name)
+                .with_id(1)
+                .with_tag_column("tag1")
+                .with_tag_column("tag2")
+                .with_f64_field_column("field")
+                .with_time_column(),
+        ) as Arc<dyn QueryChunk>;
+        let chunk2 = Arc::new(
+            TestChunk::new(table_name)
+                .with_id(2)
+                .with_dummy_parquet_file()
+                .with_tag_column("tag1")
+                .with_tag_column("tag2")
+                .with_f64_field_column("field")
+                .with_time_column(),
+        ) as Arc<dyn QueryChunk>;
+        let schema = chunk1.schema().clone();
+
+        let ctx = IOxSessionContext::with_testing();
+        let state = ctx.inner().state();
+
+        let provider = ProviderBuilder::new(Arc::from(table_name), schema)
+            .add_chunk(Arc::clone(&chunk1))
+            .add_chunk(Arc::clone(&chunk2))
+            .build()
+            .unwrap();
+
+        // simple plan
+        let plan = provider
+            .scan(&state, None, &[pred.clone()], None)
+            .await
+            .unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   FilterExec: time@3 > 100"
+        - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "       UnionExec"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // projection
+        let plan = provider
+            .scan(&state, Some(&vec![1, 3]), &[pred.clone()], None)
+            .await
+            .unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[tag1@1 as tag1, time@3 as time]"
+        - "   FilterExec: time@3 > 100"
+        - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "       UnionExec"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // filters
+        let expr = vec![lit(false), pred.clone()];
+        let expr_ref = expr.iter().collect::<Vec<_>>();
+        assert_eq!(
+            provider.supports_filters_pushdown(&expr_ref).unwrap(),
+            vec![
+                TableProviderFilterPushDown::Exact,
+                TableProviderFilterPushDown::Exact
+            ]
+        );
+        let plan = provider.scan(&state, None, &expr, None).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   FilterExec: false AND time@3 > 100"
+        - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "       UnionExec"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+
+        // limit pushdown is unimplemented at the moment
+        let plan = provider.scan(&state, None, &[pred], Some(1)).await.unwrap();
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
+        - "   FilterExec: time@3 > 100"
+        - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
+        - "       UnionExec"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
+        - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+}
diff --git a/iox_query/src/provider/adapter.rs b/iox_query/src/provider/adapter.rs
new file mode 100644
index 0000000..a0f1ad9
--- /dev/null
+++ b/iox_query/src/provider/adapter.rs
@@ -0,0 +1,514 @@
+//! Holds a stream that ensures chunks have the same (uniform) schema
+use std::{collections::HashMap, sync::Arc};
+
+use snafu::Snafu;
+use std::task::{Context, Poll};
+
+use arrow::{
+    array::new_null_array,
+    datatypes::{DataType, SchemaRef},
+    record_batch::RecordBatch,
+};
+use datafusion::physical_plan::{
+    metrics::BaselineMetrics, RecordBatchStream, SendableRecordBatchStream,
+};
+use datafusion::{error::DataFusionError, scalar::ScalarValue};
+use futures::Stream;
+
+/// Schema creation / validation errors.
+#[allow(clippy::enum_variant_names)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Internal error creating SchemaAdapterStream: input field '{}' had type '{:?}' which is different than output field '{}' which had type '{:?}'",
+                    input_field_name, input_field_type, output_field_name, output_field_type,))]
+    InternalDataTypeMismatch {
+        input_field_name: String,
+        input_field_type: DataType,
+        output_field_name: String,
+        output_field_type: DataType,
+    },
+
+    #[snafu(display("Internal error creating SchemaAdapterStream: creating virtual value of type '{:?}' which is different than output field '{}' which had type '{:?}'",
+                    field_type, output_field_name, output_field_type,))]
+    InternalDataTypeMismatchForVirtual {
+        field_type: DataType,
+        output_field_name: String,
+        output_field_type: DataType,
+    },
+
+    #[snafu(display("Internal error creating SchemaAdapterStream: the field '{}' is specified within the input and as a virtual column, don't know which one to choose",
+                    field_name))]
+    InternalColumnBothInInputAndVirtual { field_name: String },
+
+    #[snafu(display("Internal error creating SchemaAdapterStream: field '{}' had output type '{:?}' and should be a NULL column but the field is flagged as 'not null'",
+                    field_name, output_field_type,))]
+    InternalColumnNotNullable {
+        field_name: String,
+        output_field_type: DataType,
+    },
+}
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// This stream wraps another underlying stream to ensure it produces
+/// the specified schema.  If the underlying stream produces a subset
+/// of the columns specified in desired schema, this stream creates
+/// arrays with NULLs to pad out the missing columns or creates "virtual" columns which contain a fixed given value.
+///
+/// For example:
+///
+/// If a table had schema with Cols A, B, C, and D, but the chunk (input)
+/// stream only produced record batches with columns A and C. For D we provided a virtual value of "foo". This
+/// stream would append a column of B / nulls to each record batch
+/// that flowed through it and create a constant column D.
+///
+/// ```text
+///
+///                       ┌────────────────┐                         ┌───────────────────────────────┐
+///                       │ ┌─────┐┌─────┐ │                         │ ┌─────┐┌──────┐┌─────┐┌─────┐ │
+///                       │ │  A  ││  C  │ │                         │ │  A  ││  B   ││  C  ││  D  │ │
+///                       │ │  -  ││  -  │ │                         │ │  -  ││  -   ││  -  ││  -  │ │
+/// ┌──────────────┐      │ │  1  ││ 10  │ │     ┌──────────────┐    │ │  1  ││ NULL ││ 10  ││ foo │ │
+/// │    Input     │      │ │  2  ││ 20  │ │     │   Adapter    │    │ │  2  ││ NULL ││ 20  ││ foo │ │
+/// │    Stream    ├────▶ │ │  3  ││ 30  │ │────▶│    Stream    ├───▶│ │  3  ││ NULL ││ 30  ││ foo │ │
+/// └──────────────┘      │ │  4  ││ 40  │ │     └──────────────┘    │ │  4  ││ NULL ││ 40  ││ foo │ │
+///                       │ └─────┘└─────┘ │                         │ └─────┘└──────┘└─────┘└─────┘ │
+///                       │                │                         │                               │
+///                       │  Record Batch  │                         │          Record Batch         │
+///                       └────────────────┘                         └───────────────────────────────┘
+/// ```
+pub(crate) struct SchemaAdapterStream {
+    input: SendableRecordBatchStream,
+    /// Output schema of this stream
+    /// The schema of `input` is always a subset of output_schema
+    output_schema: SchemaRef,
+    mappings: Vec<ColumnMapping>,
+    /// metrics to record execution
+    baseline_metrics: BaselineMetrics,
+}
+
+impl std::fmt::Debug for SchemaAdapterStream {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SchemaAdapterStream")
+            .field("input", &"(OPAQUE STREAM)")
+            .field("output_schema", &self.output_schema)
+            .field("mappings", &self.mappings)
+            .finish()
+    }
+}
+
+impl SchemaAdapterStream {
+    /// Try to create a new adapter stream that produces batches with
+    /// the specified output schema.
+    ///
+    /// Virtual columns that contain constant values may be added via `virtual_columns`. Note that these columns MUST
+    /// NOT appear in underlying stream, other wise this method will fail.
+    ///
+    /// Columns that appear neither within the underlying stream nor a specified as virtual are created as pure NULL
+    /// columns. Note that the column must be nullable for this to work.
+    ///
+    /// If the underlying stream produces columns that DO NOT appear
+    /// in the output schema, or are different types than the output
+    /// schema, an error will be produced.
+    pub(crate) fn try_new(
+        input: SendableRecordBatchStream,
+        output_schema: SchemaRef,
+        virtual_columns: &HashMap<&str, ScalarValue>,
+        baseline_metrics: BaselineMetrics,
+    ) -> Result<Self> {
+        // record this setup time
+        let timer = baseline_metrics.elapsed_compute().timer();
+
+        let input_schema = input.schema();
+
+        // Figure out how to compute each column in the output
+        let mappings = output_schema
+            .fields()
+            .iter()
+            .map(|output_field| {
+                let input_field_index = input_schema
+                    .fields()
+                    .iter()
+                    .enumerate()
+                    .find(|(_, input_field)| output_field.name() == input_field.name())
+                    .map(|(idx, _)| idx);
+
+                if let Some(input_field_index) = input_field_index {
+                    ColumnMapping::FromInput(input_field_index)
+                } else if let Some(value) = virtual_columns.get(output_field.name().as_str()) {
+                    ColumnMapping::Virtual(value.clone())
+                } else {
+                    ColumnMapping::MakeNull(output_field.data_type().clone())
+                }
+            })
+            .collect::<Vec<_>>();
+
+        // Verify the mappings match the output type
+        for (output_index, mapping) in mappings.iter().enumerate() {
+            let output_field = output_schema.field(output_index);
+
+            match mapping {
+                ColumnMapping::FromInput(input_index) => {
+                    let input_field = input_schema.field(*input_index);
+                    if input_field.data_type() != output_field.data_type() {
+                        return InternalDataTypeMismatchSnafu {
+                            input_field_name: input_field.name(),
+                            input_field_type: input_field.data_type().clone(),
+                            output_field_name: output_field.name(),
+                            output_field_type: output_field.data_type().clone(),
+                        }
+                        .fail();
+                    }
+
+                    if virtual_columns.contains_key(input_field.name().as_str()) {
+                        return InternalColumnBothInInputAndVirtualSnafu {
+                            field_name: input_field.name().clone(),
+                        }
+                        .fail();
+                    }
+                }
+                ColumnMapping::MakeNull(_) => {
+                    if !output_field.is_nullable() {
+                        return InternalColumnNotNullableSnafu {
+                            field_name: output_field.name().clone(),
+                            output_field_type: output_field.data_type().clone(),
+                        }
+                        .fail();
+                    }
+                }
+                ColumnMapping::Virtual(value) => {
+                    let data_type = value.data_type();
+                    if &data_type != output_field.data_type() {
+                        return InternalDataTypeMismatchForVirtualSnafu {
+                            field_type: data_type,
+                            output_field_name: output_field.name(),
+                            output_field_type: output_field.data_type().clone(),
+                        }
+                        .fail();
+                    }
+                }
+            }
+        }
+
+        timer.done();
+        Ok(Self {
+            input,
+            output_schema,
+            mappings,
+            baseline_metrics,
+        })
+    }
+
+    /// Extends the record batch, if needed, so that it matches the schema
+    fn extend_batch(&self, batch: RecordBatch) -> Result<RecordBatch, DataFusionError> {
+        let output_columns = self
+            .mappings
+            .iter()
+            .map(|mapping| match mapping {
+                ColumnMapping::FromInput(input_index) => Ok(Arc::clone(batch.column(*input_index))),
+                ColumnMapping::MakeNull(data_type) => {
+                    Ok(new_null_array(data_type, batch.num_rows()))
+                }
+                ColumnMapping::Virtual(value) => value.to_array_of_size(batch.num_rows()),
+            })
+            .collect::<Result<Vec<_>, DataFusionError>>()?;
+
+        Ok(RecordBatch::try_new(
+            Arc::clone(&self.output_schema),
+            output_columns,
+        )?)
+    }
+}
+
+impl RecordBatchStream for SchemaAdapterStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.output_schema)
+    }
+}
+
+impl Stream for SchemaAdapterStream {
+    type Item = Result<RecordBatch, DataFusionError>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        ctx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        // the Poll result is an Opton<Result<Batch>> so we need a few
+        // layers of maps to get at the actual batch, if any
+        let poll = self.input.as_mut().poll_next(ctx).map(|maybe_result| {
+            maybe_result.map(|batch| batch.and_then(|batch| self.extend_batch(batch)))
+        });
+        self.baseline_metrics.record_poll(poll)
+    }
+
+    // TODO is there a useful size_hint to pass?
+}
+
+/// Describes how to create column in the output.
+#[derive(Debug)]
+enum ColumnMapping {
+    /// Output column is found at `<index>` column of the input schema
+    FromInput(usize),
+
+    /// Output colum should be synthesized with nulls of the specified type
+    MakeNull(DataType),
+
+    /// Create virtual chunk column
+    Virtual(ScalarValue),
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+    use arrow::{
+        array::{ArrayRef, Int32Array, StringArray},
+        datatypes::{Field, Schema},
+        record_batch::RecordBatch,
+    };
+    use arrow_util::assert_batches_eq;
+    use datafusion::physical_plan::{common::collect, metrics::ExecutionPlanMetricsSet};
+    use datafusion_util::stream_from_batch;
+    use test_helpers::assert_contains;
+
+    #[tokio::test]
+    async fn same_input_and_output() {
+        let batch = make_batch();
+
+        let output_schema = batch.schema();
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let adapter_stream = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &Default::default(),
+            baseline_metrics(),
+        )
+        .unwrap();
+
+        let output = collect(Box::pin(adapter_stream))
+            .await
+            .expect("Running plan");
+        let expected = vec![
+            "+---+---+-----+",
+            "| a | b | c   |",
+            "+---+---+-----+",
+            "| 1 | 4 | foo |",
+            "| 2 | 5 | bar |",
+            "| 3 | 6 | baz |",
+            "+---+---+-----+",
+        ];
+        assert_batches_eq!(&expected, &output);
+    }
+
+    #[tokio::test]
+    async fn input_different_order_than_output() {
+        let batch = make_batch();
+        // input has columns in different order than desired output
+
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new("b", DataType::Int32, false),
+            Field::new("c", DataType::Utf8, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let adapter_stream = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &Default::default(),
+            baseline_metrics(),
+        )
+        .unwrap();
+
+        let output = collect(Box::pin(adapter_stream))
+            .await
+            .expect("Running plan");
+        let expected = vec![
+            "+---+-----+---+",
+            "| b | c   | a |",
+            "+---+-----+---+",
+            "| 4 | foo | 1 |",
+            "| 5 | bar | 2 |",
+            "| 6 | baz | 3 |",
+            "+---+-----+---+",
+        ];
+        assert_batches_eq!(&expected, &output);
+    }
+
+    #[tokio::test]
+    async fn input_subset_of_output() {
+        let batch = make_batch();
+        // input has subset of columns of the desired otuput. d and e are not present
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new("c", DataType::Utf8, false),
+            Field::new("e", DataType::Float64, true),
+            Field::new("b", DataType::Int32, false),
+            Field::new("d", DataType::Float32, true),
+            Field::new("f", DataType::Utf8, true),
+            Field::new("g", DataType::Int32, false),
+            Field::new("h", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let adapter_stream = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &HashMap::from([
+                ("f", ScalarValue::from("xxx")),
+                ("g", ScalarValue::from(1i32)),
+                ("h", ScalarValue::from(1i32)),
+            ]),
+            baseline_metrics(),
+        )
+        .unwrap();
+
+        let output = collect(Box::pin(adapter_stream))
+            .await
+            .expect("Running plan");
+        let expected = vec![
+            "+-----+---+---+---+-----+---+---+---+",
+            "| c   | e | b | d | f   | g | h | a |",
+            "+-----+---+---+---+-----+---+---+---+",
+            "| foo |   | 4 |   | xxx | 1 | 1 | 1 |",
+            "| bar |   | 5 |   | xxx | 1 | 1 | 2 |",
+            "| baz |   | 6 |   | xxx | 1 | 1 | 3 |",
+            "+-----+---+---+---+-----+---+---+---+",
+        ];
+        assert_batches_eq!(&expected, &output);
+    }
+
+    #[tokio::test]
+    async fn input_superset_of_columns() {
+        let batch = make_batch();
+
+        // No such column "b" in output -- column would be lost
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new("c", DataType::Utf8, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let adapter_stream = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &Default::default(),
+            baseline_metrics(),
+        )
+        .unwrap();
+
+        let output = collect(Box::pin(adapter_stream))
+            .await
+            .expect("Running plan");
+        let expected = vec![
+            "+-----+---+",
+            "| c   | a |",
+            "+-----+---+",
+            "| foo | 1 |",
+            "| bar | 2 |",
+            "| baz | 3 |",
+            "+-----+---+",
+        ];
+        assert_batches_eq!(&expected, &output);
+    }
+
+    #[tokio::test]
+    async fn input_has_different_type() {
+        let batch = make_batch();
+
+        // column c has string type in input, output asks float32
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new("c", DataType::Float32, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let res = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &Default::default(),
+            baseline_metrics(),
+        );
+
+        assert_contains!(res.unwrap_err().to_string(), "input field 'c' had type 'Utf8' which is different than output field 'c' which had type 'Float32'");
+    }
+
+    #[tokio::test]
+    async fn virtual_col_has_wrong_type() {
+        let batch = make_batch();
+
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new("c", DataType::Utf8, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("d", DataType::UInt8, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let res = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &HashMap::from([("d", ScalarValue::from(1u32))]),
+            baseline_metrics(),
+        );
+
+        assert_contains!(res.unwrap_err().to_string(), "creating virtual value of type 'UInt32' which is different than output field 'd' which had type 'UInt8'");
+    }
+
+    #[tokio::test]
+    async fn virtual_col_also_in_input() {
+        let batch = make_batch();
+
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new("c", DataType::Utf8, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("d", DataType::Utf8, false),
+            Field::new("a", DataType::Int32, false),
+        ]));
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let res = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &HashMap::from([
+                ("a", ScalarValue::from(1i32)),
+                ("d", ScalarValue::from("foo")),
+            ]),
+            baseline_metrics(),
+        );
+
+        assert_contains!(res.unwrap_err().to_string(), "the field 'a' is specified within the input and as a virtual column, don't know which one to choose");
+    }
+
+    #[tokio::test]
+    async fn null_non_nullable_column() {
+        let batch = make_batch();
+
+        let output_schema = Arc::new(Schema::new(vec![
+            Field::new("c", DataType::Utf8, false),
+            Field::new("b", DataType::Int32, false),
+            Field::new("a", DataType::Int32, false),
+            Field::new("d", DataType::Utf8, false),
+        ]));
+        let input_stream = stream_from_batch(batch.schema(), batch);
+        let res = SchemaAdapterStream::try_new(
+            input_stream,
+            output_schema,
+            &Default::default(),
+            baseline_metrics(),
+        );
+
+        assert_contains!(res.unwrap_err().to_string(), "field 'd' had output type 'Utf8' and should be a NULL column but the field is flagged as 'not null'");
+    }
+
+    // input has different column types than desired output
+
+    fn make_batch() -> RecordBatch {
+        let col_a = Arc::new(Int32Array::from(vec![1, 2, 3]));
+        let col_b = Arc::new(Int32Array::from(vec![4, 5, 6]));
+        let col_c = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
+
+        RecordBatch::try_from_iter(vec![("a", col_a as ArrayRef), ("b", col_b), ("c", col_c)])
+            .unwrap()
+    }
+
+    /// Create a BaselineMetrics object for testing
+    fn baseline_metrics() -> BaselineMetrics {
+        BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0)
+    }
+}
diff --git a/iox_query/src/provider/deduplicate.rs b/iox_query/src/provider/deduplicate.rs
new file mode 100644
index 0000000..45c0250
--- /dev/null
+++ b/iox_query/src/provider/deduplicate.rs
@@ -0,0 +1,1238 @@
+//! Implemention of DeduplicateExec operator (resolves primary key conflicts) plumbing and tests
+mod algo;
+
+use std::{collections::HashSet, fmt, sync::Arc};
+
+use arrow::{error::ArrowError, record_batch::RecordBatch};
+use datafusion_util::{watch::WatchedTask, AdapterStream};
+
+use crate::CHUNK_ORDER_COLUMN_NAME;
+
+use self::algo::get_col_name;
+pub use self::algo::RecordBatchDeduplicator;
+use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::{
+    error::{DataFusionError, Result},
+    execution::context::TaskContext,
+    physical_expr::PhysicalSortRequirement,
+    physical_plan::{
+        expressions::{Column, PhysicalSortExpr},
+        metrics::{
+            self, BaselineMetrics, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, RecordOutput,
+        },
+        DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
+        SendableRecordBatchStream, Statistics,
+    },
+};
+use futures::StreamExt;
+use observability_deps::tracing::{debug, trace};
+use tokio::sync::mpsc;
+
+/// # DeduplicateExec
+///
+/// This operator takes an input stream of RecordBatches that is
+/// already sorted on "sort_key" and applies IOx specific deduplication
+/// logic.
+///
+/// The output is dependent on the order of the the input rows which
+/// have the same key.
+///
+/// Specifically, the value chosen for each non-sort_key column is the
+/// "last" non-null value. This is used to model "upserts" when new
+/// rows with the same primary key are inserted a second time to update
+/// existing values.
+///
+/// # Example
+/// For example, given a sort key of (t1, t2) and the following input
+/// (already sorted on t1 and t2):
+///
+/// ```text
+/// +----+----+----+----+
+/// | t1 | t2 | f1 | f2 |
+/// +----+----+----+----+
+/// | a  | x  | 2  |    |
+/// | a  | x  | 2  | 1  |
+/// | a  | x  |    | 3  |
+/// | a  | y  | 3  | 1  |
+/// | b  | y  | 3  |    |
+/// | c  | y  | 1  | 1  |
+/// +----+----+----+----+
+/// ```
+///
+/// This operator will produce the following output (note the values
+/// chosen for (a, x)):
+///
+/// ```text
+/// +----+----+----+----+
+/// | t1 | t2 | f1 | f2 |
+/// +----+----+----+----+
+/// | a  | x  | 2  | 3  |
+/// | a  | y  | 3  | 1  |
+/// | b  | y  | 3  |    |
+/// | c  | y  | 1  | 1  |
+/// +----+----+----+----+
+/// ```
+///
+/// # Field Resolution (why the last non-null value?)
+///
+/// The choice of the latest non-null value instead of the latest value is
+/// subtle and thus we try to document the rationale here. It is a
+/// consequence of the LineProtocol update model.
+///
+/// Some observations about line protocol are:
+///
+/// 1. Lines are treated as "UPSERT"s (aka updating any existing
+///    values, possibly adding new fields)
+///
+/// 2. Fields can not be removed or set to NULL via a line (So if a
+///    field has a NULL value it means the user didn't provide a value
+///    for that field)
+///
+/// For example, this data (with a NULL for `f2`):
+///
+/// ```text
+/// t1 | f1 | f2
+/// ---+----+----
+///  a | 1  | 3
+//   a | 2  |
+/// ```
+///
+/// Would have come from line protocol like
+/// ```text
+/// m,t1=a f1=1,f2=3
+/// m,t1=a f1=3
+/// ```
+/// (note there was no value for f2 provided in the second line, it can
+/// be read as "upsert value of f1=3, the value of f2 is not modified).
+///
+/// Thus it would not be correct to take the latest value from f2
+/// (NULL) as in the source input the field's value was not provided.
+#[derive(Debug)]
+pub struct DeduplicateExec {
+    input: Arc<dyn ExecutionPlan>,
+    sort_keys: Vec<PhysicalSortExpr>,
+    input_order: Vec<PhysicalSortExpr>,
+    use_chunk_order_col: bool,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl DeduplicateExec {
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        sort_keys: Vec<PhysicalSortExpr>,
+        use_chunk_order_col: bool,
+    ) -> Self {
+        let mut input_order = sort_keys.clone();
+        if use_chunk_order_col {
+            input_order.push(PhysicalSortExpr {
+                expr: Arc::new(
+                    Column::new_with_schema(CHUNK_ORDER_COLUMN_NAME, &input.schema())
+                        .expect("input has chunk order col"),
+                ),
+                options: Default::default(),
+            })
+        }
+        Self {
+            input,
+            sort_keys,
+            input_order,
+            use_chunk_order_col,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+
+    pub fn sort_keys(&self) -> &[PhysicalSortExpr] {
+        &self.sort_keys
+    }
+
+    /// Combination of all columns within the sort key and potentially the chunk order column.
+    pub fn sort_columns(&self) -> HashSet<&str> {
+        self.input_order
+            .iter()
+            .map(|sk| get_col_name(sk.expr.as_ref()))
+            .collect()
+    }
+
+    pub fn use_chunk_order_col(&self) -> bool {
+        self.use_chunk_order_col
+    }
+}
+
+#[derive(Debug)]
+struct DeduplicateMetrics {
+    baseline_metrics: BaselineMetrics,
+    num_dupes: metrics::Count,
+}
+
+impl DeduplicateMetrics {
+    fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        Self {
+            baseline_metrics: BaselineMetrics::new(metrics, partition),
+            num_dupes: MetricBuilder::new(metrics).counter("num_dupes", partition),
+        }
+    }
+}
+
+impl ExecutionPlan for DeduplicateExec {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> arrow::datatypes::SchemaRef {
+        self.input.schema()
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        trace!("Deduplicate output ordering: {:?}", self.sort_keys);
+        Some(&self.sort_keys)
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
+        vec![Some(PhysicalSortRequirement::from_sort_exprs(
+            &self.input_order,
+        ))]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn equivalence_properties(&self) -> EquivalenceProperties {
+        // deduplicate does not change the equivalence properties
+        self.input.equivalence_properties()
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        assert_eq!(children.len(), 1);
+        let input = Arc::clone(&children[0]);
+        Ok(Arc::new(Self::new(
+            input,
+            self.sort_keys.clone(),
+            self.use_chunk_order_col,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        trace!(partition, "Start DeduplicationExec::execute");
+
+        if partition != 0 {
+            return Err(DataFusionError::Internal(
+                "DeduplicateExec only supports a single input stream".to_string(),
+            ));
+        }
+        let deduplicate_metrics = DeduplicateMetrics::new(&self.metrics, partition);
+
+        let input_stream = self.input.execute(0, context)?;
+
+        // the deduplication is performed in a separate task which is
+        // then sent via a channel to the output
+        let (tx, rx) = mpsc::channel(1);
+
+        let fut = deduplicate(
+            input_stream,
+            self.sort_keys.clone(),
+            tx.clone(),
+            deduplicate_metrics,
+        );
+
+        // A second task watches the output of the worker task and reports errors
+        let handle = WatchedTask::new(fut, vec![tx], "deduplicate batches");
+
+        debug!(
+            partition,
+            "End building stream for DeduplicationExec::execute"
+        );
+
+        Ok(AdapterStream::adapt(self.schema(), rx, handle))
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        // For now use a single input -- it might be helpful
+        // eventually to deduplicate in parallel by hash partitioning
+        // the inputs (based on sort keys)
+        vec![Distribution::SinglePartition]
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        // use a guess from our input but they are NOT exact
+        Ok(self.input.statistics()?.into_inexact())
+    }
+}
+
+impl DisplayAs for DeduplicateExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                let expr: Vec<String> = self.sort_keys.iter().map(|e| e.to_string()).collect();
+                write!(f, "DeduplicateExec: [{}]", expr.join(","))
+            }
+        }
+    }
+}
+
+async fn deduplicate(
+    mut input_stream: SendableRecordBatchStream,
+    sort_keys: Vec<PhysicalSortExpr>,
+    tx: mpsc::Sender<Result<RecordBatch, DataFusionError>>,
+    deduplicate_metrics: DeduplicateMetrics,
+) -> Result<(), DataFusionError> {
+    let DeduplicateMetrics {
+        baseline_metrics,
+        num_dupes,
+    } = deduplicate_metrics;
+
+    let elapsed_compute = baseline_metrics.elapsed_compute();
+    let mut deduplicator = RecordBatchDeduplicator::new(sort_keys, num_dupes, None);
+
+    // Stream input through the indexer
+    while let Some(batch) = input_stream.next().await {
+        let batch = batch?;
+
+        // First check if this batch has same sort key with its previous batch
+        let timer = elapsed_compute.timer();
+        if let Some(last_batch) = deduplicator
+            .last_batch_with_no_same_sort_key(&batch)
+            .record_output(&baseline_metrics)
+        {
+            timer.done();
+            // No, different sort key, so send the last batch downstream first
+            if last_batch.num_rows() > 0 {
+                tx.send(Ok(last_batch))
+                    .await
+                    .map_err(|e| ArrowError::from_external_error(Box::new(e)))?;
+            }
+        } else {
+            timer.done()
+        }
+
+        // deduplicate data of the batch
+        let timer = elapsed_compute.timer();
+        let output_batch = deduplicator.push(batch)?.record_output(&baseline_metrics);
+        timer.done();
+        if output_batch.num_rows() > 0 {
+            tx.send(Ok(output_batch))
+                .await
+                .map_err(|e| ArrowError::from_external_error(Box::new(e)))?;
+        }
+    }
+    debug!("before sending the left over batch");
+
+    // send any left over batch
+    let timer = elapsed_compute.timer();
+    if let Some(output_batch) = deduplicator.finish()?.record_output(&baseline_metrics) {
+        timer.done();
+        if output_batch.num_rows() > 0 {
+            tx.send(Ok(output_batch))
+                .await
+                .map_err(|e| ArrowError::from_external_error(Box::new(e)))?;
+        }
+    } else {
+        timer.done()
+    }
+    debug!("done sending the left over batch");
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod test {
+    use arrow::compute::SortOptions;
+    use arrow::datatypes::{Int32Type, SchemaRef};
+    use arrow::{
+        array::{ArrayRef, Float64Array, StringArray, TimestampNanosecondArray},
+        record_batch::RecordBatch,
+    };
+    use arrow_util::assert_batches_eq;
+    use datafusion::physical_plan::{expressions::col, memory::MemoryExec};
+    use datafusion_util::test_collect;
+
+    use super::*;
+    use arrow::array::{DictionaryArray, Int64Array};
+    use schema::TIME_DATA_TIMEZONE;
+    use std::iter::FromIterator;
+
+    #[tokio::test]
+    async fn test_single_tag() {
+        // input:
+        // t1 | f1 | f2
+        // ---+----+----
+        //  a | 1  |
+        //  a | 2  | 3
+        //  a |    | 4
+        //  b | 5  | 6
+        //  c | 7  |
+        //  c |    |
+        //  c |    | 8
+        //
+        // expected output:
+        //
+        // t1 | f1 | f2
+        // ---+----+----
+        //  a | 2  | 4
+        //  b | 5  | 6
+        //  c | 7  | 8
+
+        let t1 = StringArray::from(vec![
+            Some("a"),
+            Some("a"),
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("c"),
+            Some("c"),
+        ]);
+        let f1 = Float64Array::from(vec![
+            Some(1.0),
+            Some(2.0),
+            None,
+            Some(5.0),
+            Some(7.0),
+            None,
+            None,
+        ]);
+        let f2 = Float64Array::from(vec![
+            None,
+            Some(3.0),
+            Some(4.0),
+            Some(6.0),
+            None,
+            None,
+            Some(8.0),
+        ]);
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("t1", &batch.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let results = dedupe(vec![batch], sort_keys).await;
+
+        let expected = vec![
+            "+----+-----+-----+",
+            "| t1 | f1  | f2  |",
+            "+----+-----+-----+",
+            "| a  | 2.0 | 4.0 |",
+            "| b  | 5.0 | 6.0 |",
+            "| c  | 7.0 | 8.0 |",
+            "+----+-----+-----+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+    }
+
+    #[tokio::test]
+    async fn test_with_timestamp() {
+        // input:
+        // f1 | f2 | time
+        // ---+----+------
+        //  1 |    | 100
+        //    | 3  | 100
+        //
+        // expected output:
+        //
+        // f1 | f2 | time
+        // ---+----+-------
+        //  1 | 3  | 100
+        let f1 = Float64Array::from(vec![Some(1.0), None]);
+        let f2 = Float64Array::from(vec![None, Some(3.0)]);
+
+        let time = TimestampNanosecondArray::from(vec![Some(100), Some(100)])
+            .with_timezone_opt(TIME_DATA_TIMEZONE());
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+            ("time", Arc::new(time) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("time", &batch.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let results = dedupe(vec![batch], sort_keys).await;
+
+        let expected = vec![
+            "+-----+-----+--------------------------------+",
+            "| f1  | f2  | time                           |",
+            "+-----+-----+--------------------------------+",
+            "| 1.0 | 3.0 | 1970-01-01T00:00:00.000000100Z |",
+            "+-----+-----+--------------------------------+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+    }
+
+    #[tokio::test]
+    async fn test_multi_tag() {
+        // input:
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | b  | 1  |
+        //  a | b  | 2  | 3
+        //  a | b  |    | 4
+        //  a | z  | 5  |
+        //  b | b  | 6  |
+        //  b | c  | 7  | 6
+        //  c | c  | 8  |
+        //  d | b  |    | 9
+        //  e |    | 10 | 11
+        //  e |    | 12 |
+        //    | f  | 13 |
+        //    | f  |    | 14
+        //
+        // expected output:
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | b  | 2  | 4
+        //  a | z  | 5  |
+        //  b | b  | 6  |
+        //  b | c  | 7  | 6
+        //  c | c  | 8  |
+        //  d | b  |    | 9
+        //  e |    | 12 | 11
+        //    | f  | 13 | 14
+
+        let t1 = StringArray::from(vec![
+            Some("a"),
+            Some("a"),
+            Some("a"),
+            Some("a"),
+            Some("b"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+            Some("e"),
+            None,
+            None,
+        ]);
+
+        let t2 = StringArray::from(vec![
+            Some("b"),
+            Some("b"),
+            Some("b"),
+            Some("z"),
+            Some("b"),
+            Some("c"),
+            Some("c"),
+            Some("b"),
+            None,
+            None,
+            Some("f"),
+            Some("f"),
+        ]);
+
+        let f1 = Float64Array::from(vec![
+            Some(1.0),
+            Some(2.0),
+            None,
+            Some(5.0),
+            Some(6.0),
+            Some(7.0),
+            Some(8.0),
+            None,
+            Some(10.0),
+            Some(12.0),
+            Some(13.0),
+            None,
+        ]);
+
+        let f2 = Float64Array::from(vec![
+            None,
+            Some(3.0),
+            Some(4.0),
+            None,
+            None,
+            Some(6.0),
+            None,
+            Some(9.0),
+            Some(11.0),
+            None,
+            None,
+            Some(14.0),
+        ]);
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("t1", &batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+            PhysicalSortExpr {
+                expr: col("t2", &batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+        ];
+
+        let results = dedupe(vec![batch], sort_keys).await;
+
+        let expected = vec![
+            "+----+----+------+------+",
+            "| t1 | t2 | f1   | f2   |",
+            "+----+----+------+------+",
+            "| a  | b  | 2.0  | 4.0  |",
+            "| a  | z  | 5.0  |      |",
+            "| b  | b  | 6.0  |      |",
+            "| b  | c  | 7.0  | 6.0  |",
+            "| c  | c  | 8.0  |      |",
+            "| d  | b  |      | 9.0  |",
+            "| e  |    | 12.0 | 11.0 |",
+            "|    | f  | 13.0 | 14.0 |",
+            "+----+----+------+------+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+    }
+
+    #[tokio::test]
+    async fn test_string_with_timestamp() {
+        // input:
+        //    s   | i  | time
+        // -------+----+------
+        //  "cat" |    | 100
+        //        | 3  | 100
+        //        | 4  | 200
+        //  "dog" |    | 200
+        //
+        // expected output:
+        //
+        //    s   | i | time
+        // -------+----+-------
+        //  "cat" | 3  | 100
+        //  "dog" | 4  | 200
+        let s = StringArray::from(vec![Some("cat"), None, None, Some("dog")]);
+
+        let i = Int64Array::from(vec![None, Some(3), Some(4), None]);
+
+        let time = TimestampNanosecondArray::from(vec![Some(100), Some(100), Some(200), Some(200)]);
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("s", Arc::new(s) as ArrayRef),
+            ("i", Arc::new(i) as ArrayRef),
+            ("time", Arc::new(time) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("time", &batch.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let results = dedupe(vec![batch], sort_keys).await;
+
+        let expected = vec![
+            "+-----+---+--------------------------------+",
+            "| s   | i | time                           |",
+            "+-----+---+--------------------------------+",
+            "| cat | 3 | 1970-01-01T00:00:00.000000100Z |",
+            "| dog | 4 | 1970-01-01T00:00:00.000000200Z |",
+            "+-----+---+--------------------------------+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+    }
+
+    #[tokio::test]
+    async fn test_last_is_null_with_timestamp() {
+        // input:
+        //    s   | i  | time
+        // -------+----+------
+        //  "cat" |     | 1639612800000000000
+        //        | 10  | 1639612800000000000
+        //
+        // expected output:
+        //
+        //    s   | i | time
+        // -------+----+-------
+        //  "cat" | 10  | 1639612800000000000
+        let s = StringArray::from(vec![Some("cat"), None]);
+
+        let i = Int64Array::from(vec![None, Some(10)]);
+
+        let time = TimestampNanosecondArray::from(vec![
+            Some(1639612800000000000),
+            Some(1639612800000000000),
+        ]);
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("s", Arc::new(s) as ArrayRef),
+            ("i", Arc::new(i) as ArrayRef),
+            ("time", Arc::new(time) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("time", &batch.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: true,
+            },
+        }];
+
+        let results = dedupe(vec![batch], sort_keys).await;
+
+        let expected = vec![
+            "+-----+----+----------------------+",
+            "| s   | i  | time                 |",
+            "+-----+----+----------------------+",
+            "| cat | 10 | 2021-12-16T00:00:00Z |",
+            "+-----+----+----------------------+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+    }
+
+    #[tokio::test]
+    async fn test_multi_record_batch() {
+        // input:
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | b  | 1  | 2
+        //  a | c  | 3  |
+        //  a | c  | 4  | 5
+        //  ====(next batch)====
+        //  a | c  |    | 6
+        //  b | d  | 7  | 8
+
+        //
+        // expected output:
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | b  | 1  | 2
+        //  a | c  | 4  | 6
+        //  b | d  | 7  | 8
+
+        let t1 = StringArray::from(vec![Some("a"), Some("a"), Some("a")]);
+
+        let t2 = StringArray::from(vec![Some("b"), Some("c"), Some("c")]);
+
+        let f1 = Float64Array::from(vec![Some(1.0), Some(3.0), Some(4.0)]);
+
+        let f2 = Float64Array::from(vec![Some(2.0), None, Some(5.0)]);
+
+        let batch1 = RecordBatch::try_from_iter_with_nullable(vec![
+            ("t1", Arc::new(t1) as ArrayRef, true),
+            ("t2", Arc::new(t2) as ArrayRef, true),
+            ("f1", Arc::new(f1) as ArrayRef, true),
+            ("f2", Arc::new(f2) as ArrayRef, true),
+        ])
+        .unwrap();
+
+        let t1 = StringArray::from(vec![Some("a"), Some("b")]);
+
+        let t2 = StringArray::from(vec![Some("c"), Some("d")]);
+
+        let f1 = Float64Array::from(vec![None, Some(7.0)]);
+
+        let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
+
+        let batch2 = RecordBatch::try_from_iter_with_nullable(vec![
+            ("t1", Arc::new(t1) as ArrayRef, true),
+            ("t2", Arc::new(t2) as ArrayRef, true),
+            ("f1", Arc::new(f1) as ArrayRef, true),
+            ("f2", Arc::new(f2) as ArrayRef, true),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("t1", &batch2.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+            PhysicalSortExpr {
+                expr: col("t2", &batch2.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+        ];
+
+        let results = dedupe(vec![batch1, batch2], sort_keys).await;
+
+        let expected = vec![
+            "+----+----+-----+-----+",
+            "| t1 | t2 | f1  | f2  |",
+            "+----+----+-----+-----+",
+            "| a  | b  | 1.0 | 2.0 |",
+            "| a  | c  | 4.0 | 6.0 |",
+            "| b  | d  | 7.0 | 8.0 |",
+            "+----+----+-----+-----+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+        // 5 rows in initial input, 3 rows in output ==> 2 dupes
+        assert_eq!(results.num_dupes(), 5 - 3);
+    }
+
+    #[tokio::test]
+    async fn test_no_dupes() {
+        // special case test for data without duplicates (fast path)
+        // input:
+        // t1 | f1
+        // ---+----
+        //  a | 1
+        //  ====(next batch)====
+        //  b | 2
+        //
+        // expected output:
+        //
+        // t1 | f1
+        // ---+----
+        //  a | 1
+        //  b | 2
+
+        let t1 = StringArray::from(vec![Some("a")]);
+        let f1 = Float64Array::from(vec![Some(1.0)]);
+
+        let batch1 = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+        ])
+        .unwrap();
+
+        let t1 = StringArray::from(vec![Some("b")]);
+        let f1 = Float64Array::from(vec![Some(2.0)]);
+
+        let batch2 = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("t1", &batch2.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let results = dedupe(vec![batch1, batch2], sort_keys).await;
+
+        let expected = vec![
+            "+----+-----+",
+            "| t1 | f1  |",
+            "+----+-----+",
+            "| a  | 1.0 |",
+            "| b  | 2.0 |",
+            "+----+-----+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+
+        // also validate there were no dupes detected
+        assert_eq!(results.num_dupes(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_single_pk() {
+        // test boundary condition
+
+        // input:
+        // t1 | f1 | f2
+        // ---+----+----
+        //  a | 1  | 2
+        //  a | 3  | 4
+        //
+        // expected output:
+        //
+        // t1 | f1 | f2
+        // ---+----+----
+        //  a | 3  | 4
+
+        let t1 = StringArray::from(vec![Some("a"), Some("a")]);
+        let f1 = Float64Array::from(vec![Some(1.0), Some(3.0)]);
+        let f2 = Float64Array::from(vec![Some(2.0), Some(4.0)]);
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("t1", &batch.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let results = dedupe(vec![batch], sort_keys).await;
+
+        let expected = vec![
+            "+----+-----+-----+",
+            "| t1 | f1  | f2  |",
+            "+----+-----+-----+",
+            "| a  | 3.0 | 4.0 |",
+            "+----+-----+-----+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+    }
+
+    #[tokio::test]
+    async fn test_column_reorder() {
+        // test if they fields come before tags and tags not in right order
+
+        // input:
+        // f1 | t2 | t1
+        // ---+----+----
+        //  1 | a  | a
+        //  2 | a  | a
+        //  3 | a  | b
+        //  4 | b  | b
+        //
+        // expected output:
+        //
+        // f1 | t2 | t1
+        // ---+----+----
+        //  2 | a  | a
+        //  3 | a  | b
+        //  4 | b  | b
+
+        let f1 = Float64Array::from(vec![Some(1.0), Some(2.0), Some(3.0), Some(4.0)]);
+        let t2 = StringArray::from(vec![Some("a"), Some("a"), Some("a"), Some("b")]);
+        let t1 = StringArray::from(vec![Some("a"), Some("a"), Some("b"), Some("b")]);
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("t1", Arc::new(t1) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("t1", &batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+            PhysicalSortExpr {
+                expr: col("t2", &batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+        ];
+
+        let results = dedupe(vec![batch], sort_keys).await;
+
+        let expected = vec![
+            "+-----+----+----+",
+            "| f1  | t2 | t1 |",
+            "+-----+----+----+",
+            "| 2.0 | a  | a  |",
+            "| 3.0 | a  | b  |",
+            "| 4.0 | b  | b  |",
+            "+-----+----+----+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "This is the error")]
+    async fn test_input_error_propagated() {
+        // test that an error from the input gets to the output
+
+        // input:
+        // t1 | f1
+        // ---+----
+        //  a | 1
+        // === next batch ===
+        // (error)
+
+        let t1 = StringArray::from(vec![Some("a")]);
+        let f1 = Float64Array::from(vec![Some(1.0)]);
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+        ])
+        .unwrap();
+
+        let schema = batch.schema();
+        let batches = vec![
+            Ok(batch),
+            Err(ArrowError::ComputeError("This is the error".to_string())),
+        ];
+
+        let input = Arc::new(DummyExec {
+            schema: Arc::clone(&schema),
+            batches,
+        });
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("t1", &schema).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(DeduplicateExec::new(input, sort_keys, false));
+        test_collect(exec).await;
+    }
+
+    #[tokio::test]
+    async fn test_dictionary() {
+        let t1 = DictionaryArray::<Int32Type>::from_iter(vec![Some("a"), Some("a"), Some("b")]);
+        let t2 = DictionaryArray::<Int32Type>::from_iter(vec![Some("b"), Some("c"), Some("c")]);
+        let f1 = Float64Array::from(vec![Some(1.0), Some(3.0), Some(4.0)]);
+        let f2 = Float64Array::from(vec![Some(2.0), None, Some(5.0)]);
+
+        let batch1 = RecordBatch::try_from_iter_with_nullable(vec![
+            ("t1", Arc::new(t1) as ArrayRef, true),
+            ("t2", Arc::new(t2) as ArrayRef, true),
+            ("f1", Arc::new(f1) as ArrayRef, true),
+            ("f2", Arc::new(f2) as ArrayRef, true),
+        ])
+        .unwrap();
+
+        let t1 = DictionaryArray::<Int32Type>::from_iter(vec![Some("b"), Some("c")]);
+        let t2 = DictionaryArray::<Int32Type>::from_iter(vec![Some("c"), Some("d")]);
+        let f1 = Float64Array::from(vec![None, Some(7.0)]);
+        let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
+
+        let batch2 = RecordBatch::try_from_iter_with_nullable(vec![
+            ("t1", Arc::new(t1) as ArrayRef, true),
+            ("t2", Arc::new(t2) as ArrayRef, true),
+            ("f1", Arc::new(f1) as ArrayRef, true),
+            ("f2", Arc::new(f2) as ArrayRef, true),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("t1", &batch1.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+            PhysicalSortExpr {
+                expr: col("t2", &batch1.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+        ];
+
+        let results = dedupe(vec![batch1, batch2], sort_keys).await;
+
+        let cols: Vec<_> = results
+            .output
+            .iter()
+            .map(|batch| {
+                batch
+                    .column(batch.schema().column_with_name("t1").unwrap().0)
+                    .as_any()
+                    .downcast_ref::<DictionaryArray<Int32Type>>()
+                    .unwrap()
+            })
+            .collect();
+
+        // Should produce optimised dictionaries
+        // The batching is not important
+        assert_eq!(cols.len(), 3);
+        assert_eq!(cols[0].keys().len(), 2);
+        assert_eq!(cols[0].values().len(), 1); // "a"
+        assert_eq!(cols[1].keys().len(), 1);
+        assert_eq!(cols[1].values().len(), 1); // "b"
+        assert_eq!(cols[2].keys().len(), 1);
+        assert_eq!(cols[2].values().len(), 1); // "c"
+
+        let expected = vec![
+            "+----+----+-----+-----+",
+            "| t1 | t2 | f1  | f2  |",
+            "+----+----+-----+-----+",
+            "| a  | b  | 1.0 | 2.0 |",
+            "| a  | c  | 3.0 |     |",
+            "| b  | c  | 4.0 | 6.0 |",
+            "| c  | d  | 7.0 | 8.0 |",
+            "+----+----+-----+-----+",
+        ];
+        assert_batches_eq!(&expected, &results.output);
+        // 5 rows in initial input, 4 rows in output ==> 1 dupes
+        assert_eq!(results.num_dupes(), 5 - 4);
+    }
+
+    struct TestResults {
+        output: Vec<RecordBatch>,
+        exec: Arc<DeduplicateExec>,
+    }
+
+    impl TestResults {
+        /// return the number of duplicates this deduplicator detected
+        fn num_dupes(&self) -> usize {
+            let metrics = self.exec.metrics().unwrap();
+
+            let metrics = metrics
+                .iter()
+                .filter(|m| m.value().name() == "num_dupes")
+                .collect::<Vec<_>>();
+
+            assert_eq!(
+                metrics.len(),
+                1,
+                "expected only one duplicate metric, found {metrics:?}"
+            );
+            metrics[0].value().as_usize()
+        }
+    }
+
+    /// Run the input through the deduplicator and return results
+    async fn dedupe(input: Vec<RecordBatch>, sort_keys: Vec<PhysicalSortExpr>) -> TestResults {
+        test_helpers::maybe_start_logging();
+
+        // Setup in memory stream
+        let schema = input[0].schema();
+        let projection = None;
+        let input = Arc::new(MemoryExec::try_new(&[input], schema, projection).unwrap());
+
+        // Create and run the deduplicator
+        let exec = Arc::new(DeduplicateExec::new(input, sort_keys, false));
+        let output = test_collect(Arc::clone(&exec) as Arc<dyn ExecutionPlan>).await;
+
+        TestResults { output, exec }
+    }
+
+    /// A PhysicalPlan that sends a specific set of
+    /// Result<RecordBatch> for testing.
+    #[derive(Debug)]
+    struct DummyExec {
+        schema: SchemaRef,
+        batches: Vec<Result<RecordBatch, ArrowError>>,
+    }
+
+    impl ExecutionPlan for DummyExec {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+
+        fn output_partitioning(&self) -> Partitioning {
+            unimplemented!();
+        }
+
+        fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+            unimplemented!()
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            unimplemented!()
+        }
+
+        fn execute(
+            &self,
+            partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            assert_eq!(partition, 0);
+
+            debug!(partition, "Start DummyExec::execute");
+
+            // queue them all up
+            let (tx, rx) = mpsc::unbounded_channel();
+
+            // queue up all the results
+            let batches: Vec<_> = self
+                .batches
+                .iter()
+                .map(|r| match r {
+                    Ok(batch) => Ok(batch.clone()),
+                    Err(e) => Err(DataFusionError::External(e.to_string().into())),
+                })
+                .collect();
+            let tx_captured = tx.clone();
+            let fut = async move {
+                for r in batches {
+                    tx_captured.send(r).unwrap();
+                }
+
+                Ok(())
+            };
+            let handle = WatchedTask::new(fut, vec![tx], "dummy send");
+
+            debug!(partition, "End DummyExec::execute");
+            Ok(AdapterStream::adapt_unbounded(self.schema(), rx, handle))
+        }
+
+        fn statistics(&self) -> Result<Statistics, DataFusionError> {
+            // don't know anything about the statistics
+            Ok(Statistics::new_unknown(&self.schema()))
+        }
+    }
+
+    impl DisplayAs for DummyExec {
+        fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "DummyExec")
+        }
+    }
+}
diff --git a/iox_query/src/provider/deduplicate/algo.rs b/iox_query/src/provider/deduplicate/algo.rs
new file mode 100644
index 0000000..a4c24e6
--- /dev/null
+++ b/iox_query/src/provider/deduplicate/algo.rs
@@ -0,0 +1,841 @@
+//! Implementation of Deduplication algorithm
+
+use std::{cmp::Ordering, ops::Range, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, UInt64Array},
+    compute::TakeOptions,
+    datatypes::{DataType, TimeUnit},
+    error::Result as ArrowResult,
+    record_batch::RecordBatch,
+};
+
+use arrow_util::optimize::optimize_dictionaries;
+use datafusion::physical_plan::{
+    coalesce_batches::concat_batches, expressions::PhysicalSortExpr, metrics, PhysicalExpr,
+};
+use observability_deps::tracing::{debug, trace};
+
+// Handles the deduplication across potentially multiple
+// [`RecordBatch`]es which are already sorted on a primary key,
+// including primary keys which straddle RecordBatch boundaries
+#[derive(Debug)]
+pub struct RecordBatchDeduplicator {
+    sort_keys: Vec<PhysicalSortExpr>,
+    last_batch: Option<RecordBatch>,
+    num_dupes: metrics::Count,
+}
+
+#[derive(Debug)]
+struct DuplicateRanges {
+    ///  `is_sort_key[col_idx] = true` if the the input column at
+    ///  `col_idx` is present in sort keys
+    is_sort_key: Vec<bool>,
+
+    /// ranges of row indices where the sort key columns have the
+    /// same values
+    ranges: Vec<Range<usize>>,
+}
+
+impl RecordBatchDeduplicator {
+    pub fn new(
+        sort_keys: Vec<PhysicalSortExpr>,
+        num_dupes: metrics::Count,
+        last_batch: Option<RecordBatch>,
+    ) -> Self {
+        Self {
+            sort_keys,
+            last_batch,
+            num_dupes,
+        }
+    }
+
+    /// Push a new RecordBatch into the indexer. Returns a
+    /// deduplicated RecordBatch and remembers any currently opened
+    /// groups
+    pub fn push(&mut self, batch: RecordBatch) -> ArrowResult<RecordBatch> {
+        // If we had a previous batch of rows, add it in here
+        //
+        // Potential optimization would be to check if the sort key is actually the same
+        // for the first row in the new batch and skip this concat if that is the case
+        let batch = if let Some(last_batch) = self.last_batch.take() {
+            let schema = last_batch.schema();
+            let row_count = last_batch.num_rows() + batch.num_rows();
+            debug!(row_count, "Before concat_batches");
+            let result = concat_batches(&schema, &[last_batch, batch], row_count)?;
+            debug!(row_count, "After concat_batches");
+            result
+        } else {
+            batch
+        };
+
+        let mut dupe_ranges = self.compute_ranges(&batch)?;
+        trace!("Finish computing range");
+
+        // The last partition may span batches so we can't emit it
+        // until we have seen the next batch (or we are at end of
+        // stream)
+        let last_range = dupe_ranges.ranges.pop();
+
+        let output_record_batch = self.output_from_ranges(&batch, &dupe_ranges)?;
+        trace!(
+            num_rows = output_record_batch.num_rows(),
+            "Rows of ouput_record_batch"
+        );
+
+        // Now, save the last bit of the pk
+        if let Some(last_range) = last_range {
+            let len = last_range.end - last_range.start;
+            let last_batch = Self::slice_record_batch(&batch, last_range.start, len)?;
+            self.last_batch = Some(last_batch);
+        }
+        trace!("done pushing record batch into the indexer");
+
+        Ok(output_record_batch)
+    }
+
+    /// Return last_batch if it does not overlap with the given batch
+    /// Note that since last_batch, if exists, will include at least one row and all of its rows will have the same key
+    pub fn last_batch_with_no_same_sort_key(&mut self, batch: &RecordBatch) -> Option<RecordBatch> {
+        // Take the previous batch, if any, out of it storage self.last_batch
+        if let Some(last_batch) = self.last_batch.take() {
+            // Build sorted columns for last_batch and current one
+            let schema = last_batch.schema();
+            // is_sort_key[col_idx] = true if it is present in sort keys
+            let mut is_sort_key: Vec<bool> = vec![false; last_batch.columns().len()];
+            let last_batch_key_columns = self
+                .sort_keys
+                .iter()
+                .map(|skey| {
+                    // figure out the index of the key columns
+                    let name = get_col_name(skey.expr.as_ref());
+                    let index = schema.index_of(name).unwrap();
+                    is_sort_key[index] = true;
+
+                    // Key column of last_batch of this index
+                    let last_batch_array = last_batch.column(index);
+                    if last_batch_array.len() == 0 {
+                        panic!("Key column, {name}, of last_batch has no data");
+                    }
+                    arrow::compute::SortColumn {
+                        values: Arc::clone(last_batch_array),
+                        options: Some(skey.options),
+                    }
+                })
+                .collect::<Vec<arrow::compute::SortColumn>>();
+
+            // Build sorted columns for current batch
+            // Schema of both batches are the same
+            let batch_key_columns = self
+                .sort_keys
+                .iter()
+                .map(|skey| {
+                    // figure out the index of the key columns
+                    let name = get_col_name(skey.expr.as_ref());
+                    let index = schema.index_of(name).unwrap();
+
+                    // Key column of current batch of this index
+                    let array = batch.column(index);
+                    if array.len() == 0 {
+                        panic!("Key column, {name}, of current batch has no data");
+                    }
+                    arrow::compute::SortColumn {
+                        values: Arc::clone(array),
+                        options: Some(skey.options),
+                    }
+                })
+                .collect::<Vec<arrow::compute::SortColumn>>();
+
+            // Zip the 2 key sets of columns for comparison
+            let zipped = last_batch_key_columns.iter().zip(batch_key_columns.iter());
+
+            // Compare sort keys of the first row of the given batch the the last_batch
+            // Note that the batches are sorted and all rows of last_batch have the same sort keys so
+            // only need to compare last row of the last_batch with the first row of the current batch
+            let mut same = true;
+            for (l, r) in zipped {
+                let last_idx = l.values.len() - 1;
+                if (l.values.is_valid(last_idx), r.values.is_valid(0)) == (true, true) {
+                    // Both have values, do the actual comparison
+                    let c =
+                        arrow::array::build_compare(l.values.as_ref(), r.values.as_ref()).unwrap();
+
+                    match c(last_idx, 0) {
+                        Ordering::Equal => {}
+                        _ => {
+                            same = false;
+                            break;
+                        }
+                    }
+                } else {
+                    // At least one of the value is invalid, consider they are different
+                    same = false;
+                    break;
+                }
+            }
+
+            if same {
+                // The batches overlap and need to be concatinated
+                // So, store it back in self.last_batch for the concat_batches later
+                self.last_batch = Some(last_batch);
+                None
+            } else {
+                // The batches do not overlap, deduplicate and then return the last_batch to get sent downstream
+
+                // Ranges of the batch needed for deduplication
+                // This last batch include only one range with all same key
+                let ranges = vec![
+                    Range {
+                        start: 0,
+                        end: last_batch.num_rows()
+                    };
+                    1
+                ];
+                let dupe_ranges = DuplicateRanges {
+                    is_sort_key,
+                    ranges,
+                };
+                let dedup_last_batch = self.output_from_ranges(&last_batch, &dupe_ranges).unwrap();
+
+                Some(dedup_last_batch)
+            }
+        } else {
+            None
+        }
+    }
+
+    /// Consume the indexer, returning any remaining record batches for output
+    pub fn finish(mut self) -> ArrowResult<Option<RecordBatch>> {
+        self.last_batch
+            .take()
+            .map(|last_batch| {
+                let dupe_ranges = self.compute_ranges(&last_batch)?;
+                self.output_from_ranges(&last_batch, &dupe_ranges)
+            })
+            .transpose()
+    }
+
+    /// Computes the ranges where the sort key has the same values
+    fn compute_ranges(&self, batch: &RecordBatch) -> ArrowResult<DuplicateRanges> {
+        let schema = batch.schema();
+        // is_sort_key[col_idx] = true if it is present in sort keys
+        let mut is_sort_key: Vec<bool> = vec![false; batch.columns().len()];
+
+        // Figure out the columns used to optimize the way we compute the ranges.
+        // Since in IOx's use cases, every ingesting row is almost unique, the optimal way
+        // to get the ranges is to compare row by row from the highest cardinality column
+        // to the lowest one
+        //
+        // First get key columns which are the sort key columns in lowest to
+        // highest cardinality plus time column at the end
+        let mut columns: Vec<_> = self
+            .sort_keys
+            .iter()
+            .map(|skey| {
+                // figure out what input column this is for
+                let name = get_col_name(skey.expr.as_ref());
+                let index = schema.index_of(name).unwrap();
+
+                is_sort_key[index] = true;
+
+                Arc::clone(batch.column(index))
+            })
+            .collect();
+        //
+        // Then converting the columns order from: lowest cardinality, second lowest, ..., highest cardinality, time
+        // to: highest cardinality, time, second highest cardinality, ...., lowest cardinality
+        //
+        // If the last column is time, swap time with its previous column (if any) which is
+        // the column with the highest cardinality
+        let len = columns.len();
+        if len > 1 {
+            if let DataType::Timestamp(TimeUnit::Nanosecond, _) = columns[len - 1].data_type() {
+                columns.swap(len - 2, len - 1);
+            }
+        }
+        // Reverse the list
+        columns.reverse();
+
+        // Compute partitions (aka breakpoints between the ranges)
+        // Each range (or partition) includes a unique sort key value which is
+        // a unique combination of PK columns. PK columns consist of all tags and the time col.
+        let partitions = arrow::compute::partition(&columns)?;
+        let ranges = partitions.ranges();
+
+        Ok(DuplicateRanges {
+            is_sort_key,
+            ranges,
+        })
+    }
+
+    /// Compute the output record batch that includes the specified ranges
+    fn output_from_ranges(
+        &self,
+        batch: &RecordBatch,
+        dupe_ranges: &DuplicateRanges,
+    ) -> ArrowResult<RecordBatch> {
+        let ranges = &dupe_ranges.ranges;
+
+        // each range is at least 1 large, so any that have more than
+        // 1 are duplicates
+        let num_dupes = ranges.iter().map(|r| r.end - r.start - 1).sum();
+
+        self.num_dupes.add(num_dupes);
+
+        // Special case when no ranges are duplicated (so just emit input as output)
+        if num_dupes == 0 {
+            trace!(num_rows = batch.num_rows(), "No dupes");
+            Self::slice_record_batch(batch, 0, ranges.len())
+        } else {
+            trace!(num_dupes, num_rows = batch.num_rows(), "dupes");
+
+            // Use take kernel
+            let sort_key_indices = self.compute_sort_key_indices(ranges);
+
+            let take_options = Some(TakeOptions {
+                check_bounds: false,
+            });
+
+            // Form each new column by `take`ing the indices as needed
+            let new_columns = batch
+                .columns()
+                .iter()
+                .enumerate()
+                .map(|(input_index, input_array)| {
+                    if dupe_ranges.is_sort_key[input_index] {
+                        arrow::compute::take(
+                            input_array.as_ref(),
+                            &sort_key_indices,
+                            take_options.clone(),
+                        )
+                    } else {
+                        // pick the last non null value
+                        let field_indices = self.compute_field_indices(ranges, input_array);
+
+                        arrow::compute::take(
+                            input_array.as_ref(),
+                            &field_indices,
+                            take_options.clone(),
+                        )
+                    }
+                })
+                .collect::<ArrowResult<Vec<ArrayRef>>>()?;
+
+            let batch = RecordBatch::try_new(batch.schema(), new_columns)?;
+            // At time of writing, `MutableArrayData` concatenates the
+            // contents of dictionaries as well; Do a post pass to remove the
+            // redundancy if possible
+            optimize_dictionaries(&batch)
+        }
+    }
+
+    /// Returns an array of indices, one for each input range (which
+    /// index is arbitrary as all the values are the same for the sort
+    /// column in each pk group)
+    ///
+    /// ranges: 0-1, 2-4, 5-6 --> Array[0, 2, 5]
+    fn compute_sort_key_indices(&self, ranges: &[Range<usize>]) -> UInt64Array {
+        ranges.iter().map(|r| Some(r.start as u64)).collect()
+    }
+
+    /// Returns an array of indices, one for each input range that
+    /// return the first non-null value of `input_array` in that range
+    /// (aka it will pick the index of the field value to use for each
+    /// pk group)
+    ///
+    /// ranges: 0-1, 2-4, 5-6
+    /// input array: A, NULL, NULL, C, NULL, NULL
+    /// --> Array[0, 3, 5]
+    fn compute_field_indices(
+        &self,
+        ranges: &[Range<usize>],
+        input_array: &ArrayRef,
+    ) -> UInt64Array {
+        ranges
+            .iter()
+            .map(|r| {
+                let value_index = r
+                    .clone()
+                    .filter(|&i| input_array.is_valid(i))
+                    .last()
+                    .map(|i| i as u64)
+                    // if all field values are none, pick one arbitrarily
+                    .unwrap_or(r.start as u64);
+                Some(value_index)
+            })
+            .collect()
+    }
+
+    /// Create a new record batch from offset --> len
+    fn slice_record_batch(
+        batch: &RecordBatch,
+        offset: usize,
+        len: usize,
+    ) -> ArrowResult<RecordBatch> {
+        let batch = batch.slice(offset, len);
+
+        // At time of writing, `concat_batches` concatenates the
+        // contents of dictionaries as well; Do a post pass to remove the
+        // redundancy if possible
+        optimize_dictionaries(&batch)
+    }
+}
+
+/// Get column name out of the `expr`. TODO use
+/// schema::SortKey instead.
+pub(crate) fn get_col_name(expr: &dyn PhysicalExpr) -> &str {
+    expr.as_any()
+        .downcast_ref::<datafusion::physical_plan::expressions::Column>()
+        .expect("expected column reference")
+        .name()
+}
+
+#[cfg(test)]
+mod test {
+    use arrow::array::{Int64Array, TimestampNanosecondArray};
+    use arrow::compute::SortOptions;
+    use arrow::{
+        array::{ArrayRef, Float64Array, StringArray},
+        record_batch::RecordBatch,
+    };
+
+    use arrow_util::assert_batches_eq;
+    use datafusion::physical_plan::expressions::col;
+    use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_non_overlapped_sorted_batches_one_key_column() {
+        // Sorted key: t1
+
+        // Last batch
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | b  | 1  | 2
+        //  a | c  | 3  |
+        //  a | c  | 4  |
+
+        // Current batch
+        //  ====(next batch)====
+        //  b | c  |    | 6
+        //  b | d  | 7  | 8
+
+        // Non overlapped => return last batch
+        // Expected output = Deduplication of Last batch
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | c  | 4  | 2
+
+        // Columns of last_batch
+        let t1 = StringArray::from(vec![Some("a"), Some("a"), Some("a")]);
+        let t2 = StringArray::from(vec![Some("b"), Some("c"), Some("c")]);
+        let f1 = Float64Array::from(vec![Some(1.0), Some(3.0), Some(4.0)]);
+        let f2 = Float64Array::from(vec![Some(2.0), None, None]);
+
+        let last_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        // Columns of current_batch
+        let t1 = StringArray::from(vec![Some("b"), Some("b")]);
+        let t2 = StringArray::from(vec![Some("c"), Some("d")]);
+        let f1 = Float64Array::from(vec![None, Some(7.0)]);
+        let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
+
+        let current_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("t1", &current_batch.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let mut dedupe = RecordBatchDeduplicator::new(sort_keys, make_counter(), Some(last_batch));
+
+        let results = dedupe
+            .last_batch_with_no_same_sort_key(&current_batch)
+            .unwrap();
+
+        let expected = vec![
+            "+----+----+-----+-----+",
+            "| t1 | t2 | f1  | f2  |",
+            "+----+----+-----+-----+",
+            "| a  | c  | 4.0 | 2.0 |",
+            "+----+----+-----+-----+",
+        ];
+        assert_batches_eq!(&expected, &[results]);
+    }
+
+    #[tokio::test]
+    async fn test_non_overlapped_sorted_batches_two_key_columns() {
+        // Sorted key: t1, t2
+
+        // Last batch
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | c  | 1  | 2
+        //  a | c  | 3  |
+        //  a | c  | 4  | 5
+
+        // Current batch
+        //  ====(next batch)====
+        //  b | c  |    | 6
+        //  b | d  | 7  | 8
+
+        // Non overlapped => return last batch
+        // Expected output = Deduplication of last batch
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | c  | 4  | 5
+
+        // Columns of last_batch
+        let t1 = StringArray::from(vec![Some("a"), Some("a"), Some("a")]);
+        let t2 = StringArray::from(vec![Some("c"), Some("c"), Some("c")]);
+        let f1 = Float64Array::from(vec![Some(1.0), Some(3.0), Some(4.0)]);
+        let f2 = Float64Array::from(vec![Some(2.0), None, Some(5.0)]);
+
+        let last_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        // Columns of current_batch
+        let t1 = StringArray::from(vec![Some("b"), Some("b")]);
+        let t2 = StringArray::from(vec![Some("c"), Some("d")]);
+        let f1 = Float64Array::from(vec![None, Some(7.0)]);
+        let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
+
+        let current_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("t1", &current_batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+            PhysicalSortExpr {
+                expr: col("t2", &current_batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+        ];
+
+        let mut dedupe = RecordBatchDeduplicator::new(sort_keys, make_counter(), Some(last_batch));
+
+        let results = dedupe
+            .last_batch_with_no_same_sort_key(&current_batch)
+            .unwrap();
+
+        let expected = vec![
+            "+----+----+-----+-----+",
+            "| t1 | t2 | f1  | f2  |",
+            "+----+----+-----+-----+",
+            "| a  | c  | 4.0 | 5.0 |",
+            "+----+----+-----+-----+",
+        ];
+        assert_batches_eq!(&expected, &[results]);
+    }
+
+    #[tokio::test]
+    async fn test_overlapped_sorted_batches_one_key_column() {
+        // Sorted key: t1
+
+        // Last batch
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | b  | 1  | 2
+        //  a | b  | 3  |
+
+        // Current batch
+        //  ====(next batch)====
+        //  a | b  |    | 6
+        //  b | d  | 7  | 8
+
+        // Overlapped => return None
+
+        // Columns of last_batch
+        let t1 = StringArray::from(vec![Some("a"), Some("a")]);
+        let t2 = StringArray::from(vec![Some("b"), Some("b")]);
+        let f1 = Float64Array::from(vec![Some(1.0), Some(3.0)]);
+        let f2 = Float64Array::from(vec![Some(2.0), None]);
+
+        let last_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        // Columns of current_batch
+        let t1 = StringArray::from(vec![Some("a"), Some("b")]);
+        let t2 = StringArray::from(vec![Some("b"), Some("d")]);
+        let f1 = Float64Array::from(vec![None, Some(7.0)]);
+        let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
+
+        let current_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![PhysicalSortExpr {
+            expr: col("t1", &current_batch.schema()).unwrap(),
+            options: SortOptions {
+                descending: false,
+                nulls_first: false,
+            },
+        }];
+
+        let mut dedupe = RecordBatchDeduplicator::new(sort_keys, make_counter(), Some(last_batch));
+
+        let results = dedupe.last_batch_with_no_same_sort_key(&current_batch);
+        assert!(results.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_overlapped_sorted_batches_two_key_columns() {
+        // Sorted key: t1, t2
+
+        // Last batch
+        // t1 | t2 | f1 | f2
+        // ---+----+----+----
+        //  a | b  | 1  | 2
+        //  a | b  | 3  |
+
+        // Current batch
+        //  ====(next batch)====
+        //  a | b  |    | 6
+        //  b | d  | 7  | 8
+
+        // Overlapped => return None
+
+        // Columns of last_batch
+        let t1 = StringArray::from(vec![Some("a"), Some("a")]);
+        let t2 = StringArray::from(vec![Some("b"), Some("b")]);
+        let f1 = Float64Array::from(vec![Some(1.0), Some(3.0)]);
+        let f2 = Float64Array::from(vec![Some(2.0), None]);
+
+        let last_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        // Columns of current_batch
+        let t1 = StringArray::from(vec![Some("a"), Some("b")]);
+        let t2 = StringArray::from(vec![Some("b"), Some("d")]);
+        let f1 = Float64Array::from(vec![None, Some(7.0)]);
+        let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
+
+        let current_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("t1", &current_batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+            PhysicalSortExpr {
+                expr: col("t2", &current_batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+        ];
+
+        let mut dedupe = RecordBatchDeduplicator::new(sort_keys, make_counter(), Some(last_batch));
+
+        let results = dedupe.last_batch_with_no_same_sort_key(&current_batch);
+        assert!(results.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_non_overlapped_none_last_batch() {
+        // Sorted key: t1, t2
+
+        // Current batch
+        //  ====(next batch)====
+        //  a | b  |    | 6
+        //  b | d  | 7  | 8
+
+        // Columns of current_batch
+        let t1 = StringArray::from(vec![Some("a"), Some("b")]);
+        let t2 = StringArray::from(vec![Some("b"), Some("d")]);
+        let f1 = Float64Array::from(vec![None, Some(7.0)]);
+        let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
+
+        let current_batch = RecordBatch::try_from_iter(vec![
+            ("t1", Arc::new(t1) as ArrayRef),
+            ("t2", Arc::new(t2) as ArrayRef),
+            ("f1", Arc::new(f1) as ArrayRef),
+            ("f2", Arc::new(f2) as ArrayRef),
+        ])
+        .unwrap();
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("t1", &current_batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+            PhysicalSortExpr {
+                expr: col("t2", &current_batch.schema()).unwrap(),
+                options: SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                },
+            },
+        ];
+
+        let mut dedupe = RecordBatchDeduplicator::new(sort_keys, make_counter(), None);
+
+        let results = dedupe.last_batch_with_no_same_sort_key(&current_batch);
+        assert!(results.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_compute_ranges() {
+        // Input columns:
+        //  The input columns are sorted on this sort order:
+        //    (Lowest_Cardinality, Second_Highest_Cardinality, Highest_Cardinality, Time)
+        //
+        // Invisible Index |  Lowest_Cardinality  | Second_Highest_Cardinality | Highest_Cardinality | Time
+        // (not a real col)
+        // --------------- | -------------------- | --------------------------- | ------------------- | ----
+        //         0       |          1           |              1              |            1        |   1
+        //         1       |          1           |              1              |            1        |   10
+        //         2       |          1           |              1              |            3        |   8
+        //         3       |          1           |              1              |            4        |   9
+        //         4       |          1           |              1              |            4        |   9
+        //         5       |          1           |              1              |            5        |   1
+        //         6       |          1           |              1              |            5        |   15
+        //         7       |          1           |              2              |            5        |   15
+        //         8       |          1           |              2              |            5        |   15
+        //         9       |          2           |              2              |            5        |   15
+        // Out put ranges: 8 ranges on their invisible indices
+        //   [0, 1],
+        //   [1, 2],
+        //   [2, 3],
+        //   [3, 5],  -- 2 rows with same values (1, 1, 4, 9)
+        //   [5, 6],
+        //   [6, 7],
+        //   [7, 9],  -- 2 rows with same values (1, 2, 5, 15)
+        //   [9, 10],
+
+        let mut lowest_cardinality = vec![Some("1"); 9]; // 9 first values are all Some(1)
+        lowest_cardinality.push(Some("2")); // Add Some(2)
+        let lowest_cardinality = Arc::new(StringArray::from(lowest_cardinality)) as ArrayRef;
+
+        let mut second_highest_cardinality = vec![Some(1.0); 7];
+        second_highest_cardinality.append(&mut vec![Some(2.0); 3]);
+        let second_highest_cardinality =
+            Arc::new(Float64Array::from(second_highest_cardinality)) as ArrayRef;
+
+        let mut highest_cardinality = vec![Some(1), Some(1), Some(3), Some(4), Some(4)];
+        highest_cardinality.append(&mut vec![Some(5); 5]);
+        let highest_cardinality = Arc::new(Int64Array::from(highest_cardinality)) as ArrayRef;
+
+        let mut time = vec![Some(1), Some(10), Some(8), Some(9), Some(9), Some(1)];
+        time.append(&mut vec![Some(15); 4]);
+        let time = Arc::new(TimestampNanosecondArray::from(time)) as ArrayRef;
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("lowest_cardinality", lowest_cardinality),
+            ("second_highest_cardinality", second_highest_cardinality),
+            ("highest_cardinality", highest_cardinality),
+            ("time", time),
+        ])
+        .unwrap();
+
+        let options = SortOptions {
+            descending: false,
+            nulls_first: false,
+        };
+
+        let sort_keys = vec![
+            PhysicalSortExpr {
+                expr: col("lowest_cardinality", &batch.schema()).unwrap(),
+                options,
+            },
+            PhysicalSortExpr {
+                expr: col("second_highest_cardinality", &batch.schema()).unwrap(),
+                options,
+            },
+            PhysicalSortExpr {
+                expr: col("highest_cardinality", &batch.schema()).unwrap(),
+                options,
+            },
+            PhysicalSortExpr {
+                expr: col("time", &batch.schema()).unwrap(),
+                options,
+            },
+        ];
+
+        let dedupe = RecordBatchDeduplicator::new(sort_keys, make_counter(), None);
+        let key_ranges = dedupe.compute_ranges(&batch).unwrap().ranges;
+
+        let expected_key_range = vec![
+            range(0, 1),
+            range(1, 2),
+            range(2, 3),
+            range(3, 5),
+            range(5, 6),
+            range(6, 7),
+            range(7, 9),
+            range(9, 10),
+        ];
+
+        assert_eq!(key_ranges, expected_key_range);
+    }
+
+    fn make_counter() -> metrics::Count {
+        let metrics = ExecutionPlanMetricsSet::new();
+        MetricBuilder::new(&metrics).counter("num_dupes", 0)
+    }
+
+    fn range(start: usize, end: usize) -> Range<usize> {
+        Range { start, end }
+    }
+}
diff --git a/iox_query/src/provider/deduplicate/key_ranges.rs b/iox_query/src/provider/deduplicate/key_ranges.rs
new file mode 100644
index 0000000..429e06c
--- /dev/null
+++ b/iox_query/src/provider/deduplicate/key_ranges.rs
@@ -0,0 +1,281 @@
+//! Implement iterator and comparator to split data into distinct ranges
+
+use arrow::array::{build_compare, DynComparator};
+use arrow::buffer::NullBuffer;
+use arrow::compute::{SortColumn, SortOptions};
+use arrow::error::{ArrowError, Result as ArrowResult};
+
+// use snafu::Snafu;
+use std::cmp::Ordering;
+use std::iter::Iterator;
+use std::ops::Range;
+
+/// Given a list of key columns, find partition ranges that would partition
+/// equal values across columns
+///
+/// The returned vec would be of size k where k is cardinality of the values; Consecutive
+/// values will be connected: (a, b) and (b, c), where start = 0 and end = n for the first and last
+/// range.
+///
+/// The algorithm works with any set of data (no sort needed) and columns but it is implemented to optimize the use case in which:
+///    1. Every row is almost unique
+///    2. Order of input columns is from highest to lowest cardinality
+///
+/// Example Input columns:
+/// Invisible Index |  Highest_Cardinality | Time | Second_Highest_Cardinality | Lowest_Cardinality
+/// --------------- | -------------------- | ---- | -------------------------- | --------------------
+///         0       |          1           |  1   |             1              |            1
+///         1       |          1           |  10  |             1              |            1
+///         2       |          3           |  8   |             1              |            1
+///         3       |          4           |  9   |             1              |            1
+///         4       |          4           |  9   |             1              |            1
+///         5       |          5           |  1   |             1              |            1
+///         6       |          5           |  15  |             1              |            1
+///         7       |          5           |  15  |             2              |            1
+///         8       |          5           |  15  |             2              |            1
+///         9       |          5           |  15  |             2              |            2
+///  The columns are sorted (and RLE) on this different sort order:
+///    (Lowest_Cardinality, Second_Highest_Cardinality, Highest_Cardinality, Time)
+/// Out put ranges: 8 ranges on their invisible indices
+///   [0, 1],
+///   [1, 2],
+///   [2, 3],
+///   [3, 5],  -- 2 rows with same values (4, 9, 1, 1)
+///   [5, 6],
+///   [6, 7],
+///   [7, 9],  -- 2 rows with same values (5, 15, 2, 1)
+///   [9, 10]
+
+pub fn key_ranges(columns: &[SortColumn]) -> ArrowResult<impl Iterator<Item = Range<usize>> + '_> {
+    KeyRangeIterator::try_new(columns)
+}
+
+struct KeyRangeIterator<'a> {
+    // function to compare values of columns
+    comparator: KeyRangeComparator<'a>,
+    // Number of rows of the columns
+    num_rows: usize,
+    // end index of previous range which will be used as starting index of the next computing range
+    start_range_idx: usize,
+}
+
+impl<'a> KeyRangeIterator<'a> {
+    fn try_new(columns: &'a [SortColumn]) -> ArrowResult<Self> {
+        if columns.is_empty() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Key range requires at least one column".to_string(),
+            ));
+        }
+        let num_rows = columns[0].values.len();
+        if columns.iter().any(|item| item.values.len() != num_rows) {
+            return Err(ArrowError::ComputeError(
+                "Sort columns have different row counts".to_string(),
+            ));
+        };
+
+        //let comparator = KeyRangeComparator::try_new(columns)?;
+        Ok(Self {
+            comparator: KeyRangeComparator::try_new(columns)?,
+            num_rows,
+            start_range_idx: 0,
+        })
+    }
+}
+
+impl<'a> Iterator for KeyRangeIterator<'a> {
+    type Item = Range<usize>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // End of the row
+        if self.start_range_idx >= self.num_rows {
+            return None;
+        }
+
+        let mut idx = self.start_range_idx + 1;
+        while idx < self.num_rows {
+            if self.comparator.compare(self.start_range_idx, idx) == Ordering::Equal {
+                idx += 1;
+            } else {
+                break;
+            }
+        }
+        let start = self.start_range_idx;
+        self.start_range_idx = idx;
+        Some(Range { start, end: idx })
+    }
+}
+
+type KeyRangeCompareItem<'a> = (
+    Option<&'a NullBuffer>, // validity of array
+    DynComparator,          // comparator
+    SortOptions,            // sort_option
+);
+
+// Todo: this is the same as LexicographicalComparator.
+// Either use it or make it like https://github.com/apache/arrow-rs/issues/563
+/// A comparator that wraps given array data (columns) and can compare data
+/// at given two indices. The lifetime is the same at the data wrapped.
+pub(super) struct KeyRangeComparator<'a> {
+    compare_items: Vec<KeyRangeCompareItem<'a>>,
+}
+
+fn is_valid(nulls: &Option<&NullBuffer>, idx: usize) -> bool {
+    nulls
+        .map(|nulls| nulls.is_valid(idx))
+        // if there is no null buffer, the entry is valid
+        .unwrap_or(true)
+}
+
+impl KeyRangeComparator<'_> {
+    /// compare values at the wrapped columns with given indices.
+    pub(super) fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering {
+        for (nulls, comparator, sort_option) in &self.compare_items {
+            match (is_valid(nulls, a_idx), is_valid(nulls, b_idx)) {
+                (true, true) => {
+                    match (comparator)(a_idx, b_idx) {
+                        // equal, move on to next column
+                        Ordering::Equal => continue,
+                        order => {
+                            if sort_option.descending {
+                                return order.reverse();
+                            } else {
+                                return order;
+                            }
+                        }
+                    }
+                }
+                (false, true) => {
+                    return if sort_option.nulls_first {
+                        Ordering::Less
+                    } else {
+                        Ordering::Greater
+                    };
+                }
+                (true, false) => {
+                    return if sort_option.nulls_first {
+                        Ordering::Greater
+                    } else {
+                        Ordering::Less
+                    };
+                }
+                // equal, move on to next column
+                (false, false) => continue,
+            }
+        }
+
+        Ordering::Equal
+    }
+
+    /// Create a new comparator that will wrap the given columns and give comparison
+    /// results with two indices.
+    pub(super) fn try_new(columns: &[SortColumn]) -> ArrowResult<KeyRangeComparator<'_>> {
+        let compare_items = columns
+            .iter()
+            .map(|column| {
+                // flatten and convert build comparators
+                // use Nulls for is_valid checks later to avoid dynamic call
+                let values = column.values.as_ref();
+
+                let nulls = values.nulls();
+                Ok((
+                    nulls,
+                    build_compare(values, values)?,
+                    column.options.unwrap_or_default(),
+                ))
+            })
+            .collect::<ArrowResult<Vec<_>>>()?;
+        Ok(KeyRangeComparator { compare_items })
+    }
+}
+
+#[cfg(test)]
+pub fn range(start: usize, end: usize) -> Range<usize> {
+    Range { start, end }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::array::ArrayRef;
+    use arrow::array::{Int64Array, TimestampNanosecondArray};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_key_ranges() {
+        // Input columns:
+        // Invisible Index |  Highest_Cardinality | Time | Second_Highest_Cardinality | Lowest_Cardinality
+        // (not a real col)
+        // --------------- | -------------------- | ---- | -------------------------- | --------------------
+        //         0       |          1           |  1   |             1              |            1
+        //         1       |          1           |  10  |             1              |            1
+        //         2       |          3           |  8   |             1              |            1
+        //         3       |          4           |  9   |             1              |            1
+        //         4       |          4           |  9   |             1              |            1
+        //         5       |          5           |  1   |             1              |            1
+        //         6       |          5           |  15  |             1              |            1
+        //         7       |          5           |  15  |             2              |            1
+        //         8       |          5           |  15  |             2              |            1
+        //         9       |          5           |  15  |             2              |            2
+        //  The columns are sorted on this sort order:
+        //    (Lowest_Cardinality, Second_Highest_Cardinality, Highest_Cardinality, Time)
+        //  But when the key_ranges function is invoked, the input columns will be
+        //    (Highest_Cardinality, Time, Second_Highest_Cardinality, Lowest_Cardinality)
+        // Out put ranges: 8 ranges on their invisible indices
+        //   [0, 1],
+        //   [1, 2],
+        //   [2, 3],
+        //   [3, 5],  -- 2 rows with same values (4, 9, 1, 1)
+        //   [5, 6],
+        //   [6, 7],
+        //   [7, 9],  -- 2 rows with same values (5, 15, 2, 1)
+        //   [9, 10],
+
+        let mut lowest_cardinality = vec![Some(1); 9]; // 9 first values are all Some(1)
+        lowest_cardinality.push(Some(2)); // Add Some(2)
+
+        let mut second_highest_cardinality = vec![Some(1); 7];
+        second_highest_cardinality.append(&mut vec![Some(2); 3]);
+
+        let mut time = vec![Some(1), Some(10), Some(8), Some(9), Some(9), Some(1)];
+        time.append(&mut vec![Some(15); 4]);
+
+        let mut highest_cardinality = vec![Some(1), Some(1), Some(3), Some(4), Some(4)];
+        highest_cardinality.append(&mut vec![Some(5); 5]);
+
+        let input = vec![
+            SortColumn {
+                values: Arc::new(Int64Array::from(highest_cardinality)) as ArrayRef,
+                options: None,
+            },
+            SortColumn {
+                values: Arc::new(TimestampNanosecondArray::from(time)) as ArrayRef,
+                options: None,
+            },
+            SortColumn {
+                values: Arc::new(Int64Array::from(second_highest_cardinality)) as ArrayRef,
+                options: None,
+            },
+            SortColumn {
+                values: Arc::new(Int64Array::from(lowest_cardinality)) as ArrayRef,
+                options: None,
+            },
+        ];
+
+        let key_ranges = key_ranges(&input).unwrap();
+
+        let expected_key_range = vec![
+            range(0, 1),
+            range(1, 2),
+            range(2, 3),
+            range(3, 5),
+            range(5, 6),
+            range(6, 7),
+            range(7, 9),
+            range(9, 10),
+        ];
+
+        assert_eq!(key_ranges.collect::<Vec<_>>(), expected_key_range);
+    }
+}
diff --git a/iox_query/src/provider/overlap.rs b/iox_query/src/provider/overlap.rs
new file mode 100644
index 0000000..4b90162
--- /dev/null
+++ b/iox_query/src/provider/overlap.rs
@@ -0,0 +1,392 @@
+//! Contains the algorithm to determine which chunks may contain "duplicate" primary keys (that is
+//! where data with the same combination of "tag" columns and timestamp in the InfluxDB DataModel
+//! have been written in via multiple distinct line protocol writes (and thus are stored in
+//! separate rows)
+
+use crate::QueryChunk;
+use data_types::TimestampMinMax;
+use datafusion::scalar::ScalarValue;
+use observability_deps::tracing::debug;
+use schema::TIME_COLUMN_NAME;
+use std::sync::Arc;
+
+/// Groups query chunks into disjoint sets of overlapped time range.
+/// Does not preserve or guarantee any ordering.
+pub fn group_potential_duplicates(
+    chunks: Vec<Arc<dyn QueryChunk>>,
+) -> Vec<Vec<Arc<dyn QueryChunk>>> {
+    let ts: Vec<_> = chunks
+        .iter()
+        .map(|c| timestamp_min_max(c.as_ref()))
+        .collect();
+
+    // If at least one of the chunks has no time range,
+    // all chunks are considered to overlap with each other.
+    if ts.iter().any(|ts| ts.is_none()) {
+        debug!("At least one chunk has not timestamp min max");
+        return vec![chunks];
+    }
+
+    // Use this algorithm to group them
+    // https://towardsdatascience.com/overlapping-time-period-problem-b7f1719347db
+
+    let num_chunks = chunks.len();
+    let mut grouper = Vec::with_capacity(num_chunks * 2);
+
+    #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+    enum StartEnd {
+        Start,
+        End,
+    }
+    #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+    struct StartEndChunk<I> {
+        start_end: StartEnd,
+        chunk: Option<I>,
+    }
+    struct GrouperRecord<I, V: PartialOrd> {
+        value: V,
+        start_end_chunk: StartEndChunk<I>,
+    }
+
+    for (chunk, ts) in chunks.into_iter().zip(ts) {
+        let time_range = ts.expect("Time range should have value");
+
+        grouper.push(GrouperRecord {
+            value: time_range.min,
+            start_end_chunk: StartEndChunk {
+                start_end: StartEnd::Start,
+                chunk: None,
+            },
+        });
+        grouper.push(GrouperRecord {
+            value: time_range.max,
+            start_end_chunk: StartEndChunk {
+                start_end: StartEnd::End,
+                chunk: Some(chunk),
+            },
+        });
+    }
+
+    grouper.sort_by_key(|gr| (gr.value, gr.start_end_chunk.start_end));
+
+    let mut cumulative_sum = 0;
+    let mut groups = Vec::with_capacity(num_chunks);
+
+    for gr in grouper {
+        cumulative_sum += match gr.start_end_chunk.start_end {
+            StartEnd::Start => 1,
+            StartEnd::End => -1,
+        };
+
+        if matches!(gr.start_end_chunk.start_end, StartEnd::Start) && cumulative_sum == 1 {
+            groups.push(Vec::with_capacity(num_chunks));
+        }
+        if let StartEnd::End = gr.start_end_chunk.start_end {
+            groups
+                .last_mut()
+                .expect("a start should have pushed at least one empty group")
+                .push(gr.start_end_chunk.chunk.expect("Must have chunk value"));
+        }
+    }
+    groups
+}
+
+fn timestamp_min_max(chunk: &dyn QueryChunk) -> Option<TimestampMinMax> {
+    let stats = chunk.stats();
+    chunk
+        .schema()
+        .find_index_of(TIME_COLUMN_NAME)
+        .map(|idx| &stats.column_statistics[idx])
+        .and_then(|stats| {
+            if let (
+                Some(ScalarValue::TimestampNanosecond(Some(min), _)),
+                Some(ScalarValue::TimestampNanosecond(Some(max), _)),
+            ) = (stats.min_value.get_value(), stats.max_value.get_value())
+            {
+                Some(TimestampMinMax::new(*min, *max))
+            } else {
+                None
+            }
+        })
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::{test::TestChunk, QueryChunk};
+
+    #[macro_export]
+    macro_rules! assert_groups_eq {
+        ($EXPECTED_LINES: expr, $GROUPS: expr) => {
+            let expected_lines: Vec<String> =
+                $EXPECTED_LINES.into_iter().map(|s| s.to_string()).collect();
+
+            let actual_lines = to_string($GROUPS);
+
+            assert_eq!(
+                expected_lines, actual_lines,
+                "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
+                expected_lines, actual_lines
+            );
+        };
+    }
+
+    // Test cases:
+
+    #[test]
+    fn one_time_column_overlap_same_min_max() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 1),
+        );
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_time_column()
+                .with_timestamp_min_max(1, 1),
+        );
+
+        let groups = group_potential_duplicates(vec![c1, c2]);
+
+        let expected = vec!["Group 0: [chunk1, chunk2]"];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn one_time_column_overlap_bad_case() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10),
+        );
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_time_column()
+                .with_timestamp_min_max(15, 30),
+        );
+        let c3 = Arc::new(
+            TestChunk::new("chunk3")
+                .with_time_column()
+                .with_timestamp_min_max(7, 20),
+        );
+        let c4 = Arc::new(
+            TestChunk::new("chunk4")
+                .with_time_column()
+                .with_timestamp_min_max(25, 35),
+        );
+
+        let groups = group_potential_duplicates(vec![c1, c2, c3, c4]);
+
+        let expected = vec!["Group 0: [chunk1, chunk3, chunk2, chunk4]"];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn one_time_column_overlap_contiguous() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10),
+        );
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_time_column()
+                .with_timestamp_min_max(7, 20),
+        );
+        let c3 = Arc::new(
+            TestChunk::new("chunk3")
+                .with_time_column()
+                .with_timestamp_min_max(15, 30),
+        );
+        let c4 = Arc::new(
+            TestChunk::new("chunk4")
+                .with_time_column()
+                .with_timestamp_min_max(25, 35),
+        );
+
+        let groups = group_potential_duplicates(vec![c1, c2, c3, c4]);
+
+        let expected = vec!["Group 0: [chunk1, chunk2, chunk3, chunk4]"];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn one_time_column_overlap_2_groups() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10),
+        );
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_time_column()
+                .with_timestamp_min_max(7, 20),
+        );
+        let c3 = Arc::new(
+            TestChunk::new("chunk3")
+                .with_time_column()
+                .with_timestamp_min_max(21, 30),
+        );
+        let c4 = Arc::new(
+            TestChunk::new("chunk4")
+                .with_time_column()
+                .with_timestamp_min_max(25, 35),
+        );
+
+        let groups = group_potential_duplicates(vec![c1, c2, c3, c4]);
+
+        let expected = vec!["Group 0: [chunk1, chunk2]", "Group 1: [chunk3, chunk4]"];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn one_time_column_overlap_3_groups() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10),
+        );
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_time_column()
+                .with_timestamp_min_max(7, 20),
+        );
+        let c3 = Arc::new(
+            TestChunk::new("chunk3")
+                .with_time_column()
+                .with_timestamp_min_max(21, 24),
+        );
+        let c4 = Arc::new(
+            TestChunk::new("chunk4")
+                .with_time_column()
+                .with_timestamp_min_max(25, 35),
+        );
+
+        let groups = group_potential_duplicates(vec![c1, c4, c3, c2]);
+
+        let expected = vec![
+            "Group 0: [chunk1, chunk2]",
+            "Group 1: [chunk3]",
+            "Group 2: [chunk4]",
+        ];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn one_time_column_overlap_1_chunk() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10),
+        );
+
+        let groups = group_potential_duplicates(vec![c1]);
+
+        let expected = vec!["Group 0: [chunk1]"];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn overlap_no_groups() {
+        let groups = group_potential_duplicates(vec![]);
+
+        assert!(groups.is_empty());
+    }
+
+    #[test]
+    fn multi_columns_overlap_bad_case() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10),
+        );
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_time_column()
+                .with_timestamp_min_max(15, 30)
+                .with_i64_field_column("field1"),
+        );
+        let c3 = Arc::new(
+            TestChunk::new("chunk3")
+                .with_time_column()
+                .with_timestamp_min_max(7, 20)
+                .with_tag_column("tag1"),
+        );
+        let c4 = Arc::new(
+            TestChunk::new("chunk4")
+                .with_time_column()
+                .with_timestamp_min_max(25, 35),
+        );
+
+        let groups = group_potential_duplicates(vec![c1, c2, c3, c4]);
+
+        let expected = vec!["Group 0: [chunk1, chunk3, chunk2, chunk4]"];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn multi_columns_overlap_1_chunk() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10)
+                .with_tag_column("tag1"),
+        );
+
+        let groups = group_potential_duplicates(vec![c1]);
+
+        let expected = vec!["Group 0: [chunk1]"];
+        assert_groups_eq!(expected, groups);
+    }
+
+    #[test]
+    fn multi_columns_overlap_3_groups() {
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_time_column()
+                .with_timestamp_min_max(1, 10)
+                .with_tag_column("tag1"),
+        );
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_time_column()
+                .with_timestamp_min_max(7, 20),
+        );
+        let c3 = Arc::new(
+            TestChunk::new("chunk3")
+                .with_time_column()
+                .with_timestamp_min_max(21, 24)
+                .with_tag_column("tag2"),
+        );
+        let c4 = Arc::new(
+            TestChunk::new("chunk4")
+                .with_time_column()
+                .with_timestamp_min_max(25, 35),
+        );
+
+        let groups = group_potential_duplicates(vec![c1, c4, c3, c2]);
+
+        let expected = vec![
+            "Group 0: [chunk1, chunk2]",
+            "Group 1: [chunk3]",
+            "Group 2: [chunk4]",
+        ];
+        assert_groups_eq!(expected, groups);
+    }
+
+    // --- Test infrastructure --
+    fn to_string(groups: Vec<Vec<Arc<dyn QueryChunk>>>) -> Vec<String> {
+        let mut s = vec![];
+        for (idx, group) in groups.iter().enumerate() {
+            let names = group
+                .iter()
+                .map(|c| {
+                    let c = c.as_any().downcast_ref::<TestChunk>().unwrap();
+                    c.table_name()
+                })
+                .collect::<Vec<_>>();
+            s.push(format!("Group {}: [{}]", idx, names.join(", ")));
+        }
+        s
+    }
+}
diff --git a/iox_query/src/provider/physical.rs b/iox_query/src/provider/physical.rs
new file mode 100644
index 0000000..3114cf8
--- /dev/null
+++ b/iox_query/src/provider/physical.rs
@@ -0,0 +1,725 @@
+//! Implementation of a DataFusion PhysicalPlan node across partition chunks
+
+use crate::statistics::build_statistics_for_chunks;
+use crate::{
+    provider::record_batch_exec::RecordBatchesExec, util::arrow_sort_key_exprs, QueryChunk,
+    QueryChunkData, CHUNK_ORDER_COLUMN_NAME,
+};
+use arrow::datatypes::{Fields, Schema as ArrowSchema, SchemaRef};
+use datafusion::{
+    datasource::{
+        listing::PartitionedFile,
+        object_store::ObjectStoreUrl,
+        physical_plan::{FileScanConfig, ParquetExec},
+    },
+    physical_expr::PhysicalSortExpr,
+    physical_plan::{empty::EmptyExec, expressions::Column, union::UnionExec, ExecutionPlan},
+    scalar::ScalarValue,
+};
+use object_store::ObjectMeta;
+use schema::{sort::SortKey, Schema};
+use std::{
+    collections::{hash_map::Entry, HashMap, HashSet},
+    sync::Arc,
+};
+
+/// Extension for [`PartitionedFile`] to hold the original [`QueryChunk`] and the [`SortKey`] that was passed to [`chunks_to_physical_nodes`].
+pub struct PartitionedFileExt {
+    pub chunk: Arc<dyn QueryChunk>,
+    pub output_sort_key_memo: Option<SortKey>,
+}
+
+/// Holds a list of chunks that all have the same "URL" and
+/// will be scanned using the same ParquetExec.
+///
+/// Also tracks the overall sort key which is provided to DataFusion
+/// plans
+#[derive(Debug)]
+struct ParquetChunkList {
+    object_store_url: ObjectStoreUrl,
+    chunks: Vec<(ObjectMeta, Arc<dyn QueryChunk>)>,
+    /// Sort key to place on the ParquetExec, validated to be
+    /// compatible with all chunk sort keys
+    sort_key: Option<SortKey>,
+}
+
+impl ParquetChunkList {
+    /// Create a new chunk list with the specified chunk and overall
+    /// sort order. If the desired output sort key is specified
+    /// (e.g. the partition sort key) also computes compatibility with
+    /// with the chunk order.
+    fn new(
+        object_store_url: ObjectStoreUrl,
+        chunk: &Arc<dyn QueryChunk>,
+        meta: ObjectMeta,
+        output_sort_key: Option<&SortKey>,
+    ) -> Self {
+        let sort_key = combine_sort_key(output_sort_key.cloned(), chunk.sort_key(), chunk.schema());
+
+        Self {
+            object_store_url,
+            chunks: vec![(meta, Arc::clone(chunk))],
+            sort_key,
+        }
+    }
+
+    /// Add the parquet file the list of files to be scanned, updating
+    /// the sort key as necessary.
+    fn add_parquet_file(&mut self, chunk: &Arc<dyn QueryChunk>, meta: ObjectMeta) {
+        self.chunks.push((meta, Arc::clone(chunk)));
+
+        self.sort_key = combine_sort_key(self.sort_key.take(), chunk.sort_key(), chunk.schema());
+    }
+}
+
+/// Combines the existing sort key with the sort key of the chunk,
+/// returning the new combined compatible sort key that describes both
+/// chunks.
+///
+/// If it is not possible to find a compatible sort key, None is
+/// returned signifying "unknown sort order"
+fn combine_sort_key(
+    existing_sort_key: Option<SortKey>,
+    chunk_sort_key: Option<&SortKey>,
+    chunk_schema: &Schema,
+) -> Option<SortKey> {
+    if let (Some(existing_sort_key), Some(chunk_sort_key)) = (existing_sort_key, chunk_sort_key) {
+        let combined_sort_key = SortKey::try_merge_key(&existing_sort_key, chunk_sort_key);
+
+        if let Some(combined_sort_key) = combined_sort_key {
+            let chunk_cols = chunk_schema
+                .iter()
+                .map(|(_t, field)| field.name().as_str())
+                .collect::<HashSet<_>>();
+            for (col, _opts) in combined_sort_key.iter() {
+                if !chunk_sort_key.contains(col.as_ref()) && chunk_cols.contains(col.as_ref()) {
+                    return None;
+                }
+            }
+        }
+
+        // Avoid cloning the sort key when possible, as the sort key
+        // is likely to commonly be the same
+        match combined_sort_key {
+            Some(combined_sort_key) if combined_sort_key == &existing_sort_key => {
+                Some(existing_sort_key)
+            }
+            Some(combined_sort_key) => Some(combined_sort_key.clone()),
+            None => None,
+        }
+    } else {
+        // no existing sort key means the data wasn't consistently sorted so leave it alone
+        None
+    }
+}
+
+/// Place [chunk](QueryChunk)s into physical nodes.
+///
+/// This will group chunks into [record batch](QueryChunkData::RecordBatches) and [parquet
+/// file](QueryChunkData::Parquet) chunks. The latter will also be grouped by store.
+///
+/// Record batch chunks will be turned into a single [`RecordBatchesExec`].
+///
+/// Parquet chunks will be turned into a [`ParquetExec`] per store, each of them with
+/// [`target_partitions`](datafusion::execution::context::SessionConfig::target_partitions) file groups.
+///
+/// If this function creates more than one physical node, they will be combined using an [`UnionExec`]. Otherwise, a
+/// single node will be returned directly.
+///
+/// If output_sort_key is specified, the ParquetExec will be marked
+/// with that sort key, otherwise it will be computed from the input chunks. TODO check if this is helpful or not
+///
+/// # Empty Inputs
+/// For empty inputs (i.e. no chunks), this will create a single [`EmptyExec`] node with appropriate schema.
+///
+/// # Predicates
+/// The give `predicate` will only be applied to [`ParquetExec`] nodes since they are the only node type benifiting from
+/// pushdown ([`RecordBatchesExec`] has NO builtin filter function). Delete predicates are NOT applied at all. The
+/// caller is responsible for wrapping the output node into appropriate filter nodes.
+pub fn chunks_to_physical_nodes(
+    schema: &SchemaRef,
+    output_sort_key: Option<&SortKey>,
+    chunks: Vec<Arc<dyn QueryChunk>>,
+    target_partitions: usize,
+) -> Arc<dyn ExecutionPlan> {
+    if chunks.is_empty() {
+        return Arc::new(EmptyExec::new(Arc::clone(schema)));
+    }
+
+    let mut record_batch_chunks: Vec<Arc<dyn QueryChunk>> = vec![];
+    let mut parquet_chunks: HashMap<String, ParquetChunkList> = HashMap::new();
+
+    for chunk in &chunks {
+        match chunk.data() {
+            QueryChunkData::RecordBatches(_) => {
+                record_batch_chunks.push(Arc::clone(chunk));
+            }
+            QueryChunkData::Parquet(parquet_input) => {
+                let url_str = parquet_input.object_store_url.as_str().to_owned();
+                match parquet_chunks.entry(url_str) {
+                    Entry::Occupied(mut o) => {
+                        o.get_mut()
+                            .add_parquet_file(chunk, parquet_input.object_meta);
+                    }
+                    Entry::Vacant(v) => {
+                        // better have some instead of no sort information at all
+                        let output_sort_key = output_sort_key.or_else(|| chunk.sort_key());
+                        v.insert(ParquetChunkList::new(
+                            parquet_input.object_store_url,
+                            chunk,
+                            parquet_input.object_meta,
+                            output_sort_key,
+                        ));
+                    }
+                }
+            }
+        }
+    }
+
+    let mut output_nodes: Vec<Arc<dyn ExecutionPlan>> = vec![];
+    if !record_batch_chunks.is_empty() {
+        output_nodes.push(Arc::new(RecordBatchesExec::new(
+            record_batch_chunks,
+            Arc::clone(schema),
+            output_sort_key.cloned(),
+        )));
+    }
+    let mut parquet_chunks: Vec<_> = parquet_chunks.into_iter().collect();
+    parquet_chunks.sort_by_key(|(url_str, _)| url_str.clone());
+    let has_chunk_order_col = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).is_ok();
+    for (_url_str, chunk_list) in parquet_chunks {
+        let ParquetChunkList {
+            object_store_url,
+            mut chunks,
+            sort_key,
+        } = chunk_list;
+
+        // ensure that chunks are actually ordered by chunk order
+        chunks.sort_by_key(|(_meta, c)| c.order());
+
+        // Compute statistics for the chunks
+        let query_chunks = chunks
+            .iter()
+            .map(|(_meta, chunk)| Arc::clone(chunk))
+            .collect::<Vec<_>>();
+        let statistics = build_statistics_for_chunks(&query_chunks, Arc::clone(schema));
+
+        let file_groups = distribute(
+            chunks.into_iter().map(|(object_meta, chunk)| {
+                let partition_values = if has_chunk_order_col {
+                    vec![ScalarValue::from(chunk.order().get())]
+                } else {
+                    vec![]
+                };
+                PartitionedFile {
+                    object_meta,
+                    partition_values,
+                    range: None,
+                    extensions: Some(Arc::new(PartitionedFileExt {
+                        chunk,
+                        output_sort_key_memo: output_sort_key.cloned(),
+                    })),
+                }
+            }),
+            target_partitions,
+        );
+
+        // Tell datafusion about the sort key, if any
+        let output_ordering = sort_key.map(|sort_key| arrow_sort_key_exprs(&sort_key, schema));
+
+        let (table_partition_cols, file_schema, output_ordering) = if has_chunk_order_col {
+            let table_partition_cols = vec![schema
+                .field_with_name(CHUNK_ORDER_COLUMN_NAME)
+                .unwrap()
+                .clone()];
+            let file_schema = Arc::new(ArrowSchema::new(
+                schema
+                    .fields
+                    .iter()
+                    .filter(|f| f.name() != CHUNK_ORDER_COLUMN_NAME)
+                    .map(Arc::clone)
+                    .collect::<Fields>(),
+            ));
+            let output_ordering = Some(
+                output_ordering
+                    .unwrap_or_default()
+                    .into_iter()
+                    .chain(std::iter::once(PhysicalSortExpr {
+                        expr: Arc::new(
+                            Column::new_with_schema(CHUNK_ORDER_COLUMN_NAME, schema)
+                                .expect("just added col"),
+                        ),
+                        options: Default::default(),
+                    }))
+                    .collect::<Vec<_>>(),
+            );
+            (table_partition_cols, file_schema, output_ordering)
+        } else {
+            (vec![], Arc::clone(schema), output_ordering)
+        };
+
+        // No sort order is represented by an empty Vec
+        let output_ordering = vec![output_ordering.unwrap_or_default()];
+
+        let base_config = FileScanConfig {
+            object_store_url,
+            file_schema,
+            file_groups,
+            statistics,
+            projection: None,
+            limit: None,
+            table_partition_cols,
+            output_ordering,
+        };
+        let meta_size_hint = None;
+
+        let parquet_exec = ParquetExec::new(base_config, None, meta_size_hint);
+        output_nodes.push(Arc::new(parquet_exec));
+    }
+
+    assert!(!output_nodes.is_empty());
+    Arc::new(UnionExec::new(output_nodes))
+}
+
+/// Distribute items from the given iterator into `n` containers.
+///
+/// This will produce less than `n` containers if the input has less than `n` elements.
+///
+/// # Panic
+/// Panics if `n` is 0.
+fn distribute<I, T>(it: I, n: usize) -> Vec<Vec<T>>
+where
+    I: IntoIterator<Item = T>,
+{
+    assert!(n > 0);
+
+    let mut outputs: Vec<_> = (0..n).map(|_| vec![]).collect();
+    let mut pos = 0usize;
+    for x in it {
+        outputs[pos].push(x);
+        pos = (pos + 1) % n;
+    }
+    outputs.into_iter().filter(|o| !o.is_empty()).collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{
+        common::stats::Precision,
+        physical_plan::{ColumnStatistics, Statistics},
+    };
+    use schema::{sort::SortKeyBuilder, InfluxFieldType, SchemaBuilder, TIME_COLUMN_NAME};
+
+    use crate::{
+        chunk_order_field,
+        statistics::build_statistics_for_chunks,
+        test::{format_execution_plan, TestChunk},
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_distribute() {
+        assert_eq!(distribute(0..0u8, 1), Vec::<Vec<u8>>::new(),);
+
+        assert_eq!(distribute(0..3u8, 1), vec![vec![0, 1, 2]],);
+
+        assert_eq!(distribute(0..3u8, 2), vec![vec![0, 2], vec![1]],);
+
+        assert_eq!(distribute(0..3u8, 10), vec![vec![0], vec![1], vec![2]],);
+    }
+
+    #[test]
+    fn test_combine_sort_key() {
+        let schema_t1 = SchemaBuilder::new().tag("t1").timestamp().build().unwrap();
+        let skey_t1 = SortKeyBuilder::new()
+            .with_col("t1")
+            .with_col(TIME_COLUMN_NAME)
+            .build();
+
+        let schema_t1_t2 = SchemaBuilder::new()
+            .tag("t1")
+            .tag("t2")
+            .timestamp()
+            .build()
+            .unwrap();
+        let skey_t1_t2 = SortKeyBuilder::new()
+            .with_col("t1")
+            .with_col("t2")
+            .with_col(TIME_COLUMN_NAME)
+            .build();
+
+        let skey_t2_t1 = SortKeyBuilder::new()
+            .with_col("t2")
+            .with_col("t1")
+            .with_col(TIME_COLUMN_NAME)
+            .build();
+
+        // output is None if any of the parameters is None (either no sort key requested or chunk is unsorted)
+        assert_eq!(combine_sort_key(None, None, &schema_t1), None);
+        assert_eq!(
+            combine_sort_key(Some(skey_t1.clone()), None, &schema_t1),
+            None
+        );
+        assert_eq!(combine_sort_key(None, Some(&skey_t1), &schema_t1), None);
+
+        // keeping sort key identical works
+        assert_eq!(
+            combine_sort_key(Some(skey_t1.clone()), Some(&skey_t1), &schema_t1),
+            Some(skey_t1.clone())
+        );
+        assert_eq!(
+            combine_sort_key(Some(skey_t1.clone()), Some(&skey_t1), &schema_t1_t2),
+            Some(skey_t1.clone())
+        );
+
+        // extending sort key works (chunk has more columns than existing key)
+        assert_eq!(
+            combine_sort_key(Some(skey_t1.clone()), Some(&skey_t1_t2), &schema_t1_t2),
+            Some(skey_t1_t2.clone())
+        );
+
+        // extending sort key works (quorum has more columns than this chunk)
+        assert_eq!(
+            combine_sort_key(Some(skey_t1_t2.clone()), Some(&skey_t1), &schema_t1),
+            Some(skey_t1_t2.clone())
+        );
+        assert_eq!(
+            combine_sort_key(Some(skey_t2_t1.clone()), Some(&skey_t1), &schema_t1),
+            Some(skey_t2_t1.clone())
+        );
+
+        // extending does not work if quorum covers columns that the chunk has but that are NOT sorted for that chunk
+        assert_eq!(
+            combine_sort_key(Some(skey_t1_t2.clone()), Some(&skey_t1), &schema_t1_t2),
+            None
+        );
+        assert_eq!(
+            combine_sort_key(Some(skey_t2_t1.clone()), Some(&skey_t1), &schema_t1_t2),
+            None
+        );
+
+        // column order conflicts are detected
+        assert_eq!(
+            combine_sort_key(Some(skey_t2_t1), Some(&skey_t1_t2), &schema_t1_t2),
+            None
+        );
+    }
+
+    #[test]
+    fn test_chunks_to_physical_nodes_empty() {
+        let schema = TestChunk::new("table").schema().as_arrow();
+        let plan = chunks_to_physical_nodes(&schema, None, vec![], 2);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " EmptyExec"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_chunks_to_physical_nodes_recordbatch() {
+        let chunk = TestChunk::new("table");
+        let schema = chunk.schema().as_arrow();
+        let plan = chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk)], 2);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   RecordBatchesExec: chunks=1"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_chunks_to_physical_nodes_parquet_one_file() {
+        let chunk = TestChunk::new("table").with_dummy_parquet_file();
+        let schema = chunk.schema().as_arrow();
+        let plan = chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk)], 2);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_chunks_to_physical_nodes_parquet_many_files() {
+        let chunk1 = TestChunk::new("table").with_id(0).with_dummy_parquet_file();
+        let chunk2 = TestChunk::new("table").with_id(1).with_dummy_parquet_file();
+        let chunk3 = TestChunk::new("table").with_id(2).with_dummy_parquet_file();
+        let schema = chunk1.schema().as_arrow();
+        let plan = chunks_to_physical_nodes(
+            &schema,
+            None,
+            vec![Arc::new(chunk1), Arc::new(chunk2), Arc::new(chunk3)],
+            2,
+        );
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   ParquetExec: file_groups={2 groups: [[0.parquet, 2.parquet], [1.parquet]]}"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_chunks_to_physical_nodes_parquet_many_store() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(0)
+            .with_dummy_parquet_file_and_store("iox1://");
+        let chunk2 = TestChunk::new("table")
+            .with_id(1)
+            .with_dummy_parquet_file_and_store("iox2://");
+        let schema = chunk1.schema().as_arrow();
+        let plan =
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1), Arc::new(chunk2)], 2);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}"
+        - "   ParquetExec: file_groups={1 group: [[1.parquet]]}"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_chunks_to_physical_nodes_mixed() {
+        let chunk1 = TestChunk::new("table").with_dummy_parquet_file();
+        let chunk2 = TestChunk::new("table");
+        let schema = chunk1.schema().as_arrow();
+        let plan =
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1), Arc::new(chunk2)], 2);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   RecordBatchesExec: chunks=1"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_chunks_to_physical_nodes_mixed_with_chunk_order() {
+        let chunk1 = TestChunk::new("table")
+            .with_tag_column("tag")
+            .with_dummy_parquet_file();
+        let chunk2 = TestChunk::new("table").with_tag_column("tag");
+        let schema = Arc::new(ArrowSchema::new(
+            chunk1
+                .schema()
+                .as_arrow()
+                .fields
+                .iter()
+                .cloned()
+                .chain(std::iter::once(chunk_order_field()))
+                .collect::<Fields>(),
+        ));
+        let plan =
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1), Arc::new(chunk2)], 2);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   RecordBatchesExec: chunks=1, projection=[tag, __chunk_order]"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[tag, __chunk_order], output_ordering=[__chunk_order@1 ASC]"
+        "###
+        );
+    }
+
+    // reproducer of https://github.com/influxdata/idpe/issues/18287
+    #[test]
+    fn reproduce_schema_bug_in_parquet_exec() {
+        // schema with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("field", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into();
+
+        // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)),
+        );
+
+        // create them same test chunk but with a parquet file
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6))
+                .with_dummy_parquet_file(),
+        );
+
+        // Build a RecordBatchsExec for record_batch_chunk
+        //
+        // Use chunks_to_physical_nodes to build a plan with UnionExec on top of RecordBatchesExec
+        // Note: I purposely use chunks_to_physical_node to create plan for both record_batch_chunk and parquet_chunk to
+        //       consistently create their plan. Also chunks_to_physical_node is used to do create plan in optimization
+        //       passes that I will need
+        let plan = chunks_to_physical_nodes(
+            &schema,
+            None,
+            vec![Arc::clone(&record_batch_chunk) as Arc<dyn QueryChunk>],
+            1,
+        );
+        // remove union
+        let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() else {
+            panic!("plan is not a UnionExec");
+        };
+        let plan_record_batches_exec = Arc::clone(&union_exec.inputs()[0]);
+        // verify this is a RecordBatchesExec
+        assert!(plan_record_batches_exec
+            .as_any()
+            .downcast_ref::<RecordBatchesExec>()
+            .is_some());
+
+        // Build a ParquetExec for parquet_chunk
+        //
+        // Use chunks_to_physical_nodes to build a plan with UnionExec on top of ParquetExec
+        let plan = chunks_to_physical_nodes(
+            &schema,
+            None,
+            vec![Arc::clone(&parquet_chunk) as Arc<dyn QueryChunk>],
+            1,
+        );
+        // remove union
+        let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() else {
+            panic!("plan is not a UnionExec");
+        };
+        let plan_parquet_exec = Arc::clone(&union_exec.inputs()[0]);
+        // verify this is a ParquetExec
+        assert!(plan_parquet_exec
+            .as_any()
+            .downcast_ref::<ParquetExec>()
+            .is_some());
+
+        // Schema of 2 chunks are the same
+        assert_eq!(record_batch_chunk.schema(), parquet_chunk.schema());
+
+        // Schema of the corresponding plans are also the same
+        assert_eq!(
+            plan_record_batches_exec.schema(),
+            plan_parquet_exec.schema()
+        );
+
+        // Statistics of 2 chunks are the same
+        let record_batch_stats =
+            build_statistics_for_chunks(&[record_batch_chunk], Arc::clone(&schema));
+        let parquet_stats = build_statistics_for_chunks(&[parquet_chunk], schema);
+        assert_eq!(record_batch_stats, parquet_stats);
+
+        // Statistics of the corresponding plans should also be the same except the CHUNK_ORDER_COLUMN_NAME
+        // Notes:
+        //  1. We do compute stats for CHUNK_ORDER_COLUMN_NAME and store it as in FileScanConfig.statistics
+        //     See: https://github.com/influxdata/influxdb_iox/blob/0e5b97d9e913111641f65b9af31e3b3f45f3b14b/iox_query/src/provider/physical.rs#L311C24-L311C24
+        //     So, if we get statistics there, we have everything
+        //  2. However, if we get statistics through the DF plan's statistics() method, we will not get stats for CHUNK_ORDER_COLUMN_NAME
+        //     The reason is we store CHUNK_ORDER_COLUMN_NAME as table_partition_cols in DF and DF has not computed stats for it yet.
+        //     See: https://github.com/apache/arrow-datafusion/blob/a9d66e2b492843c2fb335a7dfe27fed073629b09/datafusion/core/src/datasource/physical_plan/file_scan_config.rs#L139
+        // When we get the plan's statistics, we won't care about CHUNK_ORDER_COLUMN_NAME becasue it is not a real column.
+        // Thus, we are good for now. In the future, if we want a 100% consistent for CHUNK_ORDER_COLUMN_NAME, we need
+        // to modify DF to compute stats for table_partition_cols
+        //
+        // Here both parquet's plan stats and FileScanConfig stats
+        //
+        // Cast to ParquetExec to get statistics
+        let plan_parquet_exec = plan_parquet_exec
+            .as_any()
+            .downcast_ref::<ParquetExec>()
+            .unwrap();
+        // stats of the parquet plan generally computed from propagating stats from input plans/chunks/columns
+        let parquet_plan_stats = plan_parquet_exec.statistics().unwrap();
+        // stats stored in FileScanConfig
+        let parqet_file_stats = &plan_parquet_exec.base_config().statistics;
+
+        // stats of IOx specific recod batch plan
+        let record_batch_plan_stats = plan_record_batches_exec.statistics().unwrap();
+
+        // Record batch plan stats is the same as parquet file stats and includes everything
+        assert_eq!(record_batch_plan_stats, *parqet_file_stats);
+
+        // Verify content
+        //
+        // Actual columns have stats
+        let col_stats = vec![
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("MT".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(20), None)),
+                min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)),
+                distinct_count: Precision::Absent,
+            },
+        ];
+        //
+        // Add CHUNK_ORDER_COLUMN_NAME with stats
+        let mut parquet_file_col_stats = col_stats.clone();
+        parquet_file_col_stats.push(ColumnStatistics {
+            null_count: Precision::Absent,
+            max_value: Precision::Exact(ScalarValue::Int64(Some(6))),
+            min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+            distinct_count: Precision::Absent,
+        });
+        //
+        // Add CHUNK_ORDER_COLUMN_NAME without stats
+        let mut parquet_plan_stats_col_stats = col_stats;
+        parquet_plan_stats_col_stats.push(ColumnStatistics {
+            null_count: Precision::Absent,
+            max_value: Precision::Absent,
+            min_value: Precision::Absent,
+            distinct_count: Precision::Absent,
+        });
+        //
+        let expected_parquet_plan_stats = Statistics {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Absent,
+            column_statistics: parquet_plan_stats_col_stats,
+        };
+        //
+        let expected_parquet_file_stats = Statistics {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Absent,
+            column_statistics: parquet_file_col_stats,
+        };
+
+        // Content of Record batch plan stats that include stats of CHUNK_ORDER_COLUMN_NAME
+        assert_eq!(record_batch_plan_stats, expected_parquet_file_stats);
+        // Content of parquet file stats that also include stats of CHUNK_ORDER_COLUMN_NAME
+        assert_eq!(*parqet_file_stats, expected_parquet_file_stats);
+        //
+        // Content of parquet plan stats that does not include stats of CHUNK_ORDER_COLUMN_NAME
+        assert_eq!(parquet_plan_stats, expected_parquet_plan_stats);
+    }
+}
diff --git a/iox_query/src/provider/progressive_eval.rs b/iox_query/src/provider/progressive_eval.rs
new file mode 100644
index 0000000..80109e4
--- /dev/null
+++ b/iox_query/src/provider/progressive_eval.rs
@@ -0,0 +1,1206 @@
+// ProgressiveEvalExec (step 1 in https://docs.google.com/document/d/1x1yf9ggyxD4JPT8Gf9YlIKxUawqoKTJ1HFyTbGin9xY/edit)
+// This will be moved to DF once it is ready
+
+//! Defines the progressive eval plan
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion::common::{internal_err, DataFusionError, Result};
+use datafusion::execution::TaskContext;
+use datafusion::physical_expr::{EquivalenceProperties, PhysicalSortExpr, PhysicalSortRequirement};
+use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
+use datafusion::physical_plan::stream::RecordBatchReceiverStream;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, RecordBatchStream,
+    SendableRecordBatchStream, Statistics,
+};
+use datafusion::scalar::ScalarValue;
+use futures::{ready, Stream, StreamExt};
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use observability_deps::tracing::{debug, trace};
+
+/// ProgressiveEval return a stream of record batches in the order of its inputs.
+/// It will stop when the number of output rows reach the given limit.
+///
+/// This takes an input execution plan and a number n, and provided each partition of
+/// the input plan is in an expected order, this operator will return top record batches that covers the top n rows
+/// in the order of the input plan.
+///
+/// ```text
+/// ┌─────────────────────────┐
+/// │ ┌───┬───┬───┬───┐       │
+/// │ │ A │ B │ C │ D │       │──┐
+/// │ └───┴───┴───┴───┘       │  │
+/// └─────────────────────────┘  │  ┌───────────────────┐    ┌───────────────────────────────┐
+///   Stream 1                   │  │                   │    │ ┌───┬───╦═══╦───┬───╦═══╗     │
+///                              ├─▶│  ProgressiveEval  │───▶│ │ A │ B ║ C ║ D │ M ║ N ║ ... │
+///                              │  │                   │    │ └───┴─▲─╩═══╩───┴───╩═══╝     │
+/// ┌─────────────────────────┐  │  └───────────────────┘    └─┬─────┴───────────────────────┘
+/// │ ╔═══╦═══╗               │  │
+/// │ ║ M ║ N ║               │──┘                             │
+/// │ ╚═══╩═══╝               │                Output only include top record batches that cover top N rows
+/// └─────────────────────────┘                
+///   Stream 2
+///
+///
+///  Input Streams                                             Output stream
+///  (in some order)                                           (in same order)
+/// ```
+#[derive(Debug)]
+pub(crate) struct ProgressiveEvalExec {
+    /// Input plan
+    input: Arc<dyn ExecutionPlan>,
+
+    /// Corresponding value ranges of the input plan
+    /// None if the value ranges are not available
+    value_ranges: Option<Vec<(ScalarValue, ScalarValue)>>,
+
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+
+    /// Optional number of rows to fetch. Stops producing rows after this fetch
+    fetch: Option<usize>,
+}
+
+impl ProgressiveEvalExec {
+    /// Create a new progressive execution plan
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        value_ranges: Option<Vec<(ScalarValue, ScalarValue)>>,
+        fetch: Option<usize>,
+    ) -> Self {
+        Self {
+            input,
+            value_ranges,
+            metrics: ExecutionPlanMetricsSet::new(),
+            fetch,
+        }
+    }
+
+    /// Input schema
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+}
+
+impl DisplayAs for ProgressiveEvalExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "ProgressiveEvalExec: ")?;
+                if let Some(fetch) = self.fetch {
+                    write!(f, "fetch={fetch}, ")?;
+                };
+                if let Some(value_ranges) = &self.value_ranges {
+                    write!(f, "input_ranges={value_ranges:?}")?;
+                };
+
+                Ok(())
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for ProgressiveEvalExec {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    /// Get the output partitioning of this plan
+    fn output_partitioning(&self) -> Partitioning {
+        // This node serializes all the data to a single partition
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    /// Specifies whether this plan generates an infinite stream of records.
+    /// If the plan does not support pipelining, but its input(s) are
+    /// infinite, returns an error to indicate this.
+    fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
+        Ok(children[0])
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::UnspecifiedDistribution]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
+        self.input()
+            .output_ordering()
+            .map(|_| None)
+            .into_iter()
+            .collect()
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    /// ProgressiveEvalExec will only accept sorted input
+    /// and will maintain the input order
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::<dyn ExecutionPlan>::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self::new(
+            Arc::<dyn ExecutionPlan>::clone(&children[0]),
+            self.value_ranges.clone(),
+            self.fetch,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        trace!(
+            "Start ProgressiveEvalExec::execute for partition: {}",
+            partition
+        );
+        if 0 != partition {
+            return internal_err!("ProgressiveEvalExec invalid partition {partition}");
+        }
+
+        let input_partitions = self.input.output_partitioning().partition_count();
+        trace!(
+            "Number of input partitions of  ProgressiveEvalExec::execute: {}",
+            input_partitions
+        );
+        let schema = self.schema();
+
+        // Have the input streams run in parallel
+        // todo: maybe in the future we do not need this parallelism if number of fecthed rows is in the fitst stream
+        let receivers = (0..input_partitions)
+            .map(|partition| {
+                let stream = self
+                    .input
+                    .execute(partition, Arc::<TaskContext>::clone(&context))?;
+
+                Ok(spawn_buffered(stream, 1))
+            })
+            .collect::<Result<_>>()?;
+
+        debug!("Done setting up sender-receiver for ProgressiveEvalExec::execute");
+
+        let result = ProgressiveEvalStream::new(
+            receivers,
+            schema,
+            BaselineMetrics::new(&self.metrics, partition),
+            self.fetch,
+        )?;
+
+        debug!("Got stream result from ProgressiveEvalStream::new_from_receivers");
+
+        Ok(Box::pin(result))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics, DataFusionError> {
+        self.input.statistics()
+    }
+
+    fn equivalence_properties(&self) -> EquivalenceProperties {
+        // progressive eval does not change the equivalence properties of its input
+        self.input.equivalence_properties()
+    }
+}
+
+/// Concat input streams until reaching the fetch limit
+struct ProgressiveEvalStream {
+    /// input streams
+    input_streams: Vec<SendableRecordBatchStream>,
+
+    /// The schema of the input and output.
+    schema: SchemaRef,
+
+    /// used to record execution metrics
+    metrics: BaselineMetrics,
+
+    /// Index of current stream
+    current_stream_idx: usize,
+
+    /// If the stream has encountered an error
+    aborted: bool,
+
+    /// Optional number of rows to fetch
+    fetch: Option<usize>,
+
+    /// number of rows produced
+    produced: usize,
+}
+
+impl ProgressiveEvalStream {
+    fn new(
+        input_streams: Vec<SendableRecordBatchStream>,
+        schema: SchemaRef,
+        metrics: BaselineMetrics,
+        fetch: Option<usize>,
+    ) -> Result<Self> {
+        Ok(Self {
+            input_streams,
+            schema,
+            metrics,
+            current_stream_idx: 0,
+            aborted: false,
+            fetch,
+            produced: 0,
+        })
+    }
+}
+
+impl Stream for ProgressiveEvalStream {
+    type Item = Result<RecordBatch>;
+
+    // Return the next record batch until reaching the fetch limit or the end of all input streams
+    // Return pending if the next record batch is not ready
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // Error in previous poll
+        if self.aborted {
+            return Poll::Ready(None);
+        }
+
+        // Have reached the fetch limit
+        if self.produced >= self.fetch.unwrap_or(std::usize::MAX) {
+            return Poll::Ready(None);
+        }
+
+        // Have reached the end of all input streams
+        if self.current_stream_idx >= self.input_streams.len() {
+            return Poll::Ready(None);
+        }
+
+        // Get next record batch
+        let mut poll;
+        loop {
+            let idx = self.current_stream_idx;
+            poll = self.input_streams[idx].poll_next_unpin(cx);
+            match poll {
+                // This input stream no longer has data, move to next stream
+                Poll::Ready(None) => {
+                    self.current_stream_idx += 1;
+                    if self.current_stream_idx >= self.input_streams.len() {
+                        break;
+                    }
+                }
+                _ => break,
+            }
+        }
+
+        let poll = match ready!(poll) {
+            // This input stream has data, return its next record batch
+            Some(Ok(batch)) => {
+                self.produced += batch.num_rows();
+                Poll::Ready(Some(Ok(batch)))
+            }
+            // This input stream has an error, return the error and set aborted to true to stop polling next round
+            Some(Err(e)) => {
+                self.aborted = true;
+                Poll::Ready(Some(Err(e)))
+            }
+            // This input stream has no more data, return None (aka finished)
+            None => {
+                // Reaching here means data of all streams have read
+                assert!(
+                    self.current_stream_idx >= self.input_streams.len(),
+                    "ProgressiveEvalStream::poll_next should not return None before all input streams are read",);
+
+                Poll::Ready(None)
+            }
+        };
+
+        self.metrics.record_poll(poll)
+    }
+}
+
+impl RecordBatchStream for ProgressiveEvalStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+// todo: this is a copy from DF code. When this ProgressiveEval operator is moved to DF, this can be removed
+/// If running in a tokio context spawns the execution of `stream` to a separate task
+/// allowing it to execute in parallel with an intermediate buffer of size `buffer`
+pub(crate) fn spawn_buffered(
+    mut input: SendableRecordBatchStream,
+    buffer: usize,
+) -> SendableRecordBatchStream {
+    // Use tokio only if running from a multi-thread tokio context
+    match tokio::runtime::Handle::try_current() {
+        Ok(handle) if handle.runtime_flavor() == tokio::runtime::RuntimeFlavor::MultiThread => {
+            let mut builder = RecordBatchReceiverStream::builder(input.schema(), buffer);
+
+            let sender = builder.tx();
+
+            builder.spawn(async move {
+                while let Some(item) = input.next().await {
+                    if sender.send(item).await.is_err() {
+                        // receiver dropped when query is shutdown early (e.g., limit) or error,
+                        // no need to return propagate the send error.
+                        return Ok(());
+                    }
+                }
+
+                Ok(())
+            });
+
+            builder.build()
+        }
+        _ => input,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::iter::FromIterator;
+    use std::sync::Weak;
+
+    use arrow::array::ArrayRef;
+    use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray};
+    use arrow::datatypes::Schema;
+    use arrow::datatypes::{DataType, Field};
+    use arrow::record_batch::RecordBatch;
+    use datafusion::assert_batches_eq;
+    use datafusion::physical_plan::collect;
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion::physical_plan::metrics::{MetricValue, Timestamp};
+    use futures::{Future, FutureExt};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_no_input_stream() {
+        let task_ctx = Arc::new(TaskContext::default());
+        _test_progressive_eval(
+            &[],
+            None,
+            None, // no fetch limit --> return all rows
+            &["++", "++"],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_one_input_stream() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("c"),
+            Some("e"),
+            Some("g"),
+            Some("j"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // return all
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            None, // no fetch limit --> return all rows
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | j | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // fetch no rows
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            Some(0),
+            &["++", "++"],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // still return all even select 3 rows becasue first record batch is returned
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            Some(3),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | j | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // return all because fetch limit is larger
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            Some(7),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | j | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_return_all() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("c"),
+            Some("e"),
+            Some("g"),
+            Some("j"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("b"),
+            Some("d"),
+            Some("f"),
+            Some("h"),
+            Some("j"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b1, b2]
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()]],
+            None,
+            None, // no fetch limit --> return all rows
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | j | 1970-01-01T00:00:00.000000008 |",
+                "| 10 | b | 1970-01-01T00:00:00.000000004 |",
+                "| 20 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 70 | f | 1970-01-01T00:00:00.000000002 |",
+                "| 90 | h | 1970-01-01T00:00:00.000000002 |",
+                "| 30 | j | 1970-01-01T00:00:00.000000006 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b2, b1]
+        _test_progressive_eval(
+            &[vec![b2], vec![b1]],
+            None,
+            None,
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 10 | b | 1970-01-01T00:00:00.000000004 |",
+                "| 20 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 70 | f | 1970-01-01T00:00:00.000000002 |",
+                "| 90 | h | 1970-01-01T00:00:00.000000002 |",
+                "| 30 | j | 1970-01-01T00:00:00.000000006 |",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | j | 1970-01-01T00:00:00.000000008 |",
+                "+----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_return_all_on_different_length_batches() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b1, b2]
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()]],
+            None,
+            None,
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b2, b1]
+        _test_progressive_eval(
+            &[vec![b2], vec![b1]],
+            None,
+            None,
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "+----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_limit_1() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b2, b1]
+        // b2 has 3 rows. b1 has 5 rows
+        // Fetch limit is 1 --> return all 3 rows of the first batch (b2) that covers that limit
+        _test_progressive_eval(
+            &[vec![b2.clone()], vec![b1.clone()]],
+            None,
+            Some(1),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2]
+        // b1 has 5 rows. b2 has 3 rows
+        // Fetch limit is 1 --> return all 5 rows of the first batch (b1) that covers that limit
+        _test_progressive_eval(
+            &[vec![b1], vec![b2]],
+            None,
+            Some(1),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | e | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_limit_equal_first_batch_size() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b2, b1]
+        // b2 has 3 rows. b1 has 5 rows
+        // Fetch limit is 3 --> return all 3 rows of the first batch (b2) that covers that limit
+        _test_progressive_eval(
+            &[vec![b2.clone()], vec![b1.clone()]],
+            None,
+            Some(3),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2]
+        // b1 has 5 rows. b2 has 3 rows
+        // Fetch limit is 5 --> return all 5 rows of first batch (b1) that covers that limit
+        _test_progressive_eval(
+            &[vec![b1], vec![b2]],
+            None,
+            Some(5),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | e | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_limit_over_first_batch_size() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b2, b1]
+        // b2 has 3 rows. b1 has 5 rows
+        // Fetch limit is 4 --> return all rows of both batches in the order of b2, b1
+        _test_progressive_eval(
+            &[vec![b2.clone()], vec![b1.clone()]],
+            None,
+            Some(4),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2]
+        // b1 has 5 rows. b2 has 3 rows
+        // Fetch limit is 6 --> return all rows of both batches in the order of b1, b2
+        _test_progressive_eval(
+            &[vec![b1], vec![b2]],
+            None,
+            Some(6),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_three_partitions_with_nulls() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            None,
+            Some("f"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("e"),
+            Some("g"),
+            Some("h"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![40, 60, 20]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![100, 200, 700, 900]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            None,
+            Some("g"),
+            Some("h"),
+            Some("i"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2]));
+        let b3 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b1, b2, b3]
+        // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows
+        // Fetch limit is 1 --> return all rows of the b1
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()], vec![b3.clone()]],
+            None,
+            Some(1),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9 |   | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | f | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2, b3]
+        // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows
+        // Fetch limit is 7 --> return all rows of the b1 & b2 in the order of b1, b2
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()], vec![b3.clone()]],
+            None,
+            Some(7),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  |   | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | f | 1970-01-01T00:00:00.000000008 |",
+                "| 10 | e | 1970-01-01T00:00:00.000000040 |",
+                "| 20 | g | 1970-01-01T00:00:00.000000060 |",
+                "| 70 | h | 1970-01-01T00:00:00.000000020 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2, b3]
+        // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows
+        // Fetch limit is 50 --> return all rows of all batches in the order of b1, b2, b3
+        _test_progressive_eval(
+            &[vec![b1], vec![b2], vec![b3]],
+            None,
+            Some(50),
+            &[
+                "+-----+---+-------------------------------+",
+                "| a   | b | c                             |",
+                "+-----+---+-------------------------------+",
+                "| 1   | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2   | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7   | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9   |   | 1970-01-01T00:00:00.000000005 |",
+                "| 3   | f | 1970-01-01T00:00:00.000000008 |",
+                "| 10  | e | 1970-01-01T00:00:00.000000040 |",
+                "| 20  | g | 1970-01-01T00:00:00.000000060 |",
+                "| 70  | h | 1970-01-01T00:00:00.000000020 |",
+                "| 100 |   | 1970-01-01T00:00:00.000000004 |",
+                "| 200 | g | 1970-01-01T00:00:00.000000006 |",
+                "| 700 | h | 1970-01-01T00:00:00.000000002 |",
+                "| 900 | i | 1970-01-01T00:00:00.000000002 |",
+                "+-----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    async fn _test_progressive_eval(
+        partitions: &[Vec<RecordBatch>],
+        value_ranges: Option<Vec<(ScalarValue, ScalarValue)>>,
+        fetch: Option<usize>,
+        exp: &[&str],
+        context: Arc<TaskContext>,
+    ) {
+        let schema = if partitions.is_empty() {
+            // just whatwever schema
+            let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+            let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap();
+            batch.schema()
+        } else {
+            partitions[0][0].schema()
+        };
+
+        let exec = MemoryExec::try_new(partitions, schema, None).unwrap();
+        let progressive = Arc::new(ProgressiveEvalExec::new(
+            Arc::new(exec),
+            value_ranges,
+            fetch,
+        ));
+
+        let collected = collect(progressive, context).await.unwrap();
+        assert_batches_eq!(exp, collected.as_slice());
+    }
+
+    #[tokio::test]
+    async fn test_merge_metrics() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("a"), Some("c")]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("b"), Some("d")]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap();
+
+        let schema = b1.schema();
+        let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap();
+        let progressive = Arc::new(ProgressiveEvalExec::new(Arc::new(exec), None, None));
+
+        let collected = collect(Arc::<ProgressiveEvalExec>::clone(&progressive), task_ctx)
+            .await
+            .unwrap();
+        let expected = [
+            "+----+---+",
+            "| a  | b |",
+            "+----+---+",
+            "| 1  | a |",
+            "| 2  | c |",
+            "| 10 | b |",
+            "| 20 | d |",
+            "+----+---+",
+        ];
+        assert_batches_eq!(expected, collected.as_slice());
+
+        // Now, validate metrics
+        let metrics = progressive.metrics().unwrap();
+
+        assert_eq!(metrics.output_rows().unwrap(), 4);
+        assert!(metrics.elapsed_compute().unwrap() > 0);
+
+        let mut saw_start = false;
+        let mut saw_end = false;
+        metrics.iter().for_each(|m| match m.value() {
+            MetricValue::StartTimestamp(ts) => {
+                saw_start = true;
+                assert!(nanos_from_timestamp(ts) > 0);
+            }
+            MetricValue::EndTimestamp(ts) => {
+                saw_end = true;
+                assert!(nanos_from_timestamp(ts) > 0);
+            }
+            _ => {}
+        });
+
+        assert!(saw_start);
+        assert!(saw_end);
+    }
+
+    fn nanos_from_timestamp(ts: &Timestamp) -> i64 {
+        ts.value().unwrap().timestamp_nanos_opt().unwrap()
+    }
+
+    #[tokio::test]
+    async fn test_drop_cancel() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)]));
+
+        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 2));
+        let refs = blocking_exec.refs();
+        let progressive_exec = Arc::new(ProgressiveEvalExec::new(blocking_exec, None, None));
+
+        let fut = collect(progressive_exec, task_ctx);
+        let mut fut = fut.boxed();
+
+        assert_is_pending(&mut fut);
+        drop(fut);
+        assert_strong_count_converges_to_zero(refs).await;
+
+        Ok(())
+    }
+
+    // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed
+    /// Asserts that the strong count of the given [`Weak`] pointer converges to zero.
+    ///
+    /// This might take a while but has a timeout.
+    pub async fn assert_strong_count_converges_to_zero<T>(refs: Weak<T>) {
+        #![allow(clippy::future_not_send)]
+        tokio::time::timeout(std::time::Duration::from_secs(10), async {
+            loop {
+                if Weak::strong_count(&refs) == 0 {
+                    break;
+                }
+                tokio::time::sleep(std::time::Duration::from_millis(10)).await;
+            }
+        })
+        .await
+        .unwrap();
+    }
+
+    // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed
+    /// Asserts that given future is pending.
+    pub fn assert_is_pending<'a, T>(fut: &mut Pin<Box<dyn Future<Output = T> + Send + 'a>>) {
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+        let poll = fut.poll_unpin(&mut cx);
+
+        assert!(poll.is_pending());
+    }
+
+    // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed
+    /// Execution plan that emits streams that block forever.
+    ///
+    /// This is useful to test shutdown / cancelation behavior of certain execution plans.
+    #[derive(Debug)]
+    pub struct BlockingExec {
+        /// Schema that is mocked by this plan.
+        schema: SchemaRef,
+
+        /// Number of output partitions.
+        n_partitions: usize,
+
+        /// Ref-counting helper to check if the plan and the produced stream are still in memory.
+        refs: Arc<()>,
+    }
+
+    impl BlockingExec {
+        /// Create new [`BlockingExec`] with a give schema and number of partitions.
+        pub fn new(schema: SchemaRef, n_partitions: usize) -> Self {
+            Self {
+                schema,
+                n_partitions,
+                refs: Default::default(),
+            }
+        }
+
+        /// Weak pointer that can be used for ref-counting this execution plan and its streams.
+        ///
+        /// Use [`Weak::strong_count`] to determine if the plan itself and its streams are dropped (should be 0 in that
+        /// case). Note that tokio might take some time to cancel spawned tasks, so you need to wrap this check into a retry
+        /// loop. Use [`assert_strong_count_converges_to_zero`] to archive this.
+        pub fn refs(&self) -> Weak<()> {
+            Arc::downgrade(&self.refs)
+        }
+    }
+
+    impl DisplayAs for BlockingExec {
+        fn fmt_as(
+            &self,
+            t: DisplayFormatType,
+            f: &mut std::fmt::Formatter<'_>,
+        ) -> std::fmt::Result {
+            match t {
+                DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                    write!(f, "BlockingExec",)
+                }
+            }
+        }
+    }
+
+    impl ExecutionPlan for BlockingExec {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            // this is a leaf node and has no children
+            vec![]
+        }
+
+        fn output_partitioning(&self) -> Partitioning {
+            Partitioning::UnknownPartitioning(self.n_partitions)
+        }
+
+        fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+            None
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            internal_err!("Children cannot be replaced in {self:?}")
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            Ok(Box::pin(BlockingStream {
+                schema: Arc::clone(&self.schema),
+                _refs: Arc::clone(&self.refs),
+            }))
+        }
+
+        fn statistics(&self) -> Result<Statistics> {
+            unimplemented!()
+        }
+    }
+
+    /// A [`RecordBatchStream`] that is pending forever.
+    #[derive(Debug)]
+    pub struct BlockingStream {
+        /// Schema mocked by this stream.
+        schema: SchemaRef,
+
+        /// Ref-counting helper to check if the stream are still in memory.
+        _refs: Arc<()>,
+    }
+
+    impl Stream for BlockingStream {
+        type Item = Result<RecordBatch>;
+
+        fn poll_next(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+            Poll::Pending
+        }
+    }
+
+    impl RecordBatchStream for BlockingStream {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+}
diff --git a/iox_query/src/provider/record_batch_exec.rs b/iox_query/src/provider/record_batch_exec.rs
new file mode 100644
index 0000000..6122286
--- /dev/null
+++ b/iox_query/src/provider/record_batch_exec.rs
@@ -0,0 +1,191 @@
+//! Implementation of a DataFusion PhysicalPlan node across partition chunks
+
+use crate::statistics::build_statistics_for_chunks;
+use crate::{QueryChunk, CHUNK_ORDER_COLUMN_NAME};
+
+use super::adapter::SchemaAdapterStream;
+use arrow::datatypes::SchemaRef;
+use datafusion::physical_plan::display::ProjectSchemaDisplay;
+use datafusion::{
+    error::DataFusionError,
+    execution::context::TaskContext,
+    physical_plan::{
+        expressions::{Column, PhysicalSortExpr},
+        metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
+        DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream,
+        Statistics,
+    },
+    scalar::ScalarValue,
+};
+use observability_deps::tracing::trace;
+use schema::sort::SortKey;
+use std::{collections::HashMap, fmt, sync::Arc};
+
+/// Implements the DataFusion physical plan interface for [`RecordBatch`]es with automatic projection and NULL-column creation.
+///
+///
+/// [`RecordBatch`]: arrow::record_batch::RecordBatch
+#[derive(Debug)]
+pub(crate) struct RecordBatchesExec {
+    /// Chunks contained in this exec node.
+    chunks: Vec<Arc<dyn QueryChunk>>,
+
+    /// Overall schema.
+    schema: SchemaRef,
+
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+
+    /// Statistics over all batches.
+    statistics: Statistics,
+
+    /// Sort key that was passed to [`chunks_to_physical_nodes`].
+    ///
+    /// This is NOT used to set the output ordering. It is only here to recover this information later.
+    ///
+    ///
+    /// [`chunks_to_physical_nodes`]: super::physical::chunks_to_physical_nodes
+    output_sort_key_memo: Option<SortKey>,
+
+    /// Output ordering.
+    output_ordering: Option<Vec<PhysicalSortExpr>>,
+}
+
+impl RecordBatchesExec {
+    pub fn new(
+        chunks: impl IntoIterator<Item = Arc<dyn QueryChunk>>,
+        schema: SchemaRef,
+        output_sort_key_memo: Option<SortKey>,
+    ) -> Self {
+        let chunks: Vec<_> = chunks.into_iter().collect();
+        let statistics = build_statistics_for_chunks(&chunks, Arc::clone(&schema));
+
+        let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok();
+        let output_ordering = if chunk_order_field.is_some() {
+            Some(vec![
+                // every chunk gets its own partition, so we can claim that the output is ordered
+                PhysicalSortExpr {
+                    expr: Arc::new(
+                        Column::new_with_schema(CHUNK_ORDER_COLUMN_NAME, &schema)
+                            .expect("just checked presence of chunk order col"),
+                    ),
+                    options: Default::default(),
+                },
+            ])
+        } else {
+            None
+        };
+
+        Self {
+            chunks,
+            schema,
+            statistics,
+            output_sort_key_memo,
+            output_ordering,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+
+    /// Chunks that make up this node.
+    pub fn chunks(&self) -> impl Iterator<Item = &Arc<dyn QueryChunk>> {
+        self.chunks.iter()
+    }
+
+    /// Sort key that was passed to [`chunks_to_physical_nodes`].
+    ///
+    /// This is NOT used to set the output ordering. It is only here to recover this information later.
+    ///
+    ///
+    /// [`chunks_to_physical_nodes`]: super::physical::chunks_to_physical_nodes
+    pub fn output_sort_key_memo(&self) -> Option<&SortKey> {
+        self.output_sort_key_memo.as_ref()
+    }
+}
+
+impl ExecutionPlan for RecordBatchesExec {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(self.chunks.len())
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        self.output_ordering.as_deref()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        // no inputs
+        vec![]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        assert!(children.is_empty(), "no children expected in iox plan");
+
+        Ok(self)
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> datafusion::error::Result<SendableRecordBatchStream> {
+        trace!(partition, "Start RecordBatchesExec::execute");
+
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+
+        let schema = self.schema();
+
+        let chunk = &self.chunks[partition];
+
+        let stream = match chunk.data() {
+            crate::QueryChunkData::RecordBatches(stream) => stream,
+            crate::QueryChunkData::Parquet(_) => {
+                return Err(DataFusionError::Execution(String::from(
+                    "chunk must contain record batches",
+                )));
+            }
+        };
+        let virtual_columns = HashMap::from([(
+            CHUNK_ORDER_COLUMN_NAME,
+            ScalarValue::from(chunk.order().get()),
+        )]);
+        let adapter = Box::pin(
+            SchemaAdapterStream::try_new(stream, schema, &virtual_columns, baseline_metrics)
+                .map_err(|e| DataFusionError::External(Box::new(e)))?,
+        );
+
+        trace!(partition, "End RecordBatchesExec::execute");
+        Ok(adapter)
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics, DataFusionError> {
+        Ok(self.statistics.clone())
+    }
+}
+
+impl DisplayAs for RecordBatchesExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "RecordBatchesExec: chunks={}", self.chunks.len(),)?;
+                if !self.schema.fields().is_empty() {
+                    write!(f, ", projection={}", ProjectSchemaDisplay(&self.schema))?;
+                }
+                Ok(())
+            }
+        }
+    }
+}
diff --git a/iox_query/src/pruning.rs b/iox_query/src/pruning.rs
new file mode 100644
index 0000000..50f44f1
--- /dev/null
+++ b/iox_query/src/pruning.rs
@@ -0,0 +1,689 @@
+//! Implementation of statistics based pruning
+
+use crate::QueryChunk;
+use arrow::{
+    array::{ArrayRef, BooleanArray, UInt64Array},
+    datatypes::{DataType, SchemaRef},
+};
+use datafusion::{
+    physical_expr::execution_props::ExecutionProps,
+    physical_optimizer::pruning::PruningStatistics,
+    physical_plan::{ColumnStatistics, Statistics},
+    prelude::{col, Column, Expr},
+    scalar::ScalarValue,
+};
+use datafusion_util::{create_pruning_predicate, lit_timestamptz_nano};
+use observability_deps::tracing::{debug, trace, warn};
+use query_functions::group_by::Aggregate;
+use schema::{Schema, TIME_COLUMN_NAME};
+use std::collections::HashSet;
+use std::sync::Arc;
+
+/// Reason why a chunk could not be pruned.
+///
+/// Also see [`PruningObserver::could_not_prune`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum NotPrunedReason {
+    /// No expression on predicate
+    NoExpressionOnPredicate,
+
+    /// Can not create pruning predicate
+    CanNotCreatePruningPredicate,
+
+    /// DataFusion pruning failed
+    DataFusionPruningFailed,
+}
+
+impl NotPrunedReason {
+    /// Human-readable string representation.
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::NoExpressionOnPredicate => "No expression on predicate",
+            Self::CanNotCreatePruningPredicate => "Can not create pruning predicate",
+            Self::DataFusionPruningFailed => "DataFusion pruning failed",
+        }
+    }
+}
+
+impl std::fmt::Display for NotPrunedReason {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.name())
+    }
+}
+
+/// Something that cares to be notified when pruning of chunks occurs
+pub trait PruningObserver {
+    /// Called when the specified chunk was pruned
+    fn was_pruned(&self, _chunk: &dyn QueryChunk) {}
+
+    /// Called when a chunk was not pruned.
+    fn was_not_pruned(&self, _chunk: &dyn QueryChunk) {}
+
+    /// Called when no pruning can happen at all for some reason.
+    ///
+    /// Since pruning is optional and _only_ improves performance but its lack does not affect correctness, this will
+    /// NOT lead to a query error.
+    ///
+    /// In this case, statistical pruning will not happen and neither [`was_pruned`](Self::was_pruned) nor
+    /// [`was_not_pruned`](Self::was_not_pruned) will be called.
+    fn could_not_prune(&self, _reason: NotPrunedReason, _chunk: &dyn QueryChunk) {}
+}
+
+/// Given a Vec of prunable items, returns a possibly smaller set
+/// filtering those where the predicate can be proven to evaluate to
+/// `false` for every single row.
+pub fn prune_chunks(
+    table_schema: &Schema,
+    chunks: &[Arc<dyn QueryChunk>],
+    filters: &[Expr],
+) -> Result<Vec<bool>, NotPrunedReason> {
+    let num_chunks = chunks.len();
+    debug!(num_chunks, ?filters, "Pruning chunks");
+    let summaries: Vec<_> = chunks
+        .iter()
+        .map(|c| (c.stats(), c.schema().as_arrow()))
+        .collect();
+
+    let filter_expr = match filters.iter().cloned().reduce(|a, b| a.and(b)) {
+        Some(expr) => expr,
+        None => {
+            debug!("No expression on predicate");
+            return Err(NotPrunedReason::NoExpressionOnPredicate);
+        }
+    };
+
+    prune_summaries(table_schema, &summaries, &filter_expr)
+}
+
+/// Given a `Vec` of pruning summaries, return a `Vec<bool>` where `false` indicates that the
+/// predicate can be proven to evaluate to `false` for every single row.
+pub fn prune_summaries(
+    table_schema: &Schema,
+    summaries: &[(Arc<Statistics>, SchemaRef)],
+    filter_expr: &Expr,
+) -> Result<Vec<bool>, NotPrunedReason> {
+    trace!(%filter_expr, "Filter_expr of pruning chunks");
+
+    // no information about the queries here
+    let props = ExecutionProps::new();
+    let pruning_predicate =
+        match create_pruning_predicate(&props, filter_expr, &table_schema.as_arrow()) {
+            Ok(p) => p,
+            Err(e) => {
+                warn!(%e, ?filter_expr, "Can not create pruning predicate");
+                return Err(NotPrunedReason::CanNotCreatePruningPredicate);
+            }
+        };
+
+    let statistics = ChunkPruningStatistics {
+        table_schema,
+        summaries,
+    };
+
+    let results = match pruning_predicate.prune(&statistics) {
+        Ok(results) => results,
+        Err(e) => {
+            warn!(%e, ?filter_expr, "DataFusion pruning failed");
+            return Err(NotPrunedReason::DataFusionPruningFailed);
+        }
+    };
+    Ok(results)
+}
+
+/// Wraps a collection of [`QueryChunk`] and implements the [`PruningStatistics`]
+/// interface required for pruning
+struct ChunkPruningStatistics<'a> {
+    table_schema: &'a Schema,
+    summaries: &'a [(Arc<Statistics>, SchemaRef)],
+}
+
+impl<'a> ChunkPruningStatistics<'a> {
+    /// Returns the [`DataType`] for `column`
+    fn column_type(&self, column: &Column) -> Option<&DataType> {
+        let index = self.table_schema.find_index_of(&column.name)?;
+        Some(self.table_schema.field(index).1.data_type())
+    }
+
+    /// Returns an iterator that for each chunk returns the [`Statistics`]
+    /// for the provided `column` if any
+    fn column_summaries<'b: 'a, 'c: 'a>(
+        &'c self,
+        column: &'b Column,
+    ) -> impl Iterator<Item = Option<&'a ColumnStatistics>> + 'a {
+        self.summaries.iter().map(|(stats, schema)| {
+            let idx = schema.index_of(&column.name).ok()?;
+            Some(&stats.column_statistics[idx])
+        })
+    }
+}
+
+impl<'a> PruningStatistics for ChunkPruningStatistics<'a> {
+    fn min_values(&self, column: &Column) -> Option<ArrayRef> {
+        let data_type = self.column_type(column)?;
+        let summaries = self.column_summaries(column);
+        collect_pruning_stats(data_type, summaries, Aggregate::Min)
+    }
+
+    fn max_values(&self, column: &Column) -> Option<ArrayRef> {
+        let data_type = self.column_type(column)?;
+        let summaries = self.column_summaries(column);
+        collect_pruning_stats(data_type, summaries, Aggregate::Max)
+    }
+
+    fn num_containers(&self) -> usize {
+        self.summaries.len()
+    }
+
+    fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
+        let null_counts = self
+            .column_summaries(column)
+            .map(|stats| stats.and_then(|stats| stats.null_count.get_value()))
+            .map(|x| x.map(|x| *x as u64));
+
+        Some(Arc::new(UInt64Array::from_iter(null_counts)))
+    }
+
+    fn contained(
+        &self,
+        _column: &datafusion::common::Column,
+        _values: &HashSet<ScalarValue>,
+    ) -> Option<BooleanArray> {
+        None
+    }
+}
+
+/// Collects an [`ArrayRef`] containing the aggregate statistic corresponding to
+/// `aggregate` for each of the provided [`Statistics`]
+fn collect_pruning_stats<'a>(
+    data_type: &DataType,
+    statistics: impl Iterator<Item = Option<&'a ColumnStatistics>>,
+    aggregate: Aggregate,
+) -> Option<ArrayRef> {
+    let null = ScalarValue::try_from(data_type).ok()?;
+
+    ScalarValue::iter_to_array(statistics.map(|stats| {
+        stats
+            .and_then(|stats| get_aggregate(stats, aggregate).cloned())
+            .unwrap_or_else(|| null.clone())
+    }))
+    .ok()
+}
+
+/// Returns the aggregate statistic corresponding to `aggregate` from `stats`
+fn get_aggregate(stats: &ColumnStatistics, aggregate: Aggregate) -> Option<&ScalarValue> {
+    match aggregate {
+        Aggregate::Min => stats.min_value.get_value(),
+        Aggregate::Max => stats.max_value.get_value(),
+        _ => None,
+    }
+}
+
+/// Retention time expression, "time > retention_time".
+pub fn retention_expr(retention_time: i64) -> Expr {
+    col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(retention_time))
+}
+
+#[cfg(test)]
+mod test {
+    use std::{ops::Not, sync::Arc};
+
+    use datafusion::prelude::{col, lit};
+    use datafusion_util::lit_dict;
+    use schema::merge::SchemaMerger;
+
+    use crate::{test::TestChunk, QueryChunk};
+
+    use super::*;
+
+    #[test]
+    fn test_empty() {
+        test_helpers::maybe_start_logging();
+        let c1 = Arc::new(TestChunk::new("chunk1"));
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &[]);
+
+        assert_eq!(result, Err(NotPrunedReason::NoExpressionOnPredicate));
+    }
+
+    #[test]
+    fn test_pruned_f64() {
+        test_helpers::maybe_start_logging();
+        // column1 > 100.0 where
+        //   c1: [0.0, 10.0] --> pruned
+        let c1 = Arc::new(TestChunk::new("chunk1").with_f64_field_column_with_stats(
+            "column1",
+            Some(0.0),
+            Some(10.0),
+        ));
+
+        let filters = vec![col("column1").gt(lit(100.0f64))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![false]);
+    }
+
+    #[test]
+    fn test_pruned_i64() {
+        test_helpers::maybe_start_logging();
+        // column1 > 100 where
+        //   c1: [0, 10] --> pruned
+
+        let c1 = Arc::new(TestChunk::new("chunk1").with_i64_field_column_with_stats(
+            "column1",
+            Some(0),
+            Some(10),
+        ));
+
+        let filters = vec![col("column1").gt(lit(100i64))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+
+        assert_eq!(result.expect("pruning succeeds"), vec![false]);
+    }
+
+    #[test]
+    fn test_pruned_u64() {
+        test_helpers::maybe_start_logging();
+        // column1 > 100 where
+        //   c1: [0, 10] --> pruned
+
+        let c1 = Arc::new(TestChunk::new("chunk1").with_u64_field_column_with_stats(
+            "column1",
+            Some(0),
+            Some(10),
+        ));
+
+        let filters = vec![col("column1").gt(lit(100u64))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![false]);
+    }
+
+    #[test]
+    fn test_pruned_bool() {
+        test_helpers::maybe_start_logging();
+        // column1 where
+        //   c1: [false, false] --> pruned
+        let c1 = Arc::new(TestChunk::new("chunk1").with_bool_field_column_with_stats(
+            "column1",
+            Some(false),
+            Some(false),
+        ));
+
+        let filters = vec![col("column1")];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![false; 1]);
+    }
+
+    #[test]
+    fn test_pruned_string() {
+        test_helpers::maybe_start_logging();
+        // column1 > "z" where
+        //   c1: ["a", "q"] --> pruned
+
+        let c1 = Arc::new(
+            TestChunk::new("chunk1").with_string_field_column_with_stats(
+                "column1",
+                Some("a"),
+                Some("q"),
+            ),
+        );
+
+        let filters = vec![col("column1").gt(lit("z"))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![false]);
+    }
+
+    #[test]
+    fn test_not_pruned_f64() {
+        test_helpers::maybe_start_logging();
+        // column1 < 100.0 where
+        //   c1: [0.0, 10.0] --> not pruned
+        let c1 = Arc::new(TestChunk::new("chunk1").with_f64_field_column_with_stats(
+            "column1",
+            Some(0.0),
+            Some(10.0),
+        ));
+
+        let filters = vec![col("column1").lt(lit(100.0f64))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![true]);
+    }
+
+    #[test]
+    fn test_not_pruned_i64() {
+        test_helpers::maybe_start_logging();
+        // column1 < 100 where
+        //   c1: [0, 10] --> not pruned
+
+        let c1 = Arc::new(TestChunk::new("chunk1").with_i64_field_column_with_stats(
+            "column1",
+            Some(0),
+            Some(10),
+        ));
+
+        let filters = vec![col("column1").lt(lit(100i64))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![true]);
+    }
+
+    #[test]
+    fn test_not_pruned_u64() {
+        test_helpers::maybe_start_logging();
+        // column1 < 100 where
+        //   c1: [0, 10] --> not pruned
+
+        let c1 = Arc::new(TestChunk::new("chunk1").with_u64_field_column_with_stats(
+            "column1",
+            Some(0),
+            Some(10),
+        ));
+
+        let filters = vec![col("column1").lt(lit(100u64))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![true]);
+    }
+
+    #[test]
+    fn test_not_pruned_bool() {
+        test_helpers::maybe_start_logging();
+        // column1
+        //   c1: [false, true] --> not pruned
+
+        let c1 = Arc::new(TestChunk::new("chunk1").with_bool_field_column_with_stats(
+            "column1",
+            Some(false),
+            Some(true),
+        ));
+
+        let filters = vec![col("column1")];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![true]);
+    }
+
+    #[test]
+    fn test_not_pruned_string() {
+        test_helpers::maybe_start_logging();
+        // column1 < "z" where
+        //   c1: ["a", "q"] --> not pruned
+
+        let c1 = Arc::new(
+            TestChunk::new("chunk1").with_string_field_column_with_stats(
+                "column1",
+                Some("a"),
+                Some("q"),
+            ),
+        );
+
+        let filters = vec![col("column1").lt(lit("z"))];
+
+        let result = prune_chunks(&c1.schema().clone(), &[c1], &filters);
+        assert_eq!(result.expect("pruning succeeds"), vec![true]);
+    }
+
+    fn merge_schema(chunks: &[Arc<dyn QueryChunk>]) -> Schema {
+        let mut merger = SchemaMerger::new();
+        for chunk in chunks {
+            merger = merger.merge(chunk.schema()).unwrap();
+        }
+        merger.build()
+    }
+
+    #[test]
+    fn test_pruned_null() {
+        test_helpers::maybe_start_logging();
+        // column1 > 100 where
+        //   c1: [Null, 10] --> pruned
+        //   c2: [0, Null] --> not pruned
+        //   c3: [Null, Null] --> not pruned (min/max are not known in chunk 3)
+        //   c4: Null --> not pruned (no statistics at all)
+
+        let c1 = Arc::new(TestChunk::new("chunk1").with_i64_field_column_with_stats(
+            "column1",
+            None,
+            Some(10),
+        )) as Arc<dyn QueryChunk>;
+
+        let c2 = Arc::new(TestChunk::new("chunk2").with_i64_field_column_with_stats(
+            "column1",
+            Some(0),
+            None,
+        )) as Arc<dyn QueryChunk>;
+
+        let c3 = Arc::new(
+            TestChunk::new("chunk3").with_i64_field_column_with_stats("column1", None, None),
+        ) as Arc<dyn QueryChunk>;
+
+        let c4 = Arc::new(TestChunk::new("chunk4").with_i64_field_column("column1"))
+            as Arc<dyn QueryChunk>;
+
+        let filters = vec![col("column1").gt(lit(100i64))];
+
+        let chunks = vec![c1, c2, c3, c4];
+        let schema = merge_schema(&chunks);
+
+        let result = prune_chunks(&schema, &chunks, &filters);
+
+        assert_eq!(
+            result.expect("pruning succeeds"),
+            vec![false, true, true, true]
+        );
+    }
+
+    #[test]
+    fn test_pruned_multi_chunk() {
+        test_helpers::maybe_start_logging();
+        // column1 > 100 where
+        //   c1: [0, 10] --> pruned
+        //   c2: [0, 1000] --> not pruned
+        //   c3: [10, 20] --> pruned
+        //   c4: [None, None] --> not pruned
+        //   c5: [10, None] --> not pruned
+        //   c6: [None, 10] --> pruned
+
+        let c1 = Arc::new(TestChunk::new("chunk1").with_i64_field_column_with_stats(
+            "column1",
+            Some(0),
+            Some(10),
+        )) as Arc<dyn QueryChunk>;
+
+        let c2 = Arc::new(TestChunk::new("chunk2").with_i64_field_column_with_stats(
+            "column1",
+            Some(0),
+            Some(1000),
+        )) as Arc<dyn QueryChunk>;
+
+        let c3 = Arc::new(TestChunk::new("chunk3").with_i64_field_column_with_stats(
+            "column1",
+            Some(10),
+            Some(20),
+        )) as Arc<dyn QueryChunk>;
+
+        let c4 = Arc::new(
+            TestChunk::new("chunk4").with_i64_field_column_with_stats("column1", None, None),
+        ) as Arc<dyn QueryChunk>;
+
+        let c5 = Arc::new(TestChunk::new("chunk5").with_i64_field_column_with_stats(
+            "column1",
+            Some(10),
+            None,
+        )) as Arc<dyn QueryChunk>;
+
+        let c6 = Arc::new(TestChunk::new("chunk6").with_i64_field_column_with_stats(
+            "column1",
+            None,
+            Some(20),
+        )) as Arc<dyn QueryChunk>;
+
+        let filters = vec![col("column1").gt(lit(100i64))];
+
+        let chunks = vec![c1, c2, c3, c4, c5, c6];
+        let schema = merge_schema(&chunks);
+
+        let result = prune_chunks(&schema, &chunks, &filters);
+
+        assert_eq!(
+            result.expect("pruning succeeds"),
+            vec![false, true, false, true, true, false]
+        );
+    }
+
+    #[test]
+    fn test_pruned_different_schema() {
+        test_helpers::maybe_start_logging();
+        // column1 > 100 where
+        //   c1: column1 [0, 100], column2 [0, 4] --> pruned (in range, column2 ignored)
+        //   c2: column1 [0, 1000], column2 [0, 4] --> not pruned (in range, column2 ignored)
+        //   c3: None, column2 [0, 4] --> not pruned (no stats for column1)
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_i64_field_column_with_stats("column1", Some(0), Some(100))
+                .with_i64_field_column_with_stats("column2", Some(0), Some(4)),
+        ) as Arc<dyn QueryChunk>;
+
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_i64_field_column_with_stats("column1", Some(0), Some(1000))
+                .with_i64_field_column_with_stats("column2", Some(0), Some(4)),
+        ) as Arc<dyn QueryChunk>;
+
+        let c3 = Arc::new(TestChunk::new("chunk3").with_i64_field_column_with_stats(
+            "column2",
+            Some(0),
+            Some(4),
+        )) as Arc<dyn QueryChunk>;
+
+        let filters = vec![col("column1").gt(lit(100i64))];
+
+        let chunks = vec![c1, c2, c3];
+        let schema = merge_schema(&chunks);
+
+        let result = prune_chunks(&schema, &chunks, &filters);
+
+        assert_eq!(result.expect("pruning succeeds"), vec![false, true, true]);
+    }
+
+    #[test]
+    fn test_pruned_is_null() {
+        test_helpers::maybe_start_logging();
+        // Verify that type of predicate is pruned if column1 is null
+        // (this is a common predicate type created by the INfluxRPC planner)
+        // (NOT column1 IS NULL) AND (column1 = 'bar')
+
+        // No nulls, can't prune as it has values that are more and less than 'bar'
+        let c1 = Arc::new(
+            TestChunk::new("chunk1").with_tag_column_with_nulls_and_full_stats(
+                "column1",
+                Some("a"),
+                Some("z"),
+                100,
+                None,
+                0,
+            ),
+        ) as Arc<dyn QueryChunk>;
+
+        // Has no nulls, can prune it out based on statistics alone
+        let c2 = Arc::new(
+            TestChunk::new("chunk2").with_tag_column_with_nulls_and_full_stats(
+                "column1",
+                Some("a"),
+                Some("b"),
+                100,
+                None,
+                0,
+            ),
+        ) as Arc<dyn QueryChunk>;
+
+        // Has nulls, can still can prune it out based on statistics alone
+        let c3 = Arc::new(
+            TestChunk::new("chunk3").with_tag_column_with_nulls_and_full_stats(
+                "column1",
+                Some("a"),
+                Some("b"),
+                100,
+                None,
+                1, // that one peksy null!
+            ),
+        ) as Arc<dyn QueryChunk>;
+
+        let filters = vec![col("column1")
+            .is_null()
+            .not()
+            .and(col("column1").eq(lit_dict("bar")))];
+
+        let chunks = vec![c1, c2, c3];
+        let schema = merge_schema(&chunks);
+
+        let result = prune_chunks(&schema, &chunks, &filters);
+
+        assert_eq!(result.expect("pruning succeeds"), vec![true, false, false]);
+    }
+
+    #[test]
+    fn test_pruned_multi_column() {
+        test_helpers::maybe_start_logging();
+        // column1 > 100 AND column2 < 5 where
+        //   c1: column1 [0, 1000], column2 [0, 4] --> not pruned (both in range)
+        //   c2: column1 [0, 10], column2 [0, 4] --> pruned (column1 and column2 out of range)
+        //   c3: column1 [0, 10], column2 [5, 10] --> pruned (column1 out of range, column2 in of range)
+        //   c4: column1 [1000, 2000], column2 [0, 4] --> not pruned (column1 in range, column2 in range)
+        //   c5: column1 [0, 10], column2 Null --> pruned (column1 out of range, but column2 has no stats)
+        //   c6: column1 Null, column2 [0, 4] --> not pruned (column1 has no stats, column2 out of range)
+
+        let c1 = Arc::new(
+            TestChunk::new("chunk1")
+                .with_i64_field_column_with_stats("column1", Some(0), Some(1000))
+                .with_i64_field_column_with_stats("column2", Some(0), Some(4)),
+        ) as Arc<dyn QueryChunk>;
+
+        let c2 = Arc::new(
+            TestChunk::new("chunk2")
+                .with_i64_field_column_with_stats("column1", Some(0), Some(10))
+                .with_i64_field_column_with_stats("column2", Some(0), Some(4)),
+        ) as Arc<dyn QueryChunk>;
+
+        let c3 = Arc::new(
+            TestChunk::new("chunk3")
+                .with_i64_field_column_with_stats("column1", Some(0), Some(10))
+                .with_i64_field_column_with_stats("column2", Some(5), Some(10)),
+        ) as Arc<dyn QueryChunk>;
+
+        let c4 = Arc::new(
+            TestChunk::new("chunk4")
+                .with_i64_field_column_with_stats("column1", Some(1000), Some(2000))
+                .with_i64_field_column_with_stats("column2", Some(0), Some(4)),
+        ) as Arc<dyn QueryChunk>;
+
+        let c5 = Arc::new(
+            TestChunk::new("chunk5")
+                .with_i64_field_column_with_stats("column1", Some(0), Some(10))
+                .with_i64_field_column("column2"),
+        ) as Arc<dyn QueryChunk>;
+
+        let c6 = Arc::new(
+            TestChunk::new("chunk6")
+                .with_i64_field_column("column1")
+                .with_i64_field_column_with_stats("column2", Some(0), Some(4)),
+        ) as Arc<dyn QueryChunk>;
+
+        let filters = vec![col("column1")
+            .gt(lit(100i64))
+            .and(col("column2").lt(lit(5i64)))];
+
+        let chunks = vec![c1, c2, c3, c4, c5, c6];
+        let schema = merge_schema(&chunks);
+
+        let result = prune_chunks(&schema, &chunks, &filters);
+
+        assert_eq!(
+            result.expect("Pruning succeeds"),
+            vec![true, false, false, true, false, true]
+        );
+    }
+}
diff --git a/iox_query/src/query_log.rs b/iox_query/src/query_log.rs
new file mode 100644
index 0000000..e6ae929
--- /dev/null
+++ b/iox_query/src/query_log.rs
@@ -0,0 +1,704 @@
+//! Ring buffer of queries that have been run with some brief information
+
+use data_types::NamespaceId;
+use datafusion::physical_plan::ExecutionPlan;
+use iox_time::{Time, TimeProvider};
+use observability_deps::tracing::{info, warn};
+use parking_lot::Mutex;
+use std::{
+    collections::VecDeque,
+    fmt::Debug,
+    sync::{
+        atomic::{self, AtomicBool, AtomicI64, AtomicUsize, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+use trace::ctx::TraceId;
+use uuid::Uuid;
+
+/// The query duration used for queries still running.
+const UNCOMPLETED_DURATION: i64 = -1;
+
+/// Information about a single query that was executed
+pub struct QueryLogEntry {
+    /// Unique ID.
+    pub id: Uuid,
+
+    /// Namespace ID.
+    pub namespace_id: NamespaceId,
+
+    /// Namespace name.
+    pub namespace_name: Arc<str>,
+
+    /// The type of query
+    pub query_type: &'static str,
+
+    /// The text of the query (SQL for sql queries, pbjson for storage rpc queries)
+    pub query_text: QueryText,
+
+    /// The trace ID if any
+    pub trace_id: Option<TraceId>,
+
+    /// Time at which the query was run
+    pub issue_time: Time,
+
+    /// Duration it took to acquire a semaphore permit, relative to [`issue_time`](Self::issue_time).
+    permit_duration: AtomicDuration,
+
+    /// Duration it took to plan the query, relative to [`issue_time`](Self::issue_time) + [`permit_duration`](Self::permit_duration).
+    plan_duration: AtomicDuration,
+
+    /// Duration it took to execute the query, relative to [`issue_time`](Self::issue_time) +
+    /// [`permit_duration`](Self::permit_duration) + [`plan_duration`](Self::plan_duration).
+    execute_duration: AtomicDuration,
+
+    /// Duration from [`issue_time`](Self::issue_time) til the query ended somehow.
+    end2end_duration: AtomicDuration,
+
+    /// CPU duration spend for computation.
+    compute_duration: AtomicDuration,
+
+    /// If the query completed successfully
+    success: AtomicBool,
+
+    /// If the query is currently running (in any state).
+    running: AtomicBool,
+}
+
+impl Debug for QueryLogEntry {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("QueryLogEntry")
+            .field("id", &self.id)
+            .field("namespace_id", &self.namespace_id)
+            .field("namespace_name", &self.namespace_name)
+            .field("query_type", &self.query_type)
+            .field("query_text", &self.query_text.to_string())
+            .field("trace_id", &self.trace_id)
+            .field("issue_time", &self.issue_time)
+            .field("permit_duration", &self.permit_duration())
+            .field("plan_duration", &self.plan_duration())
+            .field("execute_duration", &self.execute_duration())
+            .field("end2end_duration", &self.end2end_duration())
+            .field("compute_duration", &self.compute_duration())
+            .field("success", &self.success())
+            .field("running", &self.running())
+            .finish()
+    }
+}
+
+impl QueryLogEntry {
+    /// Duration it took to acquire a semaphore permit, relative to [`issue_time`](Self::issue_time).
+    pub fn permit_duration(&self) -> Option<Duration> {
+        self.permit_duration.get()
+    }
+
+    /// Duration it took to plan the query, relative to [`issue_time`](Self::issue_time) + [`permit_duration`](Self::permit_duration).
+    pub fn plan_duration(&self) -> Option<Duration> {
+        self.plan_duration.get()
+    }
+
+    /// Duration it took to execute the query, relative to [`issue_time`](Self::issue_time) +
+    /// [`permit_duration`](Self::permit_duration) + [`plan_duration`](Self::plan_duration).
+    pub fn execute_duration(&self) -> Option<Duration> {
+        self.execute_duration.get()
+    }
+
+    /// Duration from [`issue_time`](Self::issue_time) til the query ended somehow.
+    pub fn end2end_duration(&self) -> Option<Duration> {
+        self.end2end_duration.get()
+    }
+
+    /// CPU duration spend for computation.
+    pub fn compute_duration(&self) -> Option<Duration> {
+        self.compute_duration.get()
+    }
+
+    /// Returns true if `set_completed` was called with `success=true`
+    pub fn success(&self) -> bool {
+        self.success.load(Ordering::SeqCst)
+    }
+
+    /// If the query is currently running (in any state).
+    pub fn running(&self) -> bool {
+        self.running.load(Ordering::SeqCst)
+    }
+
+    /// Log entry.
+    pub fn log(&self, when: &'static str) {
+        info!(
+            when,
+            id=%self.id,
+            namespace_id=self.namespace_id.get(),
+            namespace_name=self.namespace_name.as_ref(),
+            query_type=self.query_type,
+            query_text=%self.query_text,
+            trace_id=self.trace_id.map(|id| format!("{:x}", id.get())),
+            issue_time=%self.issue_time,
+            plan_duration_secs=self.plan_duration().map(|d| d.as_secs_f64()),
+            permit_duration_secs=self.permit_duration().map(|d| d.as_secs_f64()),
+            execute_duration_secs=self.execute_duration().map(|d| d.as_secs_f64()),
+            end2end_duration_secs=self.end2end_duration().map(|d| d.as_secs_f64()),
+            compute_duration_secs=self.compute_duration().map(|d| d.as_secs_f64()),
+            success=self.success(),
+            running=self.running(),
+            "query",
+        )
+    }
+}
+
+/// Snapshot of the entries the [`QueryLog`].
+#[derive(Debug)]
+pub struct QueryLogEntries {
+    /// Entries.
+    pub entries: VecDeque<Arc<QueryLogEntry>>,
+
+    /// Maximum number of entries
+    pub max_size: usize,
+
+    /// Number of evicted entries due to the "max size" constraint.
+    pub evicted: usize,
+}
+
+/// Stores a fixed number `QueryExecutions` -- handles locking
+/// internally so can be shared across multiple
+pub struct QueryLog {
+    log: Mutex<VecDeque<Arc<QueryLogEntry>>>,
+    max_size: usize,
+    evicted: AtomicUsize,
+    time_provider: Arc<dyn TimeProvider>,
+    id_gen: IDGen,
+}
+
+impl QueryLog {
+    /// Create a new QueryLog that can hold at most `size` items.
+    /// When the `size+1` item is added, item `0` is evicted.
+    pub fn new(max_size: usize, time_provider: Arc<dyn TimeProvider>) -> Self {
+        Self::new_with_id_gen(max_size, time_provider, Box::new(Uuid::new_v4))
+    }
+
+    pub fn new_with_id_gen(
+        max_size: usize,
+        time_provider: Arc<dyn TimeProvider>,
+        id_gen: IDGen,
+    ) -> Self {
+        Self {
+            log: Mutex::new(VecDeque::with_capacity(max_size)),
+            max_size,
+            evicted: AtomicUsize::new(0),
+            time_provider,
+            id_gen,
+        }
+    }
+
+    pub fn push(
+        &self,
+        namespace_id: NamespaceId,
+        namespace_name: Arc<str>,
+        query_type: &'static str,
+        query_text: QueryText,
+        trace_id: Option<TraceId>,
+    ) -> QueryCompletedToken<StateReceived> {
+        let entry = Arc::new(QueryLogEntry {
+            id: (self.id_gen)(),
+            namespace_id,
+            namespace_name,
+            query_type,
+            query_text,
+            trace_id,
+            issue_time: self.time_provider.now(),
+            permit_duration: Default::default(),
+            plan_duration: Default::default(),
+            execute_duration: Default::default(),
+            end2end_duration: Default::default(),
+            compute_duration: Default::default(),
+            success: atomic::AtomicBool::new(false),
+            running: atomic::AtomicBool::new(true),
+        });
+        entry.log("start");
+        let token = QueryCompletedToken {
+            entry: Some(Arc::clone(&entry)),
+            time_provider: Arc::clone(&self.time_provider),
+            state: Default::default(),
+        };
+
+        if self.max_size == 0 {
+            return token;
+        }
+
+        let mut log = self.log.lock();
+
+        // enforce limit
+        while log.len() > self.max_size {
+            log.pop_front();
+            self.evicted.fetch_add(1, Ordering::SeqCst);
+        }
+
+        log.push_back(Arc::clone(&entry));
+        token
+    }
+
+    pub fn entries(&self) -> QueryLogEntries {
+        let log = self.log.lock();
+        QueryLogEntries {
+            entries: log.clone(),
+            max_size: self.max_size,
+            evicted: self.evicted.load(Ordering::SeqCst),
+        }
+    }
+}
+
+impl Debug for QueryLog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("QueryLog")
+            .field("log", &self.log)
+            .field("max_size", &self.max_size)
+            .field("evicted", &self.evicted)
+            .field("time_provider", &self.time_provider)
+            .field("id_gen", &"<ID_GEN>")
+            .finish()
+    }
+}
+
+/// State of [`QueryCompletedToken`].
+///
+/// # Done
+/// - The query has been received (and potentially authenticated) by the server.
+///
+/// # To Do
+/// - The concurrency-limiting semaphore has NOT yet issued a permit.
+/// - The query is not planned.
+/// - The query has not been executed.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct StateReceived;
+
+/// State of [`QueryCompletedToken`].
+///
+/// # Done
+/// - The query has been received (and potentially authenticated) by the server.
+/// - The concurrency-limiting semaphore has issued a permit.
+/// - The query was planned.
+///
+/// # To Do
+/// - The concurrency-limiting semaphore has NOT yet issued a permit.
+/// - The query has not been executed.
+#[derive(Debug)]
+pub struct StatePlanned {
+    /// Physical execution plan.
+    plan: Arc<dyn ExecutionPlan>,
+}
+
+/// State of [`QueryCompletedToken`].
+///
+/// # Done
+/// - The query has been received (and potentially authenticated) by the server.
+/// - The concurrency-limiting semaphore has issued a permit.
+///
+/// # To Do
+/// - The query has not been executed.
+#[derive(Debug)]
+pub struct StatePermit {
+    /// Physical execution plan.
+    plan: Arc<dyn ExecutionPlan>,
+}
+
+/// A `QueryCompletedToken` is returned by `record_query` implementations of
+/// a `QueryNamespace`. It is used to trigger side-effects (such as query timing)
+/// on query completion.
+#[derive(Debug)]
+pub struct QueryCompletedToken<S> {
+    /// Entry.
+    ///
+    /// This is optional so we can implement type state and [`Drop`] at the same time.
+    entry: Option<Arc<QueryLogEntry>>,
+
+    /// Time provider
+    time_provider: Arc<dyn TimeProvider>,
+
+    /// Current state.
+    state: S,
+}
+
+impl<S> QueryCompletedToken<S> {
+    /// Underlying entry.
+    pub fn entry(&self) -> &Arc<QueryLogEntry> {
+        self.entry.as_ref().expect("valid state")
+    }
+}
+
+impl QueryCompletedToken<StateReceived> {
+    /// Record that this query got planned.
+    pub fn planned(mut self, plan: Arc<dyn ExecutionPlan>) -> QueryCompletedToken<StatePlanned> {
+        let entry = self.entry.take().expect("valid state");
+
+        let now = self.time_provider.now();
+        let origin = entry.issue_time;
+        entry.plan_duration.set_relative(origin, now);
+
+        QueryCompletedToken {
+            entry: Some(entry),
+            time_provider: Arc::clone(&self.time_provider),
+            state: StatePlanned { plan },
+        }
+    }
+}
+
+impl QueryCompletedToken<StatePlanned> {
+    /// Record that this query got a semaphore permit.
+    pub fn permit(mut self) -> QueryCompletedToken<StatePermit> {
+        let entry = self.entry.take().expect("valid state");
+
+        let now = self.time_provider.now();
+        let origin = entry.issue_time + entry.plan_duration().expect("valid state");
+        entry.permit_duration.set_relative(origin, now);
+
+        QueryCompletedToken {
+            entry: Some(entry),
+            time_provider: Arc::clone(&self.time_provider),
+            state: StatePermit {
+                plan: Arc::clone(&self.state.plan),
+            },
+        }
+    }
+}
+
+impl QueryCompletedToken<StatePermit> {
+    /// Record that this query completed successfully
+    pub fn success(self) {
+        let entry = self.entry.as_ref().expect("valid state");
+        entry.success.store(true, Ordering::SeqCst);
+
+        self.finish()
+    }
+
+    /// Record that the query finished execution with an error.
+    pub fn fail(self) {
+        self.finish()
+    }
+
+    fn finish(&self) {
+        let entry = self.entry.as_ref().expect("valid state");
+
+        let now = self.time_provider.now();
+        let origin = entry.issue_time
+            + entry.permit_duration().expect("valid state")
+            + entry.plan_duration().expect("valid state");
+        entry.execute_duration.set_relative(origin, now);
+
+        entry
+            .compute_duration
+            .set_absolute(collect_compute_duration(self.state.plan.as_ref()));
+    }
+}
+
+impl<S> Drop for QueryCompletedToken<S> {
+    fn drop(&mut self) {
+        if let Some(entry) = self.entry.take() {
+            let now = self.time_provider.now();
+            entry.end2end_duration.set_relative(entry.issue_time, now);
+            entry.running.store(false, Ordering::SeqCst);
+
+            entry.log("end");
+        }
+    }
+}
+
+/// Boxed description of a query that knows how to render to a string
+///
+/// This avoids storing potentially large strings
+pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
+
+/// Method that generated [`Uuid`]s.
+pub type IDGen = Box<dyn Fn() -> Uuid + Send + Sync>;
+
+struct AtomicDuration(AtomicI64);
+
+impl AtomicDuration {
+    fn get(&self) -> Option<Duration> {
+        match self.0.load(Ordering::Relaxed) {
+            UNCOMPLETED_DURATION => None,
+            d => Some(Duration::from_nanos(d as u64)),
+        }
+    }
+
+    fn set_relative(&self, origin: Time, now: Time) {
+        match now.checked_duration_since(origin) {
+            Some(dur) => {
+                self.0.store(dur.as_nanos() as i64, Ordering::Relaxed);
+            }
+            None => {
+                warn!("Clock went backwards, not query duration")
+            }
+        }
+    }
+
+    fn set_absolute(&self, d: Duration) {
+        self.0.store(d.as_nanos() as i64, Ordering::Relaxed);
+    }
+}
+
+impl Default for AtomicDuration {
+    fn default() -> Self {
+        Self(AtomicI64::new(UNCOMPLETED_DURATION))
+    }
+}
+
+/// Collect compute duration from [`ExecutionPlan`].
+fn collect_compute_duration(plan: &dyn ExecutionPlan) -> Duration {
+    let mut total = Duration::ZERO;
+
+    if let Some(metrics) = plan.metrics() {
+        if let Some(nanos) = metrics.elapsed_compute() {
+            total += Duration::from_nanos(nanos as u64);
+        }
+    }
+
+    for child in plan.children() {
+        total += collect_compute_duration(child.as_ref());
+    }
+
+    total
+}
+
+#[cfg(test)]
+mod test_super {
+    use datafusion::error::DataFusionError;
+    use std::sync::atomic::AtomicU64;
+
+    use datafusion::physical_plan::{
+        metrics::{MetricValue, MetricsSet},
+        DisplayAs, Metric,
+    };
+    use iox_time::MockProvider;
+    use test_helpers::tracing::TracingCapture;
+
+    use super::*;
+
+    #[test]
+    fn test_token_end2end_success() {
+        let capture = TracingCapture::new();
+
+        let Test {
+            time_provider,
+            token,
+            entry,
+        } = Test::default();
+
+        assert!(!entry.success());
+        assert!(entry.running());
+        assert_eq!(entry.permit_duration(), None,);
+        assert_eq!(entry.plan_duration(), None,);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), None,);
+        assert_eq!(entry.compute_duration(), None,);
+
+        time_provider.inc(Duration::from_millis(1));
+        let token = token.planned(plan());
+
+        assert!(!entry.success());
+        assert!(entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), None,);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), None,);
+        assert_eq!(entry.compute_duration(), None,);
+
+        time_provider.inc(Duration::from_millis(10));
+        let token = token.permit();
+
+        assert!(!entry.success());
+        assert!(entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), None,);
+        assert_eq!(entry.compute_duration(), None,);
+
+        time_provider.inc(Duration::from_millis(100));
+        token.success();
+
+        assert!(entry.success());
+        assert!(!entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),);
+        assert_eq!(entry.execute_duration(), Some(Duration::from_millis(100)),);
+        assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(111)),);
+        assert_eq!(entry.compute_duration(), Some(Duration::from_millis(1_337)),);
+
+        assert_eq!(
+            capture.to_string().trim(),
+            [
+                r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#,
+                r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; plan_duration_secs = 0.001; permit_duration_secs = 0.01; execute_duration_secs = 0.1; end2end_duration_secs = 0.111; compute_duration_secs = 1.337; success = true; running = false;"#,
+            ].join(" \n")
+        );
+    }
+
+    #[test]
+    fn test_token_execution_fail() {
+        let capture = TracingCapture::new();
+
+        let Test {
+            time_provider,
+            token,
+            entry,
+        } = Test::default();
+
+        time_provider.inc(Duration::from_millis(1));
+        let token = token.planned(plan());
+        time_provider.inc(Duration::from_millis(10));
+        let token = token.permit();
+        time_provider.inc(Duration::from_millis(100));
+        token.fail();
+
+        assert!(!entry.success());
+        assert!(!entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),);
+        assert_eq!(entry.execute_duration(), Some(Duration::from_millis(100)),);
+        assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(111)),);
+        assert_eq!(entry.compute_duration(), Some(Duration::from_millis(1_337)),);
+
+        assert_eq!(
+            capture.to_string().trim(),
+            [
+                r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#,
+                r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; plan_duration_secs = 0.001; permit_duration_secs = 0.01; execute_duration_secs = 0.1; end2end_duration_secs = 0.111; compute_duration_secs = 1.337; success = false; running = false;"#,
+            ].join(" \n")
+        );
+    }
+
+    #[test]
+    fn test_token_drop_before_acquire() {
+        let capture = TracingCapture::new();
+
+        let Test {
+            time_provider,
+            token,
+            entry,
+        } = Test::default();
+
+        time_provider.inc(Duration::from_millis(100));
+        drop(token);
+
+        assert!(!entry.success());
+        assert!(!entry.running());
+        assert_eq!(entry.permit_duration(), None,);
+        assert_eq!(entry.plan_duration(), None,);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(100)),);
+        assert_eq!(entry.compute_duration(), None,);
+
+        assert_eq!(
+            capture.to_string().trim(),
+            [
+                r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#,
+                r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; end2end_duration_secs = 0.1; success = false; running = false;"#,
+            ].join(" \n")
+        );
+    }
+
+    struct Test {
+        time_provider: Arc<MockProvider>,
+        token: QueryCompletedToken<StateReceived>,
+        entry: Arc<QueryLogEntry>,
+    }
+
+    impl Default for Test {
+        fn default() -> Self {
+            let time_provider =
+                Arc::new(MockProvider::new(Time::from_timestamp_millis(100).unwrap()));
+            let id_counter = AtomicU64::new(1);
+            let log = QueryLog::new_with_id_gen(
+                1_000,
+                Arc::clone(&time_provider) as _,
+                Box::new(move || Uuid::from_u128(id_counter.fetch_add(1, Ordering::SeqCst) as _)),
+            );
+
+            let token = log.push(
+                NamespaceId::new(1),
+                Arc::from("ns"),
+                "sql",
+                Box::new("SELECT 1"),
+                None,
+            );
+
+            let entry = Arc::clone(token.entry());
+
+            Self {
+                time_provider,
+                token,
+                entry,
+            }
+        }
+    }
+
+    fn plan() -> Arc<dyn ExecutionPlan> {
+        Arc::new(TestExec)
+    }
+
+    #[derive(Debug)]
+    struct TestExec;
+
+    impl DisplayAs for TestExec {
+        fn fmt_as(
+            &self,
+            _t: datafusion::physical_plan::DisplayFormatType,
+            _f: &mut std::fmt::Formatter<'_>,
+        ) -> std::fmt::Result {
+            unimplemented!()
+        }
+    }
+
+    impl ExecutionPlan for TestExec {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn schema(&self) -> arrow::datatypes::SchemaRef {
+            unimplemented!()
+        }
+
+        fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning {
+            unimplemented!()
+        }
+
+        fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> {
+            unimplemented!()
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+            unimplemented!()
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<datafusion::execution::TaskContext>,
+        ) -> datafusion::error::Result<datafusion::physical_plan::SendableRecordBatchStream>
+        {
+            unimplemented!()
+        }
+
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            unimplemented!()
+        }
+
+        fn metrics(&self) -> Option<MetricsSet> {
+            let mut metrics = MetricsSet::default();
+
+            let t = datafusion::physical_plan::metrics::Time::default();
+            t.add_duration(Duration::from_millis(1_337));
+            metrics.push(Arc::new(Metric::new(MetricValue::ElapsedCompute(t), None)));
+
+            Some(metrics)
+        }
+    }
+}
diff --git a/iox_query/src/statistics.rs b/iox_query/src/statistics.rs
new file mode 100644
index 0000000..3fc4d54
--- /dev/null
+++ b/iox_query/src/statistics.rs
@@ -0,0 +1,1447 @@
+//! Code to translate IOx statistics to DataFusion statistics
+
+use std::collections::{HashMap, VecDeque};
+use std::sync::Arc;
+
+use arrow::compute::rank;
+use arrow::datatypes::{Schema, SchemaRef};
+use datafusion::common::stats::Precision;
+use datafusion::datasource::physical_plan::ParquetExec;
+use datafusion::error::DataFusionError;
+use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion::physical_plan::empty::EmptyExec;
+use datafusion::physical_plan::expressions::Column;
+use datafusion::physical_plan::filter::FilterExec;
+use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion::physical_plan::projection::ProjectionExec;
+use datafusion::physical_plan::repartition::RepartitionExec;
+use datafusion::physical_plan::sorts::sort::SortExec;
+use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use datafusion::physical_plan::union::UnionExec;
+use datafusion::physical_plan::{visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor};
+use datafusion::{
+    physical_plan::{ColumnStatistics, Statistics as DFStatistics},
+    scalar::ScalarValue,
+};
+use observability_deps::tracing::trace;
+
+use crate::provider::{DeduplicateExec, RecordBatchesExec};
+use crate::{QueryChunk, CHUNK_ORDER_COLUMN_NAME};
+
+/// Aggregates DataFusion [statistics](DFStatistics).
+#[derive(Debug)]
+pub struct DFStatsAggregator<'a> {
+    num_rows: Precision<usize>,
+    total_byte_size: Precision<usize>,
+    column_statistics: Vec<DFStatsAggregatorCol>,
+    // Maps column name to index in column_statistics for all columns we are
+    // aggregating
+    col_idx_map: HashMap<&'a str, usize>,
+}
+
+impl<'a> DFStatsAggregator<'a> {
+    /// Creates new aggregator the the given schema.
+    ///
+    /// This will start with:
+    ///
+    /// - 0 rows
+    /// - 0 bytes
+    /// - for each column:
+    ///   - 0 null values
+    ///   - unknown min value
+    ///   - unknown max value
+    /// - exact representation
+    pub fn new(schema: &'a Schema) -> Self {
+        let col_idx_map = schema
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(idx, f)| (f.name().as_str(), idx))
+            .collect::<HashMap<_, _>>();
+
+        Self {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Exact(0),
+            column_statistics: (0..col_idx_map.len())
+                .map(|_| DFStatsAggregatorCol {
+                    null_count: Precision::Exact(0),
+                    max_value: None,
+                    min_value: None,
+                })
+                .collect(),
+
+            col_idx_map,
+        }
+    }
+
+    /// Update given base statistics with the given schema.
+    ///
+    /// This only updates columns that were present when the aggregator was created. Column reordering is allowed.
+    ///
+    /// Updates are meant to be "additive", i.e. they only add data/rows. There is NOT way to remove/substract data from
+    /// the accumulator.
+    ///
+    /// # Panics
+    /// Panics when the number of columns in the statistics and the schema are different.
+    pub fn update(&mut self, update_stats: &DFStatistics, update_schema: &Schema) {
+        // decompose structs so we don't forget new fields
+        let DFStatistics {
+            num_rows: update_num_rows,
+            total_byte_size: update_total_byte_size,
+            column_statistics: update_column_statistics,
+        } = update_stats;
+
+        self.num_rows = self.num_rows.add(update_num_rows);
+        self.total_byte_size = self.total_byte_size.add(update_total_byte_size);
+
+        assert_eq!(self.column_statistics.len(), self.col_idx_map.len());
+        assert_eq!(
+            update_column_statistics.len(),
+            update_schema.fields().len(),
+            "stats ({}) and schema ({}) have different column count",
+            update_column_statistics.len(),
+            update_schema.fields().len(),
+        );
+
+        let mut used_cols = vec![false; self.col_idx_map.len()];
+
+        for (update_field, update_col) in update_schema
+            .fields()
+            .iter()
+            .zip(update_column_statistics.iter())
+        {
+            // Skip if not aggregating statitics for this field
+            let Some(idx) = self.col_idx_map.get(update_field.name().as_str()) else {
+                continue;
+            };
+            let base_col = &mut self.column_statistics[*idx];
+            used_cols[*idx] = true;
+
+            // decompose structs so we don't forget new fields
+            let DFStatsAggregatorCol {
+                null_count: base_null_count,
+                max_value: base_max_value,
+                min_value: base_min_value,
+            } = base_col;
+            let ColumnStatistics {
+                null_count: update_null_count,
+                max_value: update_max_value,
+                min_value: update_min_value,
+                distinct_count: _update_distinct_count,
+            } = update_col;
+
+            *base_null_count = base_null_count.add(update_null_count);
+
+            *base_max_value = Some(
+                base_max_value
+                    .take()
+                    .map(|base_max_value| base_max_value.max(update_max_value))
+                    .unwrap_or(update_max_value.clone()),
+            );
+
+            *base_min_value = Some(
+                base_min_value
+                    .take()
+                    .map(|base_min_value| base_min_value.min(update_min_value))
+                    .unwrap_or(update_min_value.clone()),
+            );
+        }
+
+        // for unused cols, we need to assume all-NULL and hence invalidate the null counters
+        for (used, base_col) in used_cols.into_iter().zip(&mut self.column_statistics) {
+            if !used {
+                base_col.null_count = Precision::Absent;
+            }
+        }
+    }
+
+    /// Build aggregated statistics.
+    pub fn build(self) -> DFStatistics {
+        DFStatistics {
+            num_rows: self.num_rows,
+            total_byte_size: self.total_byte_size,
+            column_statistics: self
+                .column_statistics
+                .into_iter()
+                .map(|col| ColumnStatistics {
+                    null_count: col.null_count,
+                    max_value: col.max_value.unwrap_or(Precision::Absent),
+                    min_value: col.min_value.unwrap_or(Precision::Absent),
+                    distinct_count: Precision::Absent,
+                })
+                .collect(),
+        }
+    }
+}
+
+/// Similar to [`ColumnStatistics`] but uses `Option` to track min/max values so
+/// we can differentiate between
+///
+/// 1. "uninitialized" (`None`)
+/// 1. "initialized" (`Some(Precision::Exact(...))`)
+/// 2. "initialized but invalid" (`Some(Precision::Absent)`).
+///
+/// It also does NOT contain a distinct count because we cannot aggregate these.
+#[derive(Debug)]
+struct DFStatsAggregatorCol {
+    null_count: Precision<usize>,
+    max_value: Option<Precision<ScalarValue>>,
+    min_value: Option<Precision<ScalarValue>>,
+}
+
+/// build DF statitics for given chunks and a schema
+pub fn build_statistics_for_chunks(
+    chunks: &[Arc<dyn QueryChunk>],
+    schema: SchemaRef,
+) -> DFStatistics {
+    let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok();
+    let chunk_order_only_schema = chunk_order_field.map(|field| Schema::new(vec![field.clone()]));
+
+    let chunks: Vec<_> = chunks.iter().collect();
+
+    let statistics = chunks
+        .iter()
+        .fold(DFStatsAggregator::new(&schema), |mut agg, chunk| {
+            agg.update(&chunk.stats(), chunk.schema().as_arrow().as_ref());
+
+            if let Some(schema) = chunk_order_only_schema.as_ref() {
+                let order = chunk.order().get();
+                let order = ScalarValue::from(order);
+
+                agg.update(
+                    &DFStatistics {
+                        num_rows: Precision::Exact(0),
+                        total_byte_size: Precision::Exact(0),
+                        column_statistics: vec![ColumnStatistics {
+                            null_count: Precision::Exact(0),
+                            max_value: Precision::Exact(order.clone()),
+                            min_value: Precision::Exact(order),
+                            distinct_count: Precision::Exact(1),
+                        }],
+                    },
+                    schema,
+                );
+            }
+
+            agg
+        })
+        .build();
+
+    statistics
+}
+
+/// Traverse the execution plan and build statistics min max for the given column
+pub fn compute_stats_column_min_max(
+    plan: &dyn ExecutionPlan,
+    column_name: &str,
+) -> Result<ColumnStatistics, DataFusionError> {
+    let mut visitor = StatisticsVisitor::new(column_name);
+    visit_execution_plan(plan, &mut visitor)?;
+
+    // there must be only one statistics left in the stack
+    if visitor.statistics.len() != 1 {
+        return Err(DataFusionError::Internal(format!(
+            "There must be only one statistics left in the stack, but find {}",
+            visitor.statistics.len()
+        )));
+    }
+
+    Ok(visitor.statistics.pop_back().unwrap())
+}
+
+/// Traverse the physical plan and build statistics min max for the given column each node
+/// Note: This is a temproray solution until DF's statistics is more mature
+/// <https://github.com/apache/arrow-datafusion/issues/8078>
+struct StatisticsVisitor<'a> {
+    column_name: &'a str, //String,  // todo: not sure enough
+    statistics: VecDeque<ColumnStatistics>,
+}
+
+impl<'a> StatisticsVisitor<'a> {
+    fn new(column_name: &'a str) -> Self {
+        Self {
+            column_name,
+            statistics: VecDeque::new(),
+        }
+    }
+}
+
+impl ExecutionPlanVisitor for StatisticsVisitor<'_> {
+    type Error = DataFusionError;
+
+    fn pre_visit(&mut self, _plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
+        Ok(false)
+    }
+
+    fn post_visit(&mut self, plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
+        // If this is an EmptyExec / PlaceholderRowExec, we don't know about it
+        if plan.as_any().downcast_ref::<EmptyExec>().is_some()
+            || plan.as_any().downcast_ref::<PlaceholderRowExec>().is_some()
+        {
+            self.statistics.push_back(ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+            });
+        }
+        // If this is leaf node (ParquetExec or RecordBatchExec), compute its statistics and push it to the stack
+        else if plan.as_any().downcast_ref::<ParquetExec>().is_some()
+            || plan.as_any().downcast_ref::<RecordBatchesExec>().is_some()
+        {
+            // get index of the given column in the schema
+            let statistics = match plan.schema().index_of(self.column_name) {
+                Ok(col_index) => plan.statistics()?.column_statistics[col_index].clone(),
+                // This is the case of alias, do not optimize by returning no statistics
+                Err(_) => {
+                    trace!(
+                        " ------------------- No statistics for column {} in PQ/RB",
+                        self.column_name
+                    );
+                    ColumnStatistics {
+                        null_count: Precision::Absent,
+                        max_value: Precision::Absent,
+                        min_value: Precision::Absent,
+                        distinct_count: Precision::Absent,
+                    }
+                }
+            };
+            self.statistics.push_back(statistics);
+        }
+        // Non leaf node
+        else {
+            // These are cases the stats will be unioned of their children's
+            //   Sort, Dediplicate, Filter, Repartition, Union, SortPreservingMerge, CoalesceBatches
+            let union_stats = if plan.as_any().downcast_ref::<SortExec>().is_some()
+                || plan.as_any().downcast_ref::<DeduplicateExec>().is_some()
+                || plan.as_any().downcast_ref::<FilterExec>().is_some()
+                || plan.as_any().downcast_ref::<RepartitionExec>().is_some()
+                || plan.as_any().downcast_ref::<UnionExec>().is_some()
+                || plan
+                    .as_any()
+                    .downcast_ref::<SortPreservingMergeExec>()
+                    .is_some()
+                || plan
+                    .as_any()
+                    .downcast_ref::<CoalesceBatchesExec>()
+                    .is_some()
+            {
+                true
+            } else if plan.as_any().downcast_ref::<ProjectionExec>().is_some() {
+                // ProjectionExec is a special case. Only union stats if it includes pure columns
+                projection_includes_pure_columns(
+                    plan.as_any().downcast_ref::<ProjectionExec>().unwrap(),
+                )
+            } else {
+                false
+            };
+
+            // pop statistics of all inputs from the stack
+            let num_inputs = plan.children().len();
+            // num_input must > 0. Pop the first one
+            let mut statistics = self
+                .statistics
+                .pop_back()
+                .expect("No statistics for input plan");
+            // pop the rest and update the min and max
+            for _ in 1..num_inputs {
+                let input_statistics = self
+                    .statistics
+                    .pop_back()
+                    .expect("No statistics for input plan");
+
+                if union_stats {
+                    // Convervatively union min max
+                    statistics.null_count = statistics.null_count.add(&input_statistics.null_count);
+                    statistics.max_value = statistics.max_value.max(&input_statistics.max_value);
+                    statistics.min_value = statistics.min_value.min(&input_statistics.min_value);
+                    statistics.distinct_count = Precision::Absent;
+                };
+            }
+
+            if union_stats {
+                self.statistics.push_back(statistics);
+            } else {
+                trace!(
+                    " ------ No statistics for column {} in non-leaf node",
+                    self.column_name
+                );
+                // Make them absent for other cases
+                self.statistics.push_back(ColumnStatistics {
+                    null_count: Precision::Absent,
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                });
+            }
+        }
+
+        Ok(true)
+    }
+}
+
+fn projection_includes_pure_columns(projection: &ProjectionExec) -> bool {
+    projection
+        .expr()
+        .iter()
+        .all(|(expr, _col_name)| expr.as_any().downcast_ref::<Column>().is_some())
+}
+
+/// Return min max of a ColumnStatistics with precise values
+pub fn column_statistics_min_max(
+    column_statistics: &ColumnStatistics,
+) -> Option<(ScalarValue, ScalarValue)> {
+    match (&column_statistics.min_value, &column_statistics.max_value) {
+        (Precision::Exact(min), Precision::Exact(max)) => Some((min.clone(), max.clone())),
+        // the statistics values are absent or imprecise
+        _ => None,
+    }
+}
+
+/// Get statsistics min max of given column name on given plans
+/// Return None if one of the inputs does not have statistics or  does not include the column
+pub fn statistics_min_max(
+    plans: &[Arc<dyn ExecutionPlan>],
+    column_name: &str,
+) -> Option<Vec<(ScalarValue, ScalarValue)>> {
+    // Get statistics for each plan
+    let plans_schema_and_stats = plans
+        .iter()
+        .map(|plan| Ok((Arc::clone(plan), plan.schema(), plan.statistics()?)))
+        .collect::<Result<Vec<_>, DataFusionError>>();
+
+    // If any without statistics, return none
+    let Ok(plans_schema_and_stats) = plans_schema_and_stats else {
+        return None;
+    };
+
+    // get value range of the sorted column for each input
+    let mut min_max_ranges = Vec::with_capacity(plans_schema_and_stats.len());
+    for (input, input_schema, input_stats) in plans_schema_and_stats {
+        // get index of the sorted column in the schema
+        let Ok(sorted_col_index) = input_schema.index_of(column_name) else {
+            // panic that the sorted column is not in the schema
+            panic!("sorted column {} is not in the schema", column_name);
+        };
+
+        let column_stats = input_stats.column_statistics;
+        let sorted_col_stats = column_stats[sorted_col_index].clone();
+        match (sorted_col_stats.min_value, sorted_col_stats.max_value) {
+            (Precision::Exact(min), Precision::Exact(max)) => {
+                min_max_ranges.push((min, max));
+            }
+            // WARNING: this may produce incorrect results until we use more precision
+            // as `Inexact` is not guaranteed to cover the actual min and max values
+            // https://github.com/apache/arrow-datafusion/issues/8078
+            (Precision::Inexact(min), Precision::Inexact(max)) => {
+                if let Some(_deduplicate_exec) = input.as_any().downcast_ref::<DeduplicateExec>() {
+                    min_max_ranges.push((min, max));
+                } else {
+                    return None;
+                };
+            }
+            // the statistics  values are absent
+            _ => return None,
+        }
+    }
+
+    Some(min_max_ranges)
+}
+
+/// Return true if at least 2 min_max ranges in the given array overlap
+pub fn overlap(value_ranges: &[(ScalarValue, ScalarValue)]) -> Result<bool, DataFusionError> {
+    // interleave min and max into one iterator
+    let value_ranges_iter = value_ranges.iter().flat_map(|(min, max)| {
+        // panics if min > max
+        if min > max {
+            panic!("min ({:?}) > max ({:?})", min, max);
+        }
+        vec![min.clone(), max.clone()]
+    });
+
+    let value_ranges = ScalarValue::iter_to_array(value_ranges_iter)?;
+
+    // rank it
+    let ranks = rank(&*value_ranges, None)?;
+
+    // check overlap by checking if the max is rank right behind its corresponding min
+    //  . non-overlap example: values of min-max pairs [3, 5,   9, 12,   1, 1,   6, 8]
+    //     ranks:  [3, 4,   7, 8,  2, 2,  5, 6] : max (even index) = its correspnding min (odd index) for same min max OR min + 1
+    //  . overlap example:  [3, 5,   9, 12,   1, 1,   4, 6] : pair [3, 5] interleaves with pair [4, 6]
+    //     ranks:  [3, 5,   7, 8,  2, 2,  4, 6]
+    for i in (0..ranks.len()).step_by(2) {
+        if !((ranks[i] == ranks[i + 1]) || (ranks[i + 1] == ranks[i] + 1)) {
+            return Ok(true);
+        }
+    }
+
+    Ok(false)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{
+        provider::chunks_to_physical_nodes,
+        test::{format_execution_plan, TestChunk},
+    };
+
+    use super::*;
+    use arrow::datatypes::{DataType, Field};
+    use datafusion::{common::Statistics, error::DataFusionError};
+    use itertools::Itertools;
+    use schema::{InfluxFieldType, SchemaBuilder};
+
+    #[test]
+    fn test_df_stats_agg_no_cols_no_updates() {
+        let schema = Schema::new(Vec::<Field>::new());
+        let agg = DFStatsAggregator::new(&schema);
+
+        let actual = agg.build();
+        let expected = DFStatistics {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Exact(0),
+            column_statistics: Statistics::unknown_column(&schema),
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_df_stats_agg_no_updates() {
+        let schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col2", DataType::Utf8, false),
+        ]);
+        let agg = DFStatsAggregator::new(&schema);
+
+        let actual = agg.build();
+        let expected = DFStatistics {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Exact(0),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_df_stats_agg_valid_update_partial() {
+        let schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col2", DataType::Utf8, false),
+        ]);
+        let mut agg = DFStatsAggregator::new(&schema);
+
+        let update_schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col2", DataType::Utf8, false),
+        ]);
+        let update_stats = DFStatistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
+                },
+            ],
+        };
+        agg.update(&update_stats, &update_schema);
+
+        let update_schema = Schema::new(vec![Field::new("col2", DataType::Utf8, false)]);
+        let update_stats = DFStatistics {
+            num_rows: Precision::Exact(10_000),
+            total_byte_size: Precision::Exact(100_000),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1_000_000),
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("c".to_owned()))),
+                distinct_count: Precision::Exact(42),
+            }],
+        };
+        agg.update(&update_stats, &update_schema);
+
+        let actual = agg.build();
+        let expected = DFStatistics {
+            num_rows: Precision::Exact(10_001),
+            total_byte_size: Precision::Exact(100_010),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Absent,
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Absent,
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(1_001_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_df_stats_agg_valid_update_col_reorder() {
+        let schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col2", DataType::Utf8, false),
+        ]);
+        let mut agg = DFStatsAggregator::new(&schema);
+
+        let update_schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col2", DataType::Utf8, false),
+        ]);
+        let update_stats = DFStatistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
+                },
+            ],
+        };
+        agg.update(&update_stats, &update_schema);
+
+        let update_schema = Schema::new(vec![
+            Field::new("col2", DataType::Utf8, false),
+            Field::new("col1", DataType::UInt64, true),
+        ]);
+        let update_stats = DFStatistics {
+            num_rows: Precision::Exact(10_000),
+            total_byte_size: Precision::Exact(100_000),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(1_000_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("c".to_owned()))),
+                    distinct_count: Precision::Exact(42),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(10_000_000),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(99))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(40))),
+                    distinct_count: Precision::Exact(42),
+                },
+            ],
+        };
+        agg.update(&update_stats, &update_schema);
+
+        let actual = agg.build();
+        let expected = DFStatistics {
+            num_rows: Precision::Exact(10_001),
+            total_byte_size: Precision::Exact(100_010),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(10_000_100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(40))),
+                    distinct_count: Precision::Absent,
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(1_001_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_df_stats_agg_ignores_unknown_cols() {
+        let schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col2", DataType::Utf8, false),
+        ]);
+        let mut agg = DFStatsAggregator::new(&schema);
+
+        let update_schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col3", DataType::Utf8, false),
+        ]);
+        let update_stats = DFStatistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
+                },
+            ],
+        };
+        agg.update(&update_stats, &update_schema);
+
+        let actual = agg.build();
+        let expected = DFStatistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Absent,
+                },
+                ColumnStatistics {
+                    null_count: Precision::Absent,
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_df_stats_agg_invalidation() {
+        let schema = Schema::new(vec![
+            Field::new("col1", DataType::UInt64, true),
+            Field::new("col2", DataType::Utf8, false),
+        ]);
+
+        let update_stats = DFStatistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
+                },
+            ],
+        };
+        let agg_stats = DFStatistics {
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Exact(20),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(200),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Absent,
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(2_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+
+        #[derive(Debug, Clone, Copy)]
+        enum ColMode {
+            NullCount,
+            MaxValue,
+            MinValue,
+        }
+
+        #[derive(Debug, Clone, Copy)]
+        enum Mode {
+            NumRows,
+            TotalByteSize,
+            ColumnStatistics,
+            Col(usize, ColMode),
+        }
+
+        impl Mode {
+            fn mask(&self, mut stats: DFStatistics) -> DFStatistics {
+                match self {
+                    Self::NumRows => {
+                        stats.num_rows = Precision::Absent;
+                    }
+                    Self::TotalByteSize => {
+                        stats.total_byte_size = Precision::Absent;
+                    }
+                    Self::ColumnStatistics => {
+                        let num_cols = stats.column_statistics.len();
+                        stats.column_statistics = vec![ColumnStatistics::new_unknown(); num_cols]
+                    }
+                    Self::Col(idx, mode) => {
+                        let stats = &mut stats.column_statistics[*idx];
+
+                        match mode {
+                            ColMode::NullCount => {
+                                stats.null_count = Precision::Absent;
+                            }
+                            ColMode::MaxValue => {
+                                stats.max_value = Precision::Absent;
+                            }
+                            ColMode::MinValue => {
+                                stats.min_value = Precision::Absent;
+                            }
+                        }
+                    }
+                }
+                stats
+            }
+        }
+
+        for mode in [
+            Mode::NumRows,
+            Mode::TotalByteSize,
+            Mode::ColumnStatistics,
+            Mode::Col(0, ColMode::NullCount),
+            Mode::Col(0, ColMode::MaxValue),
+            Mode::Col(0, ColMode::MinValue),
+            Mode::Col(1, ColMode::NullCount),
+        ] {
+            println!("mode: {mode:?}");
+
+            for invalid_mask in [[false, true], [true, false], [true, true]] {
+                println!("invalid_mask: {invalid_mask:?}");
+                let mut agg = DFStatsAggregator::new(&schema);
+
+                for invalid in invalid_mask {
+                    let mut update_stats = update_stats.clone();
+                    if invalid {
+                        update_stats = mode.mask(update_stats);
+                    }
+                    agg.update(&update_stats, &schema);
+                }
+
+                let actual = agg.build();
+
+                let expected = mode.mask(agg_stats.clone());
+                assert_eq!(actual, expected);
+            }
+        }
+    }
+
+    #[test]
+    #[should_panic(expected = "stats (0) and schema (1) have different column count")]
+    fn test_df_stats_agg_asserts_schema_stats_match() {
+        let schema = Schema::new(vec![Field::new("col1", DataType::UInt64, true)]);
+        let mut agg = DFStatsAggregator::new(&schema);
+
+        let update_schema = Schema::new(vec![Field::new("col1", DataType::UInt64, true)]);
+        let update_stats = DFStatistics {
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![],
+        };
+        agg.update(&update_stats, &update_schema);
+    }
+
+    #[test]
+    fn test_stats_for_one_chunk() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("field", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into();
+
+        // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)),
+        );
+
+        // create them same test chunk but with a parquet file
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6))
+                .with_dummy_parquet_file(),
+        );
+
+        let expected_stats = [
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("MT".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(20), None)),
+                min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(6))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+        ];
+
+        let record_batch_stats =
+            build_statistics_for_chunks(&[record_batch_chunk], Arc::clone(&schema));
+        assert_eq!(record_batch_stats.column_statistics, expected_stats);
+
+        let parquet_stats = build_statistics_for_chunks(&[parquet_chunk], schema);
+        assert_eq!(parquet_stats.column_statistics, expected_stats);
+    }
+
+    #[test]
+    fn test_stats_for_two_chunks() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("field", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into();
+
+        // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let record_batch_chunk_1 = Arc::new(
+            TestChunk::new("t1")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)),
+        );
+
+        let record_batch_chunk_2 = Arc::new(
+            TestChunk::new("t2")
+                .with_tag_column_with_stats("tag", Some("MI"), Some("WA"))
+                .with_time_column_with_stats(Some(50), Some(80))
+                .with_i64_field_column_with_stats("field", Some(0), Some(70))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(7), Some(15)),
+        );
+
+        // create them same test chunk but with a parquet file
+        let parquet_chunk_1 = Arc::new(
+            TestChunk::new("t1")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6))
+                .with_dummy_parquet_file(),
+        );
+
+        let parquet_chunk_2 = Arc::new(
+            TestChunk::new("t2")
+                .with_tag_column_with_stats("tag", Some("MI"), Some("WA"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(70))
+                .with_time_column_with_stats(Some(50), Some(80))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(7), Some(15))
+                .with_dummy_parquet_file(),
+        );
+
+        let expected_stats = [
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("WA".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(80), None)),
+                min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(15))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+        ];
+
+        let record_batch_stats = build_statistics_for_chunks(
+            &[record_batch_chunk_1, record_batch_chunk_2],
+            Arc::clone(&schema),
+        );
+        assert_eq!(record_batch_stats.column_statistics, expected_stats);
+
+        let parquet_stats =
+            build_statistics_for_chunks(&[parquet_chunk_1, parquet_chunk_2], schema);
+        assert_eq!(parquet_stats.column_statistics, expected_stats);
+    }
+
+    #[test]
+    fn test_compute_statistics_min_max() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("float_field", InfluxFieldType::Float)
+            .influx_field("int_field", InfluxFieldType::Integer)
+            .influx_field("string_field", InfluxFieldType::String)
+            .tag("tag_no_val") // no chunks have values for this
+            .influx_field("field_no_val", InfluxFieldType::Integer)
+            .timestamp()
+            .build()
+            .unwrap()
+            .into();
+
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(10), Some(100))
+                .with_tag_column_with_stats("tag", Some("MA"), Some("VT"))
+                .with_f64_field_column_with_stats("float_field", Some(10.1), Some(100.4))
+                .with_i64_field_column_with_stats("int_field", Some(30), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("orange"), Some("plum"))
+                // only this chunk has value for this field
+                .with_i64_field_column_with_stats("field_no_val", Some(30), Some(50))
+                .with_dummy_parquet_file(),
+        ) as Arc<dyn QueryChunk>;
+
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(20), Some(200))
+                .with_tag_column_with_stats("tag", Some("Boston"), Some("DC"))
+                .with_f64_field_column_with_stats("float_field", Some(15.6), Some(30.0))
+                .with_i64_field_column_with_stats("int_field", Some(1), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("banana"), Some("plum")),
+        ) as Arc<dyn QueryChunk>;
+
+        let plan_pq = chunks_to_physical_nodes(&schema, None, vec![parquet_chunk], 1);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan_pq),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[tag, float_field, int_field, string_field, tag_no_val, field_no_val, time]"
+        "###
+        );
+
+        let plan_rb = chunks_to_physical_nodes(&schema, None, vec![record_batch_chunk], 1);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan_rb),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   RecordBatchesExec: chunks=1, projection=[tag, float_field, int_field, string_field, tag_no_val, field_no_val, time]"
+        "###
+        );
+
+        // Stats for time
+        // parquet
+        let time_stats = compute_stats_column_min_max(&*plan_pq, "time").unwrap();
+        let min_max = column_statistics_min_max(&time_stats).unwrap();
+        let expected_time_stats = (
+            ScalarValue::TimestampNanosecond(Some(10), None),
+            ScalarValue::TimestampNanosecond(Some(100), None),
+        );
+        assert_eq!(min_max, expected_time_stats);
+        // record batch
+        let time_stats = compute_stats_column_min_max(&*plan_rb, "time").unwrap();
+        let min_max = column_statistics_min_max(&time_stats).unwrap();
+        let expected_time_stats = (
+            ScalarValue::TimestampNanosecond(Some(20), None),
+            ScalarValue::TimestampNanosecond(Some(200), None),
+        );
+        assert_eq!(min_max, expected_time_stats);
+
+        // Stats for tag
+        // parquet
+        let tag_stats = compute_stats_column_min_max(&*plan_pq, "tag").unwrap();
+        let min_max = column_statistics_min_max(&tag_stats).unwrap();
+        let expected_tag_stats = (
+            ScalarValue::Utf8(Some("MA".to_string())),
+            ScalarValue::Utf8(Some("VT".to_string())),
+        );
+        assert_eq!(min_max, expected_tag_stats);
+        // record batch
+        let tag_stats = compute_stats_column_min_max(&*plan_rb, "tag").unwrap();
+        let min_max = column_statistics_min_max(&tag_stats).unwrap();
+        let expected_tag_stats = (
+            ScalarValue::Utf8(Some("Boston".to_string())),
+            ScalarValue::Utf8(Some("DC".to_string())),
+        );
+        assert_eq!(min_max, expected_tag_stats);
+
+        // Stats for field
+        // parquet
+        let float_stats = compute_stats_column_min_max(&*plan_pq, "float_field").unwrap();
+        let min_max = column_statistics_min_max(&float_stats).unwrap();
+        let expected_float_stats = (
+            ScalarValue::Float64(Some(10.1)),
+            ScalarValue::Float64(Some(100.4)),
+        );
+        assert_eq!(min_max, expected_float_stats);
+        // record batch
+        let float_stats = compute_stats_column_min_max(&*plan_rb, "float_field").unwrap();
+        let min_max = column_statistics_min_max(&float_stats).unwrap();
+        let expected_float_stats = (
+            ScalarValue::Float64(Some(15.6)),
+            ScalarValue::Float64(Some(30.0)),
+        );
+        assert_eq!(min_max, expected_float_stats);
+
+        // Stats for int
+        // parquet
+        let int_stats = compute_stats_column_min_max(&*plan_pq, "int_field").unwrap();
+        let min_max = column_statistics_min_max(&int_stats).unwrap();
+        let expected_int_stats = (ScalarValue::Int64(Some(30)), ScalarValue::Int64(Some(50)));
+        assert_eq!(min_max, expected_int_stats);
+        // record batch
+        let int_stats = compute_stats_column_min_max(&*plan_rb, "int_field").unwrap();
+        let min_max = column_statistics_min_max(&int_stats).unwrap();
+        let expected_int_stats = (ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(50)));
+        assert_eq!(min_max, expected_int_stats);
+
+        // Stats for string
+        // parquet
+        let string_stats = compute_stats_column_min_max(&*plan_pq, "string_field").unwrap();
+        let min_max = column_statistics_min_max(&string_stats).unwrap();
+        let expected_string_stats = (
+            ScalarValue::Utf8(Some("orange".to_string())),
+            ScalarValue::Utf8(Some("plum".to_string())),
+        );
+        assert_eq!(min_max, expected_string_stats);
+        // record batch
+        let string_stats = compute_stats_column_min_max(&*plan_rb, "string_field").unwrap();
+        let min_max = column_statistics_min_max(&string_stats).unwrap();
+        let expected_string_stats = (
+            ScalarValue::Utf8(Some("banana".to_string())),
+            ScalarValue::Utf8(Some("plum".to_string())),
+        );
+        assert_eq!(min_max, expected_string_stats);
+
+        // no tats on parquet
+        let tag_no_stats = compute_stats_column_min_max(&*plan_pq, "tag_no_val").unwrap();
+        let min_max = column_statistics_min_max(&tag_no_stats);
+        assert!(min_max.is_none());
+
+        // no stats on record batch
+        let field_no_stats = compute_stats_column_min_max(&*plan_rb, "field_no_val").unwrap();
+        let min_max = column_statistics_min_max(&field_no_stats);
+        assert!(min_max.is_none());
+    }
+
+    #[test]
+    fn test_statistics_min_max() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("float_field", InfluxFieldType::Float)
+            .influx_field("int_field", InfluxFieldType::Integer)
+            .influx_field("string_field", InfluxFieldType::String)
+            .tag("tag_no_val") // no chunks have values for this
+            .influx_field("field_no_val", InfluxFieldType::Integer)
+            .timestamp()
+            .build()
+            .unwrap()
+            .into();
+
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(10), Some(100))
+                .with_tag_column_with_stats("tag", Some("MA"), Some("VT"))
+                .with_f64_field_column_with_stats("float_field", Some(10.1), Some(100.4))
+                .with_i64_field_column_with_stats("int_field", Some(30), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("orange"), Some("plum"))
+                // only this chunk has value for this field
+                .with_i64_field_column_with_stats("field_no_val", Some(30), Some(50))
+                .with_dummy_parquet_file(),
+        ) as Arc<dyn QueryChunk>;
+
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(20), Some(200))
+                .with_tag_column_with_stats("tag", Some("Boston"), Some("DC"))
+                .with_f64_field_column_with_stats("float_field", Some(15.6), Some(30.0))
+                .with_i64_field_column_with_stats("int_field", Some(1), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("banana"), Some("plum")),
+        ) as Arc<dyn QueryChunk>;
+
+        let plan1 = chunks_to_physical_nodes(&schema, None, vec![parquet_chunk], 1);
+        let plan2 = chunks_to_physical_nodes(&schema, None, vec![record_batch_chunk], 1);
+
+        let time_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "time").unwrap();
+        let expected_time_stats = [
+            (
+                ScalarValue::TimestampNanosecond(Some(10), None),
+                ScalarValue::TimestampNanosecond(Some(100), None),
+            ),
+            (
+                ScalarValue::TimestampNanosecond(Some(20), None),
+                ScalarValue::TimestampNanosecond(Some(200), None),
+            ),
+        ];
+        assert_eq!(time_stats, expected_time_stats);
+
+        let tag_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "tag").unwrap();
+        let expected_tag_stats = [
+            (
+                ScalarValue::Utf8(Some("MA".to_string())),
+                ScalarValue::Utf8(Some("VT".to_string())),
+            ),
+            (
+                ScalarValue::Utf8(Some("Boston".to_string())),
+                ScalarValue::Utf8(Some("DC".to_string())),
+            ),
+        ];
+        assert_eq!(tag_stats, expected_tag_stats);
+
+        let float_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "float_field").unwrap();
+        let expected_float_stats = [
+            (
+                ScalarValue::Float64(Some(10.1)),
+                ScalarValue::Float64(Some(100.4)),
+            ),
+            (
+                ScalarValue::Float64(Some(15.6)),
+                ScalarValue::Float64(Some(30.0)),
+            ),
+        ];
+        assert_eq!(float_stats, expected_float_stats);
+
+        let int_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "int_field").unwrap();
+        let expected_int_stats = [
+            (ScalarValue::Int64(Some(30)), ScalarValue::Int64(Some(50))),
+            (ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(50))),
+        ];
+        assert_eq!(int_stats, expected_int_stats);
+
+        let string_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "string_field").unwrap();
+        let expected_string_stats = [
+            (
+                ScalarValue::Utf8(Some("orange".to_string())),
+                ScalarValue::Utf8(Some("plum".to_string())),
+            ),
+            (
+                ScalarValue::Utf8(Some("banana".to_string())),
+                ScalarValue::Utf8(Some("plum".to_string())),
+            ),
+        ];
+        assert_eq!(string_stats, expected_string_stats);
+
+        let tag_no_stat =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "tag_no_val");
+        assert!(tag_no_stat.is_none());
+
+        let field_no_stat =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "field_no_val");
+        assert!(field_no_stat.is_none());
+    }
+
+    #[test]
+    fn test_non_overlap_time() {
+        let pair_1 = (
+            ScalarValue::TimestampNanosecond(Some(10), None),
+            ScalarValue::TimestampNanosecond(Some(20), None),
+        );
+        let pair_2 = (
+            ScalarValue::TimestampNanosecond(Some(100), None),
+            ScalarValue::TimestampNanosecond(Some(150), None),
+        );
+        let pair_3 = (
+            ScalarValue::TimestampNanosecond(Some(60), None),
+            ScalarValue::TimestampNanosecond(Some(65), None),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_time() {
+        let pair_1 = (
+            ScalarValue::TimestampNanosecond(Some(10), None),
+            ScalarValue::TimestampNanosecond(Some(20), None),
+        );
+        let pair_2 = (
+            ScalarValue::TimestampNanosecond(Some(100), None),
+            ScalarValue::TimestampNanosecond(Some(150), None),
+        );
+        let pair_3 = (
+            ScalarValue::TimestampNanosecond(Some(8), None),
+            ScalarValue::TimestampNanosecond(Some(10), None),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    fn test_non_overlap_integer() {
+        // [3, 5,   9, 12,   1, 1,   6, 8]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(Some(9)), ScalarValue::Int16(Some(12)));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(1)));
+        let pair_4 = (ScalarValue::Int16(Some(6)), ScalarValue::Int16(Some(8)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_integer() {
+        // [3, 5,   9, 12,   1, 1,   4, 6]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(Some(9)), ScalarValue::Int16(Some(12)));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(1)));
+        let pair_4 = (ScalarValue::Int16(Some(4)), ScalarValue::Int16(Some(6)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    fn test_non_overlap_integer_ascending_null_first() {
+        // [3, 5,   null, null,   1, 1,   6, 8]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(None), ScalarValue::Int16(None));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(2)));
+        let pair_4 = (ScalarValue::Int16(Some(6)), ScalarValue::Int16(Some(8)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_integer_ascending_null_first() {
+        // [3, 5,   null, null,   1, 1,   4, 6]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(None), ScalarValue::Int16(None));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(2)));
+        let pair_4 = (ScalarValue::Int16(Some(4)), ScalarValue::Int16(Some(6)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    fn test_non_overlap_string_ascending_null_first() {
+        // ['e', 'h',   null, null,   'a', 'a',   'k', 'q']
+        let pair_1 = (
+            ScalarValue::Utf8(Some('e'.to_string())),
+            ScalarValue::Utf8(Some('h'.to_string())),
+        );
+        let pair_2 = (ScalarValue::Utf8(None), ScalarValue::Utf8(None));
+        let pair_3 = (
+            ScalarValue::Utf8(Some('a'.to_string())),
+            ScalarValue::Utf8(Some('a'.to_string())),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_string_ascending_null_first() {
+        // ['e', 'h',   null, null,   'a', 'f',   'k', 'q']
+        let pair_1 = (
+            ScalarValue::Utf8(Some('e'.to_string())),
+            ScalarValue::Utf8(Some('h'.to_string())),
+        );
+        let pair_2 = (ScalarValue::Utf8(None), ScalarValue::Utf8(None));
+        let pair_3 = (
+            ScalarValue::Utf8(Some('a'.to_string())),
+            ScalarValue::Utf8(Some('f'.to_string())),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    #[should_panic(expected = "Internal(\"Empty iterator passed to ScalarValue::iter_to_array\")")]
+    fn test_overlap_empty() {
+        let _overlap = overlap_all(&[]);
+    }
+
+    #[should_panic(expected = "min (Int16(3)) > max (Int16(2))")]
+    #[test]
+    fn test_overlap_panic() {
+        // max < min
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(2)));
+        let _overlap = overlap_all(&[pair_1]);
+    }
+
+    /// Runs `overlap` on all permutations of the given `value_range`es and asserts that the result is
+    /// the same. Returns that result
+    fn overlap_all(value_ranges: &[(ScalarValue, ScalarValue)]) -> Result<bool, DataFusionError> {
+        let n = value_ranges.len();
+
+        let mut overlaps_all_permutations = value_ranges
+            .iter()
+            .cloned()
+            .permutations(n)
+            .map(|v| overlap(&v));
+
+        let Some(first) = overlaps_all_permutations.next() else {
+            return overlap(value_ranges);
+        };
+
+        let first = first.unwrap();
+
+        for result in overlaps_all_permutations {
+            assert_eq!(&result.unwrap(), &first);
+        }
+
+        Ok(first)
+    }
+}
diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs
new file mode 100644
index 0000000..e969776
--- /dev/null
+++ b/iox_query/src/test.rs
@@ -0,0 +1,1220 @@
+//! This module provides a reference implementation of [`QueryNamespace`] for use in testing.
+//!
+//! AKA it is a Mock
+
+use crate::{
+    exec::{
+        stringset::{StringSet, StringSetRef},
+        Executor, ExecutorType, IOxSessionContext,
+    },
+    pruning::prune_chunks,
+    query_log::{QueryLog, StateReceived},
+    QueryChunk, QueryChunkData, QueryCompletedToken, QueryNamespace, QueryNamespaceProvider,
+    QueryText,
+};
+use arrow::array::{BooleanArray, Float64Array};
+use arrow::datatypes::SchemaRef;
+use arrow::{
+    array::{
+        ArrayRef, DictionaryArray, Int64Array, StringArray, TimestampNanosecondArray, UInt64Array,
+    },
+    datatypes::{DataType, Int32Type, TimeUnit},
+    record_batch::RecordBatch,
+};
+use async_trait::async_trait;
+use data_types::{ChunkId, ChunkOrder, NamespaceId, PartitionKey, TableId, TransitionPartitionId};
+use datafusion::common::stats::Precision;
+use datafusion::error::DataFusionError;
+use datafusion::execution::context::SessionState;
+use datafusion::logical_expr::Expr;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::{catalog::schema::SchemaProvider, logical_expr::LogicalPlan};
+use datafusion::{catalog::CatalogProvider, physical_plan::displayable};
+use datafusion::{
+    datasource::{object_store::ObjectStoreUrl, TableProvider, TableType},
+    physical_plan::{ColumnStatistics, Statistics as DataFusionStatistics},
+    scalar::ScalarValue,
+};
+use datafusion_util::{config::DEFAULT_SCHEMA, option_to_precision, timestamptz_nano};
+use iox_time::SystemProvider;
+use itertools::Itertools;
+use object_store::{path::Path, ObjectMeta};
+use parking_lot::Mutex;
+use parquet_file::storage::ParquetExecInput;
+use schema::{
+    builder::SchemaBuilder, merge::SchemaMerger, sort::SortKey, Schema, TIME_COLUMN_NAME,
+};
+use std::{
+    any::Any,
+    collections::{BTreeMap, HashMap},
+    fmt,
+    num::NonZeroU64,
+    sync::Arc,
+};
+use trace::{ctx::SpanContext, span::Span};
+use tracker::{AsyncSemaphoreMetrics, InstrumentedAsyncOwnedSemaphorePermit};
+
+#[derive(Debug)]
+pub struct TestDatabaseStore {
+    databases: Mutex<BTreeMap<String, Arc<TestDatabase>>>,
+    executor: Arc<Executor>,
+    pub metric_registry: Arc<metric::Registry>,
+    pub query_semaphore: Arc<tracker::InstrumentedAsyncSemaphore>,
+}
+
+impl TestDatabaseStore {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn new_with_semaphore_size(semaphore_size: usize) -> Self {
+        let metric_registry = Arc::new(metric::Registry::default());
+        let semaphore_metrics = Arc::new(AsyncSemaphoreMetrics::new(
+            &metric_registry,
+            &[("semaphore", "query_execution")],
+        ));
+        Self {
+            databases: Mutex::new(BTreeMap::new()),
+            executor: Arc::new(Executor::new_testing()),
+            metric_registry,
+            query_semaphore: Arc::new(semaphore_metrics.new_semaphore(semaphore_size)),
+        }
+    }
+
+    pub async fn db_or_create(&self, name: &str) -> Arc<TestDatabase> {
+        let mut databases = self.databases.lock();
+
+        if let Some(db) = databases.get(name) {
+            Arc::clone(db)
+        } else {
+            let new_db = Arc::new(TestDatabase::new(Arc::clone(&self.executor)));
+            databases.insert(name.to_string(), Arc::clone(&new_db));
+            new_db
+        }
+    }
+}
+
+impl Default for TestDatabaseStore {
+    fn default() -> Self {
+        Self::new_with_semaphore_size(u16::MAX as usize)
+    }
+}
+
+#[async_trait]
+impl QueryNamespaceProvider for TestDatabaseStore {
+    /// Retrieve the database specified name
+    async fn db(
+        &self,
+        name: &str,
+        _span: Option<Span>,
+        _include_debug_info_tables: bool,
+    ) -> Option<Arc<dyn QueryNamespace>> {
+        let databases = self.databases.lock();
+
+        databases.get(name).cloned().map(|ns| ns as _)
+    }
+
+    async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit {
+        Arc::clone(&self.query_semaphore)
+            .acquire_owned(span)
+            .await
+            .unwrap()
+    }
+}
+
+#[derive(Debug)]
+pub struct TestDatabase {
+    executor: Arc<Executor>,
+    /// Partitions which have been saved to this test database
+    /// Key is partition name
+    /// Value is map of chunk_id to chunk
+    partitions: Mutex<BTreeMap<String, BTreeMap<ChunkId, Arc<TestChunk>>>>,
+
+    /// `column_names` to return upon next request
+    column_names: Arc<Mutex<Option<StringSetRef>>>,
+
+    /// The predicate passed to the most recent call to `chunks()`
+    chunks_predicate: Mutex<Vec<Expr>>,
+
+    /// Retention time ns.
+    retention_time_ns: Option<i64>,
+}
+
+impl TestDatabase {
+    pub fn new(executor: Arc<Executor>) -> Self {
+        Self {
+            executor,
+            partitions: Default::default(),
+            column_names: Default::default(),
+            chunks_predicate: Default::default(),
+            retention_time_ns: None,
+        }
+    }
+
+    /// Add a test chunk to the database
+    pub fn add_chunk(&self, partition_key: &str, chunk: Arc<TestChunk>) -> &Self {
+        let mut partitions = self.partitions.lock();
+        let chunks = partitions.entry(partition_key.to_string()).or_default();
+        chunks.insert(chunk.id(), chunk);
+        self
+    }
+
+    /// Add a test chunk to the database
+    pub fn with_chunk(self, partition_key: &str, chunk: Arc<TestChunk>) -> Self {
+        self.add_chunk(partition_key, chunk);
+        self
+    }
+
+    /// Get the specified chunk
+    pub fn get_chunk(&self, partition_key: &str, id: ChunkId) -> Option<Arc<TestChunk>> {
+        self.partitions
+            .lock()
+            .get(partition_key)
+            .and_then(|p| p.get(&id).cloned())
+    }
+
+    /// Return the most recent predicate passed to get_chunks()
+    pub fn get_chunks_predicate(&self) -> Vec<Expr> {
+        self.chunks_predicate.lock().clone()
+    }
+
+    /// Set the list of column names that will be returned on a call to
+    /// column_names
+    pub fn set_column_names(&self, column_names: Vec<String>) {
+        let column_names = column_names.into_iter().collect::<StringSet>();
+        let column_names = Arc::new(column_names);
+
+        *Arc::clone(&self.column_names).lock() = Some(column_names)
+    }
+
+    /// Set retention time.
+    pub fn with_retention_time_ns(mut self, retention_time_ns: Option<i64>) -> Self {
+        self.retention_time_ns = retention_time_ns;
+        self
+    }
+}
+
+#[async_trait]
+impl QueryNamespace for TestDatabase {
+    async fn chunks(
+        &self,
+        table_name: &str,
+        filters: &[Expr],
+        _projection: Option<&Vec<usize>>,
+        _ctx: IOxSessionContext,
+    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
+        // save last predicate
+        *self.chunks_predicate.lock() = filters.to_vec();
+
+        let partitions = self.partitions.lock().clone();
+        Ok(partitions
+            .values()
+            .flat_map(|x| x.values())
+            // filter by table
+            .filter(|c| c.table_name == table_name)
+            // only keep chunks if their statistics overlap
+            .filter(|c| {
+                prune_chunks(
+                    c.schema(),
+                    &[Arc::clone(*c) as Arc<dyn QueryChunk>],
+                    filters,
+                )
+                .ok()
+                .map(|res| res[0])
+                .unwrap_or(true)
+            })
+            .map(|x| Arc::clone(x) as Arc<dyn QueryChunk>)
+            .collect::<Vec<_>>())
+    }
+
+    fn retention_time_ns(&self) -> Option<i64> {
+        self.retention_time_ns
+    }
+
+    fn record_query(
+        &self,
+        span_ctx: Option<&SpanContext>,
+        query_type: &'static str,
+        query_text: QueryText,
+    ) -> QueryCompletedToken<StateReceived> {
+        QueryLog::new(0, Arc::new(SystemProvider::new())).push(
+            NamespaceId::new(1),
+            Arc::from("ns"),
+            query_type,
+            query_text,
+            span_ctx.map(|s| s.trace_id),
+        )
+    }
+
+    fn new_query_context(&self, span_ctx: Option<SpanContext>) -> IOxSessionContext {
+        // Note: unlike Db this does not register a catalog provider
+        self.executor
+            .new_execution_config(ExecutorType::Query)
+            .with_default_catalog(Arc::new(TestDatabaseCatalogProvider::from_test_database(
+                self,
+            )))
+            .with_span_context(span_ctx)
+            .build()
+    }
+}
+
+struct TestDatabaseCatalogProvider {
+    partitions: BTreeMap<String, BTreeMap<ChunkId, Arc<TestChunk>>>,
+}
+
+impl TestDatabaseCatalogProvider {
+    fn from_test_database(db: &TestDatabase) -> Self {
+        Self {
+            partitions: db.partitions.lock().clone(),
+        }
+    }
+}
+
+impl CatalogProvider for TestDatabaseCatalogProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema_names(&self) -> Vec<String> {
+        vec![DEFAULT_SCHEMA.to_string()]
+    }
+
+    fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
+        match name {
+            DEFAULT_SCHEMA => Some(Arc::new(TestDatabaseSchemaProvider {
+                partitions: self.partitions.clone(),
+            })),
+            _ => None,
+        }
+    }
+}
+
+struct TestDatabaseSchemaProvider {
+    partitions: BTreeMap<String, BTreeMap<ChunkId, Arc<TestChunk>>>,
+}
+
+#[async_trait]
+impl SchemaProvider for TestDatabaseSchemaProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn table_names(&self) -> Vec<String> {
+        self.partitions
+            .values()
+            .flat_map(|c| c.values())
+            .map(|c| c.table_name.to_owned())
+            .unique()
+            .collect()
+    }
+
+    async fn table(&self, name: &str) -> Option<Arc<dyn TableProvider>> {
+        Some(Arc::new(TestDatabaseTableProvider {
+            partitions: self
+                .partitions
+                .values()
+                .flat_map(|chunks| chunks.values().filter(|c| c.table_name() == name))
+                .map(Clone::clone)
+                .collect(),
+        }))
+    }
+
+    fn table_exist(&self, name: &str) -> bool {
+        self.table_names().contains(&name.to_string())
+    }
+}
+
+struct TestDatabaseTableProvider {
+    partitions: Vec<Arc<TestChunk>>,
+}
+
+#[async_trait]
+impl TableProvider for TestDatabaseTableProvider {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.partitions
+            .iter()
+            .fold(SchemaMerger::new(), |merger, chunk| {
+                merger.merge(chunk.schema()).expect("consistent schemas")
+            })
+            .build()
+            .as_arrow()
+    }
+
+    fn table_type(&self) -> TableType {
+        TableType::Base
+    }
+
+    async fn scan(
+        &self,
+        _ctx: &SessionState,
+        _projection: Option<&Vec<usize>>,
+        _filters: &[Expr],
+        _limit: Option<usize>,
+    ) -> crate::exec::context::Result<Arc<dyn ExecutionPlan>> {
+        unimplemented!()
+    }
+}
+
+#[derive(Debug, Clone)]
+enum TestChunkData {
+    RecordBatches(Vec<RecordBatch>),
+    Parquet(ParquetExecInput),
+}
+
+#[derive(Debug, Clone)]
+pub struct TestChunk {
+    /// Table name
+    table_name: String,
+
+    /// Schema of the table
+    schema: Schema,
+
+    /// Values for stats()
+    column_stats: HashMap<String, ColumnStatistics>,
+    num_rows: Option<usize>,
+
+    id: ChunkId,
+
+    partition_id: TransitionPartitionId,
+
+    /// Set the flag if this chunk might contain duplicates
+    may_contain_pk_duplicates: bool,
+
+    /// Data in this chunk.
+    table_data: TestChunkData,
+
+    /// A saved error that is returned instead of actual results
+    saved_error: Option<String>,
+
+    /// Order of this chunk relative to other overlapping chunks.
+    order: ChunkOrder,
+
+    /// The sort key of this chunk
+    sort_key: Option<SortKey>,
+
+    /// Suppress output
+    quiet: bool,
+}
+
+/// Implements a method for adding a column with default stats
+macro_rules! impl_with_column {
+    ($NAME:ident, $DATA_TYPE:ident) => {
+        pub fn $NAME(self, column_name: impl Into<String>) -> Self {
+            let column_name = column_name.into();
+
+            let new_column_schema = SchemaBuilder::new()
+                .field(&column_name, DataType::$DATA_TYPE)
+                .unwrap()
+                .build()
+                .unwrap();
+            self.add_schema_to_table(new_column_schema, None)
+        }
+    };
+}
+
+/// Implements a method for adding a column with stats that have the specified min and max
+macro_rules! impl_with_column_with_stats {
+    ($NAME:ident, $DATA_TYPE:ident, $RUST_TYPE:ty, $STAT_TYPE:ident) => {
+        pub fn $NAME(
+            self,
+            column_name: impl Into<String>,
+            min: Option<$RUST_TYPE>,
+            max: Option<$RUST_TYPE>,
+        ) -> Self {
+            let column_name = column_name.into();
+
+            let new_column_schema = SchemaBuilder::new()
+                .field(&column_name, DataType::$DATA_TYPE)
+                .unwrap()
+                .build()
+                .unwrap();
+
+            let stats = ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: option_to_precision(max.map(|s| ScalarValue::from(s))),
+                min_value: option_to_precision(min.map(|s| ScalarValue::from(s))),
+                distinct_count: Precision::Absent,
+            };
+
+            self.add_schema_to_table(new_column_schema, Some(stats))
+        }
+    };
+}
+
+impl TestChunk {
+    pub fn new(table_name: impl Into<String>) -> Self {
+        let table_name = table_name.into();
+        Self {
+            table_name,
+            schema: SchemaBuilder::new().build().unwrap(),
+            column_stats: Default::default(),
+            num_rows: None,
+            id: ChunkId::new_test(0),
+            may_contain_pk_duplicates: Default::default(),
+            table_data: TestChunkData::RecordBatches(vec![]),
+            saved_error: Default::default(),
+            order: ChunkOrder::MIN,
+            sort_key: None,
+            partition_id: TransitionPartitionId::arbitrary_for_testing(),
+            quiet: false,
+        }
+    }
+
+    fn push_record_batch(&mut self, batch: RecordBatch) {
+        match &mut self.table_data {
+            TestChunkData::RecordBatches(batches) => {
+                batches.push(batch);
+            }
+            TestChunkData::Parquet(_) => panic!("chunk is parquet-based"),
+        }
+    }
+
+    pub fn with_order(self, order: i64) -> Self {
+        Self {
+            order: ChunkOrder::new(order),
+            ..self
+        }
+    }
+
+    pub fn with_dummy_parquet_file(self) -> Self {
+        self.with_dummy_parquet_file_and_store("iox://store")
+    }
+
+    pub fn with_dummy_parquet_file_and_size(self, size: usize) -> Self {
+        self.with_dummy_parquet_file_and_store_and_size("iox://store", size)
+    }
+
+    pub fn with_dummy_parquet_file_and_store(self, store: &str) -> Self {
+        self.with_dummy_parquet_file_and_store_and_size(store, 1)
+    }
+
+    pub fn with_dummy_parquet_file_and_store_and_size(self, store: &str, size: usize) -> Self {
+        match self.table_data {
+            TestChunkData::RecordBatches(batches) => {
+                assert!(batches.is_empty(), "chunk already has record batches");
+            }
+            TestChunkData::Parquet(_) => panic!("chunk already has a file"),
+        }
+
+        Self {
+            table_data: TestChunkData::Parquet(ParquetExecInput {
+                object_store_url: ObjectStoreUrl::parse(store).unwrap(),
+                object_meta: ObjectMeta {
+                    location: Self::parquet_location(self.id),
+                    last_modified: Default::default(),
+                    size,
+                    e_tag: None,
+                    version: None,
+                },
+            }),
+            ..self
+        }
+    }
+
+    fn parquet_location(chunk_id: ChunkId) -> Path {
+        Path::parse(format!("{}.parquet", chunk_id.get().as_u128())).unwrap()
+    }
+
+    /// Returns the receiver configured to suppress any output to STDOUT.
+    pub fn with_quiet(mut self) -> Self {
+        self.quiet = true;
+        self
+    }
+
+    pub fn with_id(mut self, id: u128) -> Self {
+        self.id = ChunkId::new_test(id);
+
+        if let TestChunkData::Parquet(parquet_input) = &mut self.table_data {
+            parquet_input.object_meta.location = Self::parquet_location(self.id);
+        }
+
+        self
+    }
+
+    pub fn with_partition(mut self, id: i64) -> Self {
+        self.partition_id =
+            TransitionPartitionId::new(TableId::new(id), &PartitionKey::from("arbitrary"));
+        self
+    }
+
+    pub fn with_partition_id(mut self, id: TransitionPartitionId) -> Self {
+        self.partition_id = id;
+        self
+    }
+
+    /// specify that any call should result in an error with the message
+    /// specified
+    pub fn with_error(mut self, error_message: impl Into<String>) -> Self {
+        self.saved_error = Some(error_message.into());
+        self
+    }
+
+    /// Checks the saved error, and returns it if any, otherwise returns OK
+    fn check_error(&self) -> Result<(), DataFusionError> {
+        if let Some(message) = self.saved_error.as_ref() {
+            Err(DataFusionError::External(message.clone().into()))
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Set the `may_contain_pk_duplicates` flag
+    pub fn with_may_contain_pk_duplicates(mut self, v: bool) -> Self {
+        self.may_contain_pk_duplicates = v;
+        self
+    }
+
+    /// Register a tag column with the test chunk with default stats
+    pub fn with_tag_column(self, column_name: impl Into<String>) -> Self {
+        let column_name = column_name.into();
+
+        // make a new schema with the specified column and
+        // merge it in to any existing schema
+        let new_column_schema = SchemaBuilder::new().tag(&column_name).build().unwrap();
+
+        self.add_schema_to_table(new_column_schema, None)
+    }
+
+    /// Register a tag column with stats with the test chunk
+    pub fn with_tag_column_with_stats(
+        self,
+        column_name: impl Into<String>,
+        min: Option<&str>,
+        max: Option<&str>,
+    ) -> Self {
+        self.with_tag_column_with_full_stats(column_name, min, max, 0, None)
+    }
+
+    /// Register a tag column with stats with the test chunk
+    pub fn with_tag_column_with_full_stats(
+        self,
+        column_name: impl Into<String>,
+        min: Option<&str>,
+        max: Option<&str>,
+        count: u64,
+        distinct_count: Option<NonZeroU64>,
+    ) -> Self {
+        let null_count = 0;
+        self.with_tag_column_with_nulls_and_full_stats(
+            column_name,
+            min,
+            max,
+            count,
+            distinct_count,
+            null_count,
+        )
+    }
+
+    fn update_count(&mut self, count: usize) {
+        match self.num_rows {
+            Some(existing) => assert_eq!(existing, count),
+            None => self.num_rows = Some(count),
+        }
+    }
+
+    /// Register a tag column with stats with the test chunk
+    pub fn with_tag_column_with_nulls_and_full_stats(
+        mut self,
+        column_name: impl Into<String>,
+        min: Option<&str>,
+        max: Option<&str>,
+        count: u64,
+        distinct_count: Option<NonZeroU64>,
+        null_count: u64,
+    ) -> Self {
+        let column_name = column_name.into();
+
+        // make a new schema with the specified column and
+        // merge it in to any existing schema
+        let new_column_schema = SchemaBuilder::new().tag(&column_name).build().unwrap();
+
+        // Construct stats
+        let stats = ColumnStatistics {
+            null_count: Precision::Exact(null_count as usize),
+            max_value: option_to_precision(max.map(ScalarValue::from)),
+            min_value: option_to_precision(min.map(ScalarValue::from)),
+            distinct_count: option_to_precision(distinct_count.map(|c| c.get() as usize)),
+        };
+
+        self.update_count(count as usize);
+        self.add_schema_to_table(new_column_schema, Some(stats))
+    }
+
+    /// Register a timestamp column with the test chunk with default stats
+    pub fn with_time_column(self) -> Self {
+        // make a new schema with the specified column and
+        // merge it in to any existing schema
+        let new_column_schema = SchemaBuilder::new().timestamp().build().unwrap();
+
+        self.add_schema_to_table(new_column_schema, None)
+    }
+
+    /// Register a timestamp column with the test chunk
+    pub fn with_time_column_with_stats(self, min: Option<i64>, max: Option<i64>) -> Self {
+        self.with_time_column_with_full_stats(min, max, 0, None)
+    }
+
+    /// Register a timestamp column with full stats with the test chunk
+    pub fn with_time_column_with_full_stats(
+        mut self,
+        min: Option<i64>,
+        max: Option<i64>,
+        count: u64,
+        distinct_count: Option<NonZeroU64>,
+    ) -> Self {
+        // make a new schema with the specified column and
+        // merge it in to any existing schema
+        let new_column_schema = SchemaBuilder::new().timestamp().build().unwrap();
+        let null_count = 0;
+
+        // Construct stats
+        let stats = ColumnStatistics {
+            null_count: Precision::Exact(null_count as usize),
+            max_value: option_to_precision(max.map(timestamptz_nano)),
+            min_value: option_to_precision(min.map(timestamptz_nano)),
+            distinct_count: option_to_precision(distinct_count.map(|c| c.get() as usize)),
+        };
+
+        self.update_count(count as usize);
+        self.add_schema_to_table(new_column_schema, Some(stats))
+    }
+
+    pub fn with_timestamp_min_max(mut self, min: i64, max: i64) -> Self {
+        let stats = self
+            .column_stats
+            .get_mut(TIME_COLUMN_NAME)
+            .expect("stats in sync w/ columns");
+
+        stats.min_value = Precision::Exact(timestamptz_nano(min));
+        stats.max_value = Precision::Exact(timestamptz_nano(max));
+
+        self
+    }
+
+    impl_with_column!(with_i64_field_column, Int64);
+    impl_with_column_with_stats!(with_i64_field_column_with_stats, Int64, i64, I64);
+
+    impl_with_column!(with_u64_column, UInt64);
+    impl_with_column_with_stats!(with_u64_field_column_with_stats, UInt64, u64, U64);
+
+    impl_with_column!(with_f64_field_column, Float64);
+    impl_with_column_with_stats!(with_f64_field_column_with_stats, Float64, f64, F64);
+
+    impl_with_column!(with_bool_field_column, Boolean);
+    impl_with_column_with_stats!(with_bool_field_column_with_stats, Boolean, bool, Bool);
+
+    /// Register a string field column with the test chunk
+    pub fn with_string_field_column_with_stats(
+        self,
+        column_name: impl Into<String>,
+        min: Option<&str>,
+        max: Option<&str>,
+    ) -> Self {
+        let column_name = column_name.into();
+
+        // make a new schema with the specified column and
+        // merge it in to any existing schema
+        let new_column_schema = SchemaBuilder::new()
+            .field(&column_name, DataType::Utf8)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Construct stats
+        let stats = ColumnStatistics {
+            null_count: Precision::Absent,
+            max_value: option_to_precision(max.map(ScalarValue::from)),
+            min_value: option_to_precision(min.map(ScalarValue::from)),
+            distinct_count: Precision::Absent,
+        };
+
+        self.add_schema_to_table(new_column_schema, Some(stats))
+    }
+
+    /// Adds the specified schema and optionally a column summary containing optional stats.
+    /// If `add_column_summary` is false, `stats` is ignored. If `add_column_summary` is true but
+    /// `stats` is `None`, default stats will be added to the column summary.
+    fn add_schema_to_table(
+        mut self,
+        new_column_schema: Schema,
+        input_stats: Option<ColumnStatistics>,
+    ) -> Self {
+        let mut merger = SchemaMerger::new();
+        merger = merger.merge(&new_column_schema).unwrap();
+        merger = merger.merge(&self.schema).expect("merging was successful");
+        self.schema = merger.build();
+
+        for f in new_column_schema.inner().fields() {
+            self.column_stats.insert(
+                f.name().clone(),
+                input_stats.as_ref().cloned().unwrap_or_default(),
+            );
+        }
+
+        self
+    }
+
+    /// Prepares this chunk to return a specific record batch with one
+    /// row of non null data.
+    /// tag: MA
+    pub fn with_one_row_of_data(mut self) -> Self {
+        // create arrays
+        let columns = self
+            .schema
+            .iter()
+            .map(|(_influxdb_column_type, field)| match field.data_type() {
+                DataType::Int64 => Arc::new(Int64Array::from(vec![1000])) as ArrayRef,
+                DataType::UInt64 => Arc::new(UInt64Array::from(vec![1000])) as ArrayRef,
+                DataType::Utf8 => Arc::new(StringArray::from(vec!["MA"])) as ArrayRef,
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![1000]).with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    let dict: DictionaryArray<Int32Type> = vec!["MA"].into_iter().collect();
+                    Arc::new(dict) as ArrayRef
+                }
+                DataType::Float64 => Arc::new(Float64Array::from(vec![99.5])) as ArrayRef,
+                DataType::Boolean => Arc::new(BooleanArray::from(vec![true])) as ArrayRef,
+                _ => unimplemented!(
+                    "Unimplemented data type for test database: {:?}",
+                    field.data_type()
+                ),
+            })
+            .collect::<Vec<_>>();
+
+        let batch =
+            RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch");
+        if !self.quiet {
+            println!("TestChunk batch data: {batch:#?}");
+        }
+
+        self.push_record_batch(batch);
+        self
+    }
+
+    /// Prepares this chunk to return a specific record batch with a single tag, field and timestamp like
+    pub fn with_one_row_of_specific_data(
+        mut self,
+        tag_val: impl AsRef<str>,
+        field_val: i64,
+        ts_val: i64,
+    ) -> Self {
+        // create arrays
+        let columns = self
+            .schema
+            .iter()
+            .map(|(_influxdb_column_type, field)| match field.data_type() {
+                DataType::Int64 => Arc::new(Int64Array::from(vec![field_val])) as ArrayRef,
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![ts_val]).with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    let dict: DictionaryArray<Int32Type> =
+                        vec![tag_val.as_ref()].into_iter().collect();
+                    Arc::new(dict) as ArrayRef
+                }
+                _ => unimplemented!(
+                    "Unimplemented data type for test database: {:?}",
+                    field.data_type()
+                ),
+            })
+            .collect::<Vec<_>>();
+
+        let batch =
+            RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch");
+        if !self.quiet {
+            println!("TestChunk batch data: {batch:#?}");
+        }
+
+        self.push_record_batch(batch);
+        self
+    }
+
+    /// Prepares this chunk to return a specific record batch with three
+    /// rows of non null data that look like, no duplicates within
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| tag1 | tag2 | field_int | time                          |",
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| WA   | SC   | 1000      | 1970-01-01 00:00:00.000008    |",
+    ///   "| VT   | NC   | 10        | 1970-01-01 00:00:00.000010    |",
+    ///   "| UT   | RI   | 70        | 1970-01-01 00:00:00.000020    |",
+    ///   "+------+------+-----------+-------------------------------+",
+    /// Stats(min, max) : tag1(UT, WA), tag2(RI, SC), time(8000, 20000)
+    pub fn with_three_rows_of_data(mut self) -> Self {
+        // create arrays
+        let columns = self
+            .schema
+            .iter()
+            .map(|(_influxdb_column_type, field)| match field.data_type() {
+                DataType::Int64 => Arc::new(Int64Array::from(vec![1000, 10, 70])) as ArrayRef,
+                DataType::UInt64 => Arc::new(UInt64Array::from(vec![1000, 10, 70])) as ArrayRef,
+                DataType::Utf8 => match field.name().as_str() {
+                    "tag1" => Arc::new(StringArray::from(vec!["WA", "VT", "UT"])) as ArrayRef,
+                    "tag2" => Arc::new(StringArray::from(vec!["SC", "NC", "RI"])) as ArrayRef,
+                    _ => Arc::new(StringArray::from(vec!["TX", "PR", "OR"])) as ArrayRef,
+                },
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![8000, 10000, 20000])
+                        .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    match field.name().as_str() {
+                        "tag1" => Arc::new(
+                            vec!["WA", "VT", "UT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        "tag2" => Arc::new(
+                            vec!["SC", "NC", "RI"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        _ => Arc::new(
+                            vec!["TX", "PR", "OR"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                    }
+                }
+                _ => unimplemented!(
+                    "Unimplemented data type for test database: {:?}",
+                    field.data_type()
+                ),
+            })
+            .collect::<Vec<_>>();
+
+        let batch =
+            RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch");
+
+        self.push_record_batch(batch);
+        self
+    }
+
+    /// Prepares this chunk to return a specific record batch with four
+    /// rows of non null data that look like, duplicates within
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| tag1 | tag2 | field_int | time                          |",
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| WA   | SC   | 1000      | 1970-01-01 00:00:00.000028    |",
+    ///   "| VT   | NC   | 10        | 1970-01-01 00:00:00.000210    |", (1)
+    ///   "| UT   | RI   | 70        | 1970-01-01 00:00:00.000220    |",
+    ///   "| VT   | NC   | 50        | 1970-01-01 00:00:00.000210    |", // duplicate of (1)
+    ///   "+------+------+-----------+-------------------------------+",
+    /// Stats(min, max) : tag1(UT, WA), tag2(RI, SC), time(28000, 220000)
+    pub fn with_four_rows_of_data(mut self) -> Self {
+        // create arrays
+        let columns = self
+            .schema
+            .iter()
+            .map(|(_influxdb_column_type, field)| match field.data_type() {
+                DataType::Int64 => Arc::new(Int64Array::from(vec![1000, 10, 70, 50])) as ArrayRef,
+                DataType::Utf8 => match field.name().as_str() {
+                    "tag1" => Arc::new(StringArray::from(vec!["WA", "VT", "UT", "VT"])) as ArrayRef,
+                    "tag2" => Arc::new(StringArray::from(vec!["SC", "NC", "RI", "NC"])) as ArrayRef,
+                    _ => Arc::new(StringArray::from(vec!["TX", "PR", "OR", "AL"])) as ArrayRef,
+                },
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![28000, 210000, 220000, 210000])
+                        .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    match field.name().as_str() {
+                        "tag1" => Arc::new(
+                            vec!["WA", "VT", "UT", "VT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        "tag2" => Arc::new(
+                            vec!["SC", "NC", "RI", "NC"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        _ => Arc::new(
+                            vec!["TX", "PR", "OR", "AL"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                    }
+                }
+                _ => unimplemented!(
+                    "Unimplemented data type for test database: {:?}",
+                    field.data_type()
+                ),
+            })
+            .collect::<Vec<_>>();
+
+        let batch =
+            RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch");
+
+        self.push_record_batch(batch);
+        self
+    }
+
+    /// Prepares this chunk to return a specific record batch with five
+    /// rows of non null data that look like, no duplicates within
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| tag1 | tag2 | field_int | time                          |",
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| MT   | CT   | 1000      | 1970-01-01 00:00:00.000001    |",
+    ///   "| MT   | AL   | 10        | 1970-01-01 00:00:00.000007    |",
+    ///   "| CT   | CT   | 70        | 1970-01-01 00:00:00.000000100 |",
+    ///   "| AL   | MA   | 100       | 1970-01-01 00:00:00.000000050 |",
+    ///   "| MT   | AL   | 5         | 1970-01-01 00:00:00.000005    |",
+    ///   "+------+------+-----------+-------------------------------+",
+    /// Stats(min, max) : tag1(AL, MT), tag2(AL, MA), time(5, 7000)
+    pub fn with_five_rows_of_data(mut self) -> Self {
+        // create arrays
+        let columns = self
+            .schema
+            .iter()
+            .map(|(_influxdb_column_type, field)| match field.data_type() {
+                DataType::Int64 => {
+                    Arc::new(Int64Array::from(vec![1000, 10, 70, 100, 5])) as ArrayRef
+                }
+                DataType::Utf8 => {
+                    match field.name().as_str() {
+                        "tag1" => Arc::new(StringArray::from(vec!["MT", "MT", "CT", "AL", "MT"]))
+                            as ArrayRef,
+                        "tag2" => Arc::new(StringArray::from(vec!["CT", "AL", "CT", "MA", "AL"]))
+                            as ArrayRef,
+                        _ => Arc::new(StringArray::from(vec!["CT", "MT", "AL", "AL", "MT"]))
+                            as ArrayRef,
+                    }
+                }
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![1000, 7000, 100, 50, 5000])
+                        .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    match field.name().as_str() {
+                        "tag1" => Arc::new(
+                            vec!["MT", "MT", "CT", "AL", "MT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        "tag2" => Arc::new(
+                            vec!["CT", "AL", "CT", "MA", "AL"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        _ => Arc::new(
+                            vec!["CT", "MT", "AL", "AL", "MT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                    }
+                }
+                _ => unimplemented!(
+                    "Unimplemented data type for test database: {:?}",
+                    field.data_type()
+                ),
+            })
+            .collect::<Vec<_>>();
+
+        let batch =
+            RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch");
+
+        self.push_record_batch(batch);
+        self
+    }
+
+    /// Prepares this chunk to return a specific record batch with ten
+    /// rows of non null data that look like, duplicates within
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| tag1 | tag2 | field_int | time                          |",
+    ///   "+------+------+-----------+-------------------------------+",
+    ///   "| MT   | CT   | 1000      | 1970-01-01 00:00:00.000001    |",
+    ///   "| MT   | AL   | 10        | 1970-01-01 00:00:00.000007    |", (1)
+    ///   "| CT   | CT   | 70        | 1970-01-01 00:00:00.000000100 |",
+    ///   "| AL   | MA   | 100       | 1970-01-01 00:00:00.000000050 |", (2)
+    ///   "| MT   | AL   | 5         | 1970-01-01 00:00:00.000005    |", (3)
+    ///   "| MT   | CT   | 1000      | 1970-01-01 00:00:00.000002    |",
+    ///   "| MT   | AL   | 20        | 1970-01-01 00:00:00.000007    |",  // Duplicate with (1)
+    ///   "| CT   | CT   | 70        | 1970-01-01 00:00:00.000000500 |",
+    ///   "| AL   | MA   | 10        | 1970-01-01 00:00:00.000000050 |",  // Duplicate with (2)
+    ///   "| MT   | AL   | 30        | 1970-01-01 00:00:00.000005    |",  // Duplicate with (3)
+    ///   "+------+------+-----------+-------------------------------+",
+    /// Stats(min, max) : tag1(AL, MT), tag2(AL, MA), time(5, 7000)
+    pub fn with_ten_rows_of_data_some_duplicates(mut self) -> Self {
+        // create arrays
+        let columns = self
+            .schema
+            .iter()
+            .map(|(_influxdb_column_type, field)| match field.data_type() {
+                DataType::Int64 => Arc::new(Int64Array::from(vec![
+                    1000, 10, 70, 100, 5, 1000, 20, 70, 10, 30,
+                ])) as ArrayRef,
+                DataType::Utf8 => match field.name().as_str() {
+                    "tag1" => Arc::new(StringArray::from(vec![
+                        "MT", "MT", "CT", "AL", "MT", "MT", "MT", "CT", "AL", "MT",
+                    ])) as ArrayRef,
+                    "tag2" => Arc::new(StringArray::from(vec![
+                        "CT", "AL", "CT", "MA", "AL", "CT", "AL", "CT", "MA", "AL",
+                    ])) as ArrayRef,
+                    _ => Arc::new(StringArray::from(vec![
+                        "CT", "MT", "AL", "AL", "MT", "CT", "MT", "AL", "AL", "MT",
+                    ])) as ArrayRef,
+                },
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![
+                        1000, 7000, 100, 50, 5, 2000, 7000, 500, 50, 5,
+                    ])
+                    .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    match field.name().as_str() {
+                        "tag1" => Arc::new(
+                            vec!["MT", "MT", "CT", "AL", "MT", "MT", "MT", "CT", "AL", "MT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        "tag2" => Arc::new(
+                            vec!["CT", "AL", "CT", "MA", "AL", "CT", "AL", "CT", "MA", "AL"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        _ => Arc::new(
+                            vec!["CT", "MT", "AL", "AL", "MT", "CT", "MT", "AL", "AL", "MT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                    }
+                }
+                _ => unimplemented!(
+                    "Unimplemented data type for test database: {:?}",
+                    field.data_type()
+                ),
+            })
+            .collect::<Vec<_>>();
+
+        let batch =
+            RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch");
+
+        self.push_record_batch(batch);
+        self
+    }
+
+    /// Set the sort key for this chunk
+    pub fn with_sort_key(self, sort_key: SortKey) -> Self {
+        Self {
+            sort_key: Some(sort_key),
+            ..self
+        }
+    }
+
+    pub fn table_name(&self) -> &str {
+        &self.table_name
+    }
+}
+
+impl fmt::Display for TestChunk {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.table_name())
+    }
+}
+
+impl QueryChunk for TestChunk {
+    fn stats(&self) -> Arc<DataFusionStatistics> {
+        self.check_error().unwrap();
+
+        Arc::new(DataFusionStatistics {
+            num_rows: option_to_precision(self.num_rows),
+            total_byte_size: Precision::Absent,
+            column_statistics: self
+                .schema
+                .inner()
+                .fields()
+                .iter()
+                .map(|f| self.column_stats.get(f.name()).cloned().unwrap_or_default())
+                .collect(),
+        })
+    }
+
+    fn schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    fn partition_id(&self) -> &TransitionPartitionId {
+        &self.partition_id
+    }
+
+    fn sort_key(&self) -> Option<&SortKey> {
+        self.sort_key.as_ref()
+    }
+
+    fn id(&self) -> ChunkId {
+        self.id
+    }
+
+    fn may_contain_pk_duplicates(&self) -> bool {
+        self.may_contain_pk_duplicates
+    }
+
+    fn data(&self) -> QueryChunkData {
+        self.check_error().unwrap();
+
+        match &self.table_data {
+            TestChunkData::RecordBatches(batches) => {
+                QueryChunkData::in_mem(batches.clone(), Arc::clone(self.schema.inner()))
+            }
+            TestChunkData::Parquet(input) => QueryChunkData::Parquet(input.clone()),
+        }
+    }
+
+    fn chunk_type(&self) -> &str {
+        "Test Chunk"
+    }
+
+    fn order(&self) -> ChunkOrder {
+        self.order
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+/// Return the raw data from the list of chunks
+pub async fn raw_data(chunks: &[Arc<dyn QueryChunk>]) -> Vec<RecordBatch> {
+    let ctx = IOxSessionContext::with_testing();
+    let mut batches = vec![];
+    for c in chunks {
+        batches.append(&mut c.data().read_to_batches(c.schema(), ctx.inner()).await);
+    }
+    batches
+}
+
+pub fn format_logical_plan(plan: &LogicalPlan) -> Vec<String> {
+    format_lines(&plan.display_indent().to_string())
+}
+
+pub fn format_execution_plan(plan: &Arc<dyn ExecutionPlan>) -> Vec<String> {
+    format_lines(&displayable(plan.as_ref()).indent(false).to_string())
+}
+
+fn format_lines(s: &str) -> Vec<String> {
+    s.trim()
+        .split('\n')
+        .map(|s| {
+            // Always add a leading space to ensure tha all lines in the YAML insta snapshots are quoted, otherwise the
+            // alignment gets messed up and the snapshot would be hard to read.
+            format!(" {s}")
+        })
+        .collect()
+}
diff --git a/iox_query/src/util.rs b/iox_query/src/util.rs
new file mode 100644
index 0000000..7cd92a4
--- /dev/null
+++ b/iox_query/src/util.rs
@@ -0,0 +1,325 @@
+//! This module contains DataFusion utility functions and helpers
+
+use std::{
+    cmp::{max, min},
+    sync::Arc,
+};
+
+use arrow::{
+    array::TimestampNanosecondArray,
+    compute::SortOptions,
+    datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef},
+    record_batch::RecordBatch,
+};
+
+use data_types::TimestampMinMax;
+use datafusion::common::stats::Precision;
+use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries};
+use datafusion::{
+    self,
+    common::ToDFSchema,
+    datasource::{provider_as_source, MemTable},
+    error::DataFusionError,
+    execution::context::ExecutionProps,
+    logical_expr::{interval_arithmetic::Interval, LogicalPlan, LogicalPlanBuilder},
+    optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext},
+    physical_expr::create_physical_expr,
+    physical_plan::{
+        expressions::{col as physical_col, PhysicalSortExpr},
+        PhysicalExpr,
+    },
+    prelude::{Column, Expr},
+};
+
+use itertools::Itertools;
+use observability_deps::tracing::trace;
+use schema::{sort::SortKey, TIME_COLUMN_NAME};
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+
+#[derive(Debug, Snafu)]
+#[allow(missing_copy_implementations, missing_docs)]
+pub enum Error {
+    #[snafu(display("The Record batch is empty"))]
+    EmptyBatch,
+
+    #[snafu(display("Error while searching Time column in a Record Batch"))]
+    TimeColumn { source: arrow::error::ArrowError },
+
+    #[snafu(display("Error while casting Timenanosecond on Time column"))]
+    TimeCasting,
+
+    #[snafu(display("Time column does not have value"))]
+    TimeValue,
+
+    #[snafu(display("Time column is null"))]
+    TimeNull,
+}
+
+/// A specialized `Error`
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Create a logical plan that produces the record batch
+pub fn make_scan_plan(batch: RecordBatch) -> std::result::Result<LogicalPlan, DataFusionError> {
+    let schema = batch.schema();
+    let partitions = vec![vec![batch]];
+    let projection = None; // scan all columns
+
+    let table = MemTable::try_new(schema, partitions)?;
+
+    let source = provider_as_source(Arc::new(table));
+
+    LogicalPlanBuilder::scan("memtable", source, projection)?.build()
+}
+
+pub fn logical_sort_key_exprs(sort_key: &SortKey) -> Vec<Expr> {
+    sort_key
+        .iter()
+        .map(|(key, options)| {
+            let expr = Expr::Column(Column::from_name(key.as_ref()));
+            expr.sort(!options.descending, options.nulls_first)
+        })
+        .collect()
+}
+
+pub fn arrow_sort_key_exprs(
+    sort_key: &SortKey,
+    input_schema: &ArrowSchema,
+) -> Vec<PhysicalSortExpr> {
+    sort_key
+        .iter()
+        .flat_map(|(key, options)| {
+            // Skip over missing columns
+            let expr = physical_col(key, input_schema).ok()?;
+            Some(PhysicalSortExpr {
+                expr,
+                options: SortOptions {
+                    descending: options.descending,
+                    nulls_first: options.nulls_first,
+                },
+            })
+        })
+        .collect()
+}
+
+/// Build a datafusion physical expression from a logical one
+pub fn df_physical_expr(
+    schema: ArrowSchemaRef,
+    expr: Expr,
+) -> std::result::Result<Arc<dyn PhysicalExpr>, DataFusionError> {
+    let df_schema = Arc::clone(&schema).to_dfschema_ref()?;
+
+    let props = ExecutionProps::new();
+    let simplifier =
+        ExprSimplifier::new(SimplifyContext::new(&props).with_schema(Arc::clone(&df_schema)));
+
+    // apply type coercion here to ensure types match
+    trace!(%df_schema, "input schema");
+    let expr = simplifier.coerce(expr, Arc::clone(&df_schema))?;
+    trace!(%expr, "coerced logical expression");
+
+    create_physical_expr(&expr, df_schema.as_ref(), schema.as_ref(), &props)
+}
+
+/// Return min and max for column `time` of the given set of record batches by
+/// performing an `O(n)` scan of all provided batches.
+pub fn compute_timenanosecond_min_max<'a, I>(batches: I) -> Result<TimestampMinMax>
+where
+    I: IntoIterator<Item = &'a RecordBatch>,
+{
+    let mut min_time = i64::MAX;
+    let mut max_time = i64::MIN;
+    for batch in batches {
+        let (mi, ma) = compute_timenanosecond_min_max_for_one_record_batch(batch)?;
+        min_time = min(min_time, mi);
+        max_time = max(max_time, ma);
+    }
+    Ok(TimestampMinMax {
+        min: min_time,
+        max: max_time,
+    })
+}
+
+/// Return min and max for column `time` in the given record batch by performing
+/// an `O(n)` scan of `batch`.
+pub fn compute_timenanosecond_min_max_for_one_record_batch(
+    batch: &RecordBatch,
+) -> Result<(i64, i64)> {
+    ensure!(batch.num_columns() > 0, EmptyBatchSnafu);
+
+    let index = batch
+        .schema()
+        .index_of(TIME_COLUMN_NAME)
+        .context(TimeColumnSnafu {})?;
+
+    let time_col = batch
+        .column(index)
+        .as_any()
+        .downcast_ref::<TimestampNanosecondArray>()
+        .context(TimeCastingSnafu {})?;
+
+    let (min, max) = match time_col.iter().minmax() {
+        itertools::MinMaxResult::NoElements => return Err(Error::TimeValue),
+        itertools::MinMaxResult::OneElement(val) => {
+            let val = val.context(TimeNullSnafu)?;
+            (val, val)
+        }
+        itertools::MinMaxResult::MinMax(min, max) => {
+            (min.context(TimeNullSnafu)?, max.context(TimeNullSnafu)?)
+        }
+    };
+
+    Ok((min, max))
+}
+
+/// Determine the possible maximum range for each of the fields in a
+/// ['ArrowSchema'] once the ['Expr'] has been applied. The returned
+/// Vec includes an Interval for every field in the schema in the same
+/// order. Any fileds that are not constrained by the expression will
+/// have an unbounded interval.
+pub fn calculate_field_intervals(
+    schema: ArrowSchemaRef,
+    expr: Expr,
+) -> Result<Vec<Interval>, DataFusionError> {
+    // make unknown boundaries for each column
+    // TODO use upstream code when https://github.com/apache/arrow-datafusion/pull/8377 is merged
+    let fields = schema.fields();
+    let boundaries = fields
+        .iter()
+        .enumerate()
+        .map(|(i, field)| {
+            let column = datafusion::physical_expr::expressions::Column::new(field.name(), i);
+            let interval = Interval::make_unbounded(field.data_type())?;
+            Ok(ExprBoundaries {
+                column,
+                interval,
+                distinct_count: Precision::Absent,
+            })
+        })
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+
+    let context = AnalysisContext::new(boundaries);
+    let analysis_result = analyze(
+        &df_physical_expr(Arc::clone(&schema), expr)?,
+        context,
+        &schema,
+    )?;
+
+    let intervals = analysis_result
+        .boundaries
+        .into_iter()
+        .map(|b| b.interval)
+        .collect::<Vec<_>>();
+
+    Ok(intervals)
+}
+
+/// Determine the possible maximum range for the named field in the
+/// ['ArrowSchema'] once the ['Expr'] has been applied.
+pub fn calculate_field_interval(
+    schema: ArrowSchemaRef,
+    expr: Expr,
+    name: &str,
+) -> Result<Interval, DataFusionError> {
+    let idx = schema.index_of(name)?;
+    let mut intervals = calculate_field_intervals(Arc::clone(&schema), expr)?;
+    Ok(intervals.swap_remove(idx))
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::common::rounding::next_down;
+    use datafusion::common::ScalarValue;
+    use datafusion::logical_expr::{col, lit};
+    use schema::{builder::SchemaBuilder, InfluxFieldType, TIME_DATA_TIMEZONE};
+
+    use super::*;
+
+    fn time_interval(lower: Option<i64>, upper: Option<i64>) -> Interval {
+        let lower = ScalarValue::TimestampNanosecond(lower, TIME_DATA_TIMEZONE());
+        let upper = ScalarValue::TimestampNanosecond(upper, TIME_DATA_TIMEZONE());
+        Interval::try_new(lower, upper).unwrap()
+    }
+
+    fn f64_interval(lower: Option<f64>, upper: Option<f64>) -> Interval {
+        let lower = ScalarValue::Float64(lower);
+        let upper = ScalarValue::Float64(upper);
+        Interval::try_new(lower, upper).unwrap()
+    }
+
+    #[test]
+    fn test_calculate_field_intervals() {
+        let schema = SchemaBuilder::new()
+            .timestamp()
+            .influx_field("a", InfluxFieldType::Float)
+            .build()
+            .unwrap()
+            .as_arrow();
+        let expr = col("time")
+            .gt_eq(lit("2020-01-01T00:00:00Z"))
+            .and(col("time").lt(lit("2020-01-02T00:00:00Z")))
+            .and(col("a").gt_eq(lit(1000000.0)))
+            .and(col("a").lt(lit(2000000.0)));
+        let intervals = calculate_field_intervals(schema, expr).unwrap();
+        // 2020-01-01T00:00:00Z == 1577836800000000000
+        // 2020-01-02T00:00:00Z == 1577923200000000000
+        assert_eq!(
+            vec![
+                time_interval(Some(1577836800000000000), Some(1577923200000000000i64 - 1),),
+                f64_interval(Some(1000000.0), Some(next_down(2000000.0)))
+            ],
+            intervals
+        );
+    }
+
+    #[test]
+    fn test_calculate_field_intervals_no_constraints() {
+        let schema = SchemaBuilder::new()
+            .timestamp()
+            .influx_field("a", InfluxFieldType::Float)
+            .build()
+            .unwrap()
+            .as_arrow();
+        // must be a predicate (boolean expression)
+        let expr = lit("test").eq(lit("foo"));
+        let intervals = calculate_field_intervals(schema, expr).unwrap();
+        assert_eq!(
+            vec![time_interval(None, None), f64_interval(None, None)],
+            intervals
+        );
+    }
+
+    #[test]
+    fn test_calculate_field_interval() {
+        let schema = SchemaBuilder::new()
+            .timestamp()
+            .influx_field("a", InfluxFieldType::Float)
+            .build()
+            .unwrap()
+            .as_arrow();
+        let expr = col("time")
+            .gt_eq(lit("2020-01-01T00:00:00Z"))
+            .and(col("time").lt(lit("2020-01-02T00:00:00Z")))
+            .and(col("a").gt_eq(lit(1000000.0)))
+            .and(col("a").lt(lit(2000000.0)));
+
+        // Note
+        // 2020-01-01T00:00:00Z == 1577836800000000000
+        // 2020-01-02T00:00:00Z == 1577923200000000000
+        let interval = calculate_field_interval(Arc::clone(&schema), expr.clone(), "time").unwrap();
+        assert_eq!(
+            time_interval(Some(1577836800000000000), Some(1577923200000000000 - 1),),
+            interval
+        );
+
+        let interval = calculate_field_interval(Arc::clone(&schema), expr.clone(), "a").unwrap();
+        assert_eq!(
+            f64_interval(Some(1000000.0), Some(next_down(2000000.0))),
+            interval
+        );
+
+        assert_eq!(
+            "Arrow error: Schema error: Unable to get field named \"b\". Valid fields: [\"time\", \"a\"]",
+            calculate_field_interval(Arc::clone(&schema), expr.clone(), "b").unwrap_err().to_string(),
+        );
+    }
+}
diff --git a/iox_query_influxql/Cargo.toml b/iox_query_influxql/Cargo.toml
new file mode 100644
index 0000000..0c11612
--- /dev/null
+++ b/iox_query_influxql/Cargo.toml
@@ -0,0 +1,34 @@
+[package]
+name = "iox_query_influxql"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+chrono-tz = { version = "0.8" }
+datafusion = { workspace = true }
+datafusion_util = { path = "../datafusion_util" }
+generated_types = { path = "../generated_types" }
+influxdb_influxql_parser = { path = "../influxdb_influxql_parser" }
+iox_query = { path = "../iox_query" }
+itertools = "0.12.0"
+observability_deps = { path = "../observability_deps" }
+once_cell = "1"
+predicate = { path = "../predicate" }
+query_functions = { path = "../query_functions" }
+regex = "1"
+schema = { path = "../schema" }
+serde_json = "1.0.111"
+thiserror = "1.0"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+chrono = { version = "0.4", default-features = false }
+test_helpers = { path = "../test_helpers" }
+assert_matches = "1"
+insta = { version = "1", features = ["yaml"] }
diff --git a/iox_query_influxql/src/aggregate.rs b/iox_query_influxql/src/aggregate.rs
new file mode 100644
index 0000000..badb279
--- /dev/null
+++ b/iox_query_influxql/src/aggregate.rs
@@ -0,0 +1,24 @@
+//! User defined aggregate functions implementing influxQL features.
+
+use datafusion::logical_expr::{
+    AccumulatorFactoryFunction, AggregateUDF, ReturnTypeFunction, StateTypeFunction,
+};
+use once_cell::sync::Lazy;
+use std::sync::Arc;
+
+mod percentile;
+
+/// Definition of the `PERCENTILE` user-defined aggregate function.
+pub(crate) static PERCENTILE: Lazy<Arc<AggregateUDF>> = Lazy::new(|| {
+    let return_type: ReturnTypeFunction = Arc::new(percentile::return_type);
+    let accumulator: AccumulatorFactoryFunction = Arc::new(percentile::accumulator);
+    let state_type: StateTypeFunction = Arc::new(percentile::state_type);
+
+    Arc::new(AggregateUDF::new(
+        percentile::NAME,
+        &percentile::SIGNATURE,
+        &return_type,
+        &accumulator,
+        &state_type,
+    ))
+});
diff --git a/iox_query_influxql/src/aggregate/percentile.rs b/iox_query_influxql/src/aggregate/percentile.rs
new file mode 100644
index 0000000..dda8659
--- /dev/null
+++ b/iox_query_influxql/src/aggregate/percentile.rs
@@ -0,0 +1,157 @@
+use crate::error;
+use arrow::array::{as_list_array, Array, ArrayRef, Float64Array, Int64Array};
+use arrow::datatypes::{DataType, Field};
+use datafusion::common::{downcast_value, DataFusionError, Result, ScalarValue};
+use datafusion::logical_expr::{Accumulator, Signature, TypeSignature, Volatility};
+use once_cell::sync::Lazy;
+use std::sync::Arc;
+
+/// The name of the percentile aggregate function.
+pub(super) const NAME: &str = "percentile";
+
+/// Valid signatures for the percentile aggregate function.
+pub(super) static SIGNATURE: Lazy<Signature> = Lazy::new(|| {
+    Signature::one_of(
+        crate::NUMERICS
+            .iter()
+            .flat_map(|dt| {
+                [
+                    TypeSignature::Exact(vec![dt.clone(), DataType::Int64]),
+                    TypeSignature::Exact(vec![dt.clone(), DataType::Float64]),
+                ]
+            })
+            .collect(),
+        Volatility::Immutable,
+    )
+});
+
+/// Calculate the return type given the function signature. Percentile
+/// always returns the same type as the input column.
+pub(super) fn return_type(signature: &[DataType]) -> Result<Arc<DataType>> {
+    Ok(Arc::new(signature[0].clone()))
+}
+
+/// Create a new accumulator for the data type.
+pub(super) fn accumulator(dt: &DataType) -> Result<Box<dyn Accumulator>> {
+    Ok(Box::new(PercentileAccumulator::new(dt.clone())))
+}
+
+/// Calculate the intermediate merge state for the aggregator.
+pub(super) fn state_type(dt: &DataType) -> Result<Arc<Vec<DataType>>> {
+    Ok(Arc::new(vec![
+        DataType::List(Arc::new(Field::new("item", dt.clone(), true))),
+        DataType::Float64,
+    ]))
+}
+
+#[derive(Debug)]
+struct PercentileAccumulator {
+    data_type: DataType,
+    data: Vec<ScalarValue>,
+    percentile: Option<f64>,
+}
+
+impl PercentileAccumulator {
+    fn new(data_type: DataType) -> Self {
+        Self {
+            data_type,
+            data: vec![],
+            percentile: None,
+        }
+    }
+
+    fn update(&mut self, array: ArrayRef) -> Result<()> {
+        let array = Arc::clone(&array);
+        assert_eq!(array.data_type(), &self.data_type);
+
+        let nulls = array.nulls();
+        let null_len = nulls.map_or(0, |nb| nb.null_count());
+        self.data.reserve(array.len() - null_len);
+        for idx in 0..array.len() {
+            if nulls.map_or(true, |nb| nb.is_valid(idx)) {
+                self.data.push(ScalarValue::try_from_array(&array, idx)?)
+            }
+        }
+        Ok(())
+    }
+
+    fn set_percentile(&mut self, array: ArrayRef) -> Result<()> {
+        if self.percentile.is_none() && array.is_valid(0) {
+            self.percentile = match array.data_type() {
+                DataType::Int64 => Some(downcast_value!(array, Int64Array).value(0) as f64),
+                DataType::Float64 => Some(downcast_value!(array, Float64Array).value(0)),
+                dt => {
+                    return error::internal(format!(
+                        "invalid data type ({dt}) for PERCENTILE n argument"
+                    ))
+                }
+            };
+        }
+        Ok(())
+    }
+}
+
+impl Accumulator for PercentileAccumulator {
+    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
+        assert_eq!(values.len(), 2);
+
+        self.set_percentile(Arc::clone(&values[1]))?;
+        self.update(Arc::clone(&values[0]))
+    }
+
+    fn evaluate(&self) -> Result<ScalarValue> {
+        let idx = self
+            .percentile
+            .and_then(|n| percentile_idx(self.data.len(), n));
+        if idx.is_none() {
+            return Ok(ScalarValue::Float64(None));
+        }
+
+        let array = ScalarValue::iter_to_array(self.data.clone())?;
+        let indices = arrow::compute::sort_to_indices(&array, None, None)?;
+        let array_idx = indices.value(idx.unwrap());
+        ScalarValue::try_from_array(&array, array_idx as usize)
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of::<Option<f64>>()
+            + std::mem::size_of::<DataType>()
+            + ScalarValue::size_of_vec(&self.data)
+    }
+
+    fn state(&self) -> Result<Vec<ScalarValue>> {
+        let arr = ScalarValue::new_list(&self.data, &self.data_type);
+        Ok(vec![
+            ScalarValue::List(arr),
+            ScalarValue::Float64(self.percentile),
+        ])
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> {
+        assert_eq!(states.len(), 2);
+
+        self.set_percentile(Arc::clone(&states[1]))?;
+
+        let array = Arc::clone(&states[0]);
+        let list_array = as_list_array(&array);
+        for idx in 0..list_array.len() {
+            self.update(list_array.value(idx))?;
+        }
+        Ok(())
+    }
+}
+
+/// Calculate the location in an ordered list of len items where the
+/// location of the item at the given percentile would be found.
+///
+/// This uses the same algorithm as the original influxdb implementation
+/// of percentile as can be found in
+/// <https://github.com/influxdata/influxdb/blob/75a8bcfae2af7b0043933be9f96b98c0741ceee3/influxql/query/call_iterator.go#L1087>.
+fn percentile_idx(len: usize, percentile: f64) -> Option<usize> {
+    match TryInto::<usize>::try_into(
+        (((len as f64) * percentile / 100.0 + 0.5).floor() as isize) - 1,
+    ) {
+        Ok(idx) if idx < len => Some(idx),
+        _ => None,
+    }
+}
diff --git a/iox_query_influxql/src/error.rs b/iox_query_influxql/src/error.rs
new file mode 100644
index 0000000..cc2dd6d
--- /dev/null
+++ b/iox_query_influxql/src/error.rs
@@ -0,0 +1,71 @@
+use datafusion::common::Result;
+
+/// An error that was the result of an invalid InfluxQL query.
+pub(crate) fn query<T>(s: impl Into<String>) -> Result<T> {
+    Err(map::query(s))
+}
+
+/// An unexpected error whilst planning that represents a bug in IOx.
+pub(crate) fn internal<T>(s: impl Into<String>) -> Result<T> {
+    Err(map::internal(s))
+}
+
+/// The specified `feature` is not implemented.
+pub(crate) fn not_implemented<T>(feature: impl Into<String>) -> Result<T> {
+    Err(map::not_implemented(feature))
+}
+
+/// Functions that return a DataFusionError rather than a `Result<T, DataFusionError>`
+/// making them convenient to use with functions like `map_err`.
+pub(crate) mod map {
+    use datafusion::common::DataFusionError;
+    use influxdb_influxql_parser::time_range::ExprError;
+    use thiserror::Error;
+
+    #[derive(Debug, Error)]
+    enum PlannerError {
+        /// An unexpected error that represents a bug in IOx.
+        ///
+        /// The message is prefixed with `InfluxQL internal error: `,
+        /// which may be used by clients to identify internal InfluxQL
+        /// errors.
+        #[error("InfluxQL internal error: {0}")]
+        Internal(String),
+    }
+
+    /// An error that was the result of an invalid InfluxQL query.
+    pub(crate) fn query(s: impl Into<String>) -> DataFusionError {
+        DataFusionError::Plan(s.into())
+    }
+
+    /// An unexpected error whilst planning that represents a bug in IOx.
+    pub(crate) fn internal(s: impl Into<String>) -> DataFusionError {
+        DataFusionError::External(Box::new(PlannerError::Internal(s.into())))
+    }
+
+    /// The specified `feature` is not implemented.
+    pub(crate) fn not_implemented(feature: impl Into<String>) -> DataFusionError {
+        DataFusionError::NotImplemented(feature.into())
+    }
+
+    /// Map an [`ExprError`] to a DataFusion error.
+    pub(crate) fn expr_error(err: ExprError) -> DataFusionError {
+        match err {
+            ExprError::Expression(s) => query(s),
+            ExprError::Internal(s) => internal(s),
+        }
+    }
+
+    #[cfg(test)]
+    mod test {
+        use crate::error::map::PlannerError;
+
+        #[test]
+        fn test_planner_error_display() {
+            // The InfluxQL internal error:
+            assert!(PlannerError::Internal("****".to_owned())
+                .to_string()
+                .starts_with("InfluxQL internal error: "))
+        }
+    }
+}
diff --git a/iox_query_influxql/src/frontend/mod.rs b/iox_query_influxql/src/frontend/mod.rs
new file mode 100644
index 0000000..5e48085
--- /dev/null
+++ b/iox_query_influxql/src/frontend/mod.rs
@@ -0,0 +1 @@
+pub mod planner;
diff --git a/iox_query_influxql/src/frontend/planner.rs b/iox_query_influxql/src/frontend/planner.rs
new file mode 100644
index 0000000..f8f6ff0
--- /dev/null
+++ b/iox_query_influxql/src/frontend/planner.rs
@@ -0,0 +1,420 @@
+use arrow::datatypes::SchemaRef;
+use datafusion::common::ParamValues;
+use datafusion::physical_expr::execution_props::ExecutionProps;
+use influxdb_influxql_parser::show_field_keys::ShowFieldKeysStatement;
+use influxdb_influxql_parser::show_measurements::ShowMeasurementsStatement;
+use influxdb_influxql_parser::show_tag_keys::ShowTagKeysStatement;
+use influxdb_influxql_parser::show_tag_values::ShowTagValuesStatement;
+use std::any::Any;
+use std::collections::{HashMap, HashSet};
+use std::fmt;
+use std::fmt::Debug;
+use std::ops::Deref;
+use std::sync::Arc;
+
+use crate::plan::{parse_regex, InfluxQLToLogicalPlan, SchemaProvider};
+use datafusion::datasource::provider_as_source;
+use datafusion::execution::context::{SessionState, TaskContext};
+use datafusion::logical_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource};
+use datafusion::physical_expr::PhysicalSortExpr;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, Partitioning, SendableRecordBatchStream,
+};
+use datafusion::{
+    error::{DataFusionError, Result},
+    physical_plan::ExecutionPlan,
+};
+use influxdb_influxql_parser::common::MeasurementName;
+use influxdb_influxql_parser::parse_statements;
+use influxdb_influxql_parser::statement::Statement;
+use influxdb_influxql_parser::visit::{Visitable, Visitor};
+use iox_query::exec::IOxSessionContext;
+use observability_deps::tracing::debug;
+use schema::Schema;
+
+struct ContextSchemaProvider<'a> {
+    state: &'a SessionState,
+    tables: HashMap<String, (Arc<dyn TableSource>, Schema)>,
+}
+
+impl<'a> SchemaProvider for ContextSchemaProvider<'a> {
+    fn get_table_provider(&self, name: &str) -> Result<Arc<dyn TableSource>> {
+        self.tables
+            .get(name)
+            .map(|(t, _)| Arc::clone(t))
+            .ok_or_else(|| DataFusionError::Plan(format!("measurement does not exist: {name}")))
+    }
+
+    fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
+        self.state.scalar_functions().get(name).cloned()
+    }
+
+    fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
+        self.state.aggregate_functions().get(name).cloned()
+    }
+
+    fn table_names(&self) -> Vec<&'_ str> {
+        self.tables.keys().map(|k| k.as_str()).collect::<Vec<_>>()
+    }
+
+    fn table_exists(&self, name: &str) -> bool {
+        self.tables.contains_key(name)
+    }
+
+    fn table_schema(&self, name: &str) -> Option<Schema> {
+        self.tables.get(name).map(|(_, s)| s.clone())
+    }
+
+    fn execution_props(&self) -> &ExecutionProps {
+        self.state.execution_props()
+    }
+}
+
+/// A physical operator that overrides the `schema` API,
+/// to return an amended version owned by `SchemaExec`. The
+/// principal use case is to add additional metadata to the schema.
+struct SchemaExec {
+    input: Arc<dyn ExecutionPlan>,
+    schema: SchemaRef,
+}
+
+impl Debug for SchemaExec {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.fmt_as(DisplayFormatType::Default, f)
+    }
+}
+
+impl ExecutionPlan for SchemaExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        self.input.output_partitioning()
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        unimplemented!()
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        self.input.execute(partition, context)
+    }
+
+    fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+        Ok(datafusion::physical_plan::Statistics::new_unknown(
+            &self.schema(),
+        ))
+    }
+}
+
+impl DisplayAs for SchemaExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "SchemaExec")
+            }
+        }
+    }
+}
+
+/// Create plans for running InfluxQL queries against databases
+#[derive(Debug, Default, Copy, Clone)]
+pub struct InfluxQLQueryPlanner {}
+
+impl InfluxQLQueryPlanner {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Plan an InfluxQL query against the catalogs registered with `ctx`, and return a
+    /// DataFusion physical execution plan that runs on the query executor.
+    pub async fn query(
+        &self,
+        query: &str,
+        params: impl Into<ParamValues> + Send,
+        ctx: &IOxSessionContext,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let ctx = ctx.child_ctx("InfluxQLQueryPlanner::query");
+        debug!(text=%query, "planning InfluxQL query");
+
+        let statement = self.query_to_statement(query)?;
+        let logical_plan = self.statement_to_plan(statement, &ctx).await?;
+        // add params to plan only when they're non-empty
+        let logical_plan = match params.into() {
+            ParamValues::List(v) if !v.is_empty() => logical_plan.with_param_values(v)?,
+            ParamValues::Map(m) if !m.is_empty() => logical_plan.with_param_values(m)?,
+            _ => logical_plan,
+        };
+        let input = ctx.create_physical_plan(&logical_plan).await?;
+
+        // Merge schema-level metadata from the logical plan with the
+        // schema from the physical plan, as it is not propagated through the
+        // physical planning process.
+        let input_schema = input.schema();
+        let mut md = input_schema.metadata().clone();
+        md.extend(logical_plan.schema().metadata().clone());
+        let schema = Arc::new(arrow::datatypes::Schema::new_with_metadata(
+            input_schema.fields().clone(),
+            md,
+        ));
+
+        Ok(Arc::new(SchemaExec { input, schema }))
+    }
+
+    async fn statement_to_plan(
+        &self,
+        statement: Statement,
+        ctx: &IOxSessionContext,
+    ) -> Result<LogicalPlan> {
+        use std::collections::hash_map::Entry;
+
+        let ctx = ctx.child_ctx("statement_to_plan");
+        let session_cfg = ctx.inner().copied_config();
+        let cfg = session_cfg.options();
+        let schema = ctx
+            .inner()
+            .catalog(&cfg.catalog.default_catalog)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "failed to resolve catalog: {}",
+                    cfg.catalog.default_catalog
+                ))
+            })?
+            .schema(&cfg.catalog.default_schema)
+            .ok_or_else(|| {
+                DataFusionError::Plan(format!(
+                    "failed to resolve schema: {}",
+                    cfg.catalog.default_schema
+                ))
+            })?;
+        let names = schema.table_names();
+        let query_tables = find_all_measurements(&statement, &names)?;
+
+        let mut sp = ContextSchemaProvider {
+            state: &ctx.inner().state(),
+            tables: HashMap::with_capacity(query_tables.len()),
+        };
+
+        for table_name in &query_tables {
+            if let Entry::Vacant(v) = sp.tables.entry(table_name.to_string()) {
+                let mut ctx = ctx.child_ctx("get table schema");
+                ctx.set_metadata("table", table_name.to_owned());
+
+                if let Some(table) = schema.table(table_name).await {
+                    let schema = Schema::try_from(table.schema())
+                        .map_err(|err| {
+                            DataFusionError::Internal(format!("unable to convert DataFusion schema for measurement {table_name} to IOx schema: {err}"))
+                        })?;
+                    v.insert((provider_as_source(table), schema));
+                }
+            }
+        }
+
+        let planner = InfluxQLToLogicalPlan::new(&sp, &ctx);
+        let logical_plan = planner.statement_to_plan(statement)?;
+        debug!(plan=%logical_plan.display_graphviz(), "logical plan");
+        Ok(logical_plan)
+    }
+
+    fn query_to_statement(&self, query: &str) -> Result<Statement> {
+        let mut statements =
+            parse_statements(query).map_err(|e| DataFusionError::Plan(e.to_string()))?;
+
+        if statements.len() != 1 {
+            return Err(DataFusionError::NotImplemented(
+                "The context currently only supports a single InfluxQL statement".to_string(),
+            ));
+        }
+
+        Ok(statements.pop().unwrap())
+    }
+}
+
+fn find_all_measurements(stmt: &Statement, tables: &[String]) -> Result<HashSet<String>> {
+    struct Matcher<'a>(&'a mut HashSet<String>, &'a [String]);
+    impl<'a> Visitor for Matcher<'a> {
+        type Error = DataFusionError;
+
+        fn post_visit_measurement_name(
+            self,
+            mn: &MeasurementName,
+        ) -> std::result::Result<Self, Self::Error> {
+            match mn {
+                MeasurementName::Name(name) => {
+                    let name = name.deref();
+                    if self.1.contains(name) {
+                        self.0.insert(name.to_string());
+                    }
+                }
+                MeasurementName::Regex(re) => {
+                    let re = parse_regex(re)?;
+
+                    self.1
+                        .iter()
+                        .filter(|table| re.is_match(table))
+                        .for_each(|table| {
+                            self.0.insert(table.into());
+                        });
+                }
+            }
+
+            Ok(self)
+        }
+
+        fn post_visit_show_measurements_statement(
+            self,
+            sm: &ShowMeasurementsStatement,
+        ) -> Result<Self, Self::Error> {
+            if sm.with_measurement.is_none() {
+                self.0.extend(self.1.iter().cloned());
+            }
+
+            Ok(self)
+        }
+
+        fn post_visit_show_field_keys_statement(
+            self,
+            sfk: &ShowFieldKeysStatement,
+        ) -> Result<Self, Self::Error> {
+            if sfk.from.is_none() {
+                self.0.extend(self.1.iter().cloned());
+            }
+
+            Ok(self)
+        }
+
+        fn post_visit_show_tag_values_statement(
+            self,
+            stv: &ShowTagValuesStatement,
+        ) -> Result<Self, Self::Error> {
+            if stv.from.is_none() {
+                self.0.extend(self.1.iter().cloned());
+            }
+
+            Ok(self)
+        }
+
+        fn post_visit_show_tag_keys_statement(
+            self,
+            stk: &ShowTagKeysStatement,
+        ) -> std::result::Result<Self, Self::Error> {
+            if stk.from.is_none() {
+                self.0.extend(self.1.iter().cloned());
+            }
+
+            Ok(self)
+        }
+    }
+
+    let mut m = HashSet::new();
+    let vis = Matcher(&mut m, tables);
+    stmt.accept(vis)?;
+
+    Ok(m)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use itertools::Itertools;
+    use test_helpers::assert_error;
+
+    #[test]
+    fn test_query_to_statement() {
+        let p = InfluxQLQueryPlanner::new();
+
+        // succeeds for a single statement
+        let _ = p.query_to_statement("SELECT foo FROM bar").unwrap();
+
+        // Fallible
+
+        assert_error!(
+            p.query_to_statement("SELECT foo FROM bar; SELECT bar FROM foo"),
+            DataFusionError::NotImplemented(ref s) if s == "The context currently only supports a single InfluxQL statement"
+        );
+    }
+
+    #[test]
+    fn test_find_all_measurements() {
+        fn find(q: &str) -> Vec<String> {
+            let p = InfluxQLQueryPlanner::new();
+            let s = p.query_to_statement(q).unwrap();
+            let tables = vec!["foo".into(), "bar".into(), "foobar".into()];
+            let res = find_all_measurements(&s, &tables).unwrap();
+            res.into_iter().sorted().collect()
+        }
+
+        assert_eq!(find("SELECT * FROM foo"), vec!["foo"]);
+        assert_eq!(find("SELECT * FROM foo, foo"), vec!["foo"]);
+        assert_eq!(find("SELECT * FROM foo, bar"), vec!["bar", "foo"]);
+        assert_eq!(find("SELECT * FROM foo, none"), vec!["foo"]);
+        assert_eq!(find("SELECT * FROM /^foo/"), vec!["foo", "foobar"]);
+        assert_eq!(find("SELECT * FROM foo, /^bar/"), vec!["bar", "foo"]);
+        assert_eq!(find("SELECT * FROM //"), vec!["bar", "foo", "foobar"]);
+
+        // Find all measurements in subqueries
+        assert_eq!(
+            find("SELECT * FROM foo, (SELECT * FROM bar)"),
+            vec!["bar", "foo"]
+        );
+        assert_eq!(
+            find("SELECT * FROM foo, (SELECT * FROM /bar/)"),
+            vec!["bar", "foo", "foobar"]
+        );
+
+        // Find all measurements in `SHOW MEASUREMENTS`
+        assert_eq!(find("SHOW MEASUREMENTS"), vec!["bar", "foo", "foobar"]);
+        assert_eq!(
+            find("SHOW MEASUREMENTS WITH MEASUREMENT = foo"),
+            vec!["foo"]
+        );
+        assert_eq!(
+            find("SHOW MEASUREMENTS WITH MEASUREMENT =~ /^foo/"),
+            vec!["foo", "foobar"]
+        );
+
+        // Find all measurements in `SHOW FIELD KEYS`
+        assert_eq!(find("SHOW FIELD KEYS"), vec!["bar", "foo", "foobar"]);
+        assert_eq!(find("SHOW FIELD KEYS FROM /^foo/"), vec!["foo", "foobar"]);
+
+        // Find all measurements in `SHOW TAG VALUES`
+        assert_eq!(
+            find("SHOW TAG VALUES WITH KEY = \"k\""),
+            vec!["bar", "foo", "foobar"]
+        );
+        assert_eq!(
+            find("SHOW TAG VALUES FROM /^foo/ WITH KEY = \"k\""),
+            vec!["foo", "foobar"]
+        );
+
+        // Find all measurements in `SHOW TAG KEYS`
+        assert_eq!(find("SHOW TAG KEYS"), vec!["bar", "foo", "foobar"]);
+        assert_eq!(find("SHOW TAG KEYS FROM /^foo/"), vec!["foo", "foobar"]);
+
+        // Finds no measurements
+        assert!(find("SELECT * FROM none").is_empty());
+        assert!(find("SELECT * FROM (SELECT * FROM none)").is_empty());
+        assert!(find("SELECT * FROM /^l/").is_empty());
+        assert!(find("SELECT * FROM (SELECT * FROM /^l/)").is_empty());
+    }
+}
diff --git a/iox_query_influxql/src/lib.rs b/iox_query_influxql/src/lib.rs
new file mode 100644
index 0000000..e236959
--- /dev/null
+++ b/iox_query_influxql/src/lib.rs
@@ -0,0 +1,28 @@
+//! Contains the IOx InfluxQL query planner
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use arrow::datatypes::DataType;
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod aggregate;
+mod error;
+pub mod frontend;
+pub mod plan;
+mod window;
+
+/// A list of the numeric types supported by InfluxQL that can be be used
+/// as input to user-defined functions.
+static NUMERICS: &[DataType] = &[DataType::Int64, DataType::UInt64, DataType::Float64];
diff --git a/iox_query_influxql/src/plan/expr_type_evaluator.rs b/iox_query_influxql/src/plan/expr_type_evaluator.rs
new file mode 100644
index 0000000..e2103ba
--- /dev/null
+++ b/iox_query_influxql/src/plan/expr_type_evaluator.rs
@@ -0,0 +1,808 @@
+use crate::error;
+use crate::plan::field::field_by_name;
+use crate::plan::field_mapper::map_type;
+use crate::plan::ir::DataSource;
+use crate::plan::var_ref::influx_type_to_var_ref_data_type;
+use crate::plan::SchemaProvider;
+use datafusion::common::Result;
+use influxdb_influxql_parser::expression::{
+    Binary, BinaryOperator, Call, Expr, VarRef, VarRefDataType,
+};
+use influxdb_influxql_parser::literal::Literal;
+use influxdb_influxql_parser::select::Dimension;
+use itertools::Itertools;
+
+/// Evaluate the type of the specified expression.
+///
+/// Derived from [Go implementation](https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4796-L4797).
+pub(super) struct TypeEvaluator<'a> {
+    s: &'a dyn SchemaProvider,
+    from: &'a [DataSource],
+    /// Setting this to `true` will ensure scalar functions return errors for invalid data types.
+    /// The default is false, to ensure compatibility with InfluxQL OG.
+    call_type_is_strict: bool,
+}
+
+impl<'a> TypeEvaluator<'a> {
+    /// Create a `TypeEvaluator` with behavior compatible with InfluxQL OG.
+    ///
+    /// This behavior includes limited evaluation of [`Call`] expressions, as described
+    /// by [`TypeEvaluator::eval_scalar`].
+    pub(super) fn new(s: &'a dyn SchemaProvider, from: &'a [DataSource]) -> Self {
+        Self {
+            from,
+            s,
+            call_type_is_strict: false,
+        }
+    }
+
+    /// Create a `TypeEvaluator` with strict behavior.
+    ///
+    /// This behavior includes strict evaluation of [`Call`] expressions, that are
+    /// not compatible with InfluxQL OG, but may be enabled in the future to improve
+    /// the user experience.
+    ///
+    /// # NOTE
+    ///
+    /// This behaviour is unused in production, but may be enabled to improve the
+    /// user experience of InfluxQL.
+    #[cfg(test)]
+    fn new_strict(s: &'a dyn SchemaProvider, from: &'a [DataSource]) -> Self {
+        Self {
+            from,
+            s,
+            call_type_is_strict: true,
+        }
+    }
+
+    pub(super) fn eval_type(&self, expr: &Expr) -> Result<Option<VarRefDataType>> {
+        Ok(match expr {
+            Expr::VarRef(v) => self.eval_var_ref(v)?,
+            Expr::Call(v) => self.eval_call(v)?,
+            Expr::Binary(expr) => self.eval_binary_expr_type(expr)?,
+            Expr::Nested(expr) => self.eval_type(expr)?,
+            Expr::Literal(Literal::Float(_)) => Some(VarRefDataType::Float),
+            Expr::Literal(Literal::Unsigned(_)) => Some(VarRefDataType::Unsigned),
+            Expr::Literal(Literal::Integer(_)) => Some(VarRefDataType::Integer),
+            Expr::Literal(Literal::String(_)) => Some(VarRefDataType::String),
+            Expr::Literal(Literal::Boolean(_)) => Some(VarRefDataType::Boolean),
+            // Remaining patterns are not valid field types
+            Expr::BindParameter(_)
+            | Expr::Distinct(_)
+            | Expr::Wildcard(_)
+            | Expr::Literal(Literal::Duration(_))
+            | Expr::Literal(Literal::Regex(_))
+            | Expr::Literal(Literal::Timestamp(_)) => None,
+        })
+    }
+
+    fn eval_binary_expr_type(&self, expr: &Binary) -> Result<Option<VarRefDataType>> {
+        let (lhs, op, rhs) = (
+            self.eval_type(&expr.lhs)?,
+            expr.op,
+            self.eval_type(&expr.rhs)?,
+        );
+
+        // Deviation from InfluxQL OG, which fails if one operand is unsigned and the other is
+        // an integer. This will let some additional queries succeed that would otherwise have
+        // failed.
+        //
+        // In this case, we will let DataFusion handle automatic coercion, rather than fail.
+        //
+        // See: https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4729-L4730
+
+        match (lhs, rhs) {
+            (Some(dt), None) | (None, Some(dt)) => Ok(Some(dt)),
+            (None, None) => Ok(None),
+            (Some(lhs), Some(rhs)) => {
+                Ok(Some(binary_data_type(lhs, op, rhs).ok_or_else(|| {
+                    error::map::query(format!(
+                        "incompatible operands for operator {op}: {lhs} and {rhs}"
+                    ))
+                })?))
+            }
+        }
+    }
+
+    /// Returns the type for the specified [`VarRef`].
+    ///
+    /// This function assumes that the expression has already been reduced.
+    pub(super) fn eval_var_ref(&self, expr: &VarRef) -> Result<Option<VarRefDataType>> {
+        Ok(match expr.data_type {
+            Some(dt)
+                if matches!(
+                    dt,
+                    VarRefDataType::Integer
+                        | VarRefDataType::Unsigned
+                        | VarRefDataType::Float
+                        | VarRefDataType::String
+                        | VarRefDataType::Boolean
+                        | VarRefDataType::Tag
+                ) =>
+            {
+                Some(dt)
+            }
+            _ => {
+                let mut data_type: Option<VarRefDataType> = None;
+                for tr in self.from {
+                    match tr {
+                        DataSource::Table(name) => match (
+                            data_type,
+                            map_type(self.s, name.as_str(), expr.name.as_str()),
+                        ) {
+                            (Some(existing), Some(res)) => {
+                                if res < existing {
+                                    data_type = Some(res)
+                                }
+                            }
+                            (None, Some(res)) => data_type = Some(res),
+                            _ => continue,
+                        },
+                        DataSource::Subquery(select) => {
+                            // find the field by name
+                            if let Some(field) = field_by_name(&select.fields, expr.name.as_str()) {
+                                match (data_type, influx_type_to_var_ref_data_type(field.data_type))
+                                {
+                                    (Some(existing), Some(res)) => {
+                                        if res < existing {
+                                            data_type = Some(res)
+                                        }
+                                    }
+                                    (None, Some(res)) => data_type = Some(res),
+                                    _ => {}
+                                }
+                            };
+
+                            if data_type.is_none() {
+                                if let Some(group_by) = &select.group_by {
+                                    if group_by.iter().any(|dim| {
+                                        matches!(dim, Dimension::VarRef(VarRef { name, ..}) if name.as_str() == expr.name.as_str())
+                                    }) {
+                                        data_type = Some(VarRefDataType::Tag);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+                data_type
+            }
+        })
+    }
+
+    /// Evaluate the datatype of the function identified by `name`.
+    ///
+    /// Derived from [Go implementation](https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4693)
+    /// and [here](https://github.com/influxdata/influxdb/blob/37088e8f5330bec0f08a376b2cb945d02a296f4e/influxql/query/functions.go#L50).
+    fn eval_call(&self, call: &Call) -> Result<Option<VarRefDataType>> {
+        // Evaluate the data types of the arguments
+        let arg_types: Vec<_> = call
+            .args
+            .iter()
+            .map(|expr| self.eval_type(expr))
+            .try_collect()?;
+
+        Ok(match call.name.as_str() {
+            // See: https://github.com/influxdata/influxdb/blob/e484c4d87193a475466c0285c018d16f168139e6/query/functions.go#L54-L60
+            "mean" => Some(VarRefDataType::Float),
+            "count" => Some(VarRefDataType::Integer),
+            // These functions return the same type as their first argument
+            "min" | "max" | "sum" | "first" | "last" | "distinct" => match arg_types.first() {
+                Some(v) => *v,
+                None => None,
+            },
+
+            // See: https://github.com/influxdata/influxdb/blob/e484c4d87193a475466c0285c018d16f168139e6/query/functions.go#L80
+            "median"
+            | "integral"
+            | "stddev"
+            | "derivative"
+            | "non_negative_derivative"
+            | "moving_average"
+            | "exponential_moving_average"
+            | "double_exponential_moving_average"
+            | "triple_exponential_moving_average"
+            | "relative_strength_index"
+            | "triple_exponential_derivative"
+            | "kaufmans_efficiency_ratio"
+            | "kaufmans_adaptive_moving_average"
+            | "chande_momentum_oscillator"
+            | "holt_winters"
+            | "holt_winters_with_fit" => Some(VarRefDataType::Float),
+            "elapsed" => Some(VarRefDataType::Integer),
+
+            name => self.eval_scalar(name, &arg_types)?,
+        })
+    }
+
+    /// Evaluate the data type of a scalar function
+    ///
+    /// See: <https://github.com/influxdata/influxdb/blob/343ce4223810ecdbc7f4de68f2509a51b28f2c56/query/math.go#L24>
+    ///
+    /// 💥InfluxQL OG has a bug that it does not evaluate call types correctly, and returns
+    /// the incorrect type by unconditionally using the first argument. It does not even call the
+    /// mapper to evaluate scalar functions. We must replicate the InfluxQL OG behaviour,
+    /// or queries will fail, that would ordinarily succeed.
+    ///
+    /// The bug may be traced through the OG source as follows.
+    ///
+    /// Prior to executing a `SELECT`, the following steps occur to validate all the field
+    /// expression types.
+    ///
+    /// 1. Calls `validateTypes` to ensure all field data types are valid:
+    ///    <https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L1186-L1187>
+    ///
+    /// 2. Uses a `MultiTypeMapper` to evaluate types, combining:
+    ///
+    ///    * a `FunctionTypeMapper` for sum, min, max, etc
+    ///    * a `MathTypeMapper` for scalar functions like log, abs, etc
+    ///
+    ///    ⚠️NOTE: the order is important. `FunctionTypeMapper` is called first.
+    ///
+    ///    See: <https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/select.go#L973-L976>
+    ///
+    /// 3. Call `EvalType` for each field:
+    ///
+    ///    See: <https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/select.go#L979>
+    ///
+    /// 4. For fields that have call expressions, the `evalCallExprType` function is ultimately called
+    ///
+    ///    See: <https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4697>
+    ///
+    /// 5. Because the `TypeMapper` is a `CallTypeMapper`, `evalCallExprType` eventually calls `CallType`:
+    ///
+    ///    See: <https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4715>
+    ///
+    /// 6. The `TypeMapper` is a `multiTypeMapper` and thus calls `CallType` for each instance. The first
+    ///    inner call that returns no error and the `typ` is not `Unknown` will be returned to the caller
+    ///
+    ///    See: <https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4610-L4615>
+    ///
+    /// 7. Recall, the first `TypeMapper` is `FunctionTypeMapper`, so it's `CallType` is
+    ///    called first.
+    ///
+    ///    🪳Here is the bug, which is that `FunctionTypeMapper::CallType` always returns
+    ///      the type of the first argument:
+    ///
+    ///    See: <https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/functions.go#L98-L99>
+    fn eval_scalar(
+        &self,
+        name: &str,
+        arg_types: &[Option<VarRefDataType>],
+    ) -> Result<Option<VarRefDataType>> {
+        if self.call_type_is_strict {
+            self.eval_scalar_strict(name, arg_types)
+        } else {
+            self.eval_scalar_compatible(arg_types)
+        }
+    }
+
+    fn eval_scalar_compatible(
+        &self,
+        arg_types: &[Option<VarRefDataType>],
+    ) -> Result<Option<VarRefDataType>> {
+        Ok(arg_types.first().and_then(|v| *v))
+    }
+
+    fn eval_scalar_strict(
+        &self,
+        name: &str,
+        arg_types: &[Option<VarRefDataType>],
+    ) -> Result<Option<VarRefDataType>> {
+        match name {
+            // These functions require a single numeric as input and return a float
+            name @ ("sin" | "cos" | "tan" | "atan" | "exp" | "log" | "ln" | "log2" | "log10"
+            | "sqrt") => {
+                match arg_types
+                    .first()
+                    .ok_or_else(|| error::map::query(format!("{name} expects 1 argument")))?
+                {
+                    Some(
+                        VarRefDataType::Float | VarRefDataType::Integer | VarRefDataType::Unsigned,
+                    )
+                    | None => Ok(Some(VarRefDataType::Float)),
+                    Some(arg0) => error::query(format!(
+                        "invalid argument type for {name}: expected a number, got {arg0}"
+                    )),
+                }
+            }
+
+            // These functions require a single float as input and return a float
+            name @ ("asin" | "acos") => {
+                match arg_types
+                    .first()
+                    .ok_or_else(|| error::map::query(format!("{name} expects 1 argument")))?
+                {
+                    Some(VarRefDataType::Float) | None => Ok(Some(VarRefDataType::Float)),
+                    Some(arg0) if self.call_type_is_strict => error::query(format!(
+                        "invalid argument type for {name}: expected a float, got {arg0}"
+                    )),
+                    _ => Ok(None),
+                }
+            }
+
+            // These functions require two numeric arguments and return a float
+            name @ ("atan2" | "pow") => {
+                let (Some(arg0), Some(arg1)) = (arg_types.first(), arg_types.get(1)) else {
+                    return error::query(format!("{name} expects 2 arguments"));
+                };
+
+                match (arg0, arg1) {
+                    (Some(
+                        VarRefDataType::Float
+                        | VarRefDataType::Integer
+                        | VarRefDataType::Unsigned
+                    ) | None, Some(
+                        VarRefDataType::Float
+                        | VarRefDataType::Integer
+                        | VarRefDataType::Unsigned
+                    ) | None) => Ok(Some(VarRefDataType::Float)),
+                    (arg0, arg1) if self.call_type_is_strict => error::query(format!(
+                        "invalid argument types for {name}: expected a number for both arguments, got ({arg0:?}, {arg1:?})"
+                    )),
+                    _ => Ok(None),
+                }
+            }
+
+            // These functions return the same data type as their input
+            name @ ("abs" | "floor" | "ceil" | "round") => {
+                match arg_types
+                    .first()
+                    .cloned()
+                    .ok_or_else(|| error::map::query(format!("{name} expects 1 argument")))?
+                {
+                    // Return the same data type as the input
+                    dt @ Some(
+                        VarRefDataType::Float | VarRefDataType::Integer | VarRefDataType::Unsigned,
+                    ) => Ok(dt),
+                    // If the input is unknown, default to float
+                    None => Ok(Some(VarRefDataType::Float)),
+                    Some(arg0) if self.call_type_is_strict => error::query(format!(
+                        "invalid argument type for {name}: expected a number, got {arg0}"
+                    )),
+                    _ => Ok(None),
+                }
+            }
+            _ => Ok(None),
+        }
+    }
+}
+
+/// Determine the data type of the binary expression using the left and right operands and the operator
+///
+/// This logic is derived from [InfluxQL OG][og].
+///
+/// [og]: https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4192
+fn binary_data_type(
+    lhs: VarRefDataType,
+    op: BinaryOperator,
+    rhs: VarRefDataType,
+) -> Option<VarRefDataType> {
+    use BinaryOperator::*;
+    use VarRefDataType::{Boolean, Float, Integer, Unsigned};
+
+    match (lhs, op, rhs) {
+        // Boolean only supports bitwise operators.
+        //
+        // See:
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4210
+        (Boolean, BitwiseAnd | BitwiseOr | BitwiseXor, Boolean) => Some(Boolean),
+
+        // A float for either operand is a float result, but only
+        // support the +, -, * / and % operators.
+        //
+        // See:
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4228
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4285
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4411
+        (Float, Add | Sub | Mul | Div | Mod, Float | Integer | Unsigned)
+        | (Integer | Unsigned, Add | Sub | Mul | Div | Mod, Float) => Some(Float),
+
+        // Integers using the division operator are always float
+        //
+        // See:
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4335-L4340
+        // * https://github.com/influxdata/influxdb/blob/3372d3b878ebcba708dc9edfce7ea83cc8152393/query/cursor.go#L178
+        (Integer, Div, Integer) => Some(Float),
+
+        // Integer and unsigned types support all operands and
+        // the result is the same type if both operands are the same.
+        //
+        // See:
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4314
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4489
+        (Integer, _, Integer) | (Unsigned, _, Unsigned) => Some(lhs),
+
+        // If either side is unsigned, and the other is integer,
+        // the result is unsigned for all operators.
+        //
+        // See:
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4358
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4440
+        (Unsigned, _, Integer) | (Integer, _, Unsigned) => Some(Unsigned),
+
+        // String or any other combination of operator and operands are invalid
+        //
+        // See:
+        // * https://github.com/influxdata/influxql/blob/802555d6b3a35cd464a6d8afa2a6511002cf3c2c/ast.go#L4562
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::plan::expr_type_evaluator::binary_data_type;
+    use crate::plan::ir::DataSource;
+    use crate::plan::test_utils::MockSchemaProvider;
+    use assert_matches::assert_matches;
+    use datafusion::common::DataFusionError;
+    use influxdb_influxql_parser::expression::VarRefDataType;
+    use influxdb_influxql_parser::select::Field;
+    use itertools::iproduct;
+
+    #[test]
+    fn test_binary_data_type() {
+        use influxdb_influxql_parser::expression::BinaryOperator::*;
+        use VarRefDataType::{Boolean, Float, Integer, String, Tag, Timestamp, Unsigned};
+
+        // Boolean ok
+        for op in [BitwiseAnd, BitwiseOr, BitwiseXor] {
+            assert_matches!(
+                binary_data_type(Boolean, op, Boolean),
+                Some(VarRefDataType::Boolean)
+            );
+        }
+
+        // Boolean !ok
+        for op in [Add, Sub, Div, Mul, Mod] {
+            assert_matches!(binary_data_type(Boolean, op, Boolean), None);
+        }
+
+        // Float ok
+        for (op, operand) in iproduct!([Add, Sub, Div, Mul, Mod], [Float, Integer, Unsigned]) {
+            assert_matches!(binary_data_type(Float, op, operand), Some(Float));
+            assert_matches!(binary_data_type(operand, op, Float), Some(Float));
+        }
+
+        // Float !ok
+        for (op, operand) in iproduct!(
+            [BitwiseAnd, BitwiseOr, BitwiseXor],
+            [Float, Integer, Unsigned]
+        ) {
+            assert_matches!(binary_data_type(Float, op, operand), None);
+            assert_matches!(binary_data_type(operand, op, Float), None);
+        }
+
+        // division and integers are special
+        assert_matches!(binary_data_type(Integer, Div, Integer), Some(Float));
+        assert_matches!(binary_data_type(Unsigned, Div, Unsigned), Some(Unsigned));
+
+        // Integer op Integer | Unsigned op Unsigned
+        for op in [Add, Sub, Mul, Mod, BitwiseAnd, BitwiseOr, BitwiseXor] {
+            assert_matches!(binary_data_type(Integer, op, Integer), Some(Integer));
+            assert_matches!(binary_data_type(Unsigned, op, Unsigned), Some(Unsigned));
+        }
+
+        // Unsigned op Integer | Integer op Unsigned
+        for op in [Add, Sub, Div, Mul, Mod, BitwiseAnd, BitwiseOr, BitwiseXor] {
+            assert_matches!(binary_data_type(Integer, op, Unsigned), Some(Unsigned));
+            assert_matches!(binary_data_type(Unsigned, op, Integer), Some(Unsigned));
+        }
+
+        // Fallible cases
+
+        assert_matches!(binary_data_type(Tag, Add, Tag), None);
+        assert_matches!(binary_data_type(String, Add, String), None);
+        assert_matches!(binary_data_type(Timestamp, Add, Timestamp), None);
+    }
+
+    #[test]
+    fn test_evaluate_type() {
+        let namespace = MockSchemaProvider::default();
+
+        fn evaluate_type(
+            s: &dyn SchemaProvider,
+            expr: &str,
+            from: &[&str],
+        ) -> Result<Option<VarRefDataType>> {
+            let from = from
+                .iter()
+                .map(ToString::to_string)
+                .map(DataSource::Table)
+                .collect::<Vec<_>>();
+            let Field { expr, .. } = expr.parse().unwrap();
+            TypeEvaluator::new(s, &from).eval_type(&expr)
+        }
+
+        let res = evaluate_type(&namespace, "shared_field0", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        let res = evaluate_type(&namespace, "shared_tag0", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Tag);
+
+        // Unknown
+        let res = evaluate_type(&namespace, "not_exists", &["temp_01"]).unwrap();
+        assert!(res.is_none());
+
+        let res = evaluate_type(&namespace, "shared_field0", &["temp_02"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        let res = evaluate_type(&namespace, "shared_field0", &["temp_02"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        // Same field across multiple measurements resolves to the highest precedence (float)
+        let res = evaluate_type(&namespace, "shared_field0", &["temp_01", "temp_02"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        // Explicit cast of integer field to float
+        let res = evaluate_type(&namespace, "SUM(field_i64::float)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        //
+        // Binary expressions
+        //
+
+        let res = evaluate_type(&namespace, "field_f64 + field_i64", &["all_types"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        let res = evaluate_type(&namespace, "field_bool | field_bool", &["all_types"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Boolean);
+
+        // Fallible
+
+        // Verify incompatible operators and operator error
+        let res = evaluate_type(&namespace, "field_f64 & field_i64", &["all_types"]);
+        assert_matches!(res, Err(DataFusionError::Plan(ref s)) if s == "incompatible operands for operator &: float and integer");
+
+        // data types for functions
+        let res = evaluate_type(&namespace, "SUM(field_f64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        let res = evaluate_type(&namespace, "SUM(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        let res = evaluate_type(&namespace, "SUM(field_u64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Unsigned);
+
+        let res = evaluate_type(&namespace, "MIN(field_f64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        let res = evaluate_type(&namespace, "MAX(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        let res = evaluate_type(&namespace, "FIRST(field_str)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::String);
+
+        let res = evaluate_type(&namespace, "LAST(field_str)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::String);
+
+        let res = evaluate_type(&namespace, "DISTINCT(field_str)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::String);
+
+        let res = evaluate_type(&namespace, "MEAN(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        let res = evaluate_type(&namespace, "MEAN(field_u64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        let res = evaluate_type(&namespace, "COUNT(field_f64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        let res = evaluate_type(&namespace, "COUNT(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        let res = evaluate_type(&namespace, "COUNT(field_u64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        let res = evaluate_type(&namespace, "COUNT(field_str)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        // Float functions
+        for call in [
+            "median(field_i64)",
+            "integral(field_i64)",
+            "stddev(field_i64)",
+            "derivative(field_i64)",
+            "non_negative_derivative(field_i64)",
+            "moving_average(field_i64, 2)",
+            "exponential_moving_average(field_i64, 2)",
+            "double_exponential_moving_average(field_i64, 2)",
+            "triple_exponential_moving_average(field_i64, 2)",
+            "relative_strength_index(field_i64, 2)",
+            "triple_exponential_derivative(field_i64, 2)",
+            "kaufmans_efficiency_ratio(field_i64, 2)",
+            "kaufmans_adaptive_moving_average(field_i64, 2)",
+            "chande_momentum_oscillator(field_i64, 2)",
+        ] {
+            let res = evaluate_type(&namespace, call, &["temp_01"])
+                .unwrap()
+                .unwrap();
+            assert_matches!(res, VarRefDataType::Float);
+        }
+
+        // holt_winters
+        let res = evaluate_type(
+            &namespace,
+            "holt_winters(mean(field_i64), 2, 3)",
+            &["temp_01"],
+        )
+        .unwrap()
+        .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        // holt_winters_with_fit
+        let res = evaluate_type(
+            &namespace,
+            "holt_winters_with_fit(mean(field_i64), 2, 3)",
+            &["temp_01"],
+        )
+        .unwrap()
+        .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        // Integer functions
+        let res = evaluate_type(&namespace, "elapsed(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        // scalar functions
+
+        // These require a single numeric input and return a float
+        let res = evaluate_type(&namespace, "sin(field_f64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        // These require a single float as input and return a float
+        let res = evaluate_type(&namespace, "asin(field_f64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        // These require two numeric arguments as input and return a float
+        let res = evaluate_type(&namespace, "atan2(field_f64, 3)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        // These require a numeric argument as input and return the same type
+        let res = evaluate_type(&namespace, "abs(field_f64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+        let res = evaluate_type(&namespace, "abs(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+        let res = evaluate_type(&namespace, "abs(field_u64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Unsigned);
+    }
+
+    /// Validate InfluxQL OG compatible behavior for scalar functions
+    #[test]
+    fn test_evaluate_type_compat() {
+        let namespace = MockSchemaProvider::default();
+
+        fn evaluate_type(
+            s: &dyn SchemaProvider,
+            expr: &str,
+            from: &[&str],
+        ) -> Result<Option<VarRefDataType>> {
+            let from = from
+                .iter()
+                .map(ToString::to_string)
+                .map(DataSource::Table)
+                .collect::<Vec<_>>();
+            let Field { expr, .. } = expr.parse().unwrap();
+            TypeEvaluator::new(s, &from).eval_type(&expr)
+        }
+
+        let res = evaluate_type(&namespace, "sin(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+        let res = evaluate_type(&namespace, "sin(field_str)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::String);
+
+        let res = evaluate_type(&namespace, "asin(field_i64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+
+        // invalid number of arguments, still returns data type of first arg
+        let res = evaluate_type(&namespace, "atan2(field_f64)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Float);
+
+        let res = evaluate_type(&namespace, "atan2(field_str, 3)", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::String);
+        let res = evaluate_type(&namespace, "atan2(field_i64, 'str')", &["temp_01"])
+            .unwrap()
+            .unwrap();
+        assert_matches!(res, VarRefDataType::Integer);
+    }
+
+    /// Validates `TypeEvaluator` when in strict mode.
+    #[test]
+    fn test_evaluate_type_strict() {
+        let namespace = MockSchemaProvider::default();
+
+        fn evaluate_type(
+            s: &dyn SchemaProvider,
+            expr: &str,
+            from: &[&str],
+        ) -> Result<Option<VarRefDataType>> {
+            let from = from
+                .iter()
+                .map(ToString::to_string)
+                .map(DataSource::Table)
+                .collect::<Vec<_>>();
+            let Field { expr, .. } = expr.parse().unwrap();
+            TypeEvaluator::new_strict(s, &from).eval_type(&expr)
+        }
+
+        // In struct mode, these scalar functions should return an error when the arguments are an
+        // invalid data type.
+
+        evaluate_type(&namespace, "sin(field_str)", &["temp_01"]).unwrap_err();
+        evaluate_type(&namespace, "asin(field_i64)", &["temp_01"]).unwrap_err();
+        evaluate_type(&namespace, "atan2(field_f64)", &["temp_01"]).unwrap_err();
+        evaluate_type(&namespace, "atan2(field_str, 3)", &["temp_01"]).unwrap_err();
+        evaluate_type(&namespace, "atan2(field_i64, 'str')", &["temp_01"]).unwrap_err();
+        evaluate_type(&namespace, "abs(field_str)", &["temp_01"]).unwrap_err();
+    }
+}
diff --git a/iox_query_influxql/src/plan/field.rs b/iox_query_influxql/src/plan/field.rs
new file mode 100644
index 0000000..cf989b1
--- /dev/null
+++ b/iox_query_influxql/src/plan/field.rs
@@ -0,0 +1,191 @@
+use crate::plan::ir::Field;
+use influxdb_influxql_parser::expression::{Call, Expr, VarRef};
+use influxdb_influxql_parser::visit::{Recursion, Visitable, Visitor};
+use std::ops::Deref;
+
+/// Returns the name of the field.
+///
+/// Prefers the alias if set, otherwise derives the name
+/// from [Expr::VarRef] or [Expr::Call]. Finally, if neither
+/// are available, falls back to an empty string.
+///
+/// Derived from [Go implementation](https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3326-L3328)
+pub(crate) fn field_name(f: &influxdb_influxql_parser::select::Field) -> String {
+    if let Some(alias) = &f.alias {
+        return alias.deref().to_string();
+    }
+
+    let mut expr = &f.expr;
+    loop {
+        expr = match expr {
+            Expr::Call(Call { name, .. }) => return name.clone(),
+            Expr::Nested(nested) => nested,
+            Expr::Binary { .. } => return binary_expr_name(&f.expr),
+            Expr::Distinct(_) => return "distinct".to_string(),
+            Expr::VarRef(VarRef { name, .. }) => return name.deref().into(),
+            Expr::Wildcard(_) | Expr::BindParameter(_) | Expr::Literal(_) => return "".to_string(),
+        };
+    }
+}
+
+/// Returns the expression that matches the field name.
+///
+/// If the name matches one of the arguments to
+/// "top" or "bottom", the variable reference inside of the function is returned.
+///
+/// Derive from [this implementation](https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L1725)
+///
+/// **NOTE**
+///
+/// This implementation duplicates the behavior of the original implementation, including skipping the
+/// first argument. It is likely the original intended to skip the _last_ argument, which is the number
+/// of rows.
+pub(super) fn field_by_name<'a>(fields: &'a [Field], name: &str) -> Option<&'a Field> {
+    fields.iter().find(|f| f.name == name || match &f.expr {
+        Expr::Call(Call{ name: func_name, args }) if (func_name == "top"
+            || func_name == "bottom")
+            && args.len() > 2 =>
+            args[1..].iter().any(|f| matches!(f, Expr::VarRef(VarRef{ name: field_name, .. }) if field_name.as_str() == name)),
+        _ => false,
+    })
+}
+
+struct BinaryExprNameVisitor<'a>(&'a mut Vec<String>);
+
+impl<'a> Visitor for BinaryExprNameVisitor<'a> {
+    type Error = ();
+
+    fn pre_visit_var_ref(self, n: &VarRef) -> Result<Recursion<Self>, Self::Error> {
+        self.0.push(n.name.to_string());
+        Ok(Recursion::Continue(self))
+    }
+
+    fn pre_visit_call(self, n: &Call) -> Result<Recursion<Self>, Self::Error> {
+        self.0.push(n.name.clone());
+        Ok(Recursion::Stop(self))
+    }
+}
+
+/// Returns the name of a binary expression by concatenating
+/// the names of any [Expr::VarRef] and [Expr::Call] with underscores.
+///
+/// Derived from [Go implementation](https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L3729-L3731)
+fn binary_expr_name(expr: &Expr) -> String {
+    let mut names = Vec::new();
+    let vis = BinaryExprNameVisitor(&mut names);
+    expr.accept(vis).unwrap(); // It is not expected to fail
+    names.join("_")
+}
+
+#[cfg(test)]
+mod test {
+    use crate::plan::field::{field_by_name, field_name};
+    use crate::plan::ir;
+    use assert_matches::assert_matches;
+    use influxdb_influxql_parser::select::Field;
+
+    #[test]
+    fn test_field_name() {
+        let f: Field = "usage".parse().unwrap();
+        assert_eq!(field_name(&f), "usage");
+
+        let f: Field = "usage as u2".parse().unwrap();
+        assert_eq!(field_name(&f), "u2");
+
+        let f: Field = "(usage)".parse().unwrap();
+        assert_eq!(field_name(&f), "usage");
+
+        let f: Field = "COUNT(usage)".parse().unwrap();
+        assert_eq!(field_name(&f), "count");
+
+        let f: Field = "COUNT(usage) + SUM(usage_idle)".parse().unwrap();
+        assert_eq!(field_name(&f), "count_sum");
+
+        let f: Field = "1+2".parse().unwrap();
+        assert_eq!(field_name(&f), "");
+
+        let f: Field = "1 + usage".parse().unwrap();
+        assert_eq!(field_name(&f), "usage");
+
+        let f: Field = "/reg/".parse().unwrap();
+        assert_eq!(field_name(&f), "");
+
+        let f: Field = "DISTINCT usage".parse().unwrap();
+        assert_eq!(field_name(&f), "distinct");
+
+        let f: Field = "-usage".parse().unwrap();
+        assert_eq!(field_name(&f), "usage");
+
+        // Doesn't quote keyword
+        let f: Field = "\"user\"".parse().unwrap();
+        assert_eq!(field_name(&f), "user");
+    }
+
+    #[test]
+    fn test_field_by_name() {
+        fn parse_fields(exprs: Vec<&str>) -> Vec<ir::Field> {
+            exprs
+                .iter()
+                .map(|s| {
+                    let f: Field = s.parse().unwrap();
+                    let name = field_name(&f);
+                    let data_type = None;
+                    ir::Field {
+                        expr: f.expr,
+                        name,
+                        data_type,
+                    }
+                })
+                .collect()
+        }
+        let stmt = parse_fields(vec!["usage", "idle"]);
+        assert_eq!(
+            format!("{}", field_by_name(&stmt, "usage").unwrap()),
+            "usage AS usage"
+        );
+
+        let stmt = parse_fields(vec!["usage as foo", "usage"]);
+        assert_eq!(
+            format!("{}", field_by_name(&stmt, "foo").unwrap()),
+            "usage AS foo"
+        );
+
+        let stmt = parse_fields(vec!["top(idle, usage, 5)", "usage"]);
+        assert_eq!(
+            format!("{}", field_by_name(&stmt, "usage").unwrap()),
+            "top(idle, usage, 5) AS top"
+        );
+
+        let stmt = parse_fields(vec!["bottom(idle, usage, 5)", "usage"]);
+        assert_eq!(
+            format!("{}", field_by_name(&stmt, "usage").unwrap()),
+            "bottom(idle, usage, 5) AS bottom"
+        );
+
+        // TOP is in uppercase, to ensure we can expect the function name to be
+        // uniformly lowercase.
+        let stmt = parse_fields(vec!["TOP(idle, usage, 5) as foo", "usage"]);
+        assert_eq!(
+            format!("{}", field_by_name(&stmt, "usage").unwrap()),
+            "top(idle, usage, 5) AS foo"
+        );
+        assert_eq!(
+            format!("{}", field_by_name(&stmt, "foo").unwrap()),
+            "top(idle, usage, 5) AS foo"
+        );
+
+        // Not exists
+
+        let stmt = parse_fields(vec!["usage", "idle"]);
+        assert_matches!(field_by_name(&stmt, "bar"), None);
+
+        // Does not match name by first argument to top or bottom, per
+        // bug in original implementation.
+        let stmt = parse_fields(vec!["top(foo, usage, 5)", "idle"]);
+        assert_matches!(field_by_name(&stmt, "foo"), None);
+        assert_eq!(
+            format!("{}", field_by_name(&stmt, "idle").unwrap()),
+            "idle AS idle"
+        );
+    }
+}
diff --git a/iox_query_influxql/src/plan/field_mapper.rs b/iox_query_influxql/src/plan/field_mapper.rs
new file mode 100644
index 0000000..24d09d9
--- /dev/null
+++ b/iox_query_influxql/src/plan/field_mapper.rs
@@ -0,0 +1,92 @@
+use crate::plan::ir::TagSet;
+use crate::plan::var_ref::{field_type_to_var_ref_data_type, influx_type_to_var_ref_data_type};
+use crate::plan::SchemaProvider;
+use influxdb_influxql_parser::expression::VarRefDataType;
+use schema::InfluxColumnType;
+use std::collections::HashMap;
+
+pub(crate) type FieldTypeMap = HashMap<String, VarRefDataType>;
+
+pub(crate) fn field_and_dimensions(
+    s: &dyn SchemaProvider,
+    name: &str,
+) -> Option<(FieldTypeMap, TagSet)> {
+    s.table_schema(name).map(|iox| {
+        let mut field_set = FieldTypeMap::new();
+        let mut tag_set = TagSet::new();
+
+        for col in iox.iter() {
+            match col {
+                (InfluxColumnType::Field(ft), f) => {
+                    field_set.insert(f.name().to_owned(), field_type_to_var_ref_data_type(ft));
+                }
+                (InfluxColumnType::Tag, f) => {
+                    tag_set.insert(f.name().to_owned());
+                }
+                (InfluxColumnType::Timestamp, _) => {}
+            }
+        }
+        (field_set, tag_set)
+    })
+}
+
+pub(crate) fn map_type(
+    s: &dyn SchemaProvider,
+    measurement_name: &str,
+    field: &str,
+) -> Option<VarRefDataType> {
+    s.table_schema(measurement_name).and_then(|iox| {
+        iox.field_by_name(field)
+            .and_then(|(dt, _)| influx_type_to_var_ref_data_type(Some(dt)))
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::plan::test_utils::MockSchemaProvider;
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_schema_field_mapper() {
+        let namespace = MockSchemaProvider::default();
+
+        // Measurement exists
+        let (field_set, tag_set) = field_and_dimensions(&namespace, "cpu").unwrap();
+        assert_eq!(
+            field_set,
+            FieldTypeMap::from([
+                ("usage_user".to_string(), VarRefDataType::Float),
+                ("usage_system".to_string(), VarRefDataType::Float),
+                ("usage_idle".to_string(), VarRefDataType::Float),
+            ])
+        );
+        assert_eq!(
+            tag_set,
+            TagSet::from(["cpu".to_string(), "host".to_string(), "region".to_string()])
+        );
+
+        // Measurement does not exist
+        assert!(field_and_dimensions(&namespace, "cpu2").is_none());
+
+        // `map_type` API calls
+
+        // Returns expected type
+        assert_matches!(
+            map_type(&namespace, "cpu", "usage_user"),
+            Some(VarRefDataType::Float)
+        );
+        assert_matches!(
+            map_type(&namespace, "cpu", "host"),
+            Some(VarRefDataType::Tag)
+        );
+        assert_matches!(
+            map_type(&namespace, "cpu", "time"),
+            Some(VarRefDataType::Timestamp)
+        );
+        // Returns None for nonexistent field
+        assert!(map_type(&namespace, "cpu", "nonexistent").is_none());
+        // Returns None for nonexistent measurement
+        assert!(map_type(&namespace, "nonexistent", "usage").is_none());
+    }
+}
diff --git a/iox_query_influxql/src/plan/ir.rs b/iox_query_influxql/src/plan/ir.rs
new file mode 100644
index 0000000..7ee811d
--- /dev/null
+++ b/iox_query_influxql/src/plan/ir.rs
@@ -0,0 +1,235 @@
+//! Defines data structures which represent an InfluxQL
+//! statement after it has been processed
+
+use crate::error;
+use crate::plan::rewriter::ProjectionType;
+use datafusion::common::Result;
+use influxdb_influxql_parser::common::{
+    LimitClause, MeasurementName, OffsetClause, OrderByClause, QualifiedMeasurementName,
+    WhereClause,
+};
+use influxdb_influxql_parser::expression::{ConditionalExpression, Expr};
+use influxdb_influxql_parser::select::{
+    FieldList, FillClause, FromMeasurementClause, GroupByClause, MeasurementSelection,
+    SelectStatement, TimeZoneClause,
+};
+use influxdb_influxql_parser::time_range::TimeRange;
+use schema::{InfluxColumnType, Schema};
+use std::collections::HashSet;
+use std::fmt::{Display, Formatter};
+
+use super::SchemaProvider;
+
+/// A set of tag keys.
+pub(super) type TagSet = HashSet<String>;
+
+/// Represents a validated and normalized top-level [`SelectStatement`].
+#[derive(Debug, Default, Clone)]
+pub(super) struct SelectQuery {
+    pub(super) select: Select,
+}
+
+#[derive(Debug, Default, Clone)]
+pub(super) struct Select {
+    /// The projection type of the selection.
+    pub(super) projection_type: ProjectionType,
+
+    /// The interval derived from the arguments to the `TIME` function
+    /// when a `GROUP BY` clause is declared with `TIME`.
+    pub(super) interval: Option<Interval>,
+
+    /// The number of additional intervals that must be read
+    /// for queries that group by time and use window functions such as
+    /// `DIFFERENCE` or `DERIVATIVE`. This ensures data for the first
+    /// window is available.
+    ///
+    /// See: <https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L50>
+    pub(super) extra_intervals: usize,
+
+    /// Projection clause of the selection.
+    pub(super) fields: Vec<Field>,
+
+    /// A list of data sources for the selection.
+    pub(super) from: Vec<DataSource>,
+
+    /// A conditional expression to filter the selection, excluding any predicates for the `time`
+    /// column.
+    pub(super) condition: Option<ConditionalExpression>,
+
+    /// The time range derived from the `WHERE` clause of the `SELECT` statement.
+    pub(super) time_range: TimeRange,
+
+    /// The GROUP BY clause of the selection.
+    pub(super) group_by: Option<GroupByClause>,
+
+    /// The set of possible tags for the selection, by combining
+    /// the tag sets of all inputs via the `FROM` clause.
+    pub(super) tag_set: TagSet,
+
+    /// The [fill] clause specifies the fill behaviour for the selection. If the value is [`None`],
+    /// it is the same behavior as `fill(null)`.
+    ///
+    /// [fill]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#group-by-time-intervals-and-fill
+    pub(super) fill: Option<FillClause>,
+
+    /// Configures the ordering of the selection by time.
+    pub(super) order_by: Option<OrderByClause>,
+
+    /// A value to restrict the number of rows returned.
+    pub(super) limit: Option<LimitClause>,
+
+    /// A value to specify an offset to start retrieving rows.
+    pub(super) offset: Option<OffsetClause>,
+
+    /// The timezone for the query, specified as [`tz('<time zone>')`][time_zone_clause].
+    ///
+    /// [time_zone_clause]: https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#the-time-zone-clause
+    pub(super) timezone: Option<chrono_tz::Tz>,
+}
+
+impl From<Select> for SelectStatement {
+    fn from(value: Select) -> Self {
+        Self {
+            fields: FieldList::new(
+                value
+                    .fields
+                    .into_iter()
+                    .map(|c| influxdb_influxql_parser::select::Field {
+                        expr: c.expr,
+                        alias: Some(c.name.into()),
+                    })
+                    .collect(),
+            ),
+            from: FromMeasurementClause::new(
+                value
+                    .from
+                    .into_iter()
+                    .map(|tr| match tr {
+                        DataSource::Table(name) => {
+                            MeasurementSelection::Name(QualifiedMeasurementName {
+                                database: None,
+                                retention_policy: None,
+                                name: MeasurementName::Name(name.as_str().into()),
+                            })
+                        }
+                        DataSource::Subquery(q) => {
+                            MeasurementSelection::Subquery(Box::new((*q).into()))
+                        }
+                    })
+                    .collect(),
+            ),
+            condition: where_clause(value.condition, value.time_range),
+            group_by: value.group_by,
+            fill: value.fill,
+            order_by: value.order_by,
+            limit: value.limit,
+            offset: value.offset,
+            series_limit: None,
+            series_offset: None,
+            timezone: value.timezone.map(TimeZoneClause::new),
+        }
+    }
+}
+
+/// Combine the `condition` and `time_range` into a single `WHERE` predicate.
+fn where_clause(
+    condition: Option<ConditionalExpression>,
+    time_range: TimeRange,
+) -> Option<WhereClause> {
+    let time_expr: Option<ConditionalExpression> = match (time_range.lower, time_range.upper) {
+        (Some(lower), Some(upper)) if lower == upper => {
+            Some(format!("time = {lower}").parse().unwrap())
+        }
+        (Some(lower), Some(upper)) => Some(
+            format!("time >= {lower} AND time <= {upper}")
+                .parse()
+                .unwrap(),
+        ),
+        (Some(lower), None) => Some(format!("time >= {lower}").parse().unwrap()),
+        (None, Some(upper)) => Some(format!("time <= {upper}").parse().unwrap()),
+        (None, None) => None,
+    };
+
+    match (time_expr, condition) {
+        (Some(lhs), Some(rhs)) => Some(WhereClause::new(lhs.and(rhs))),
+        (Some(expr), None) | (None, Some(expr)) => Some(WhereClause::new(expr)),
+        (None, None) => None,
+    }
+}
+
+/// Represents a data source that is either a table or a subquery in a [`Select`] from clause.
+#[derive(Debug, Clone)]
+pub(super) enum DataSource {
+    Table(String),
+    Subquery(Box<Select>),
+}
+
+impl DataSource {
+    pub(super) fn schema(&self, s: &dyn SchemaProvider) -> Result<DataSourceSchema<'_>> {
+        match self {
+            Self::Table(table_name) => s
+                .table_schema(table_name)
+                .map(DataSourceSchema::Table)
+                .ok_or_else(|| error::map::internal("expected table")),
+            Self::Subquery(q) => Ok(DataSourceSchema::Subquery(q)),
+        }
+    }
+}
+
+pub(super) enum DataSourceSchema<'a> {
+    Table(Schema),
+    Subquery(&'a Select),
+}
+
+impl<'a> DataSourceSchema<'a> {
+    /// Returns `true` if the specified name is a tag field or a projection of a tag field if
+    /// the `DataSource` is a subquery.
+    pub(super) fn is_tag_field(&self, name: &str) -> bool {
+        match self {
+            DataSourceSchema::Table(s) => {
+                matches!(s.field_type_by_name(name), Some(InfluxColumnType::Tag))
+            }
+            DataSourceSchema::Subquery(q) => q.tag_set.contains(name),
+        }
+    }
+
+    /// Returns `true` if the specified name is a tag from the perspective of an outer
+    /// query consuming the results of this subquery or table. If a subquery has aliases
+    /// on its SELECT list, then those aliases are considered to be the names of the
+    /// columns in the outer query.
+    pub(super) fn is_projected_tag_field(&self, name: &str) -> bool {
+        match self {
+            DataSourceSchema::Table(_) => self.is_tag_field(name),
+            DataSourceSchema::Subquery(q) => q
+                .fields
+                .iter()
+                .any(|f| f.name == name && f.data_type == Some(InfluxColumnType::Tag)),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub(super) struct Field {
+    pub(super) expr: Expr,
+    pub(super) name: String,
+    pub(super) data_type: Option<InfluxColumnType>,
+}
+
+impl Display for Field {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        Display::fmt(&self.expr, f)?;
+        write!(f, " AS {}", self.name)
+    }
+}
+
+/// Represents the interval duration and offset
+/// derived from the `TIME` function when specified
+/// in a `GROUP BY` clause.
+#[derive(Debug, Clone, Copy)]
+pub(super) struct Interval {
+    /// The nanosecond duration of the interval
+    pub(super) duration: i64,
+
+    /// The nanosecond offset of the interval.
+    pub(super) offset: Option<i64>,
+}
diff --git a/iox_query_influxql/src/plan/mod.rs b/iox_query_influxql/src/plan/mod.rs
new file mode 100644
index 0000000..9c85c78
--- /dev/null
+++ b/iox_query_influxql/src/plan/mod.rs
@@ -0,0 +1,16 @@
+mod expr_type_evaluator;
+mod field;
+mod field_mapper;
+mod ir;
+mod planner;
+mod planner_rewrite_expression;
+mod planner_time_range_expression;
+mod rewriter;
+mod test_utils;
+mod udf;
+mod util;
+mod var_ref;
+
+pub use planner::InfluxQLToLogicalPlan;
+pub use planner::SchemaProvider;
+pub(crate) use util::parse_regex;
diff --git a/iox_query_influxql/src/plan/planner.rs b/iox_query_influxql/src/plan/planner.rs
new file mode 100644
index 0000000..14ff70a
--- /dev/null
+++ b/iox_query_influxql/src/plan/planner.rs
@@ -0,0 +1,5890 @@
+mod select;
+
+use crate::aggregate::PERCENTILE;
+use crate::error;
+use crate::plan::ir::{DataSource, Field, Interval, Select, SelectQuery};
+use crate::plan::planner::select::{
+    fields_to_exprs_no_nulls, make_tag_key_column_meta, plan_with_sort, ProjectionInfo, Selector,
+    SelectorWindowOrderBy,
+};
+use crate::plan::planner_time_range_expression::time_range_to_df_expr;
+use crate::plan::rewriter::{find_table_names, rewrite_statement, ProjectionType};
+use crate::plan::udf::{
+    cumulative_sum, derivative, difference, find_window_udfs, moving_average,
+    non_negative_derivative, non_negative_difference,
+};
+use crate::plan::util::{binary_operator_to_df_operator, rebase_expr, IQLSchema};
+use crate::plan::var_ref::var_ref_data_type_to_data_type;
+use crate::plan::{planner_rewrite_expression, udf};
+use crate::window::{
+    CUMULATIVE_SUM, DERIVATIVE, DIFFERENCE, MOVING_AVERAGE, NON_NEGATIVE_DERIVATIVE,
+    NON_NEGATIVE_DIFFERENCE, PERCENT_ROW_NUMBER,
+};
+use arrow::array::{
+    BooleanArray, DictionaryArray, Int32Array, Int64Array, StringArray, StringBuilder,
+    StringDictionaryBuilder,
+};
+use arrow::datatypes::{DataType, Field as ArrowField, Int32Type, Schema as ArrowSchema};
+use arrow::record_batch::RecordBatch;
+use chrono_tz::Tz;
+use datafusion::catalog::TableReference;
+use datafusion::common::tree_node::{Transformed, TreeNode, VisitRecursion};
+use datafusion::common::{DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, ToDFSchema};
+use datafusion::datasource::{provider_as_source, MemTable};
+use datafusion::logical_expr::expr::{AggregateFunctionDefinition, Alias, ScalarFunction};
+use datafusion::logical_expr::expr_rewriter::normalize_col;
+use datafusion::logical_expr::logical_plan::builder::project;
+use datafusion::logical_expr::logical_plan::Analyze;
+use datafusion::logical_expr::utils::{expr_as_column_expr, find_aggregate_exprs};
+use datafusion::logical_expr::{
+    binary_expr, col, date_bin, expr, expr::WindowFunction, lit, now, union, utils::conjunction,
+    AggregateFunction, AggregateUDF, Between, BuiltInWindowFunction, BuiltinScalarFunction,
+    Distinct, EmptyRelation, Explain, Expr, ExprSchemable, Extension, LogicalPlan,
+    LogicalPlanBuilder, Operator, PlanType, Projection, ScalarFunctionDefinition, ScalarUDF,
+    TableSource, ToStringifiedPlan, WindowFrame, WindowFrameBound, WindowFrameUnits,
+    WindowFunctionDefinition,
+};
+use datafusion::physical_expr::execution_props::ExecutionProps;
+use datafusion::prelude::{cast, sum, when, Column};
+use datafusion_util::{lit_dict, lit_timestamptz_nano, AsExpr};
+use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
+use influxdb_influxql_parser::common::{LimitClause, OffsetClause, OrderByClause};
+use influxdb_influxql_parser::explain::{ExplainOption, ExplainStatement};
+use influxdb_influxql_parser::expression::walk::{walk_expr, walk_expression, Expression};
+use influxdb_influxql_parser::expression::{
+    Binary, Call, ConditionalBinary, ConditionalExpression, ConditionalOperator, VarRef,
+    VarRefDataType,
+};
+use influxdb_influxql_parser::functions::{
+    is_aggregate_function, is_now_function, is_scalar_math_function,
+};
+use influxdb_influxql_parser::select::{FillClause, GroupByClause};
+use influxdb_influxql_parser::show_field_keys::ShowFieldKeysStatement;
+use influxdb_influxql_parser::show_measurements::{
+    ShowMeasurementsStatement, WithMeasurementClause,
+};
+use influxdb_influxql_parser::show_retention_policies::ShowRetentionPoliciesStatement;
+use influxdb_influxql_parser::show_tag_keys::ShowTagKeysStatement;
+use influxdb_influxql_parser::show_tag_values::{ShowTagValuesStatement, WithKeyClause};
+use influxdb_influxql_parser::simple_from_clause::ShowFromClause;
+use influxdb_influxql_parser::time_range::{split_cond, ReduceContext, TimeRange};
+use influxdb_influxql_parser::timestamp::Timestamp;
+use influxdb_influxql_parser::{
+    common::{MeasurementName, WhereClause},
+    expression::Expr as IQLExpr,
+    literal::Literal,
+    select::SelectStatement,
+    statement::Statement,
+};
+use iox_query::config::{IoxConfigExt, MetadataCutoff};
+use iox_query::exec::gapfill::{FillStrategy, GapFill, GapFillParams};
+use iox_query::exec::IOxSessionContext;
+use iox_query::logical_optimizer::range_predicate::find_time_range;
+use itertools::Itertools;
+use observability_deps::tracing::debug;
+use query_functions::{
+    clean_non_meta_escapes,
+    selectors::{selector_first, selector_last, selector_max, selector_min},
+};
+use schema::{
+    InfluxColumnType, InfluxFieldType, Schema, INFLUXQL_MEASUREMENT_COLUMN_NAME,
+    INFLUXQL_METADATA_KEY,
+};
+use std::collections::{hash_map::Entry, BTreeSet, HashMap, HashSet};
+use std::fmt::Debug;
+use std::iter;
+use std::ops::{Bound, ControlFlow, Deref, Not, Range};
+use std::str::FromStr;
+use std::sync::Arc;
+
+use super::parse_regex;
+use super::util::contains_expr;
+
+/// The column index of the measurement column.
+const MEASUREMENT_COLUMN_INDEX: u32 = 0;
+
+/// The `SchemaProvider` trait allows the InfluxQL query planner to obtain
+/// meta-data about tables referenced in InfluxQL statements.
+pub trait SchemaProvider {
+    /// Getter for a datasource
+    fn get_table_provider(&self, name: &str) -> Result<Arc<dyn TableSource>>;
+
+    /// Getter for a UDF description
+    fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>>;
+
+    /// Getter for a UDAF description
+    fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>>;
+
+    /// The collection of tables for this schema.
+    fn table_names(&self) -> Vec<&'_ str>;
+
+    /// Test if a table with the specified `name` exists.
+    fn table_exists(&self, name: &str) -> bool {
+        self.table_names().contains(&name)
+    }
+
+    /// Get the schema for the specified `table`.
+    fn table_schema(&self, name: &str) -> Option<Schema>;
+
+    fn execution_props(&self) -> &ExecutionProps;
+}
+
+/// Informs the planner which rules should be applied when transforming
+/// an InfluxQL expression.
+///
+/// Specifically, the scope of available functions is narrowed to mathematical scalar functions
+/// when processing the `WHERE` clause.
+#[derive(Debug, Default, Clone, Copy, PartialEq)]
+enum ExprScope {
+    /// Signals that expressions should be transformed in the context of
+    /// the `WHERE` clause.
+    #[default]
+    Where,
+    /// Signals that expressions should be transformed in the context of
+    /// the `SELECT` projection list.
+    Projection,
+}
+
+/// State used to inform the planner, which is derived for the
+/// root `SELECT` and subqueries.
+#[allow(dead_code)]
+#[derive(Debug, Default, Clone)]
+struct Context<'a> {
+    /// The name of the table used as the data source for the current query.
+    table_name: &'a str,
+    projection_type: ProjectionType,
+    tz: Option<Tz>,
+
+    order_by: OrderByClause,
+
+    /// The column alias for the `time` column.
+    ///
+    /// # NOTE
+    ///
+    /// The time column can only be aliased for the root query.
+    time_alias: &'a str,
+
+    /// The filter predicate for the query, without `time`.
+    condition: Option<&'a ConditionalExpression>,
+
+    /// The time range of the query
+    time_range: TimeRange,
+
+    // GROUP BY information
+    group_by: Option<&'a GroupByClause>,
+    fill: Option<FillClause>,
+
+    /// Interval of the `TIME` function found in the `GROUP BY` clause.
+    interval: Option<Interval>,
+
+    /// How many additional window intervals must be retrieved, when grouping
+    /// by time, to ensure window functions like `difference` have sufficient
+    /// data to for the first window of the `time_range`.
+    extra_intervals: usize,
+
+    /// The set of tags specified in the top-level `SELECT` statement
+    /// which represent the tag set used for grouping output.
+    root_group_by_tags: &'a [&'a str],
+}
+
+impl<'a> Context<'a> {
+    fn new_root(
+        table_name: &'a str,
+        select: &'a Select,
+        root_group_by_tags: &'a [&'a str],
+    ) -> Self {
+        Self {
+            table_name,
+            projection_type: select.projection_type,
+            tz: select.timezone,
+            order_by: select.order_by.unwrap_or_default(),
+            time_alias: &select.fields[0].name,
+            condition: select.condition.as_ref(),
+            time_range: select.time_range,
+            group_by: select.group_by.as_ref(),
+            fill: select.fill,
+            interval: select.interval,
+            extra_intervals: select.extra_intervals,
+            root_group_by_tags,
+        }
+    }
+
+    /// Create a new context for the select statement that is
+    /// a subquery of the current context.
+    fn subquery(&self, select: &'a Select) -> Self {
+        Self {
+            table_name: self.table_name,
+            projection_type: select.projection_type,
+            tz: select.timezone,
+            order_by: self.order_by,
+            // time is never aliased in subqueries
+            time_alias: "time",
+            condition: select.condition.as_ref(),
+            // Subqueries should be restricted by the time range of the parent
+            //
+            // See: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/iterator.go#L716-L721
+            time_range: select.time_range.intersected(self.time_range),
+            group_by: select.group_by.as_ref(),
+            fill: select.fill,
+            interval: select.interval,
+            extra_intervals: select.extra_intervals,
+            root_group_by_tags: self.root_group_by_tags,
+        }
+    }
+
+    /// Return a [`Expr::Sort`] expression for the `time` column.
+    #[allow(dead_code)]
+    fn time_sort_expr(&self) -> Expr {
+        self.time_alias.as_expr().sort(
+            match self.order_by {
+                OrderByClause::Ascending => true,
+                OrderByClause::Descending => false,
+            },
+            false,
+        )
+    }
+
+    /// Returns true if the current context has an extended
+    /// time range to provide leading data for window functions
+    /// to produce the result for the first window.
+    #[allow(dead_code)]
+    fn has_extended_time_range(&self) -> bool {
+        self.extra_intervals > 0 && self.interval.is_some()
+    }
+
+    /// Return the time range of the context, including any
+    /// additional intervals required for window functions like
+    /// `difference` or `moving_average`, when the query contains a
+    /// `GROUP BY TIME` clause.
+    ///
+    /// # NOTE
+    ///
+    /// This function accounts for a bug in InfluxQL OG that only reads
+    /// a single interval, rather than the number required based on the
+    /// window function.
+    ///
+    /// # EXPECTED
+    ///
+    /// For InfluxQL OG, the likely intended behaviour of the extra intervals
+    /// was to ensure a minimum number of windows were calculated to ensure
+    /// there was sufficient data for the lower time bound specified
+    /// in the `WHERE` clause, or upper time bound when ordering by `time`
+    /// in descending order.
+    ///
+    /// For example, the following InfluxQL query calculates the `moving_average`
+    /// of the `mean` of the `writes` field over 3 intervals. The interval
+    /// is 10 seconds, as specified by the `GROUP BY time(10s)` clause.
+    ///
+    /// ```sql
+    /// SELECT moving_average(mean(writes), 3)
+    /// FROM diskio
+    /// WHERE time >= '2020-06-11T16:53:00Z' AND time < '2020-06-11T16:55:00Z'
+    /// GROUP BY time(10s)
+    /// ```
+    ///
+    /// The intended output was supposed to include the first window of the time
+    /// bounds, or `'2020-06-11T16:53:00Z'`:
+    ///
+    /// ```text
+    /// name: diskio
+    /// time                 moving_average
+    /// ----                 --------------
+    /// 2020-06-11T16:53:00Z 5592529.333333333
+    /// 2020-06-11T16:53:10Z 5592677.333333333
+    /// ...
+    /// 2020-06-11T16:54:10Z 5593513.333333333
+    /// 2020-06-11T16:54:20Z 5593612.333333333
+    /// ```
+    /// however, the actual output starts at `2020-06-11T16:53:10Z`.
+    ///
+    /// # BUG
+    ///
+    /// During compilation of the query, InfluxQL OG determines the `ExtraIntervals`
+    /// required for the `moving_average` function, which in the example is `3` ([source][1]):
+    ///
+    /// ```go
+    /// if c.global.ExtraIntervals < int(arg1.Val) {
+    ///     c.global.ExtraIntervals = int(arg1.Val)
+    /// }
+    /// ```
+    ///
+    /// `arg1.Val` is the second argument from the example InfluxQL query, or `3`.
+    ///
+    /// When preparing the query for execution, the time range is adjusted by the
+    /// `ExtraIntervals` determined during compilation ([source][2]):
+    ///
+    /// ```go
+    /// // Modify the time range if there are extra intervals and an interval.
+    /// if !c.Interval.IsZero() && c.ExtraIntervals > 0 {
+    ///     if c.Ascending {
+    ///         newTime := timeRange.Min.Add(time.Duration(-c.ExtraIntervals) * c.Interval.Duration)
+    ///         if !newTime.Before(time.Unix(0, influxql.MinTime).UTC()) {
+    ///             timeRange.Min = newTime
+    /// ```
+    ///
+    /// In this case `timeRange.Min` will be adjusted from `2020-06-11T16:53:00Z` to
+    /// `2020-06-11T16:52:30Z`, as `ExtraIntervals` is `3` and `Interval.Duration` is `10s`.
+    ///
+    /// The first issue is that the adjusted `timeRange` is only used to determine which
+    /// shards to read per the following ([source][3]):
+    ///
+    /// ```go
+    /// // Create an iterator creator based on the shards in the cluster.
+    /// shards, err := shardMapper.MapShards(c.stmt.Sources, timeRange, sopt)
+    /// ```
+    ///
+    /// The options used to configure query execution, constructed later in the function,
+    /// use the time range from the compiled statement ([source][4]):
+    ///
+    /// ```go
+    /// opt.StartTime, opt.EndTime = c.TimeRange.MinTimeNano(), c.TimeRange.MaxTimeNano()
+    /// ```
+    ///
+    /// Specifically, `opt.StartTime` would be `2020-06-11T16:53:00Z` (`1591894380000000000`).
+    ///
+    /// Finally, when construction the physical operator to compute the `moving_average`,
+    /// the `StartTime`, or `EndTime` for descending queries, is adjusted by the single
+    /// interval of `10s` ([source][5]):
+    ///
+    /// ```go
+    /// if !opt.Interval.IsZero() {
+    ///     if opt.Ascending {
+    ///         opt.StartTime -= int64(opt.Interval.Duration)
+    /// ```
+    ///
+    /// before creating the iterator over the adjusted time range ([source][6]):
+    ///
+    /// ```go
+    /// input, err := buildExprIterator(ctx, expr.Args[0], b.ic, b.sources, opt, b.selector, false)
+    /// ```
+    ///
+    /// and despite the time range being adjusted correctly later in the switch statement ([source][7]):
+    ///
+    /// ```go
+    /// case "moving_average":
+    ///     n := expr.Args[1].(*influxql.IntegerLiteral)
+    ///     if n.Val > 1 && !opt.Interval.IsZero() {
+    ///         if opt.Ascending {
+    ///             opt.StartTime -= int64(opt.Interval.Duration) * (n.Val - 1)
+    /// ```
+    /// this is not used by the `moving_average` iterator ([source][8]):
+    ///
+    /// ```go
+    /// return newMovingAverageIterator(input, int(n.Val), opt)
+    /// ```
+    /// [1]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L592-L594
+    /// [2]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L1153-L1158
+    /// [3]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L1172-L1173
+    /// [4]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L1198
+    /// [5]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/select.go#L259-L261
+    /// [6]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/select.go#L268-L267
+    /// [7]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/select.go#L286-L290
+    /// [8]: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/select.go#L295
+    #[allow(dead_code)]
+    fn extended_time_range(&self) -> TimeRange {
+        // As described in the function docs, extra_intervals is either
+        // 1 or 0 to match InfluxQL OG behaviour.
+        match (self.extra_intervals.min(1), self.interval) {
+            (count @ 1.., Some(interval)) => {
+                if self.order_by.is_ascending() {
+                    TimeRange {
+                        lower: self
+                            .time_range
+                            .lower
+                            .map(|v| v - (count as i64 * interval.duration)),
+                        upper: self.time_range.upper,
+                    }
+                } else {
+                    TimeRange {
+                        lower: self.time_range.lower,
+                        upper: self
+                            .time_range
+                            .upper
+                            .map(|v| v + (count as i64 * interval.duration)),
+                    }
+                }
+            }
+            _ => self.time_range,
+        }
+    }
+
+    /// Returns the combined `GROUP BY` tags clause from the root
+    /// and current statement. The list is sorted and guaranteed to be unique.
+    fn group_by_tags(&self) -> Vec<&str> {
+        match (self.root_group_by_tags.is_empty(), self.group_by) {
+            (true, None) => vec![],
+            (false, None) => self.root_group_by_tags.to_vec(),
+            (_, Some(group_by)) => group_by
+                .tag_names()
+                .map(|ident| ident.as_str())
+                .chain(self.root_group_by_tags.iter().copied())
+                .sorted()
+                .dedup()
+                .collect(),
+        }
+    }
+
+    fn fill(&self) -> FillClause {
+        self.fill.unwrap_or_default()
+    }
+}
+
+#[allow(missing_debug_implementations)]
+/// InfluxQL query planner
+pub struct InfluxQLToLogicalPlan<'a> {
+    s: &'a dyn SchemaProvider,
+    iox_ctx: IOxSessionContext,
+}
+
+impl<'a> InfluxQLToLogicalPlan<'a> {
+    pub fn new(s: &'a dyn SchemaProvider, iox_ctx: &'a IOxSessionContext) -> Self {
+        Self {
+            s,
+            iox_ctx: iox_ctx.child_ctx("InfluxQLToLogicalPlan"),
+        }
+    }
+
+    pub fn statement_to_plan(&self, statement: Statement) -> Result<LogicalPlan> {
+        match statement {
+            Statement::CreateDatabase(_) => error::not_implemented("CREATE DATABASE"),
+            Statement::Delete(_) => error::not_implemented("DELETE"),
+            Statement::DropMeasurement(_) => error::not_implemented("DROP MEASUREMENT"),
+            Statement::Explain(explain) => self.explain_statement_to_plan(*explain),
+            Statement::Select(select) => self.select_query_to_plan(
+                &self
+                    .rewrite_select_statement(*select)
+                    .map_err(|e| e.context("rewriting statement"))?,
+            ),
+            Statement::ShowDatabases(_) => error::not_implemented("SHOW DATABASES"),
+            Statement::ShowMeasurements(show_measurements) => {
+                self.show_measurements_to_plan(*show_measurements)
+            }
+            Statement::ShowRetentionPolicies(show_retention_policies) => {
+                self.show_retention_policies_to_plan(*show_retention_policies)
+            }
+            Statement::ShowTagKeys(show_tag_keys) => self.show_tag_keys_to_plan(*show_tag_keys),
+            Statement::ShowTagValues(show_tag_values) => {
+                self.show_tag_values_to_plan(*show_tag_values)
+            }
+            Statement::ShowFieldKeys(show_field_keys) => {
+                self.show_field_keys_to_plan(*show_field_keys)
+            }
+        }
+    }
+
+    fn explain_statement_to_plan(&self, explain: ExplainStatement) -> Result<LogicalPlan> {
+        let plan = self.statement_to_plan(*explain.statement)?;
+        let plan = Arc::new(plan);
+        let schema = LogicalPlan::explain_schema();
+        let schema = schema.to_dfschema_ref()?;
+
+        // We'll specify the `plan_type` column as the "measurement name", so that it may be
+        // grouped into tables in the output when formatted as InfluxQL tabular format.
+        let measurement_column_index = schema
+            .index_of_column_by_name(None, "plan_type")?
+            .ok_or_else(|| error::map::internal("unable to find plan_type column"))?
+            as u32;
+
+        let (analyze, verbose) = match explain.options {
+            Some(ExplainOption::AnalyzeVerbose) => (true, true),
+            Some(ExplainOption::Analyze) => (true, false),
+            Some(ExplainOption::Verbose) => (false, true),
+            None => (false, false),
+        };
+
+        let plan = if analyze {
+            LogicalPlan::Analyze(Analyze {
+                verbose,
+                input: plan,
+                schema,
+            })
+        } else {
+            let stringified_plans = vec![plan.to_stringified(PlanType::InitialLogicalPlan)];
+            LogicalPlan::Explain(Explain {
+                verbose,
+                plan,
+                stringified_plans,
+                schema,
+                logical_optimization_succeeded: false,
+            })
+        };
+
+        plan_with_metadata(
+            plan,
+            &InfluxQlMetadata {
+                measurement_column_index,
+                tag_key_columns: vec![],
+            },
+        )
+    }
+
+    fn rewrite_select_statement(&self, select: SelectStatement) -> Result<SelectQuery> {
+        rewrite_statement(self.s, &select)
+    }
+
+    /// Create a [`LogicalPlan`] from the specified InfluxQL `SELECT` statement.
+    fn select_query_to_plan(&self, query: &SelectQuery) -> Result<LogicalPlan> {
+        let select = &query.select;
+
+        let group_by_tags = if let Some(group_by) = select.group_by.as_ref() {
+            group_by
+                .tag_names()
+                .map(|ident| ident.as_str())
+                .sorted()
+                .collect()
+        } else {
+            vec![]
+        };
+
+        let ProjectionInfo {
+            fields,
+            group_by_tag_set,
+            projection_tag_set,
+            is_projected,
+        } = ProjectionInfo::new(&select.fields, &group_by_tags);
+
+        let order_by = select.order_by.unwrap_or_default();
+        let time_alias = fields[0].name.as_str();
+
+        let table_names = find_table_names(select);
+        let sort_by_measurement = table_names.len() > 1;
+        let mut plans = Vec::new();
+        for table_name in table_names {
+            let ctx = Context::new_root(table_name, select, &group_by_tags);
+
+            let Some(plan) = self.union_from(&ctx, select)? else {
+                continue;
+            };
+
+            let plan = self.project_select(&ctx, plan, &fields, &group_by_tag_set)?;
+
+            // TODO(sgc): Handle FILL(N) and FILL(previous)
+            //
+            // See: https://github.com/influxdata/influxdb_iox/issues/8042
+
+            plans.push((table_name, plan));
+        }
+
+        let plan = {
+            fn project_with_measurement(
+                table_name: &str,
+                input: LogicalPlan,
+            ) -> Result<LogicalPlan> {
+                if let LogicalPlan::Projection(Projection { expr, input, .. }) = input {
+                    // Rewrite the existing projection with the measurement name column first
+                    project(
+                        input.deref().clone(),
+                        iter::once(lit_dict(table_name).alias(INFLUXQL_MEASUREMENT_COLUMN_NAME))
+                            .chain(expr),
+                    )
+                } else {
+                    project(
+                        input.clone(),
+                        iter::once(lit_dict(table_name).alias(INFLUXQL_MEASUREMENT_COLUMN_NAME))
+                            .chain(
+                                input
+                                    .schema()
+                                    .fields()
+                                    .iter()
+                                    .map(|expr| Expr::Column(expr.unqualified_column())),
+                            ),
+                    )
+                }
+            }
+
+            let mut iter = plans.into_iter();
+            let plan = match iter.next() {
+                Some((table_name, plan)) => project_with_measurement(table_name, plan),
+                None => {
+                    // empty result, but let's at least have all the strictly necessary metadata
+                    let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+                        INFLUXQL_MEASUREMENT_COLUMN_NAME,
+                        (&InfluxColumnType::Tag).into(),
+                        false,
+                    )]));
+                    let plan = LogicalPlan::EmptyRelation(EmptyRelation {
+                        produce_one_row: false,
+                        schema: schema.to_dfschema_ref()?,
+                    });
+                    let plan = plan_with_metadata(
+                        plan,
+                        &InfluxQlMetadata {
+                            measurement_column_index: MEASUREMENT_COLUMN_INDEX,
+                            tag_key_columns: vec![],
+                        },
+                    )?;
+                    return Ok(plan);
+                }
+            }?;
+
+            iter.try_fold(plan, |prev, (table_name, input)| {
+                let next = project_with_measurement(table_name, input)?;
+                union(prev, next)
+            })?
+        };
+
+        let plan = plan_with_metadata(
+            plan,
+            &InfluxQlMetadata {
+                measurement_column_index: MEASUREMENT_COLUMN_INDEX,
+                tag_key_columns: make_tag_key_column_meta(
+                    &fields,
+                    &group_by_tag_set,
+                    &is_projected,
+                ),
+            },
+        )?;
+
+        let time_sort_expr = time_alias.as_expr().sort(
+            match order_by {
+                OrderByClause::Ascending => true,
+                OrderByClause::Descending => false,
+            },
+            false,
+        );
+
+        let plan = plan_with_sort(
+            plan,
+            vec![time_sort_expr.clone()],
+            sort_by_measurement,
+            &group_by_tag_set,
+            &projection_tag_set,
+        )?;
+
+        self.limit(
+            plan,
+            select.offset,
+            select.limit,
+            vec![time_sort_expr],
+            sort_by_measurement,
+            &group_by_tag_set,
+            &projection_tag_set,
+        )
+    }
+
+    fn subquery_to_plan(&self, ctx: &Context<'_>, select: &Select) -> Result<Option<LogicalPlan>> {
+        let ctx = ctx.subquery(select);
+
+        let Some(plan) = self.union_from(&ctx, select)? else {
+            return Ok(None);
+        };
+
+        let group_by_tags = ctx.group_by_tags();
+        let ProjectionInfo {
+            fields,
+            group_by_tag_set,
+            projection_tag_set,
+            ..
+        } = ProjectionInfo::new(&select.fields, &group_by_tags);
+
+        let plan = self.project_select(&ctx, plan, &fields, &group_by_tag_set)?;
+
+        // the sort planner node must refer to the time column using
+        // the alias that was specified
+        let time_alias = fields[0].name.as_str();
+
+        let time_sort_expr = time_alias.as_expr().sort(
+            match ctx.order_by {
+                OrderByClause::Ascending => true,
+                OrderByClause::Descending => false,
+            },
+            false,
+        );
+
+        let plan = plan_with_sort(
+            plan,
+            vec![time_sort_expr.clone()],
+            false,
+            &group_by_tag_set,
+            &projection_tag_set,
+        )?;
+
+        Ok(Some(self.limit(
+            plan,
+            select.offset,
+            select.limit,
+            vec![time_sort_expr],
+            false,
+            &group_by_tag_set,
+            &projection_tag_set,
+        )?))
+    }
+
+    /// Returns a `LogicalPlan` that combines the `FROM` clause as a `UNION ALL`.
+    fn union_from(&self, ctx: &Context<'_>, select: &Select) -> Result<Option<LogicalPlan>> {
+        let mut plans = Vec::new();
+        for ds in &select.from {
+            let Some(plan) = self.plan_from_data_source(ctx, ds)? else {
+                continue;
+            };
+
+            let schema = IQLSchema::new_from_ds_schema(plan.schema(), ds.schema(self.s)?)?;
+            let plan = self.plan_condition_time_range(
+                ctx.condition,
+                ctx.extended_time_range(),
+                plan,
+                &schema,
+            )?;
+            plans.push((plan, schema));
+        }
+
+        Ok(match plans.len() {
+            0 => None,
+            1 => plans.pop().map(|(plan, _)| plan),
+            _ => {
+                // find all the columns referenced in the `SELECT`
+                let var_refs = find_var_refs(select);
+
+                let mut tags = HashMap::new();
+                let plans = plans
+                    .into_iter()
+                    .map(|(plan, ds_schema)| {
+                        let schema = plan.schema();
+                        let select_exprs = var_refs.iter().map(|vr| {
+                            // If the variable reference is a tag in one of the sources,
+                            // but not in the other, then we need to produce an error.
+                            let name = vr.name.as_str();
+                            match (ds_schema.is_projected_tag_field(name), tags.get(name)) {
+                                (v, None) => { tags.insert(name, v); Ok(()) },
+                                (prev, Some(cur)) if &prev != cur => error::not_implemented(
+                                        format!(
+                                            "cannot mix tag and field columns with the same name: {}",
+                                            name
+                                        )),
+                                _ => Ok(()),
+
+                            }?;
+
+                            if schema.has_column_with_unqualified_name(name) {
+                                Ok(name.as_expr().alias(name))
+                            } else {
+                                Ok(lit(ScalarValue::Null).alias(name))
+                            }
+                        })
+                        .collect::<Result<Vec<_>>>()?;
+
+                        project(plan.clone(), select_exprs)
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
+                let plan = {
+                    let mut iter = plans.into_iter();
+                    let plan = iter
+                        .next()
+                        .ok_or_else(|| error::map::internal("expected plan"))?;
+                    iter.try_fold(plan, union)?
+                };
+                Some(plan)
+            }
+        })
+    }
+
+    fn project_select(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        match ctx.projection_type {
+            ProjectionType::Raw => self.project_select_raw(input, fields),
+            ProjectionType::RawDistinct => self.project_select_raw_distinct(ctx, input, fields),
+            ProjectionType::Aggregate => {
+                self.project_select_aggregate(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::Window => {
+                self.project_select_window(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::WindowAggregate => {
+                self.project_select_window_aggregate(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::WindowAggregateMixed => {
+                self.project_select_window_aggregate_mixed(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::Selector { .. } => {
+                self.project_select_selector(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::TopBottomSelector => {
+                self.project_select_top_bottom_selector(ctx, input, fields, group_by_tag_set)
+            }
+        }
+    }
+
+    /// Plan "Raw" SELECT queriers, These are queries that have no grouping
+    /// and call only scalar functions.
+    fn project_select_raw(&self, input: LogicalPlan, fields: &[Field]) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        project(input, select_exprs)
+    }
+
+    /// Plan "RawDistinct" SELECT queriers, These are queries that have no grouping
+    /// and call only scalar functions, but output only distinct rows.
+    fn project_select_raw_distinct(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let mut select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
+
+        // This is a special case, where exactly one column can be projected with a `DISTINCT`
+        // clause or the `distinct` function.
+        //
+        // In addition, the time column is projected as the Unix epoch.
+
+        let Some(time_column_index) = find_time_column_index(fields) else {
+            return error::internal("unable to find time column");
+        };
+
+        // Take ownership of the alias, so we don't reallocate, and temporarily place a literal
+        // `NULL` in its place.
+        let Expr::Alias(Alias { name: alias, .. }) =
+            std::mem::replace(&mut select_exprs[time_column_index], lit(ScalarValue::Null))
+        else {
+            return error::internal("time column is not an alias");
+        };
+
+        select_exprs[time_column_index] = if let Some(i) = ctx.interval {
+            let stride = lit(ScalarValue::new_interval_mdn(0, 0, i.duration));
+            let offset = i.offset.unwrap_or_default();
+
+            date_bin(stride, "time".as_expr(), lit_timestamptz_nano(offset)).alias(alias)
+        } else {
+            lit_timestamptz_nano(0).alias(alias)
+        };
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        let mut plan = project(input, select_exprs)?;
+
+        // generate a predicate to filter out all rows where all field values are `NULL`,
+        // like:
+        //
+        //   NOT (field1 IS NULL AND field2 IS NULL AND ...)
+        plan = match conjunction(fields.iter().filter_map(|f| {
+            if matches!(f.data_type, Some(InfluxColumnType::Field(_))) {
+                Some(f.name.as_expr().is_null())
+            } else {
+                None
+            }
+        })) {
+            Some(expr) => LogicalPlanBuilder::from(plan).filter(expr.not())?.build()?,
+            None => plan,
+        };
+
+        LogicalPlanBuilder::from(plan).distinct()?.build()
+    }
+
+    /// Plan "Aggregate" SELECT queries. These are queries that use one or
+    /// more aggregate (but not window) functions.
+    fn project_select_aggregate(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
+
+        let (plan, select_exprs) =
+            self.select_aggregate(ctx, input, fields, select_exprs, group_by_tag_set)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        project(plan, select_exprs)
+    }
+
+    /// Plan "Window" SELECT queries. These are queries that use one or
+    /// more window functions.
+    fn project_select_window(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
+
+        let (plan, select_exprs) =
+            self.select_window(ctx, input, select_exprs, group_by_tag_set)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        let plan = project(plan, select_exprs)?;
+
+        // InfluxQL OG physical operators for
+
+        // generate a predicate to filter rows where all field values of the row are `NULL`,
+        // like:
+        //
+        //   NOT (field1 IS NULL AND field2 IS NULL AND ...)
+        match conjunction(fields.iter().filter_map(|f| {
+            if matches!(f.data_type, Some(InfluxColumnType::Field(_))) {
+                Some(f.name.as_expr().is_null())
+            } else {
+                None
+            }
+        })) {
+            Some(expr) => LogicalPlanBuilder::from(plan).filter(expr.not())?.build(),
+            None => Ok(plan),
+        }
+    }
+
+    /// Plan "WindowAggregate" SELECT queries. These are queries that use
+    /// a combination of window and nested aggregate functions.
+    fn project_select_window_aggregate(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
+
+        let (plan, select_exprs) =
+            self.select_aggregate(ctx, input, fields, select_exprs, group_by_tag_set)?;
+
+        let (plan, select_exprs) = self.select_window(ctx, plan, select_exprs, group_by_tag_set)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        let plan = project(plan, select_exprs)?;
+
+        // generate a predicate to filter rows where all field values of the row are `NULL`,
+        // like:
+        //
+        //   NOT (field1 IS NULL AND field2 IS NULL AND ...)
+        match conjunction(fields.iter().filter_map(|f| {
+            if matches!(f.data_type, Some(InfluxColumnType::Field(_))) {
+                Some(f.name.as_expr().is_null())
+            } else {
+                None
+            }
+        })) {
+            Some(expr) => LogicalPlanBuilder::from(plan).filter(expr.not())?.build(),
+            None => Ok(plan),
+        }
+    }
+
+    /// Plan "WindowAggregateMixed" SELECT queries. These are queries that use
+    /// a combination of window and nested aggregate functions, along with
+    /// additional aggregate functions.
+    ///
+    /// N.B. The plans produced here can output incorrect results when using the
+    /// `FILL(0)` directive. See [#9706](https://github.com/influxdata/influxdb_iox/issues/9706)
+    /// for details.
+    fn project_select_window_aggregate_mixed(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
+
+        let (plan, select_exprs) =
+            self.select_aggregate(ctx, input, fields, select_exprs, group_by_tag_set)?;
+
+        let (plan, select_exprs) = self.select_window(ctx, plan, select_exprs, group_by_tag_set)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        project(plan, select_exprs)
+    }
+
+    /// Plan the execution of SELECT queries that have the Selector projection
+    /// type. These a queries that include a single FIRST, LAST, MAX, MIN,
+    /// PERCENTILE, or SAMPLE function call, possibly requesting additional
+    /// tags or fields.
+    ///
+    /// N.B SAMPLE is not yet implemented.
+    fn project_select_selector(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        let (selector_index, field_key, plan) = match Selector::find_enumerated(fields)? {
+            (_, Selector::First { .. })
+            | (_, Selector::Last { .. })
+            | (_, Selector::Max { .. })
+            | (_, Selector::Min { .. }) => {
+                // The FIRST, LAST, MAX & MIN selectors are implmented as specialised
+                // forms of the equivilent aggregate implementaiion.
+                return self.project_select_aggregate(ctx, input, fields, group_by_tag_set);
+            }
+            (idx, Selector::Percentile { field_key, n }) => {
+                let window_perc_row = Expr::WindowFunction(WindowFunction::new(
+                    PERCENT_ROW_NUMBER.clone(),
+                    vec![lit(n)],
+                    window_partition_by(ctx, input.schema(), group_by_tag_set),
+                    vec![field_key.as_expr().sort(true, false), ctx.time_sort_expr()],
+                    WindowFrame {
+                        units: WindowFrameUnits::Rows,
+                        start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                        end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                    },
+                ));
+                let perc_row_column_name = window_perc_row.display_name()?;
+
+                let window_row = Expr::WindowFunction(WindowFunction::new(
+                    WindowFunctionDefinition::BuiltInWindowFunction(
+                        BuiltInWindowFunction::RowNumber,
+                    ),
+                    vec![],
+                    window_partition_by(ctx, input.schema(), group_by_tag_set),
+                    vec![field_key.as_expr().sort(true, false), ctx.time_sort_expr()],
+                    WindowFrame {
+                        units: WindowFrameUnits::Rows,
+                        start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                        end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                    },
+                ));
+                let row_column_name = window_row.display_name()?;
+
+                let filter_expr = binary_expr(
+                    col(perc_row_column_name.clone()),
+                    Operator::Eq,
+                    col(row_column_name.clone()),
+                );
+                let plan = LogicalPlanBuilder::from(input)
+                    .filter(field_key.as_expr().is_not_null())?
+                    .window(vec![
+                        window_perc_row.alias(perc_row_column_name),
+                        window_row.alias(row_column_name),
+                    ])?
+                    .filter(filter_expr)?
+                    .build()?;
+
+                (idx, field_key, plan)
+            }
+            (_, Selector::Sample { field_key: _, n: _ }) => {
+                return error::not_implemented("sample selector function")
+            }
+
+            (_, s) => {
+                return error::internal(format!(
+                    "unsupported selector function for ProjectionSelector {s}"
+                ))
+            }
+        };
+
+        let mut fields_vec = fields.to_vec();
+        fields_vec[selector_index].expr = IQLExpr::VarRef(VarRef {
+            name: field_key.clone(),
+            data_type: None,
+        });
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&plan, fields_vec.as_slice(), &schema)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        project(plan, select_exprs)
+    }
+
+    /// Plan the execution of "TopBottomSelector" SELECT queries. These are
+    /// queries that use the TOP or BOTTOM functions to select a number of
+    /// rows from the ends of a partition..
+    fn project_select_top_bottom_selector(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        let (selector_index, is_bottom, field_key, tag_keys, narg) =
+            match Selector::find_enumerated(fields)? {
+                (
+                    idx,
+                    Selector::Bottom {
+                        field_key,
+                        tag_keys,
+                        n,
+                    },
+                ) => (idx, true, field_key, tag_keys, n),
+                (
+                    idx,
+                    Selector::Top {
+                        field_key,
+                        tag_keys,
+                        n,
+                    },
+                ) => (idx, false, field_key, tag_keys, n),
+                (_, s) => {
+                    return error::internal(format!(
+                        "ProjectionTopBottomSelector used with unexpected selector function: {s}"
+                    ))
+                }
+            };
+
+        let mut fields_vec = fields.to_vec();
+        fields_vec[selector_index].expr = IQLExpr::VarRef(VarRef {
+            name: field_key.clone(),
+            data_type: None,
+        });
+        let order_by = if is_bottom {
+            SelectorWindowOrderBy::FieldAsc(field_key)
+        } else {
+            SelectorWindowOrderBy::FieldDesc(field_key)
+        };
+
+        let mut internal_group_by = group_by_tag_set.to_vec();
+        for (i, tag_key) in tag_keys.iter().enumerate() {
+            fields_vec.insert(
+                selector_index + i + 1,
+                Field {
+                    expr: IQLExpr::VarRef(VarRef {
+                        name: (*tag_key).clone(),
+                        data_type: Some(VarRefDataType::Tag),
+                    }),
+                    name: (*tag_key).clone().take(),
+                    data_type: None,
+                },
+            );
+            internal_group_by.push(*tag_key);
+        }
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&input, fields_vec.as_slice(), &schema)?;
+
+        let plan = if !tag_keys.is_empty() {
+            self.select_first(ctx, input, order_by, internal_group_by.as_slice(), 1)?
+        } else {
+            input
+        };
+
+        let plan = self.select_first(ctx, plan, order_by, group_by_tag_set, narg)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        project(plan, select_exprs)
+    }
+
+    fn select_aggregate(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        mut select_exprs: Vec<Expr>,
+        group_by_tag_set: &[&str],
+    ) -> Result<(LogicalPlan, Vec<Expr>)> {
+        // Find a list of unique aggregate expressions from the projection.
+        //
+        // For example, a projection such as:
+        //
+        // SELECT SUM(foo), SUM(foo) / COUNT(foo) ..
+        //
+        // will produce two aggregate expressions:
+        //
+        // [SUM(foo), COUNT(foo)]
+        //
+        // NOTE:
+        //
+        // It is possible this vector is empty, when all the fields in the
+        // projection refer to columns that do not exist in the current
+        // table.
+        let mut aggr_exprs = find_aggregate_exprs(&select_exprs);
+
+        // gather some time-related metadata
+        let Some(time_column_index) = find_time_column_index(fields) else {
+            return error::internal("unable to find time column");
+        };
+
+        // if there's only a single selector, wrap non-aggregated fields into that selector
+        let mut should_fill_expr = fields.iter().map(is_aggregate_field).collect::<Vec<_>>();
+        if aggr_exprs.len() == 1 {
+            let selector = aggr_exprs[0].clone();
+
+            if let Expr::AggregateFunction(mut agg) = selector.clone() {
+                if agg.func_def.name().starts_with("selector_") {
+                    let selector_index = select_exprs
+                        .iter()
+                        .enumerate()
+                        .find(|(_i, expr)| contains_expr(expr, &selector))
+                        .map(|(i, _expr)| i)
+                        .ok_or_else(|| error::map::internal("cannot find selector expression"))?;
+
+                    let group_by_tag_set = group_by_tag_set.iter().copied().collect::<HashSet<_>>();
+
+                    let mut additional_args = vec![];
+                    let mut fields_to_extract = vec![];
+                    for (idx, expr) in select_exprs.iter().enumerate() {
+                        if (idx == time_column_index) || (idx == selector_index) {
+                            continue;
+                        }
+                        let (expr, out_name) = match expr.clone() {
+                            Expr::Alias(Alias {
+                                expr,
+                                relation: None,
+                                name: out_name,
+                            }) => (*expr, out_name),
+                            _ => {
+                                return error::internal("other field is not aliased");
+                            }
+                        };
+                        if group_by_tag_set.contains(&out_name.as_str()) {
+                            continue;
+                        }
+                        additional_args.push(expr);
+                        fields_to_extract.push((
+                            idx,
+                            format!("other_{}", additional_args.len()),
+                            out_name,
+                        ));
+                    }
+
+                    agg.args.append(&mut additional_args);
+                    let selector_new = Expr::AggregateFunction(agg);
+                    select_exprs[selector_index] = select_exprs[selector_index]
+                        .clone()
+                        .transform_up(&|expr| {
+                            if expr == selector {
+                                Ok(Transformed::Yes(selector_new.clone()))
+                            } else {
+                                Ok(Transformed::No(expr))
+                            }
+                        })
+                        .expect("cannot fail");
+                    aggr_exprs[0] = selector_new.clone();
+
+                    for (idx, struct_name, out_alias) in fields_to_extract {
+                        select_exprs[idx] =
+                            selector_new.clone().field(struct_name).alias(out_alias);
+                        should_fill_expr[idx] = true;
+                    }
+                }
+            }
+        }
+
+        // This block identifies the time column index and updates the time expression
+        // based on the semantics of the projection.
+        let time_column = {
+            // Take ownership of the alias, so we don't reallocate, and temporarily place a literal
+            // `NULL` in its place.
+            let Expr::Alias(Alias { name: alias, .. }) =
+                std::mem::replace(&mut select_exprs[time_column_index], lit(ScalarValue::Null))
+            else {
+                return error::internal("time column is not an alias");
+            };
+
+            // Rewrite the `time` column projection based on a series of rules in the following
+            // order. If the query:
+            //
+            // 1. is binning by time, project the column using the `DATE_BIN` function,
+            // 2. is a single-selector query, project the `time` field of the selector aggregate,
+            // 3. otherwise, project the Unix epoch (0)
+            select_exprs[time_column_index] = if let Some(i) = ctx.interval {
+                let stride = lit(ScalarValue::new_interval_mdn(0, 0, i.duration));
+                let offset = i.offset.unwrap_or_default();
+
+                date_bin(stride, "time".as_expr(), lit_timestamptz_nano(offset))
+            } else if let ProjectionType::Selector { has_fields: _ } = ctx.projection_type {
+                let selector = match aggr_exprs.len() {
+                    1 => aggr_exprs[0].clone(),
+                    len => {
+                        // Should have been validated by `select_statement_info`
+                        return error::internal(format!(
+                            "internal: expected 1 selector expression, got {len}"
+                        ));
+                    }
+                };
+
+                selector.field("time")
+            } else {
+                lit_timestamptz_nano(0)
+            }
+            .alias(alias);
+
+            &select_exprs[time_column_index]
+        };
+
+        let aggr_group_by_exprs = {
+            let schema = input.schema();
+
+            let mut group_by_exprs = Vec::new();
+
+            if ctx.group_by.and_then(|v| v.time_dimension()).is_some() {
+                // Include the GROUP BY TIME(..) expression
+                group_by_exprs.push(time_column.clone());
+            }
+
+            group_by_exprs.extend(group_by_tag_set.iter().filter_map(|name| {
+                if schema.has_column_with_unqualified_name(name) {
+                    Some(name.as_expr())
+                } else {
+                    None
+                }
+            }));
+
+            group_by_exprs
+        };
+
+        if aggr_exprs.is_empty() && aggr_group_by_exprs.is_empty() {
+            // If there are no aggregate expressions in the projection, because
+            // they all referred to non-existent columns in the table, and there
+            // is no GROUP BY, the result set is a single row.
+            //
+            // This is required for InfluxQL compatibility.
+            return Ok((LogicalPlanBuilder::empty(true).build()?, select_exprs));
+        }
+
+        let plan = LogicalPlanBuilder::from(input)
+            .aggregate(aggr_group_by_exprs.clone(), aggr_exprs.clone())?
+            .build()?;
+
+        let fill_option = ctx.fill();
+
+        // Wrap the plan in a GapFill operator if the statement specifies a `GROUP BY TIME` clause and
+        // the FILL option is one of
+        //
+        // * `null`
+        // * `previous`
+        // * `literal` value
+        // * `linear`
+        //
+        let plan = if ctx.group_by.and_then(|gb| gb.time_dimension()).is_some()
+            && fill_option != FillClause::None
+        {
+            let fill_strategy = match fill_option {
+                FillClause::Null | FillClause::Value(_) => FillStrategy::Null,
+                FillClause::Previous => FillStrategy::PrevNullAsMissing,
+                FillClause::Linear => FillStrategy::LinearInterpolate,
+                FillClause::None => unreachable!(),
+            };
+
+            build_gap_fill_node(plan, time_column, fill_strategy, &ctx.projection_type)?
+        } else {
+            plan
+        };
+
+        // Combine the aggregate columns and group by expressions, which represents
+        // the final projection from the aggregate operator.
+        let aggr_projection_exprs = [aggr_group_by_exprs, aggr_exprs].concat();
+
+        // Create a literal expression for `value` if the strategy
+        // is `FILL(<value>)`
+        let fill_if_null = match fill_option {
+            FillClause::Value(v) => Some(v),
+            _ => None,
+        };
+
+        // Some aggregates, such as COUNT, should be filled with zero by default
+        // rather than NULL.
+        let should_zero_fill_expr = fields
+            .iter()
+            .map(is_zero_filled_aggregate_field)
+            .collect::<Vec<_>>();
+
+        // Rewrite the aggregate columns from the projection, so that the expressions
+        // refer to the columns from the aggregate projection
+        let select_exprs_post_aggr = select_exprs
+            .iter()
+            .zip(should_fill_expr.iter().zip(should_zero_fill_expr))
+            .map(|(expr, (should_fill, should_zero_fill))| {
+                // This implements the `FILL(<value>)` strategy, by coalescing any aggregate
+                // expressions to `<value>` when they are `NULL`.
+                let fill_if_null = match (fill_if_null, should_fill, should_zero_fill) {
+                    (Some(_), true, _) => fill_if_null,
+                    (None, true, true) => Some(0.into()),
+                    _ => None,
+                };
+
+                rebase_expr(expr, &aggr_projection_exprs, &fill_if_null, &plan)
+            })
+            .collect::<Result<Vec<Expr>>>()?;
+
+        Ok((plan, select_exprs_post_aggr))
+    }
+
+    /// Generate a plan for any window functions, such as `moving_average` or `difference`.
+    fn select_window(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        select_exprs: Vec<Expr>,
+        group_by_tag_set: &[&str],
+    ) -> Result<(LogicalPlan, Vec<Expr>)> {
+        let udfs = find_window_udfs(&select_exprs);
+
+        if udfs.is_empty() {
+            return Ok((input, select_exprs));
+        }
+
+        let order_by = vec![ctx.time_sort_expr()];
+        let partition_by =
+            fields_to_exprs_no_nulls(input.schema(), group_by_tag_set).collect::<Vec<_>>();
+
+        let window_func_exprs = udfs
+            .clone()
+            .into_iter()
+            .map(|e| Self::udf_to_expr(ctx, e, partition_by.clone(), order_by.clone()))
+            .collect::<Result<Vec<_>>>()?;
+
+        let plan = LogicalPlanBuilder::from(input)
+            .window(window_func_exprs)?
+            .build()?;
+
+        // Rewrite the window columns from the projection, so that the expressions
+        // refer to the columns from the window projection.
+        let select_exprs = select_exprs
+            .iter()
+            .map(|expr| {
+                expr.clone().transform_up(&|udf_expr| {
+                    Ok(if udfs.contains(&udf_expr) {
+                        Transformed::Yes(expr_as_column_expr(&udf_expr, &plan)?)
+                    } else {
+                        Transformed::No(udf_expr)
+                    })
+                })
+            })
+            .collect::<Result<Vec<Expr>>>()?;
+
+        Ok((plan, select_exprs))
+    }
+
+    /// Generate a plan to select the first n rows from each partition in
+    /// the input data, optionally sorted by the requested field.
+    fn select_first(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        order_by: SelectorWindowOrderBy<'_>,
+        group_by_tags: &[&str],
+        count: i64,
+    ) -> Result<LogicalPlan> {
+        let order_by_exprs = match order_by {
+            SelectorWindowOrderBy::FieldAsc(id) => {
+                vec![id.as_expr().sort(true, false), ctx.time_sort_expr()]
+            }
+            SelectorWindowOrderBy::FieldDesc(id) => {
+                vec![id.as_expr().sort(false, false), ctx.time_sort_expr()]
+            }
+        };
+
+        let window_expr = Expr::WindowFunction(WindowFunction::new(
+            WindowFunctionDefinition::BuiltInWindowFunction(BuiltInWindowFunction::RowNumber),
+            Vec::<Expr>::new(),
+            window_partition_by(ctx, input.schema(), group_by_tags),
+            order_by_exprs,
+            WindowFrame {
+                units: WindowFrameUnits::Rows,
+                start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                end_bound: WindowFrameBound::CurrentRow,
+            },
+        ));
+        let column_name = window_expr.display_name()?;
+        let filter_expr = binary_expr(col(column_name.clone()), Operator::LtEq, lit(count));
+        LogicalPlanBuilder::from(input)
+            .window(vec![window_expr.alias(column_name)])?
+            .filter(filter_expr)?
+            .build()
+    }
+
+    /// Transform a UDF to a window expression.
+    fn udf_to_expr(
+        ctx: &Context<'_>,
+        e: Expr,
+        partition_by: Vec<Expr>,
+        order_by: Vec<Expr>,
+    ) -> Result<Expr> {
+        let alias = e
+            .display_name()
+            // display_name is known only to fail with Expr::Sort and Expr::QualifiedWildcard,
+            // neither of which should be passed to udf_to_expr
+            .map_err(|err| error::map::internal(format!("display_name: {err}")))?;
+
+        let Expr::ScalarFunction(ScalarFunction { func_def, args }) = e else {
+            return error::internal(format!("udf_to_expr: unexpected expression: {e}"));
+        };
+        let ScalarFunctionDefinition::UDF(udf) = func_def else {
+            return error::internal(format!("udf_to_expr: unexpected function: {func_def:?}"));
+        };
+
+        fn derivative_unit(ctx: &Context<'_>, args: &Vec<Expr>) -> Result<ScalarValue> {
+            if args.len() > 1 {
+                if let Expr::Literal(v) = &args[1] {
+                    Ok(v.clone())
+                } else {
+                    error::internal(format!("udf_to_expr: unexpected expression: {}", args[1]))
+                }
+            } else if let Some(interval) = ctx.interval {
+                Ok(ScalarValue::new_interval_mdn(0, 0, interval.duration))
+            } else {
+                Ok(ScalarValue::new_interval_mdn(0, 0, 1_000_000_000)) // 1s
+            }
+        }
+
+        match udf::WindowFunction::try_from_scalar_udf(Arc::clone(&udf)) {
+            Some(udf::WindowFunction::MovingAverage) => Ok(Expr::WindowFunction(WindowFunction {
+                fun: MOVING_AVERAGE.clone(),
+                args,
+                partition_by,
+                order_by,
+                window_frame: WindowFrame {
+                    units: WindowFrameUnits::Rows,
+                    start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                    end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                },
+            })
+            .alias(alias)),
+            Some(udf::WindowFunction::Difference) => Ok(Expr::WindowFunction(WindowFunction {
+                fun: DIFFERENCE.clone(),
+                args,
+                partition_by,
+                order_by,
+                window_frame: WindowFrame {
+                    units: WindowFrameUnits::Rows,
+                    start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                    end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                },
+            })
+            .alias(alias)),
+            Some(udf::WindowFunction::NonNegativeDifference) => {
+                Ok(Expr::WindowFunction(WindowFunction {
+                    fun: NON_NEGATIVE_DIFFERENCE.clone(),
+                    args,
+                    partition_by,
+                    order_by,
+                    window_frame: WindowFrame {
+                        units: WindowFrameUnits::Rows,
+                        start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                        end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                    },
+                })
+                .alias(alias))
+            }
+            Some(udf::WindowFunction::Derivative) => Ok(Expr::WindowFunction(WindowFunction {
+                fun: DERIVATIVE.clone(),
+                args: vec![
+                    args[0].clone(),
+                    lit(derivative_unit(ctx, &args)?),
+                    "time".as_expr(),
+                ],
+                partition_by,
+                order_by,
+                window_frame: WindowFrame {
+                    units: WindowFrameUnits::Rows,
+                    start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                    end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                },
+            })
+            .alias(alias)),
+            Some(udf::WindowFunction::NonNegativeDerivative) => {
+                Ok(Expr::WindowFunction(WindowFunction {
+                    fun: NON_NEGATIVE_DERIVATIVE.clone(),
+                    args: vec![
+                        args[0].clone(),
+                        lit(derivative_unit(ctx, &args)?),
+                        "time".as_expr(),
+                    ],
+                    partition_by,
+                    order_by,
+                    window_frame: WindowFrame {
+                        units: WindowFrameUnits::Rows,
+                        start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                        end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                    },
+                })
+                .alias(alias))
+            }
+            Some(udf::WindowFunction::CumulativeSum) => Ok(Expr::WindowFunction(WindowFunction {
+                fun: CUMULATIVE_SUM.clone(),
+                args,
+                partition_by,
+                order_by,
+                window_frame: WindowFrame {
+                    units: WindowFrameUnits::Rows,
+                    start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                    end_bound: WindowFrameBound::Following(ScalarValue::Null),
+                },
+            })
+            .alias(alias)),
+            None => error::internal(format!(
+                "unexpected user-defined window function: {}",
+                udf.name()
+            )),
+        }
+    }
+
+    /// Generate a plan that partitions the input data into groups, first omitting a specified
+    /// number of rows, followed by restricting the quantity of rows within each group.
+    ///
+    /// ## Arguments
+    ///
+    /// - `input`: The plan to apply the limit to.
+    /// - `offset`: The number of input rows to skip.
+    /// - `limit`: The maximum number of rows to return in the output plan per group.
+    /// - `time_sort_expr`: An `Expr::Sort` referring to the `time` column of the input.
+    /// - `sort_by_measurement`: `true` if the `input` must be sorted by the measurement column.
+    /// - `group_by_tag_set`: Tag columns from the `input` plan that should be used to partition
+    ///   the `input` plan and sort the `output` plan.
+    /// - `projection_tag_set`: Additional tag columns that should be used to sort the `output`
+    ///   plan.
+    #[allow(clippy::too_many_arguments)]
+    fn limit(
+        &self,
+        input: LogicalPlan,
+        offset: Option<OffsetClause>,
+        limit: Option<LimitClause>,
+        sort_exprs: Vec<Expr>,
+        sort_by_measurement: bool,
+        group_by_tag_set: &[&str],
+        projection_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        if offset.is_none() && limit.is_none() {
+            return Ok(input);
+        }
+
+        if group_by_tag_set.is_empty() && !sort_by_measurement {
+            // If the query is not grouping by tags, and is a single measurement, the DataFusion
+            // Limit operator is sufficient.
+            let skip = offset.map_or(0, |v| *v as usize);
+            let fetch = limit.map(|v| *v as usize);
+
+            LogicalPlanBuilder::from(input).limit(skip, fetch)?.build()
+        } else {
+            // If the query includes a GROUP BY tag[, tag, ...], the LIMIT and OFFSET clauses
+            // are applied to each unique group. To accomplish this, construct a plan which uses
+            // the ROW_NUMBER windowing function.
+
+            // The name of the ROW_NUMBER window expression
+            const IOX_ROW_ALIAS: &str = "iox::row";
+
+            // Construct a ROW_NUMBER window expression:
+            //
+            // ROW_NUMBER() OVER (
+            //   PARTITION BY [iox::measurement, group_by_tag_set]
+            //   ORDER BY time [ASC | DESC]
+            //   ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
+            // ) AS iox::row
+            let order_by = sort_exprs.clone();
+
+            let partition_by = if sort_by_measurement {
+                iter::once(INFLUXQL_MEASUREMENT_COLUMN_NAME.as_expr())
+                    .chain(fields_to_exprs_no_nulls(input.schema(), group_by_tag_set))
+                    .collect::<Vec<_>>()
+            } else {
+                fields_to_exprs_no_nulls(input.schema(), group_by_tag_set).collect::<Vec<_>>()
+            };
+
+            let window_func_exprs = vec![Expr::WindowFunction(WindowFunction::new(
+                WindowFunctionDefinition::BuiltInWindowFunction(BuiltInWindowFunction::RowNumber),
+                vec![],
+                partition_by,
+                order_by,
+                WindowFrame {
+                    units: WindowFrameUnits::Rows,
+                    start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
+                    end_bound: WindowFrameBound::CurrentRow,
+                },
+            ))
+            .alias(IOX_ROW_ALIAS)];
+
+            // Prepare new projection.
+            let proj_exprs = input
+                .schema()
+                .fields()
+                .iter()
+                .map(|expr| Expr::Column(expr.unqualified_column()))
+                .collect::<Vec<_>>();
+
+            let plan = LogicalPlanBuilder::from(input)
+                .window(window_func_exprs)?
+                .build()?;
+
+            let limit = limit
+                .map(|v| <u64 as TryInto<i64>>::try_into(*v))
+                .transpose()
+                .map_err(|_| error::map::query("limit out of range"))?;
+            let offset = offset
+                .map(|v| <u64 as TryInto<i64>>::try_into(*v))
+                .transpose()
+                .map_err(|_| error::map::query("offset out of range".to_owned()))?;
+
+            // a reference to the ROW_NUMBER column.
+            let row_alias = IOX_ROW_ALIAS.as_expr();
+
+            let row_filter_expr = match (limit, offset) {
+                // WHERE "iox::row" BETWEEN OFFSET + 1 AND OFFSET + LIMIT
+                (Some(limit), Some(offset)) => {
+                    let low = offset + 1;
+                    let high = offset + limit;
+
+                    Expr::Between(Between {
+                        expr: Box::new(row_alias),
+                        negated: false,
+                        low: Box::new(lit(low)),
+                        high: Box::new(lit(high)),
+                    })
+                }
+
+                // WHERE "iox::row" <= LIMIT
+                (Some(limit), None) => row_alias.lt_eq(lit(limit)),
+
+                // WHERE "iox::row" > OFFSET
+                (None, Some(offset)) => row_alias.gt(lit(offset)),
+                (None, None) => unreachable!("limit and offset cannot not be None"),
+            };
+
+            let plan = LogicalPlanBuilder::from(plan)
+                // Filter by the LIMIT and OFFSET clause
+                .filter(row_filter_expr)?
+                // Project the output without the IOX_ROW_ALIAS column
+                .project(proj_exprs)?
+                .build()?;
+
+            // For consistency with InfluxQL, the final results must be sorted by
+            // the tag set from the GROUP BY
+            plan_with_sort(
+                plan,
+                sort_exprs,
+                sort_by_measurement,
+                group_by_tag_set,
+                projection_tag_set,
+            )
+        }
+    }
+
+    /// Map the InfluxQL `SELECT` projection list into a list of DataFusion expressions.
+    fn field_list_to_exprs(
+        &self,
+        plan: &LogicalPlan,
+        fields: &[Field],
+        schema: &IQLSchema<'_>,
+    ) -> Result<Vec<Expr>> {
+        let mut names: HashMap<&str, usize> = HashMap::new();
+        fields
+            .iter()
+            .map(|field| {
+                let mut new_field = field.clone();
+                new_field.name = match names.entry(field.name.as_str()) {
+                    Entry::Vacant(v) => {
+                        v.insert(0);
+                        field.name.clone()
+                    }
+                    Entry::Occupied(mut e) => {
+                        let count = e.get_mut();
+                        *count += 1;
+                        format!("{}_{}", field.name, *count)
+                    }
+                };
+                new_field
+            })
+            .map(|field| self.field_to_df_expr(&field, plan, schema))
+            .collect()
+    }
+
+    /// Map an InfluxQL [`Field`] to a DataFusion [`Expr`].
+    ///
+    /// A [`Field`] is analogous to a column in a SQL `SELECT` projection.
+    fn field_to_df_expr(
+        &self,
+        field: &Field,
+        plan: &LogicalPlan,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        let expr = self.expr_to_df_expr(ExprScope::Projection, &field.expr, schema)?;
+        let expr = planner_rewrite_expression::rewrite_field_expr(expr, schema)?;
+        normalize_col(expr.alias(&field.name), plan)
+    }
+
+    /// Map an InfluxQL [`ConditionalExpression`] to a DataFusion [`Expr`].
+    fn conditional_to_df_expr(
+        &self,
+        iql: &ConditionalExpression,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        match iql {
+            ConditionalExpression::Expr(expr) => {
+                self.expr_to_df_expr(ExprScope::Where, expr, schema)
+            }
+            ConditionalExpression::Binary(expr) => self.binary_conditional_to_df_expr(expr, schema),
+            ConditionalExpression::Grouped(e) => self.conditional_to_df_expr(e, schema),
+        }
+    }
+
+    /// Map an InfluxQL binary conditional expression to a DataFusion [`Expr`].
+    fn binary_conditional_to_df_expr(
+        &self,
+        expr: &ConditionalBinary,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        let ConditionalBinary { lhs, op, rhs } = expr;
+
+        Ok(binary_expr(
+            self.conditional_to_df_expr(lhs, schema)?,
+            conditional_op_to_operator(*op)?,
+            self.conditional_to_df_expr(rhs, schema)?,
+        ))
+    }
+
+    /// Map an InfluxQL [`IQLExpr`] to a DataFusion [`Expr`].
+    fn expr_to_df_expr(
+        &self,
+        scope: ExprScope,
+        iql: &IQLExpr,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        let df_schema = &schema.df_schema;
+        match iql {
+            // rewriter is expected to expand wildcard expressions
+            IQLExpr::Wildcard(_) => error::internal("unexpected wildcard in projection"),
+            IQLExpr::VarRef(VarRef {
+                name,
+                data_type: opt_dst_type,
+            }) => {
+                Ok(match (scope, name.as_str()) {
+                    // Per the Go implementation, the time column is case-insensitive in the
+                    // `WHERE` clause and disregards any postfix type cast operator.
+                    //
+                    // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5751-L5753
+                    (ExprScope::Where, name) if name.eq_ignore_ascii_case("time") => {
+                        "time".as_expr()
+                    }
+                    (ExprScope::Projection, "time") => "time".as_expr(),
+                    (_, name) => match df_schema
+                        .fields_with_unqualified_name(name)
+                        .first()
+                        .map(|f| f.data_type().clone())
+                    {
+                        Some(src_type) => {
+                            let column = name.as_expr();
+
+                            match opt_dst_type.and_then(var_ref_data_type_to_data_type) {
+                                Some(dst_type) => {
+                                    fn is_numeric(dt: &DataType) -> bool {
+                                        matches!(
+                                            dt,
+                                            DataType::Int64 | DataType::Float64 | DataType::UInt64
+                                        )
+                                    }
+
+                                    if src_type == dst_type {
+                                        column
+                                    } else if is_numeric(&src_type) && is_numeric(&dst_type) {
+                                        // InfluxQL only allows casting between numeric types,
+                                        // and it is safe to unconditionally unwrap, as the
+                                        // `is_numeric_type` call guarantees it can be mapped to
+                                        // an Arrow DataType
+                                        column.cast_to(&dst_type, &schema.df_schema)?
+                                    } else {
+                                        // If the cast is incompatible, evaluates to NULL
+                                        Expr::Literal(ScalarValue::Null)
+                                    }
+                                }
+                                None => column,
+                            }
+                        }
+                        _ => Expr::Literal(ScalarValue::Null),
+                    },
+                })
+            }
+            IQLExpr::BindParameter(_) => error::not_implemented("parameter"),
+            IQLExpr::Literal(val) => match val {
+                Literal::Integer(v) => Ok(lit(*v)),
+                Literal::Unsigned(v) => Ok(lit(*v)),
+                Literal::Float(v) => Ok(lit(*v)),
+                Literal::String(v) => Ok(lit(v)),
+                Literal::Boolean(v) => Ok(lit(*v)),
+                Literal::Timestamp(v) => v
+                    .timestamp_nanos_opt()
+                    .ok_or_else(|| error::map::query("timestamp out of range"))
+                    .map(lit_timestamptz_nano),
+                Literal::Duration(v) => {
+                    Ok(lit(ScalarValue::IntervalMonthDayNano(Some((**v).into()))))
+                }
+                Literal::Regex(re) => match scope {
+                    // a regular expression in a projection list is unexpected,
+                    // as it should have been expanded by the rewriter.
+                    ExprScope::Projection => {
+                        error::internal("unexpected regular expression found in projection")
+                    }
+                    ExprScope::Where => Ok(lit(clean_non_meta_escapes(re.as_str()))),
+                },
+            },
+            // A DISTINCT <ident> clause should have been replaced by `rewrite_statement`.
+            IQLExpr::Distinct(_) => error::internal("distinct expression"),
+            IQLExpr::Call(call) => self.call_to_df_expr(scope, call, schema),
+            IQLExpr::Binary(expr) => self.arithmetic_expr_to_df_expr(scope, expr, schema),
+            IQLExpr::Nested(e) => self.expr_to_df_expr(scope, e, schema),
+        }
+    }
+
+    /// Map an InfluxQL function call to a DataFusion expression.
+    ///
+    /// A full list of supported functions available via the [InfluxQL documentation][docs].
+    ///
+    /// > **Note**
+    /// >
+    /// > These are not necessarily implemented, and are tracked by the following
+    /// > issues:
+    /// >
+    /// > * <https://github.com/influxdata/influxdb_iox/issues/6934>
+    /// > * <https://github.com/influxdata/influxdb_iox/issues/6935>
+    /// > * <https://github.com/influxdata/influxdb_iox/issues/6937>
+    /// > * <https://github.com/influxdata/influxdb_iox/issues/6938>
+    /// > * <https://github.com/influxdata/influxdb_iox/issues/6939>
+    ///
+    /// [docs]: https://docs.influxdata.com/influxdb/v1.8/query_language/functions/
+    fn call_to_df_expr(
+        &self,
+        scope: ExprScope,
+        call: &Call,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        if is_scalar_math_function(call.name.as_str()) {
+            return self.scalar_math_func_to_df_expr(scope, call, schema);
+        }
+
+        match scope {
+            ExprScope::Where => {
+                if is_now_function(&call.name) {
+                    error::not_implemented("now")
+                } else {
+                    let name = &call.name;
+                    error::query(format!("invalid function call in condition: {name}"))
+                }
+            }
+            ExprScope::Projection => self.function_to_df_expr(scope, call, schema),
+        }
+    }
+
+    fn function_to_df_expr(
+        &self,
+        scope: ExprScope,
+        call: &Call,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        fn check_arg_count(name: &str, args: &[IQLExpr], count: usize) -> Result<()> {
+            let got = args.len();
+            if got != count {
+                error::query(format!(
+                    "invalid number of arguments for {name}: expected {count}, got {got}"
+                ))
+            } else {
+                Ok(())
+            }
+        }
+
+        fn check_arg_count_range(
+            name: &str,
+            args: &[IQLExpr],
+            min: usize,
+            max: usize,
+        ) -> Result<()> {
+            let got = args.len();
+            if got < min || got > max {
+                error::query(format!(
+                    "invalid number of arguments for {name}: expected between {min} and {max}, got {got}"
+                ))
+            } else {
+                Ok(())
+            }
+        }
+
+        let Call { name, args } = call;
+
+        match name.as_str() {
+            // The DISTINCT function is handled as a `ProjectionType::RawDistinct`
+            // query, so the planner only needs to project the single column
+            // argument.
+            "distinct" => self.expr_to_df_expr(scope, &args[0], schema),
+            "count" => {
+                let (expr, distinct) = match &args[0] {
+                    IQLExpr::Call(c) if c.name == "distinct" => {
+                        (self.expr_to_df_expr(scope, &c.args[0], schema)?, true)
+                    }
+                    expr => (self.expr_to_df_expr(scope, expr, schema)?, false),
+                };
+                if let Expr::Literal(ScalarValue::Null) = expr {
+                    return Ok(expr);
+                }
+
+                check_arg_count("count", args, 1)?;
+                Ok(Expr::AggregateFunction(expr::AggregateFunction::new(
+                    AggregateFunction::Count,
+                    vec![expr],
+                    distinct,
+                    None,
+                    None,
+                )))
+            }
+            "sum" | "stddev" | "mean" | "median" => {
+                let expr = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = expr {
+                    return Ok(expr);
+                }
+
+                check_arg_count(name, args, 1)?;
+                Ok(Expr::AggregateFunction(expr::AggregateFunction::new(
+                    AggregateFunction::from_str(name)?,
+                    vec![expr],
+                    false,
+                    None,
+                    None,
+                )))
+            }
+            "percentile" => {
+                let expr = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = expr {
+                    return Ok(expr);
+                }
+
+                check_arg_count(name, args, 2)?;
+                let nexpr = self.expr_to_df_expr(scope, &args[1], schema)?;
+                Ok(Expr::AggregateFunction(expr::AggregateFunction {
+                    func_def: AggregateFunctionDefinition::UDF(PERCENTILE.clone()),
+                    args: vec![expr, nexpr],
+                    distinct: false,
+                    filter: None,
+                    order_by: None,
+                }))
+            }
+            name @ ("first" | "last" | "min" | "max") => {
+                let expr = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = expr {
+                    return Ok(expr);
+                }
+
+                let selector_udf = match name {
+                    "first" => selector_first(),
+                    "last" => selector_last(),
+                    "max" => selector_max(),
+                    "min" => selector_min(),
+                    _ => unreachable!(),
+                }
+                .call(vec![expr, "time".as_expr()]);
+
+                Ok(selector_udf.field("value"))
+            }
+            "difference" => {
+                check_arg_count(name, args, 1)?;
+
+                // arg0 should be a column or function
+                let arg0 = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = arg0 {
+                    return Ok(arg0);
+                }
+
+                Ok(difference(vec![arg0]))
+            }
+            "non_negative_difference" => {
+                check_arg_count(name, args, 1)?;
+
+                // arg0 should be a column or function
+                let arg0 = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = arg0 {
+                    return Ok(arg0);
+                }
+
+                Ok(non_negative_difference(vec![arg0]))
+            }
+            "moving_average" => {
+                check_arg_count(name, args, 2)?;
+
+                // arg0 should be a column or function
+                let arg0 = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = arg0 {
+                    return Ok(arg0);
+                }
+
+                // arg1 should be an integer.
+                let arg1 = ScalarValue::Int64(Some(
+                    match self.expr_to_df_expr(scope, &args[1], schema)? {
+                        Expr::Literal(ScalarValue::Int64(Some(v))) => v,
+                        Expr::Literal(ScalarValue::UInt64(Some(v))) => v as i64,
+                        _ => {
+                            return error::query(
+                                "moving_average expects number for second argument",
+                            )
+                        }
+                    },
+                ));
+
+                Ok(moving_average(vec![arg0, lit(arg1)]))
+            }
+            "derivative" => {
+                check_arg_count_range(name, args, 1, 2)?;
+
+                // arg0 should be a column or function
+                let arg0 = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = arg0 {
+                    return Ok(arg0);
+                }
+                let mut eargs = vec![arg0];
+                if args.len() > 1 {
+                    let arg1 = self.expr_to_df_expr(scope, &args[1], schema)?;
+                    eargs.push(arg1);
+                }
+
+                Ok(derivative(eargs))
+            }
+            "non_negative_derivative" => {
+                check_arg_count_range(name, args, 1, 2)?;
+
+                // arg0 should be a column or function
+                let arg0 = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = arg0 {
+                    return Ok(arg0);
+                }
+                let mut eargs = vec![arg0];
+                if args.len() > 1 {
+                    let arg1 = self.expr_to_df_expr(scope, &args[1], schema)?;
+                    eargs.push(arg1);
+                }
+
+                Ok(non_negative_derivative(eargs))
+            }
+            "cumulative_sum" => {
+                check_arg_count(name, args, 1)?;
+
+                // arg0 should be a column or function
+                let arg0 = self.expr_to_df_expr(scope, &args[0], schema)?;
+                if let Expr::Literal(ScalarValue::Null) = arg0 {
+                    return Ok(arg0);
+                }
+
+                Ok(cumulative_sum(vec![arg0]))
+            }
+            // The TOP/BOTTOM function is handled as a `ProjectionType::TopBottomSelector`
+            // query, so the planner only needs to project the single column
+            // argument.
+            "top" | "bottom" => self.expr_to_df_expr(scope, &args[0], schema),
+
+            _ => error::query(format!("Invalid function '{name}'")),
+        }
+    }
+
+    /// Map the InfluxQL scalar function call to a DataFusion scalar function expression.
+    fn scalar_math_func_to_df_expr(
+        &self,
+        scope: ExprScope,
+        call: &Call,
+        schema: &IQLSchema<'a>,
+    ) -> Result<Expr> {
+        let mut args = call
+            .args
+            .iter()
+            .map(|e| self.expr_to_df_expr(scope, e, schema))
+            .collect::<Result<Vec<Expr>>>()?;
+
+        match BuiltinScalarFunction::from_str(call.name.as_str())? {
+            BuiltinScalarFunction::Log => {
+                if args.len() != 2 {
+                    error::query("invalid number of arguments for log, expected 2, got 1")
+                } else {
+                    let arg1 = args.pop().unwrap();
+                    let arg0 = args.pop().unwrap();
+                    // reverse args
+                    Ok(datafusion::prelude::log(arg1, arg0))
+                }
+            }
+            fun => Ok(Expr::ScalarFunction(ScalarFunction {
+                func_def: ScalarFunctionDefinition::BuiltIn(fun),
+                args,
+            })),
+        }
+    }
+
+    /// Map an InfluxQL arithmetic expression to a DataFusion [`Expr`].
+    fn arithmetic_expr_to_df_expr(
+        &self,
+        scope: ExprScope,
+        expr: &Binary,
+        schema: &IQLSchema<'_>,
+    ) -> Result<Expr> {
+        Ok(binary_expr(
+            self.expr_to_df_expr(scope, &expr.lhs, schema)?,
+            binary_operator_to_df_operator(expr.op),
+            self.expr_to_df_expr(scope, &expr.rhs, schema)?,
+        ))
+    }
+
+    fn plan_condition_time_range(
+        &self,
+        condition: Option<&ConditionalExpression>,
+        time_range: TimeRange,
+        plan: LogicalPlan,
+        schema: &IQLSchema<'a>,
+    ) -> Result<LogicalPlan> {
+        let filter_expr = condition
+            .map(|condition| {
+                let filter_expr = self.conditional_to_df_expr(condition, schema)?;
+                planner_rewrite_expression::rewrite_conditional_expr(
+                    self.s.execution_props(),
+                    filter_expr,
+                    schema,
+                )
+            })
+            .transpose()?;
+
+        let time_expr = time_range_to_df_expr(time_range);
+
+        let pb = LogicalPlanBuilder::from(plan);
+        match (time_expr, filter_expr) {
+            (Some(lhs), Some(rhs)) => pb.filter(lhs.and(rhs))?,
+            (Some(expr), None) | (None, Some(expr)) => pb.filter(expr)?,
+            (None, None) => pb,
+        }
+        .build()
+    }
+
+    /// Generate a logical plan that filters the existing plan based on the
+    /// InfluxQL [`WhereClause`] of a `SHOW` statement.
+    fn plan_where_clause(
+        &self,
+        plan: LogicalPlan,
+        condition: &Option<WhereClause>,
+        cutoff: MetadataCutoff,
+        schema: &IQLSchema<'_>,
+    ) -> Result<LogicalPlan> {
+        let start_time = Timestamp::from(self.s.execution_props().query_execution_start_time);
+
+        let (cond, time_range) = condition
+            .as_ref()
+            .map(|where_clause| {
+                let rc = ReduceContext {
+                    now: Some(start_time),
+                    tz: None,
+                };
+
+                split_cond(&rc, where_clause).map_err(error::map::expr_error)
+            })
+            .transpose()?
+            .unwrap_or_default();
+
+        // Add time restriction to logical plan if there isn't any.
+        let time_range = if time_range.is_unbounded() {
+            TimeRange {
+                lower: Some(match cutoff {
+                    MetadataCutoff::Absolute(dt) => dt
+                        .timestamp_nanos_opt()
+                        .ok_or_else(|| error::map::query("timestamp out of range"))?,
+                    MetadataCutoff::Relative(delta) => {
+                        start_time
+                            .timestamp_nanos_opt()
+                            .ok_or_else(|| error::map::query("timestamp out of range"))?
+                            - delta.as_nanos() as i64
+                    }
+                }),
+                upper: None,
+            }
+        } else {
+            time_range
+        };
+
+        self.plan_condition_time_range(cond.as_ref(), time_range, plan, schema)
+    }
+
+    /// Generate a logical plan for the specified `DataSource`.
+    fn plan_from_data_source(
+        &self,
+        ctx: &Context<'_>,
+        ds: &DataSource,
+    ) -> Result<Option<LogicalPlan>> {
+        match ds {
+            DataSource::Table(table_name) if table_name == ctx.table_name => {
+                // `rewrite_statement` guarantees the table should exist
+                let source = self.s.get_table_provider(table_name)?;
+                let table_ref = TableReference::bare(table_name.to_owned());
+                Ok(Some(
+                    LogicalPlanBuilder::scan(table_ref, source, None)?.build()?,
+                ))
+            }
+            DataSource::Table(_) => Ok(None),
+            DataSource::Subquery(select) => Ok(self.subquery_to_plan(ctx, select)?),
+        }
+    }
+
+    /// Create a [LogicalPlan] that refers to the specified `table_name`.
+    ///
+    /// Normally, this functions will not return a `None`, as tables have been matched]
+    /// by the [`rewrite_statement`] function.
+    fn create_table_ref(&self, table_name: &str) -> Result<Option<(LogicalPlan, Vec<Expr>)>> {
+        Ok(if let Ok(source) = self.s.get_table_provider(table_name) {
+            let table_ref = TableReference::bare(table_name.to_owned());
+            Some((
+                LogicalPlanBuilder::scan(table_ref, source, None)?.build()?,
+                vec![lit_dict(table_name).alias(INFLUXQL_MEASUREMENT_COLUMN_NAME)],
+            ))
+        } else {
+            None
+        })
+    }
+
+    /// Expand tables from `FROM` clause in metadata queries.
+    fn expand_show_from_clause(&self, from: Option<ShowFromClause>) -> Result<Vec<String>> {
+        match from {
+            None => {
+                let mut tables = self
+                    .s
+                    .table_names()
+                    .into_iter()
+                    .map(|s| s.to_owned())
+                    .collect::<Vec<_>>();
+                tables.sort();
+                Ok(tables)
+            }
+            Some(from) => {
+                let all_tables = self.s.table_names().into_iter().collect::<HashSet<_>>();
+                let mut out = HashSet::new();
+                for qualified_name in &*from {
+                    if qualified_name.database.is_some() {
+                        return error::not_implemented("database name in from clause");
+                    }
+                    if qualified_name.retention_policy.is_some() {
+                        return error::not_implemented("retention policy in from clause");
+                    }
+                    match &qualified_name.name {
+                        MeasurementName::Name(name) => {
+                            let name = name.as_str();
+                            if all_tables.contains(name) {
+                                out.insert(name);
+                            }
+                        }
+                        MeasurementName::Regex(regex) => {
+                            let regex = parse_regex(regex)?;
+                            for name in &all_tables {
+                                if regex.is_match(name) {
+                                    out.insert(name);
+                                }
+                            }
+                        }
+                    }
+                }
+
+                let mut out = out.into_iter().map(|s| s.to_owned()).collect::<Vec<_>>();
+                out.sort();
+                Ok(out)
+            }
+        }
+    }
+
+    fn expand_with_measurement_clause(
+        &self,
+        with_measurement: Option<WithMeasurementClause>,
+    ) -> Result<Vec<String>> {
+        match with_measurement {
+            Some(
+                WithMeasurementClause::Equals(qualified_name)
+                | WithMeasurementClause::Regex(qualified_name),
+            ) if qualified_name.database.is_some() => {
+                error::not_implemented("database name in from clause")
+            }
+            Some(
+                WithMeasurementClause::Equals(qualified_name)
+                | WithMeasurementClause::Regex(qualified_name),
+            ) if qualified_name.retention_policy.is_some() => {
+                error::not_implemented("retention policy in from clause")
+            }
+            Some(WithMeasurementClause::Equals(qualified_name)) => match qualified_name.name {
+                MeasurementName::Name(n) => {
+                    let names = self.s.table_names();
+                    let tables = if names.into_iter().any(|table| table == n.as_str()) {
+                        vec![n.as_str().to_owned()]
+                    } else {
+                        vec![]
+                    };
+                    Ok(tables)
+                }
+                MeasurementName::Regex(_) => error::query("expected string but got regex"),
+            },
+            Some(WithMeasurementClause::Regex(qualified_name)) => match &qualified_name.name {
+                MeasurementName::Name(_) => error::query("expected regex but got string"),
+                MeasurementName::Regex(regex) => {
+                    let regex = parse_regex(regex)?;
+                    let mut tables = self
+                        .s
+                        .table_names()
+                        .into_iter()
+                        .filter(|s| regex.is_match(s))
+                        .map(|s| s.to_owned())
+                        .collect::<Vec<_>>();
+                    tables.sort();
+                    Ok(tables)
+                }
+            },
+            None => {
+                let mut tables = self
+                    .s
+                    .table_names()
+                    .into_iter()
+                    .map(|s| s.to_owned())
+                    .collect::<Vec<_>>();
+                tables.sort();
+                Ok(tables)
+            }
+        }
+    }
+
+    fn show_tag_keys_to_plan(&self, show_tag_keys: ShowTagKeysStatement) -> Result<LogicalPlan> {
+        if show_tag_keys.database.is_some() {
+            // How do we handle this? Do we need to perform cross-namespace queries here?
+            return error::not_implemented("SHOW TAG KEYS ON <database>");
+        }
+
+        let tag_key_col = "tagKey";
+        let output_schema = Arc::new(ArrowSchema::new(vec![
+            ArrowField::new(
+                INFLUXQL_MEASUREMENT_COLUMN_NAME,
+                (&InfluxColumnType::Tag).into(),
+                false,
+            ),
+            ArrowField::new(tag_key_col, (&InfluxColumnType::Tag).into(), false),
+        ]));
+
+        let tables = self.expand_show_from_clause(show_tag_keys.from)?;
+
+        let plan = match show_tag_keys.condition {
+            Some(condition) => {
+                debug!("`SHOW TAG KEYS` w/ WHERE-clause, use data scan plan",);
+
+                let condition = Some(condition);
+                let metadata_cutoff = self.metadata_cutoff();
+
+                let mut union_plan = None;
+                for table in tables {
+                    let Some(table_schema) = self.s.table_schema(&table) else {
+                        continue;
+                    };
+                    let Some((plan, measurement_expr)) = self.create_table_ref(&table)? else {
+                        continue;
+                    };
+
+                    let ds = DataSource::Table(table.clone());
+                    let schema = IQLSchema::new_from_ds_schema(plan.schema(), ds.schema(self.s)?)?;
+                    let plan =
+                        self.plan_where_clause(plan, &condition, metadata_cutoff, &schema)?;
+
+                    let tags = table_schema
+                        .iter()
+                        .filter(|(t, _f)| matches!(t, InfluxColumnType::Tag))
+                        .map(|(_t, f)| f.name().as_str())
+                        .collect::<Vec<_>>();
+
+                    // We want to find all tag columns that had non-null values and create a row for each of them. SQL
+                    // (and DataFusion) don't have a real pivot/transpose operation, but we can work around this by
+                    // using some `make_array`+`unnest` trickery.
+                    let tag_key_df_col = Column::from_name(tag_key_col);
+                    let tag_key_col_expr = Expr::Column(tag_key_df_col.clone());
+                    let plan = LogicalPlanBuilder::from(plan)
+                        // aggregate `SUM(tag IS NOT NULL)` for all tags in one go
+                        //
+                        // we have a single row afterwards because the group expression is empty.
+                        .aggregate(
+                            [] as [Expr; 0],
+                            tags.iter().map(|tag| {
+                                let tag_col = Expr::Column(Column::from_name(*tag));
+
+                                sum(cast(tag_col.is_not_null(), DataType::UInt64)).alias(*tag)
+                            }),
+                        )?
+                        // create array of tag names, where every name is:
+                        // - null if it had no non-null values
+                        // - not null if it had any non-null values
+                        //
+                        // note that since we only have a single row, this is efficient
+                        .project([datafusion::prelude::array(
+                            tags.iter()
+                                .map(|tag| {
+                                    let tag_col = Expr::Column(Column::from_name(*tag));
+
+                                    when(tag_col.gt(lit(0)), lit(*tag)).end()
+                                })
+                                .collect::<Result<Vec<_>, _>>()?,
+                        )
+                        .alias(tag_key_col)])?
+                        // roll our single array row into one row per tag key
+                        .unnest_column(tag_key_df_col)?
+                        // filter out tags that had no none-null values
+                        .filter(tag_key_col_expr.clone().is_not_null())?
+                        // build proper output
+                        .project(measurement_expr.into_iter().chain([tag_key_col_expr]))?
+                        .build()?;
+
+                    union_plan = match union_plan {
+                        Some(union_plan) => {
+                            Some(LogicalPlanBuilder::from(union_plan).union(plan)?.build()?)
+                        }
+                        None => Some(plan),
+                    };
+                }
+
+                let plan = match union_plan {
+                    Some(plan) => plan,
+                    None => LogicalPlan::EmptyRelation(EmptyRelation {
+                        produce_one_row: false,
+                        schema: output_schema.to_dfschema_ref()?,
+                    }),
+                };
+
+                LogicalPlanBuilder::from(plan)
+                    .sort([
+                        Expr::Column(Column::new_unqualified(INFLUXQL_MEASUREMENT_COLUMN_NAME))
+                            .sort(true, false),
+                        Expr::Column(Column::new_unqualified(tag_key_col)).sort(true, false),
+                    ])?
+                    .build()?
+            }
+            None => {
+                debug!("`SHOW TAG KEYS` w/o WHERE-clause, use cheap metadata scan",);
+
+                let mut measurement_names_builder = StringDictionaryBuilder::<Int32Type>::new();
+                let mut tag_key_builder = StringDictionaryBuilder::<Int32Type>::new();
+                for table in tables {
+                    let Some(table_schema) = self.s.table_schema(&table) else {
+                        continue;
+                    };
+                    for (t, f) in table_schema.iter() {
+                        match t {
+                            InfluxColumnType::Tag => {}
+                            InfluxColumnType::Field(_) | InfluxColumnType::Timestamp => {
+                                continue;
+                            }
+                        }
+                        measurement_names_builder.append_value(&table);
+                        tag_key_builder.append_value(f.name());
+                    }
+                }
+                LogicalPlanBuilder::scan(
+                    "tag_keys",
+                    provider_as_source(Arc::new(MemTable::try_new(
+                        Arc::clone(&output_schema),
+                        vec![vec![RecordBatch::try_new(
+                            Arc::clone(&output_schema),
+                            vec![
+                                Arc::new(measurement_names_builder.finish()),
+                                Arc::new(tag_key_builder.finish()),
+                            ],
+                        )?]],
+                    )?)),
+                    None,
+                )?
+                .build()?
+            }
+        };
+
+        let plan = plan_with_metadata(
+            plan,
+            &InfluxQlMetadata {
+                measurement_column_index: MEASUREMENT_COLUMN_INDEX,
+                tag_key_columns: vec![],
+            },
+        )?;
+
+        let plan = self.limit(
+            plan,
+            show_tag_keys.offset,
+            show_tag_keys.limit,
+            vec![Expr::Column(Column::new_unqualified(tag_key_col)).sort(true, false)],
+            true,
+            &[],
+            &[],
+        )?;
+
+        Ok(plan)
+    }
+
+    fn show_field_keys_to_plan(
+        &self,
+        show_field_keys: ShowFieldKeysStatement,
+    ) -> Result<LogicalPlan> {
+        if show_field_keys.database.is_some() {
+            // How do we handle this? Do we need to perform cross-namespace queries here?
+            return error::not_implemented("SHOW FIELD KEYS ON <database>");
+        }
+
+        let field_key_col = "fieldKey";
+        let output_schema = Arc::new(ArrowSchema::new(vec![
+            ArrowField::new(INFLUXQL_MEASUREMENT_COLUMN_NAME, DataType::Utf8, false),
+            ArrowField::new(field_key_col, DataType::Utf8, false),
+            ArrowField::new("fieldType", DataType::Utf8, false),
+        ]));
+
+        let tables = self.expand_show_from_clause(show_field_keys.from)?;
+
+        let mut measurement_names_builder = StringBuilder::new();
+        let mut field_key_builder = StringBuilder::new();
+        let mut field_type_builder = StringBuilder::new();
+        for table in tables {
+            let Some(table_schema) = self.s.table_schema(&table) else {
+                continue;
+            };
+            for (t, f) in table_schema.iter() {
+                let t = match t {
+                    InfluxColumnType::Field(t) => t,
+                    InfluxColumnType::Tag | InfluxColumnType::Timestamp => {
+                        continue;
+                    }
+                };
+                let t = match t {
+                    InfluxFieldType::Float => "float",
+                    InfluxFieldType::Integer => "integer",
+                    InfluxFieldType::UInteger => "unsigned",
+                    InfluxFieldType::String => "string",
+                    InfluxFieldType::Boolean => "boolean",
+                };
+                measurement_names_builder.append_value(&table);
+                field_key_builder.append_value(f.name());
+                field_type_builder.append_value(t);
+            }
+        }
+        let plan = LogicalPlanBuilder::scan(
+            "field_keys",
+            provider_as_source(Arc::new(MemTable::try_new(
+                Arc::clone(&output_schema),
+                vec![vec![RecordBatch::try_new(
+                    Arc::clone(&output_schema),
+                    vec![
+                        Arc::new(measurement_names_builder.finish()),
+                        Arc::new(field_key_builder.finish()),
+                        Arc::new(field_type_builder.finish()),
+                    ],
+                )?]],
+            )?)),
+            None,
+        )?
+        .build()?;
+        let plan = plan_with_metadata(
+            plan,
+            &InfluxQlMetadata {
+                measurement_column_index: MEASUREMENT_COLUMN_INDEX,
+                tag_key_columns: vec![],
+            },
+        )?;
+        let plan = self.limit(
+            plan,
+            show_field_keys.offset,
+            show_field_keys.limit,
+            vec![Expr::Column(Column::new_unqualified(field_key_col)).sort(true, false)],
+            true,
+            &[],
+            &[],
+        )?;
+
+        Ok(plan)
+    }
+
+    fn show_tag_values_to_plan(
+        &self,
+        show_tag_values: ShowTagValuesStatement,
+    ) -> Result<LogicalPlan> {
+        if show_tag_values.database.is_some() {
+            // How do we handle this? Do we need to perform cross-namespace queries here?
+            return error::not_implemented("SHOW TAG VALUES ON <database>");
+        }
+
+        let key_col = "key";
+        let value_col = "value";
+        let output_schema = Arc::new(ArrowSchema::new(vec![
+            ArrowField::new(INFLUXQL_MEASUREMENT_COLUMN_NAME, DataType::Utf8, false),
+            ArrowField::new(key_col, DataType::Utf8, false),
+            ArrowField::new(value_col, DataType::Utf8, false),
+        ]));
+
+        let tables = self.expand_show_from_clause(show_tag_values.from)?;
+        let metadata_cutoff = self.metadata_cutoff();
+
+        let mut union_plan = None;
+        for table in tables {
+            let Some(schema) = self.s.table_schema(&table) else {
+                continue;
+            };
+
+            let keys = eval_with_key_clause(
+                schema.tags_iter().map(|field| field.name().as_str()),
+                &show_tag_values.with_key,
+            )?;
+            if keys.is_empty() {
+                // don't bother to create a plan for this table
+                continue;
+            }
+
+            let Some((plan, measurement_expr)) = self.create_table_ref(&table)? else {
+                continue;
+            };
+
+            let ds = DataSource::Table(table.clone());
+            let schema = IQLSchema::new_from_ds_schema(plan.schema(), ds.schema(self.s)?)?;
+            let plan =
+                self.plan_where_clause(plan, &show_tag_values.condition, metadata_cutoff, &schema)?;
+
+            for key in keys {
+                let idx = plan
+                    .schema()
+                    .index_of_column_by_name(None, key)?
+                    .expect("where is the key?");
+
+                let plan = LogicalPlanBuilder::from(plan.clone())
+                    .select([idx])?
+                    .distinct()?
+                    .project(measurement_expr.iter().cloned().chain([
+                        lit_dict(key).alias(key_col),
+                        Expr::Column(Column::from_name(key)).alias(value_col),
+                    ]))?
+                    .build()?;
+
+                union_plan = match union_plan {
+                    Some(union_plan) => {
+                        Some(LogicalPlanBuilder::from(union_plan).union(plan)?.build()?)
+                    }
+                    None => Some(plan),
+                };
+            }
+        }
+
+        let plan = match union_plan {
+            Some(plan) => plan,
+            None => LogicalPlan::EmptyRelation(EmptyRelation {
+                produce_one_row: false,
+                schema: output_schema.to_dfschema_ref()?,
+            }),
+        };
+        let plan = LogicalPlanBuilder::from(plan)
+            .sort([
+                Expr::Column(Column::new_unqualified(INFLUXQL_MEASUREMENT_COLUMN_NAME))
+                    .sort(true, false),
+                Expr::Column(Column::new_unqualified(key_col)).sort(true, false),
+                Expr::Column(Column::new_unqualified(value_col)).sort(true, false),
+            ])?
+            .build()?;
+        let plan = plan_with_metadata(
+            plan,
+            &InfluxQlMetadata {
+                measurement_column_index: MEASUREMENT_COLUMN_INDEX,
+                tag_key_columns: vec![],
+            },
+        )?;
+        let plan = self.limit(
+            plan,
+            show_tag_values.offset,
+            show_tag_values.limit,
+            vec![
+                Expr::Column(Column::new_unqualified(key_col)).sort(true, false),
+                Expr::Column(Column::new_unqualified(value_col)).sort(true, false),
+            ],
+            true,
+            &[],
+            &[],
+        )?;
+
+        Ok(plan)
+    }
+
+    fn show_measurements_to_plan(
+        &self,
+        show_measurements: ShowMeasurementsStatement,
+    ) -> Result<LogicalPlan> {
+        if show_measurements.on.is_some() {
+            // How do we handle this? Do we need to perform cross-namespace queries here?
+            return error::not_implemented("SHOW MEASUREMENTS ON <database>");
+        }
+
+        let tables = self.expand_with_measurement_clause(show_measurements.with_measurement)?;
+
+        let name_col = "name";
+        let output_schema = Arc::new(ArrowSchema::new(vec![
+            ArrowField::new(
+                INFLUXQL_MEASUREMENT_COLUMN_NAME,
+                (&InfluxColumnType::Tag).into(),
+                false,
+            ),
+            ArrowField::new(name_col, (&InfluxColumnType::Tag).into(), false),
+        ]));
+        let dummy_measurement_name = "measurements";
+
+        let plan = match show_measurements.condition {
+            Some(condition) => {
+                debug!("`SHOW MEASUREMENTS` w/ WHERE-clause, use data scan plan",);
+
+                let condition = Some(condition);
+                let metadata_cutoff = self.metadata_cutoff();
+
+                let mut union_plan = None;
+                for table in tables {
+                    let Some((plan, _measurement_expr)) = self.create_table_ref(&table)? else {
+                        continue;
+                    };
+
+                    let ds = DataSource::Table(table.clone());
+                    let schema = IQLSchema::new_from_ds_schema(plan.schema(), ds.schema(self.s)?)?;
+                    let plan =
+                        self.plan_where_clause(plan, &condition, metadata_cutoff, &schema)?;
+
+                    let plan = LogicalPlanBuilder::from(plan)
+                        .limit(0, Some(1))?
+                        .project([
+                            lit_dict(dummy_measurement_name)
+                                .alias(INFLUXQL_MEASUREMENT_COLUMN_NAME),
+                            lit_dict(&table).alias(name_col),
+                        ])?
+                        .build()?;
+
+                    union_plan = match union_plan {
+                        Some(union_plan) => {
+                            Some(LogicalPlanBuilder::from(union_plan).union(plan)?.build()?)
+                        }
+                        None => Some(plan),
+                    };
+                }
+
+                let plan = match union_plan {
+                    Some(plan) => plan,
+                    None => LogicalPlan::EmptyRelation(EmptyRelation {
+                        produce_one_row: false,
+                        schema: output_schema.to_dfschema_ref()?,
+                    }),
+                };
+                LogicalPlanBuilder::from(plan)
+                    .sort([
+                        Expr::Column(Column::new_unqualified(INFLUXQL_MEASUREMENT_COLUMN_NAME))
+                            .sort(true, false),
+                        Expr::Column(Column::new_unqualified(name_col)).sort(true, false),
+                    ])?
+                    .build()?
+            }
+            None => {
+                debug!("`SHOW MEASUREMENTS` w/o WHERE-clause, use cheap metadata scan",);
+
+                let mut dummy_measurement_names_builder =
+                    StringDictionaryBuilder::<Int32Type>::new();
+                let mut name_builder = StringDictionaryBuilder::<Int32Type>::new();
+                for table in tables {
+                    dummy_measurement_names_builder.append_value(dummy_measurement_name);
+                    name_builder.append_value(table);
+                }
+                LogicalPlanBuilder::scan(
+                    "measurements",
+                    provider_as_source(Arc::new(MemTable::try_new(
+                        Arc::clone(&output_schema),
+                        vec![vec![RecordBatch::try_new(
+                            Arc::clone(&output_schema),
+                            vec![
+                                Arc::new(dummy_measurement_names_builder.finish()),
+                                Arc::new(name_builder.finish()),
+                            ],
+                        )?]],
+                    )?)),
+                    None,
+                )?
+                .build()?
+            }
+        };
+
+        let plan = plan_with_metadata(
+            plan,
+            &InfluxQlMetadata {
+                measurement_column_index: MEASUREMENT_COLUMN_INDEX,
+                tag_key_columns: vec![],
+            },
+        )?;
+        let plan = self.limit(
+            plan,
+            show_measurements.offset,
+            show_measurements.limit,
+            vec![Expr::Column(Column::new_unqualified(name_col)).sort(true, false)],
+            true,
+            &[],
+            &[],
+        )?;
+
+        Ok(plan)
+    }
+
+    /// A limited implementation of SHOW RETENTION POLICIES that assumes
+    /// any database has a single, default, retention policy.
+    fn show_retention_policies_to_plan(
+        &self,
+        show_retention_policies: ShowRetentionPoliciesStatement,
+    ) -> Result<LogicalPlan> {
+        if show_retention_policies.database.is_some() {
+            // This syntax is not yet handled.
+            return error::not_implemented("SHOW RETENTION POLICIES ON <database>");
+        }
+
+        let output_schema = Arc::new(ArrowSchema::new(vec![
+            ArrowField::new(
+                INFLUXQL_MEASUREMENT_COLUMN_NAME,
+                (&InfluxColumnType::Tag).into(),
+                false,
+            ),
+            ArrowField::new(
+                "name",
+                (&InfluxColumnType::Field(InfluxFieldType::String)).into(),
+                false,
+            ),
+            ArrowField::new(
+                "duration",
+                (&InfluxColumnType::Field(InfluxFieldType::String)).into(),
+                false,
+            ),
+            ArrowField::new(
+                "shardGroupDuration",
+                (&InfluxColumnType::Field(InfluxFieldType::String)).into(),
+                false,
+            ),
+            ArrowField::new(
+                "replicaN",
+                (&InfluxColumnType::Field(InfluxFieldType::Integer)).into(),
+                false,
+            ),
+            ArrowField::new(
+                "default",
+                (&InfluxColumnType::Field(InfluxFieldType::Boolean)).into(),
+                false,
+            ),
+        ]));
+        let record_batch = RecordBatch::try_new(
+            Arc::clone(&output_schema),
+            vec![
+                Arc::new(DictionaryArray::try_new(
+                    Int32Array::from(vec![0]),
+                    Arc::new(StringArray::from(vec![Some("retention_policies")])),
+                )?),
+                Arc::new(StringArray::from(vec![Some("autogen")])),
+                Arc::new(StringArray::from(vec![Some("0s")])),
+                Arc::new(StringArray::from(vec![Some("168h0m0s")])),
+                Arc::new(Int64Array::from(vec![1])),
+                Arc::new(BooleanArray::from(vec![true])),
+            ],
+        )?;
+        let table = Arc::new(MemTable::try_new(output_schema, vec![vec![record_batch]])?);
+        let plan = LogicalPlanBuilder::scan("retention policies", provider_as_source(table), None)?
+            .build()?;
+        let plan = plan_with_metadata(
+            plan,
+            &InfluxQlMetadata {
+                measurement_column_index: MEASUREMENT_COLUMN_INDEX,
+                tag_key_columns: vec![],
+            },
+        )?;
+        Ok(plan)
+    }
+
+    fn metadata_cutoff(&self) -> MetadataCutoff {
+        self.iox_ctx
+            .inner()
+            .state()
+            .config()
+            .options()
+            .extensions
+            .get::<IoxConfigExt>()
+            .cloned()
+            .unwrap_or_default()
+            .influxql_metadata_cutoff
+    }
+}
+
+/// Returns a [`LogicalPlan`] that performs gap-filling for the `input` plan.
+///
+/// # Arguments
+///
+/// * `input` - An aggregate plan which requires gap-filling.
+/// * `time_column` - The `date_bin` expression.
+/// * `fill_strategy` - The strategy used to fill gaps in the data.
+fn build_gap_fill_node(
+    input: LogicalPlan,
+    time_column: &Expr,
+    fill_strategy: FillStrategy,
+    projection_type: &ProjectionType,
+) -> Result<LogicalPlan> {
+    let (expr, alias) = match time_column {
+        Expr::Alias(Alias {
+            expr,
+            relation: None,
+            name: alias,
+        }) => (expr.as_ref(), alias),
+        _ => return error::internal("expected time column to have an alias function"),
+    };
+
+    let date_bin_args = match expr {
+        Expr::ScalarFunction(ScalarFunction {
+            func_def: ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::DateBin),
+            args,
+        }) => args,
+        _ => {
+            // The InfluxQL planner adds the `date_bin` function,
+            // so this condition represents an internal failure.
+            return error::internal("expected DATE_BIN function");
+        }
+    };
+
+    // Extract the gap-fill parameters from the arguments to the `DATE_BIN` function.
+    // Any unexpected conditions represents an internal error, as the `DATE_BIN` function is
+    // added by the planner.
+    let (stride, time_range, origin) = match date_bin_args.len() {
+        nargs @ 2..=3 => {
+            let time_col = date_bin_args[1].try_into_col().map_err(|_| {
+                error::map::internal("DATE_BIN requires a column as the source argument")
+            })?;
+
+            // Ensure that a time range was specified and is valid for gap filling
+            let time_range = {
+                // TODO(sgc): Fix via https://github.com/influxdata/influxdb_iox/issues/7829
+
+                // This is a stop gap, until #7929 is fixed, as `find_time_range` does not look
+                // beyond Union operators. We are working around the limitation by traversing the
+                // tree until we find the first operator which specifies a filter predicate.
+                let mut time_range: Option<Range<Bound<Expr>>> = None;
+                _ = input.apply(&mut |n| match n {
+                    plan @ (LogicalPlan::Filter(_) | LogicalPlan::TableScan(_)) => {
+                        time_range = Some(match find_time_range(plan, &time_col)? {
+                            // Follow the InfluxQL behaviour to use an upper bound of `now` when
+                            // not found:
+                            //
+                            // See: https://github.com/influxdata/influxdb/blob/98361e207349a3643bcc332d54b009818fe7585f/query/compile.go#L172-L176
+                            Range {
+                                start,
+                                end: Bound::Unbounded,
+                            } => Range {
+                                start,
+                                end: Bound::Excluded(now()),
+                            },
+                            time_range => time_range,
+                        });
+                        Ok(VisitRecursion::Stop)
+                    }
+                    _ => Ok(VisitRecursion::Continue),
+                });
+                time_range = if projection_type == &ProjectionType::WindowAggregateMixed {
+                    // For WindowAggregateMixed queries do not gap fill before the first
+                    // iterator value.
+                    time_range.map(|Range { start: _, end }| Range {
+                        start: Bound::Unbounded,
+                        end,
+                    })
+                } else {
+                    time_range
+                };
+                time_range
+                    .ok_or_else(|| error::map::internal("expected to find a Filter or TableScan"))
+            }?;
+
+            let origin = (nargs == 3).then_some(date_bin_args[2].clone());
+
+            (date_bin_args[0].clone(), time_range, origin)
+        }
+        nargs => {
+            // This is an internal error as the date_bin function is added by the planner and should
+            // always contain the correct number of arguments.
+            return error::internal(format!("DATE_BIN expects 2 or 3 arguments, got {nargs}"));
+        }
+    };
+
+    let LogicalPlan::Aggregate(aggr) = &input else {
+        return Err(DataFusionError::Internal(format!(
+            "Expected Aggregate plan, got {}",
+            input.display()
+        )));
+    };
+    let mut new_group_expr: Vec<_> = aggr
+        .schema
+        .fields()
+        .iter()
+        .map(|f| Expr::Column(f.qualified_column()))
+        .collect();
+    let aggr_expr = new_group_expr.split_off(aggr.group_expr.len());
+
+    // The fill strategy for InfluxQL is specified at the query level
+    let fill_strategy = aggr_expr
+        .iter()
+        .cloned()
+        .map(|e| (e, fill_strategy))
+        .collect();
+
+    let time_column = col(input
+        .schema()
+        .field_with_unqualified_name(alias)
+        .map(|f| f.qualified_column())?);
+
+    Ok(LogicalPlan::Extension(Extension {
+        node: Arc::new(GapFill::try_new(
+            Arc::new(input),
+            new_group_expr,
+            aggr_expr,
+            GapFillParams {
+                stride,
+                time_column,
+                origin,
+                time_range,
+                fill_strategy,
+            },
+        )?),
+    }))
+}
+
+/// Adds [`InfluxQlMetadata`] to the `plan`.
+fn plan_with_metadata(plan: LogicalPlan, metadata: &InfluxQlMetadata) -> Result<LogicalPlan> {
+    fn make_schema(schema: DFSchemaRef, metadata: &InfluxQlMetadata) -> Result<DFSchemaRef> {
+        let data = serde_json::to_string(metadata).map_err(|err| {
+            error::map::internal(format!("error serializing InfluxQL metadata: {err}"))
+        })?;
+
+        let mut md = schema.metadata().clone();
+        md.insert(INFLUXQL_METADATA_KEY.to_owned(), data);
+
+        Ok(Arc::new(DFSchema::new_with_metadata(
+            schema.fields().clone(),
+            md,
+        )?))
+    }
+
+    // Reconstruct the plan, altering the first node which defines the output schema
+    fn set_schema(input: &LogicalPlan, metadata: &InfluxQlMetadata) -> Result<LogicalPlan> {
+        Ok(match input {
+            LogicalPlan::Projection(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Projection(v)
+            }
+            LogicalPlan::Filter(src) => {
+                let mut v = src.clone();
+                v.input = Arc::new(set_schema(&src.input, metadata)?);
+                LogicalPlan::Filter(v)
+            }
+            LogicalPlan::Window(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Window(v)
+            }
+            LogicalPlan::Aggregate(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Aggregate(v)
+            }
+            LogicalPlan::Sort(src) => {
+                let mut v = src.clone();
+                v.input = Arc::new(set_schema(&src.input, metadata)?);
+                LogicalPlan::Sort(v)
+            }
+            LogicalPlan::Join(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Join(v)
+            }
+            LogicalPlan::CrossJoin(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::CrossJoin(v)
+            }
+            LogicalPlan::Repartition(src) => {
+                let mut v = src.clone();
+                v.input = Arc::new(set_schema(&src.input, metadata)?);
+                LogicalPlan::Repartition(v)
+            }
+            LogicalPlan::Union(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Union(v)
+            }
+            LogicalPlan::EmptyRelation(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::EmptyRelation(v)
+            }
+            LogicalPlan::SubqueryAlias(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::SubqueryAlias(v)
+            }
+            LogicalPlan::Limit(src) => {
+                let mut v = src.clone();
+                v.input = Arc::new(set_schema(&src.input, metadata)?);
+                LogicalPlan::Limit(v)
+            }
+            LogicalPlan::Values(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Values(v)
+            }
+            LogicalPlan::Explain(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Explain(v)
+            }
+            LogicalPlan::Analyze(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Analyze(v)
+            }
+            LogicalPlan::Distinct(src) => {
+                let v = match src.clone() {
+                    Distinct::All(input) => Distinct::All(Arc::new(set_schema(&input, metadata)?)),
+                    Distinct::On(mut on) => {
+                        on.input = Arc::new(set_schema(&on.input, metadata)?);
+                        Distinct::On(on)
+                    }
+                };
+                LogicalPlan::Distinct(v)
+            }
+            LogicalPlan::Unnest(src) => {
+                let mut v = src.clone();
+                v.schema = make_schema(Arc::clone(&src.schema), metadata)?;
+                LogicalPlan::Unnest(v)
+            }
+            LogicalPlan::TableScan(src) => {
+                let mut t = src.clone();
+                t.projected_schema = make_schema(Arc::clone(&src.projected_schema), metadata)?;
+                LogicalPlan::TableScan(t)
+            }
+            _ => return error::internal(format!("unexpected LogicalPlan: {}", input.display())),
+        })
+    }
+
+    set_schema(&plan, metadata)
+}
+
+/// A utility function that checks whether `f` is an
+/// aggregate field or not. An aggregate field is one that contains at least one
+/// call to an aggregate function.
+fn is_aggregate_field(f: &Field) -> bool {
+    walk_expr(&f.expr, &mut |e| match e {
+        IQLExpr::Call(Call { name, .. }) if is_aggregate_function(name) => ControlFlow::Break(()),
+        _ => ControlFlow::Continue(()),
+    })
+    .is_break()
+}
+
+/// A utility function that checks whether `f` is an aggregate field
+/// that should be filled with a 0 rather than an NULL.
+fn is_zero_filled_aggregate_field(f: &Field) -> bool {
+    walk_expr(&f.expr, &mut |e| match e {
+        IQLExpr::Call(Call { name, .. }) if name == "count" => ControlFlow::Break(()),
+        _ => ControlFlow::Continue(()),
+    })
+    .is_break()
+}
+
+fn conditional_op_to_operator(op: ConditionalOperator) -> Result<Operator> {
+    match op {
+        ConditionalOperator::Eq => Ok(Operator::Eq),
+        ConditionalOperator::NotEq => Ok(Operator::NotEq),
+        ConditionalOperator::EqRegex => Ok(Operator::RegexMatch),
+        ConditionalOperator::NotEqRegex => Ok(Operator::RegexNotMatch),
+        ConditionalOperator::Lt => Ok(Operator::Lt),
+        ConditionalOperator::LtEq => Ok(Operator::LtEq),
+        ConditionalOperator::Gt => Ok(Operator::Gt),
+        ConditionalOperator::GtEq => Ok(Operator::GtEq),
+        ConditionalOperator::And => Ok(Operator::And),
+        ConditionalOperator::Or => Ok(Operator::Or),
+        // NOTE: This is not supported by InfluxQL SELECT expressions, so it is unexpected
+        ConditionalOperator::In => error::internal("unexpected binary operator: IN"),
+    }
+}
+
+/// Find the index of the time column in the fields list.
+///
+/// > **Note**
+/// >
+/// > To match InfluxQL, the `time` column must not exist as part of a
+/// > complex expression.
+fn find_time_column_index(fields: &[Field]) -> Option<usize> {
+    fields
+        .iter()
+        .find_position(|f| matches!(f.data_type, Some(InfluxColumnType::Timestamp)))
+        .map(|(i, _)| i)
+}
+
+/// Evaluate [`WithKeyClause`] on the given list of keys.
+///
+/// This may fail if the clause contains an invalid regex.
+fn eval_with_key_clause<'a>(
+    keys: impl IntoIterator<Item = &'a str>,
+    clause: &WithKeyClause,
+) -> Result<Vec<&'a str>> {
+    match clause {
+        WithKeyClause::Eq(ident) => {
+            let ident = ident.as_str();
+            Ok(keys.into_iter().filter(|key| ident == *key).collect())
+        }
+        WithKeyClause::NotEq(ident) => {
+            let ident = ident.as_str();
+            Ok(keys.into_iter().filter(|key| ident != *key).collect())
+        }
+        WithKeyClause::EqRegex(regex) => {
+            let regex = parse_regex(regex)?;
+            Ok(keys.into_iter().filter(|key| regex.is_match(key)).collect())
+        }
+        WithKeyClause::NotEqRegex(regex) => {
+            let regex = parse_regex(regex)?;
+            Ok(keys
+                .into_iter()
+                .filter(|key| !regex.is_match(key))
+                .collect())
+        }
+        WithKeyClause::In(idents) => {
+            let idents = idents
+                .iter()
+                .map(|ident| ident.as_str())
+                .collect::<HashSet<_>>();
+            Ok(keys
+                .into_iter()
+                .filter(|key| idents.contains(key))
+                .collect())
+        }
+    }
+}
+
+/// Find distinct occurrences of `Expr::VarRef` expressions for
+/// the `select`.
+fn find_var_refs(select: &Select) -> BTreeSet<&VarRef> {
+    let mut var_refs = BTreeSet::new();
+
+    for f in &select.fields {
+        walk_expr(&f.expr, &mut |e| {
+            if let IQLExpr::VarRef(vr) = e {
+                var_refs.insert(vr);
+            }
+            ControlFlow::<()>::Continue(())
+        });
+    }
+
+    if let Some(condition) = &select.condition {
+        walk_expression(condition, &mut |e| match e {
+            Expression::Arithmetic(e) => walk_expr(e, &mut |e| {
+                if let IQLExpr::VarRef(vr) = e {
+                    var_refs.insert(vr);
+                }
+                ControlFlow::<()>::Continue(())
+            }),
+            _ => ControlFlow::<()>::Continue(()),
+        });
+    }
+
+    if let Some(group_by) = &select.group_by {
+        for vr in group_by.tags() {
+            var_refs.insert(vr);
+        }
+    }
+
+    var_refs
+}
+
+/// Calculate the partitioning for window functions.
+fn window_partition_by(
+    ctx: &Context<'_>,
+    schema: &DFSchemaRef,
+    group_by_tags: &[&str],
+) -> Vec<Expr> {
+    let mut parition_by = fields_to_exprs_no_nulls(schema, group_by_tags).collect::<Vec<_>>();
+    if let Some(i) = ctx.interval {
+        let stride = lit(ScalarValue::new_interval_mdn(0, 0, i.duration));
+        let offset = i.offset.unwrap_or_default();
+
+        parition_by.push(date_bin(
+            stride,
+            "time".as_expr(),
+            lit_timestamptz_nano(offset),
+        ));
+    }
+    parition_by
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::plan::test_utils::{parse_select, MockSchemaProvider};
+    use influxdb_influxql_parser::parse_statements;
+    use insta::assert_snapshot;
+    use schema::SchemaBuilder;
+
+    fn logical_plan(sql: &str) -> Result<LogicalPlan> {
+        let mut statements = parse_statements(sql).unwrap();
+        let mut sp = MockSchemaProvider::default();
+        sp.add_schemas(vec![
+            SchemaBuilder::new()
+                .measurement("data")
+                .timestamp()
+                .tag("foo")
+                .tag("bar")
+                .influx_field("f64_field", InfluxFieldType::Float)
+                .influx_field("mixedCase", InfluxFieldType::Float)
+                .influx_field("with space", InfluxFieldType::Float)
+                .influx_field("i64_field", InfluxFieldType::Integer)
+                .influx_field("str_field", InfluxFieldType::String)
+                .influx_field("bool_field", InfluxFieldType::Boolean)
+                // InfluxQL is case sensitive
+                .influx_field("TIME", InfluxFieldType::Boolean)
+                .build()
+                .unwrap(),
+            // Table with tags and all field types
+            SchemaBuilder::new()
+                .measurement("all_types")
+                .timestamp()
+                .tag("tag0")
+                .tag("tag1")
+                .influx_field("f64_field", InfluxFieldType::Float)
+                .influx_field("i64_field", InfluxFieldType::Integer)
+                .influx_field("str_field", InfluxFieldType::String)
+                .influx_field("bool_field", InfluxFieldType::Boolean)
+                .influx_field("u64_field", InfluxFieldType::UInteger)
+                .build()
+                .unwrap(),
+            // table w/ name clashes
+            SchemaBuilder::new()
+                .measurement("name_clash")
+                .timestamp()
+                .tag("first")
+                .influx_field("f", InfluxFieldType::Float)
+                .build()
+                .unwrap(),
+        ]);
+
+        let iox_ctx = IOxSessionContext::with_testing();
+        let planner = InfluxQLToLogicalPlan::new(&sp, &iox_ctx);
+
+        planner.statement_to_plan(statements.pop().unwrap())
+    }
+
+    fn metadata(sql: &str) -> Option<InfluxQlMetadata> {
+        logical_plan(sql)
+            .unwrap()
+            .schema()
+            .metadata()
+            .get(INFLUXQL_METADATA_KEY)
+            .map(|s| serde_json::from_str(s).unwrap())
+    }
+
+    fn plan(sql: impl Into<String>) -> String {
+        let result = logical_plan(&sql.into());
+        match result {
+            Ok(res) => res.display_indent_schema().to_string(),
+            Err(err) => err.to_string(),
+        }
+    }
+
+    #[test]
+    fn test_find_var_refs() {
+        use influxdb_influxql_parser::expression::VarRefDataType::*;
+
+        macro_rules! var_ref {
+            ($NAME: literal) => {
+                VarRef {
+                    name: $NAME.into(),
+                    data_type: None,
+                }
+            };
+
+            ($NAME: literal, $TYPE: ident) => {
+                VarRef {
+                    name: $NAME.into(),
+                    data_type: Some($TYPE),
+                }
+            };
+        }
+
+        fn find_var_refs(s: &dyn SchemaProvider, q: &str) -> Vec<VarRef> {
+            let sel = parse_select(q);
+            let select = rewrite_statement(s, &sel).unwrap();
+            super::find_var_refs(&select.select)
+                .into_iter()
+                .cloned()
+                .collect()
+        }
+
+        let sp = MockSchemaProvider::default();
+
+        let got = find_var_refs(
+            &sp,
+            "SELECT cpu, usage_idle FROM cpu WHERE usage_user = 3 AND usage_idle > 1 GROUP BY cpu",
+        );
+        assert_eq!(
+            &got,
+            &[
+                var_ref!("cpu", Tag),
+                var_ref!("time", Timestamp),
+                var_ref!("usage_idle", Float),
+                var_ref!("usage_user", Float),
+            ]
+        );
+
+        let got = find_var_refs(&sp, "SELECT non_existent, usage_idle FROM cpu");
+        assert_eq!(
+            &got,
+            &[
+                var_ref!("non_existent"),
+                var_ref!("time", Timestamp),
+                var_ref!("usage_idle", Float),
+            ]
+        );
+
+        let got = find_var_refs(&sp, "SELECT non_existent, usage_idle FROM (SELECT cpu as non_existent, usage_idle FROM cpu) GROUP BY cpu");
+        assert_eq!(
+            &got,
+            &[
+                var_ref!("cpu", Tag),
+                var_ref!("non_existent", Tag),
+                var_ref!("time", Timestamp),
+                var_ref!("usage_idle", Float),
+            ]
+        );
+    }
+
+    /// Verify the list of unsupported statements.
+    ///
+    /// It is expected certain statements will be unsupported, indefinitely.
+    #[test]
+    fn test_unsupported_statements() {
+        assert_snapshot!(plan("CREATE DATABASE foo"), @"This feature is not implemented: CREATE DATABASE");
+        assert_snapshot!(plan("DELETE FROM foo"), @"This feature is not implemented: DELETE");
+        assert_snapshot!(plan("DROP MEASUREMENT foo"), @"This feature is not implemented: DROP MEASUREMENT");
+        assert_snapshot!(plan("SHOW DATABASES"), @"This feature is not implemented: SHOW DATABASES");
+    }
+
+    mod metadata_queries {
+        use super::*;
+
+        #[test]
+        fn test_show_field_keys() {
+            assert_snapshot!(plan("SHOW FIELD KEYS"), @"TableScan: field_keys [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8]");
+            assert_snapshot!(plan("SHOW FIELD KEYS LIMIT 1 OFFSET 2"), @r###"
+            Sort: field_keys.iox::measurement ASC NULLS LAST, field_keys.fieldKey ASC NULLS LAST [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8]
+              Projection: field_keys.iox::measurement, field_keys.fieldKey, field_keys.fieldType [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8]
+                Filter: iox::row BETWEEN Int64(3) AND Int64(3) [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8, iox::row:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [field_keys.iox::measurement] ORDER BY [field_keys.fieldKey ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8, iox::row:UInt64;N]
+                    TableScan: field_keys [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8]
+            "###);
+        }
+
+        #[test]
+        fn test_show_measurements() {
+            assert_snapshot!(plan("SHOW MEASUREMENTS"), @"TableScan: measurements [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]");
+            assert_snapshot!(plan("SHOW MEASUREMENTS LIMIT 1 OFFSET 2"), @r###"
+            Sort: measurements.iox::measurement ASC NULLS LAST, measurements.name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+              Projection: measurements.iox::measurement, measurements.name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                Filter: iox::row BETWEEN Int64(3) AND Int64(3) [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8), iox::row:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [measurements.iox::measurement] ORDER BY [measurements.name ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8), iox::row:UInt64;N]
+                    TableScan: measurements [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+            "###);
+            assert_snapshot!(plan("SHOW MEASUREMENTS WHERE foo = 'some_foo'"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+              Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                      Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                        TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                      Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                    Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                  TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                              Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                            Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                              TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                      Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_03")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                  Limit: skip=0, fetch=1 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Filter: temp_03.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                      TableScan: temp_03 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+            assert_snapshot!(plan("SHOW MEASUREMENTS WHERE time > 1337"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+              Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                      Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                        TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                      Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                    Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                  TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                              Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                            Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                              TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                      Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_03")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                  Limit: skip=0, fetch=1 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Filter: temp_03.time >= TimestampNanosecond(1338, None) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                      TableScan: temp_03 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+        }
+
+        #[test]
+        fn test_show_tag_keys_1() {
+            assert_snapshot!(plan("SHOW TAG KEYS"), @"TableScan: tag_keys [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8)]");
+            assert_snapshot!(plan("SHOW TAG KEYS LIMIT 1 OFFSET 2"), @r###"
+            Sort: tag_keys.iox::measurement ASC NULLS LAST, tag_keys.tagKey ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8)]
+              Projection: tag_keys.iox::measurement, tag_keys.tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8)]
+                Filter: iox::row BETWEEN Int64(3) AND Int64(3) [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8), iox::row:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [tag_keys.iox::measurement] ORDER BY [tag_keys.tagKey ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8), iox::row:UInt64;N]
+                    TableScan: tag_keys [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8)]
+            "###);
+        }
+
+        #[test]
+        fn test_show_tag_keys_2() {
+            assert_snapshot!(plan("SHOW TAG KEYS WHERE foo = 'some_foo'"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, tagKey ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N]
+                                            Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                              TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                            Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                              TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                    Unnest: tagKey [tagKey:Utf8;N]
+                                      Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N]
+                                          Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                  Unnest: tagKey [tagKey:Utf8;N]
+                                    Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                      Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                        Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                          TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                Unnest: tagKey [tagKey:Utf8;N]
+                                  Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                    Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N]
+                                      Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                        TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                              Unnest: tagKey [tagKey:Utf8;N]
+                                Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                  Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N]
+                                    Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                      TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                            Unnest: tagKey [tagKey:Utf8;N]
+                              Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N]
+                                  Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                          Unnest: tagKey [tagKey:Utf8;N]
+                            Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                              Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N]
+                                Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                        Unnest: tagKey [tagKey:Utf8;N]
+                          Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                            Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                              Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                      Unnest: tagKey [tagKey:Utf8;N]
+                        Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                            Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                              TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Projection: Dictionary(Int32, Utf8("temp_03")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                    Unnest: tagKey [tagKey:Utf8;N]
+                      Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_03.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_03.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                          Filter: temp_03.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            TableScan: temp_03 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+        }
+
+        #[test]
+        fn test_show_tag_keys_3() {
+            assert_snapshot!(plan("SHOW TAG KEYS WHERE time > 1337"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, tagKey ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N]
+                                            Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                              TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                            Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                              TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                    Unnest: tagKey [tagKey:Utf8;N]
+                                      Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N]
+                                          Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                  Unnest: tagKey [tagKey:Utf8;N]
+                                    Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                      Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                        Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                          TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                Unnest: tagKey [tagKey:Utf8;N]
+                                  Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                    Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N]
+                                      Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                        TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                              Unnest: tagKey [tagKey:Utf8;N]
+                                Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                  Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N]
+                                    Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                      TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                            Unnest: tagKey [tagKey:Utf8;N]
+                              Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N]
+                                  Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                          Unnest: tagKey [tagKey:Utf8;N]
+                            Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                              Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N]
+                                Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                        Unnest: tagKey [tagKey:Utf8;N]
+                          Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                            Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                              Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                      Unnest: tagKey [tagKey:Utf8;N]
+                        Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                            Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                              TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Projection: Dictionary(Int32, Utf8("temp_03")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                    Unnest: tagKey [tagKey:Utf8;N]
+                      Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_03.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_03.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                          Filter: temp_03.time >= TimestampNanosecond(1338, None) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            TableScan: temp_03 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+        }
+
+        #[test]
+        fn test_show_tag_values_1() {
+            assert_snapshot!(plan("SHOW TAG VALUES WITH KEY = bar"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, key ASC NULLS LAST, value ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, Dictionary(Int32, Utf8("bar")) AS key, data.bar AS value [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+                Distinct: [bar:Dictionary(Int32, Utf8);N]
+                  Projection: data.bar [bar:Dictionary(Int32, Utf8);N]
+                    Filter: data.time >= TimestampNanosecond(1672444800000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_show_tag_values_2() {
+            assert_snapshot!(plan("SHOW TAG VALUES WITH KEY = bar LIMIT 1 OFFSET 2"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, key ASC NULLS LAST, value ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+              Projection: iox::measurement, key, value [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+                Filter: iox::row BETWEEN Int64(3) AND Int64(3) [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N, iox::row:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [iox::measurement] ORDER BY [key ASC NULLS LAST, value ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N, iox::row:UInt64;N]
+                    Sort: iox::measurement ASC NULLS LAST, key ASC NULLS LAST, value ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+                      Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, Dictionary(Int32, Utf8("bar")) AS key, data.bar AS value [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+                        Distinct: [bar:Dictionary(Int32, Utf8);N]
+                          Projection: data.bar [bar:Dictionary(Int32, Utf8);N]
+                            Filter: data.time >= TimestampNanosecond(1672444800000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_show_tag_values_3() {
+            assert_snapshot!(plan("SHOW TAG VALUES WITH KEY = bar WHERE foo = 'some_foo'"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, key ASC NULLS LAST, value ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, Dictionary(Int32, Utf8("bar")) AS key, data.bar AS value [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+                Distinct: [bar:Dictionary(Int32, Utf8);N]
+                  Projection: data.bar [bar:Dictionary(Int32, Utf8);N]
+                    Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_show_tag_values_4() {
+            assert_snapshot!(plan("SHOW TAG VALUES WITH KEY = bar WHERE time > 1337"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, key ASC NULLS LAST, value ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, Dictionary(Int32, Utf8("bar")) AS key, data.bar AS value [iox::measurement:Dictionary(Int32, Utf8), key:Dictionary(Int32, Utf8), value:Dictionary(Int32, Utf8);N]
+                Distinct: [bar:Dictionary(Int32, Utf8);N]
+                  Projection: data.bar [bar:Dictionary(Int32, Utf8);N]
+                    Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_show_retention_policies() {
+            assert_snapshot!(plan("SHOW RETENTION POLICIES"), @r###"
+            TableScan: retention policies [iox::measurement:Dictionary(Int32, Utf8), name:Utf8, duration:Utf8, shardGroupDuration:Utf8, replicaN:Int64, default:Boolean]
+            "###);
+            assert_snapshot!(plan("SHOW RETENTION POLICIES ON my_db"), @r###"
+            This feature is not implemented: SHOW RETENTION POLICIES ON <database>
+            "###);
+        }
+    }
+
+    /// Tests to validate InfluxQL `SELECT` statements, where the projections do not matter,
+    /// such as the WHERE clause.
+    mod select {
+        use super::*;
+
+        mod subqueries {
+            use super::*;
+
+            /// Projecting subqueries that do not use aggregate or selector functions.
+            #[test]
+            fn raw() {
+                // project an aliased column
+                assert_snapshot!(plan("SELECT value FROM (SELECT usage_idle AS value FROM cpu)"), @r###"
+                Sort: time AS time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N]
+                      Projection: cpu.time AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // project a wildcard
+                assert_snapshot!(plan("SELECT * FROM (SELECT usage_idle, usage_system AS value FROM cpu)"), @r###"
+                Sort: time AS time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), usage_idle:Float64;N, value:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, usage_idle AS usage_idle, value AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), usage_idle:Float64;N, value:Float64;N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), usage_idle:Float64;N, value:Float64;N]
+                      Projection: cpu.time AS time, cpu.usage_idle AS usage_idle, cpu.usage_system AS value [time:Timestamp(Nanosecond, None), usage_idle:Float64;N, value:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            /// Projecting subqueries that do not use aggregate or selector functions.
+            #[test]
+            fn aggregates() {
+                // project an aggregate
+                assert_snapshot!(plan("SELECT value FROM (SELECT mean(usage_idle) AS value FROM cpu)"), @r###"
+                Sort: time AS time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N]
+                      Projection: TimestampNanosecond(0, None) AS time, AVG(cpu.usage_idle) AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
+                        Aggregate: groupBy=[[]], aggr=[[AVG(cpu.usage_idle)]] [AVG(cpu.usage_idle):Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                assert_snapshot!(plan("SELECT value FROM (SELECT mean(usage_idle) AS value FROM cpu GROUP BY TIME(10s))"), @r###"
+                Sort: time AS time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, value:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, value:Float64;N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None);N, value:Float64;N]
+                      Projection: time, AVG(cpu.usage_idle) AS value [time:Timestamp(Nanosecond, None);N, value:Float64;N]
+                        Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                          Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            /// Projecting subqueries that use the `DISTINCT` function / operator.
+            #[test]
+            fn distinct() {
+                // Subquery is a DISTINCT
+                assert_snapshot!(plan("SELECT value FROM (SELECT DISTINCT(usage_idle) AS value FROM cpu)"), @r###"
+                Sort: time AS time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N]
+                      Distinct: [time:Timestamp(Nanosecond, None), value:Float64;N]
+                        Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), value:Float64;N]
+                          Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // Outer query projects subquery with binary expressions
+                assert_snapshot!(plan("SELECT value * 0.99 FROM (SELECT DISTINCT(usage_idle) AS value FROM cpu)"), @r###"
+                Sort: time AS time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value * Float64(0.99) AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N]
+                      Distinct: [time:Timestamp(Nanosecond, None), value:Float64;N]
+                        Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), value:Float64;N]
+                          Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // Outer query groups by the `cpu` tag, which should be pushed all the way to inner-most subquery
+                assert_snapshot!(plan("SELECT * FROM (SELECT MAX(value) FROM (SELECT DISTINCT(usage_idle) AS value FROM cpu)) GROUP BY cpu"), @r###"
+                Sort: cpu AS cpu ASC NULLS LAST, time AS time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, max:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, cpu AS cpu, max AS max [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, max:Float64;N]
+                    Sort: cpu AS cpu ASC NULLS LAST, time ASC NULLS LAST [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, max:Float64;N]
+                      Projection: (selector_max(value,time))[time] AS time, cpu AS cpu, (selector_max(value,time))[value] AS max [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, max:Float64;N]
+                        Aggregate: groupBy=[[cpu]], aggr=[[selector_max(value, time)]] [cpu:Dictionary(Int32, Utf8);N, selector_max(value,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                          Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
+                            Distinct: [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
+                              Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
+                                Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
+                                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+        }
+
+        /// Validate plans for multiple data sources in the `FROM` clause, including subqueries
+        #[test]
+        fn multiple_data_sources() {
+            // same table for each subquery
+            //
+            // ⚠️ Important
+            // The aggregate must be applied to the UNION of all instances of the cpu table
+            assert_snapshot!(plan("SELECT last(a) / last(b) FROM (SELECT mean(usage_idle) AS a FROM cpu), (SELECT mean(usage_user) AS b FROM cpu)"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last_last:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce((selector_last(a,time))[value] / (selector_last(b,time))[value], Float64(0)) AS last_last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last_last:Float64;N]
+                Aggregate: groupBy=[[]], aggr=[[selector_last(a, time), selector_last(b, time)]] [selector_last(a,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, selector_last(b,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                  Union [a:Float64;N, b:Float64;N, time:Timestamp(Nanosecond, None)]
+                    Projection: a AS a, CAST(NULL AS Float64) AS b, time AS time [a:Float64;N, b:Float64;N, time:Timestamp(Nanosecond, None)]
+                      Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), a:Float64;N]
+                        Projection: TimestampNanosecond(0, None) AS time, AVG(cpu.usage_idle) AS a [time:Timestamp(Nanosecond, None), a:Float64;N]
+                          Aggregate: groupBy=[[]], aggr=[[AVG(cpu.usage_idle)]] [AVG(cpu.usage_idle):Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                    Projection: CAST(NULL AS Float64) AS a, b AS b, time AS time [a:Float64;N, b:Float64;N, time:Timestamp(Nanosecond, None)]
+                      Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), b:Float64;N]
+                        Projection: TimestampNanosecond(0, None) AS time, AVG(cpu.usage_user) AS b [time:Timestamp(Nanosecond, None), b:Float64;N]
+                          Aggregate: groupBy=[[]], aggr=[[AVG(cpu.usage_user)]] [AVG(cpu.usage_user):Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // selector with repeated table
+            //
+            // ⚠️ Important
+            // The selector must be applied to the UNION of all instances of the cpu table
+            assert_snapshot!(plan("SELECT last(usage_idle) FROM cpu, cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_last(usage_idle,time))[time] AS time, (selector_last(usage_idle,time))[value] AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                Aggregate: groupBy=[[]], aggr=[[selector_last(usage_idle, time)]] [selector_last(usage_idle,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                  Union [time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                    Projection: cpu.time AS time, cpu.usage_idle AS usage_idle [time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                    Projection: cpu.time AS time, cpu.usage_idle AS usage_idle [time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // different tables for each subquery
+            //
+            // ⚠️ Important
+            // The selector must be applied independently for each unique table
+            assert_snapshot!(plan("SELECT last(value) FROM (SELECT usage_idle AS value FROM cpu), (SELECT bytes_free AS value FROM disk)"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_last(value,time))[time] AS time, (selector_last(value,time))[value] AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                  Aggregate: groupBy=[[]], aggr=[[selector_last(value, time)]] [selector_last(value,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N]
+                      Projection: cpu.time AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, (selector_last(value,time))[time] AS time, (selector_last(value,time))[value] AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                  Aggregate: groupBy=[[]], aggr=[[selector_last(CAST(value AS Float64), time)]] [selector_last(value,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                    Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Int64;N]
+                      Projection: disk.time AS time, disk.bytes_free AS value [time:Timestamp(Nanosecond, None), value:Int64;N]
+                        TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+        }
+
+        #[test]
+        fn test_time_column() {
+            // validate time column is explicitly projected
+            assert_snapshot!(plan("SELECT usage_idle, time FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // validate time column may be aliased
+            assert_snapshot!(plan("SELECT usage_idle, time AS timestamp FROM cpu"), @r###"
+            Sort: timestamp ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), timestamp:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS timestamp, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), timestamp:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        mod window_functions {
+            use super::*;
+
+            #[test]
+            fn test_difference() {
+                // no aggregates
+                assert_snapshot!(plan("SELECT DIFFERENCE(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), difference:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), difference:Float64;N]
+                    Filter: NOT difference IS NULL [time:Timestamp(Nanosecond, None), difference:Float64;N]
+                      Projection: cpu.time AS time, difference(cpu.usage_idle) AS difference [time:Timestamp(Nanosecond, None), difference:Float64;N]
+                        WindowAggr: windowExpr=[[difference(cpu.usage_idle) ORDER BY [cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(cpu.usage_idle)]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, difference(cpu.usage_idle):Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // aggregate
+                assert_snapshot!(plan("SELECT DIFFERENCE(MEAN(usage_idle)) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N]
+                    Filter: NOT difference IS NULL [time:Timestamp(Nanosecond, None);N, difference:Float64;N]
+                      Projection: time, difference(AVG(cpu.usage_idle)) AS difference [time:Timestamp(Nanosecond, None);N, difference:Float64;N]
+                        WindowAggr: windowExpr=[[difference(AVG(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, difference(AVG(cpu.usage_idle)):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn test_non_negative_difference() {
+                // no aggregates
+                assert_snapshot!(plan("SELECT NON_NEGATIVE_DIFFERENCE(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_negative_difference:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_negative_difference:Float64;N]
+                    Filter: NOT non_negative_difference IS NULL [time:Timestamp(Nanosecond, None), non_negative_difference:Float64;N]
+                      Projection: cpu.time AS time, non_negative_difference(cpu.usage_idle) AS non_negative_difference [time:Timestamp(Nanosecond, None), non_negative_difference:Float64;N]
+                        WindowAggr: windowExpr=[[non_negative_difference(cpu.usage_idle) ORDER BY [cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(cpu.usage_idle)]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, non_negative_difference(cpu.usage_idle):Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // aggregate
+                assert_snapshot!(plan("SELECT NON_NEGATIVE_DIFFERENCE(MEAN(usage_idle)) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_difference [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_difference:Float64;N]
+                    Filter: NOT non_negative_difference IS NULL [time:Timestamp(Nanosecond, None);N, non_negative_difference:Float64;N]
+                      Projection: time, non_negative_difference(AVG(cpu.usage_idle)) AS non_negative_difference [time:Timestamp(Nanosecond, None);N, non_negative_difference:Float64;N]
+                        WindowAggr: windowExpr=[[non_negative_difference(AVG(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_difference(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, non_negative_difference(AVG(cpu.usage_idle)):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn test_moving_average() {
+                // no aggregates
+                assert_snapshot!(plan("SELECT MOVING_AVERAGE(usage_idle, 3) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), moving_average:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, moving_average [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), moving_average:Float64;N]
+                    Filter: NOT moving_average IS NULL [time:Timestamp(Nanosecond, None), moving_average:Float64;N]
+                      Projection: cpu.time AS time, moving_average(cpu.usage_idle,Int64(3)) AS moving_average [time:Timestamp(Nanosecond, None), moving_average:Float64;N]
+                        WindowAggr: windowExpr=[[moving_average(cpu.usage_idle, Int64(3)) ORDER BY [cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS moving_average(cpu.usage_idle,Int64(3))]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, moving_average(cpu.usage_idle,Int64(3)):Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // aggregate
+                assert_snapshot!(plan("SELECT MOVING_AVERAGE(MEAN(usage_idle), 3) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, moving_average:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, moving_average [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, moving_average:Float64;N]
+                    Filter: NOT moving_average IS NULL [time:Timestamp(Nanosecond, None);N, moving_average:Float64;N]
+                      Projection: time, moving_average(AVG(cpu.usage_idle),Int64(3)) AS moving_average [time:Timestamp(Nanosecond, None);N, moving_average:Float64;N]
+                        WindowAggr: windowExpr=[[moving_average(AVG(cpu.usage_idle), Int64(3)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS moving_average(AVG(cpu.usage_idle),Int64(3))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, moving_average(AVG(cpu.usage_idle),Int64(3)):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // Invariant: second argument is always a constant
+                assert_snapshot!(plan("SELECT MOVING_AVERAGE(MEAN(usage_idle), usage_system) FROM cpu GROUP BY TIME(10s)"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: expected integer argument in moving_average()
+                "###);
+            }
+
+            #[test]
+            fn test_derivative() {
+                // no aggregates
+                assert_snapshot!(plan("SELECT DERIVATIVE(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), derivative:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), derivative:Float64;N]
+                    Filter: NOT derivative IS NULL [time:Timestamp(Nanosecond, None), derivative:Float64;N]
+                      Projection: cpu.time AS time, derivative(cpu.usage_idle) AS derivative [time:Timestamp(Nanosecond, None), derivative:Float64;N]
+                        WindowAggr: windowExpr=[[derivative(cpu.usage_idle, IntervalMonthDayNano("1000000000"), cpu.time) ORDER BY [cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS derivative(cpu.usage_idle)]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, derivative(cpu.usage_idle):Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // aggregate
+                assert_snapshot!(plan("SELECT DERIVATIVE(MEAN(usage_idle)) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, derivative:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, derivative:Float64;N]
+                    Filter: NOT derivative IS NULL [time:Timestamp(Nanosecond, None);N, derivative:Float64;N]
+                      Projection: time, derivative(AVG(cpu.usage_idle)) AS derivative [time:Timestamp(Nanosecond, None);N, derivative:Float64;N]
+                        WindowAggr: windowExpr=[[derivative(AVG(cpu.usage_idle), IntervalMonthDayNano("10000000000"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS derivative(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, derivative(AVG(cpu.usage_idle)):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn test_non_negative_derivative() {
+                // no aggregates
+                assert_snapshot!(plan("SELECT NON_NEGATIVE_DERIVATIVE(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_negative_derivative:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_negative_derivative:Float64;N]
+                    Filter: NOT non_negative_derivative IS NULL [time:Timestamp(Nanosecond, None), non_negative_derivative:Float64;N]
+                      Projection: cpu.time AS time, non_negative_derivative(cpu.usage_idle) AS non_negative_derivative [time:Timestamp(Nanosecond, None), non_negative_derivative:Float64;N]
+                        WindowAggr: windowExpr=[[non_negative_derivative(cpu.usage_idle, IntervalMonthDayNano("1000000000"), cpu.time) ORDER BY [cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(cpu.usage_idle)]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, non_negative_derivative(cpu.usage_idle):Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // aggregate
+                assert_snapshot!(plan("SELECT NON_NEGATIVE_DERIVATIVE(MEAN(usage_idle)) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                    Filter: NOT non_negative_derivative IS NULL [time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                      Projection: time, non_negative_derivative(AVG(cpu.usage_idle)) AS non_negative_derivative [time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                        WindowAggr: windowExpr=[[non_negative_derivative(AVG(cpu.usage_idle), IntervalMonthDayNano("10000000000"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, non_negative_derivative(AVG(cpu.usage_idle)):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // selector
+                assert_snapshot!(plan("SELECT NON_NEGATIVE_DERIVATIVE(LAST(usage_idle)) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, non_negative_derivative [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                    Filter: NOT non_negative_derivative IS NULL [time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                      Projection: time, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]) AS non_negative_derivative [time:Timestamp(Nanosecond, None);N, non_negative_derivative:Float64;N]
+                        WindowAggr: windowExpr=[[non_negative_derivative((selector_last(cpu.usage_idle,cpu.time))[value], IntervalMonthDayNano("10000000000"), time) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value])]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, non_negative_derivative(selector_last(cpu.usage_idle,cpu.time)[value]):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn test_cumulative_sum() {
+                // no aggregates
+                assert_snapshot!(plan("SELECT CUMULATIVE_SUM(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cumulative_sum:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cumulative_sum [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cumulative_sum:Float64;N]
+                    Filter: NOT cumulative_sum IS NULL [time:Timestamp(Nanosecond, None), cumulative_sum:Float64;N]
+                      Projection: cpu.time AS time, cumulative_sum(cpu.usage_idle) AS cumulative_sum [time:Timestamp(Nanosecond, None), cumulative_sum:Float64;N]
+                        WindowAggr: windowExpr=[[cumumlative_sum(cpu.usage_idle) ORDER BY [cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS cumulative_sum(cpu.usage_idle)]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, cumulative_sum(cpu.usage_idle):Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+
+                // aggregate
+                assert_snapshot!(plan("SELECT CUMULATIVE_SUM(MEAN(usage_idle)) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cumulative_sum:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cumulative_sum [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cumulative_sum:Float64;N]
+                    Filter: NOT cumulative_sum IS NULL [time:Timestamp(Nanosecond, None);N, cumulative_sum:Float64;N]
+                      Projection: time, cumulative_sum(AVG(cpu.usage_idle)) AS cumulative_sum [time:Timestamp(Nanosecond, None);N, cumulative_sum:Float64;N]
+                        WindowAggr: windowExpr=[[cumumlative_sum(AVG(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS cumulative_sum(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, cumulative_sum(AVG(cpu.usage_idle)):Float64;N]
+                          GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                            Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                              Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn test_mixed_aggregate() {
+                assert_snapshot!(plan("SELECT DIFFERENCE(MEAN(usage_idle)), MEAN(usage_idle) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference(AVG(cpu.usage_idle)) AS difference, AVG(cpu.usage_idle) AS mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N]
+                    WindowAggr: windowExpr=[[difference(AVG(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, difference(AVG(cpu.usage_idle)):Float64;N]
+                      GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                        Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                          Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+        }
+
+        /// Tests for the `DISTINCT` clause and `DISTINCT` function
+        #[test]
+        fn test_distinct() {
+            assert_snapshot!(plan("SELECT DISTINCT usage_idle FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                Distinct: [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                    Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                Distinct: [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                    Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT DISTINCT usage_idle FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                Distinct: [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                    Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT COUNT(DISTINCT usage_idle) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(DISTINCT cpu.usage_idle), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+                Aggregate: groupBy=[[]], aggr=[[COUNT(DISTINCT cpu.usage_idle)]] [COUNT(DISTINCT cpu.usage_idle):Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu GROUP BY time(1s)"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                Distinct: [time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                    Projection: date_bin(IntervalMonthDayNano("1000000000"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                      Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu GROUP BY time(1s), cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                Distinct: [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                    Projection: date_bin(IntervalMonthDayNano("1000000000"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                      Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // fallible
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle), DISTINCT(usage_system) FROM cpu"), @r###"
+            rewriting statement
+            caused by
+            gather information about select statement
+            caused by
+            Error during planning: aggregate function distinct() cannot be combined with other functions or fields
+            "###);
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle), usage_system FROM cpu"), @r###"
+            rewriting statement
+            caused by
+            gather information about select statement
+            caused by
+            Error during planning: aggregate function distinct() cannot be combined with other functions or fields
+            "###);
+        }
+
+        mod functions {
+            use super::*;
+
+            #[test]
+            fn test_selectors_query() {
+                // single-selector query
+                assert_snapshot!(plan("SELECT LAST(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_last(cpu.usage_idle,cpu.time))[time] AS time, (selector_last(cpu.usage_idle,cpu.time))[value] AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_query_grouping() {
+                // single-selector, grouping by tags
+                assert_snapshot!(plan("SELECT LAST(usage_idle) FROM cpu GROUP BY cpu"), @r###"
+                Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, last:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_last(cpu.usage_idle,cpu.time))[time] AS time, cpu.cpu AS cpu, (selector_last(cpu.usage_idle,cpu.time))[value] AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, last:Float64;N]
+                    Aggregate: groupBy=[[cpu.cpu]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [cpu:Dictionary(Int32, Utf8);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_aggregate_gby_time() {
+                // aggregate query, as we're grouping by time
+                assert_snapshot!(plan("SELECT LAST(usage_idle) FROM cpu GROUP BY TIME(5s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, (selector_last(cpu.usage_idle,cpu.time))[value] AS last [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N]
+                    GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("5000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("5000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                        Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_aggregate_gby_time_gapfill() {
+                // aggregate query, grouping by time with gap filling
+                assert_snapshot!(plan("SELECT FIRST(usage_idle) FROM cpu GROUP BY TIME(5s) FILL(0)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, (coalesce_struct(selector_first(cpu.usage_idle,cpu.time), Struct({value:Float64(0),time:TimestampNanosecond(0, None)})))[value] AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
+                    GapFill: groupBy=[time], aggr=[[selector_first(cpu.usage_idle,cpu.time)]], time_column=time, stride=IntervalMonthDayNano("5000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("5000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_first(cpu.usage_idle, cpu.time)]] [time:Timestamp(Nanosecond, None);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                        Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_aggregate_multi_selectors() {
+                // aggregate query, as we're specifying multiple selectors or aggregates
+                assert_snapshot!(plan("SELECT LAST(usage_idle), FIRST(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last:Float64;N, first:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, (selector_last(cpu.usage_idle,cpu.time))[value] AS last, (selector_first(cpu.usage_idle,cpu.time))[value] AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last:Float64;N, first:Float64;N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_last(cpu.usage_idle, cpu.time), selector_first(cpu.usage_idle, cpu.time)]] [selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_and_aggregate() {
+                assert_snapshot!(plan("SELECT LAST(usage_idle), COUNT(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last:Float64;N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, (selector_last(cpu.usage_idle,cpu.time))[value] AS last, coalesce_struct(COUNT(cpu.usage_idle), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), last:Float64;N, count:Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_last(cpu.usage_idle, cpu.time), COUNT(cpu.usage_idle)]] [selector_last(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N, COUNT(cpu.usage_idle):Int64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_additional_fields() {
+                // additional fields
+                assert_snapshot!(plan("SELECT LAST(usage_idle), usage_system FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_last(cpu.usage_idle,cpu.time,cpu.usage_system))[time] AS time, (selector_last(cpu.usage_idle,cpu.time,cpu.usage_system))[value] AS last, (selector_last(cpu.usage_idle,cpu.time,cpu.usage_system))[other_1] AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_last(cpu.usage_idle, cpu.time, cpu.usage_system)]] [selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_additional_fields_2() {
+                assert_snapshot!(plan("SELECT LAST(usage_idle), usage_system FROM cpu GROUP BY TIME(5s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, (selector_last(cpu.usage_idle,cpu.time,cpu.usage_system))[value] AS last, (selector_last(cpu.usage_idle,cpu.time,cpu.usage_system))[other_1] AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
+                    GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time,cpu.usage_system)]], time_column=time, stride=IntervalMonthDayNano("5000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("5000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time, cpu.usage_system)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                        Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_additional_fields_3() {
+                assert_snapshot!(plan("SELECT LAST(usage_idle), usage_system FROM cpu GROUP BY TIME(5s) FILL(0)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, (coalesce_struct(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Struct({value:Float64(0),time:TimestampNanosecond(0, None),other_1:Float64(0)})))[value] AS last, (coalesce_struct(selector_last(cpu.usage_idle,cpu.time,cpu.usage_system), Struct({value:Float64(0),time:TimestampNanosecond(0, None),other_1:Float64(0)})))[other_1] AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, last:Float64;N, usage_system:Float64;N]
+                    GapFill: groupBy=[time], aggr=[[selector_last(cpu.usage_idle,cpu.time,cpu.usage_system)]], time_column=time, stride=IntervalMonthDayNano("5000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("5000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[selector_last(cpu.usage_idle, cpu.time, cpu.usage_system)]] [time:Timestamp(Nanosecond, None);N, selector_last(cpu.usage_idle,cpu.time,cpu.usage_system):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                        Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_additional_fields_4() {
+                assert_snapshot!(plan("SELECT FIRST(f), first FROM name_clash"), @r###"
+                Sort: time ASC NULLS LAST, first_1 ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N, first_1:Dictionary(Int32, Utf8);N]
+                  Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, (selector_first(name_clash.f,name_clash.time,name_clash.first))[time] AS time, (selector_first(name_clash.f,name_clash.time,name_clash.first))[value] AS first, (selector_first(name_clash.f,name_clash.time,name_clash.first))[other_1] AS first_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N, first_1:Dictionary(Int32, Utf8);N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_first(name_clash.f, name_clash.time, name_clash.first)]] [selector_first(name_clash.f,name_clash.time,name_clash.first):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                "###);
+            }
+            #[test]
+            fn test_selectors_query_other_other_selectors_1() {
+                // Validate we can call the remaining supported selector functions
+                assert_snapshot!(plan("SELECT FIRST(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_first(cpu.usage_idle,cpu.time))[time] AS time, (selector_first(cpu.usage_idle,cpu.time))[value] AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, first:Float64;N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_first(cpu.usage_idle, cpu.time)]] [selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_query_other_other_selectors_2() {
+                assert_snapshot!(plan("SELECT MAX(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, max:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_max(cpu.usage_idle,cpu.time))[time] AS time, (selector_max(cpu.usage_idle,cpu.time))[value] AS max [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, max:Float64;N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_max(cpu.usage_idle, cpu.time)]] [selector_max(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+            #[test]
+            fn test_selectors_query_other_other_selectors_3() {
+                assert_snapshot!(plan("SELECT MIN(usage_idle) FROM cpu"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, min:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, (selector_min(cpu.usage_idle,cpu.time))[time] AS time, (selector_min(cpu.usage_idle,cpu.time))[value] AS min [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, min:Float64;N]
+                    Aggregate: groupBy=[[]], aggr=[[selector_min(cpu.usage_idle, cpu.time)]] [selector_min(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn test_selectors_invalid_arguments_3() {
+                // Invalid number of arguments
+                assert_snapshot!(plan("SELECT MIN(usage_idle, usage_idle) FROM cpu"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: invalid number of arguments for min, expected 1, got 2
+                "###);
+            }
+        }
+
+        #[test]
+        fn test_percentile() {
+            assert_snapshot!(plan("SELECT percentile(usage_idle,50),usage_system FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), percentile:Float64;N, usage_system:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS percentile, cpu.usage_system AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), percentile:Float64;N, usage_system:Float64;N]
+                Filter: percent_row_number(Float64(50)) ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING = ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, percent_row_number(Float64(50)) ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N]
+                  WindowAggr: windowExpr=[[percent_row_number(Float64(50)) ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS percent_row_number(Float64(50)) ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, percent_row_number(Float64(50)) ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N]
+                    Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT percentile(usage_idle,50),usage_system FROM cpu WHERE time >= 0 AND time < 60000000000 GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, usage_system:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS percentile, cpu.usage_system AS usage_system [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, usage_system:Float64;N]
+                Filter: percent_row_number(Float64(50)) PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING = ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, percent_row_number(Float64(50)) PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N]
+                  WindowAggr: windowExpr=[[percent_row_number(Float64(50)) PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS percent_row_number(Float64(50)) PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING, ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, percent_row_number(Float64(50)) PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING:UInt64;N]
+                    Filter: cpu.usage_idle IS NOT NULL [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                      Filter: cpu.time >= TimestampNanosecond(0, None) AND cpu.time <= TimestampNanosecond(59999999999, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT percentile(usage_idle,50), percentile(usage_idle,90) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), percentile:Float64;N, percentile_1:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, percentile(cpu.usage_idle,Int64(50)) AS percentile, percentile(cpu.usage_idle,Int64(90)) AS percentile_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), percentile:Float64;N, percentile_1:Float64;N]
+                Aggregate: groupBy=[[]], aggr=[[percentile(cpu.usage_idle, Int64(50)), percentile(cpu.usage_idle, Int64(90))]] [percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT percentile(usage_idle,50), percentile(usage_idle,90) FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, percentile(cpu.usage_idle,Int64(50)) AS percentile, percentile(cpu.usage_idle,Int64(90)) AS percentile_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
+                Aggregate: groupBy=[[cpu.cpu]], aggr=[[percentile(cpu.usage_idle, Int64(50)), percentile(cpu.usage_idle, Int64(90))]] [cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT percentile(usage_idle,50), percentile(usage_idle,90) FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, percentile(cpu.usage_idle,Int64(50)) AS percentile, percentile(cpu.usage_idle,Int64(90)) AS percentile_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
+                Aggregate: groupBy=[[cpu.cpu]], aggr=[[percentile(cpu.usage_idle, Int64(50)), percentile(cpu.usage_idle, Int64(90))]] [cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT percentile(usage_idle,50), percentile(usage_idle,90) FROM cpu WHERE time >= 0 AND time < 60000000000 GROUP BY time(10s), cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu.cpu AS cpu, percentile(cpu.usage_idle,Int64(50)) AS percentile, percentile(cpu.usage_idle,Int64(90)) AS percentile_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile:Float64;N, percentile_1:Float64;N]
+                GapFill: groupBy=[time, cpu.cpu], aggr=[[percentile(cpu.usage_idle,Int64(50)), percentile(cpu.usage_idle,Int64(90))]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Included(Literal(TimestampNanosecond(0, None)))..Included(Literal(TimestampNanosecond(59999999999, None))) [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
+                  Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.cpu]], aggr=[[percentile(cpu.usage_idle, Int64(50)), percentile(cpu.usage_idle, Int64(90))]] [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, percentile(cpu.usage_idle,Int64(50)):Float64;N, percentile(cpu.usage_idle,Int64(90)):Float64;N]
+                    Filter: cpu.time >= TimestampNanosecond(0, None) AND cpu.time <= TimestampNanosecond(59999999999, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_top() {
+            assert_snapshot!(plan("SELECT top(usage_idle,10) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS top [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N]
+                Filter: ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT top(usage_idle,10),cpu FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS top, cpu.cpu AS cpu [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+                Filter: ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT top(usage_idle,10) FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, top:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS top [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, top:Float64;N]
+                Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT top(usage_idle,cpu,10) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS top, cpu.cpu AS cpu [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+                Filter: ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(1) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_bottom() {
+            assert_snapshot!(plan("SELECT bottom(usage_idle,10) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS bottom [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N]
+                Filter: ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT bottom(usage_idle,10),cpu FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS bottom, cpu.cpu AS cpu [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+                Filter: ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT bottom(usage_idle,10) FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, bottom:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS bottom [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, bottom:Float64;N]
+                Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT bottom(usage_idle,cpu,10) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS bottom, cpu.cpu AS cpu [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N]
+                Filter: ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                    Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(1) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        /// Test InfluxQL-specific behaviour of scalar functions that differ
+        /// from DataFusion
+        #[test]
+        fn test_scalar_functions() {
+            // LOG requires two arguments, and first argument is field
+            assert_snapshot!(plan("SELECT LOG(usage_idle, 8) FROM cpu"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), log:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, log(Int64(8), cpu.usage_idle) AS log [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), log:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // Fallible
+
+            // LOG requires two arguments
+            assert_snapshot!(plan("SELECT LOG(usage_idle) FROM cpu"), @r###"
+            rewriting statement
+            caused by
+            gather information about select statement
+            caused by
+            Error during planning: invalid number of arguments for log, expected 2, got 1
+            "###);
+        }
+
+        /// Validate the metadata is correctly encoded in the schema.
+        ///
+        /// Properties that are tested:
+        ///
+        /// * only tag keys listed in a `GROUP BY` clause are included in the `tag_key_columns` vector
+        /// * `tag_key_columns` is order by `tag_key`
+        #[test]
+        fn test_metadata_in_schema() {
+            macro_rules! assert_tag_keys {
+                ($MD:expr $(,($KEY:literal, $VAL:literal, $PROJ:literal))+) => {
+                    assert_eq!(
+                        $MD.tag_key_columns.clone().into_iter().map(|v| (v.tag_key, v.column_index, v.is_projected)).collect::<Vec<_>>(),
+                        vec![$(($KEY.to_owned(), $VAL, $PROJ),)*],
+                        "tag keys don't match"
+                    );
+
+                    let keys = $MD.tag_key_columns.into_iter().map(|v| v.tag_key).collect::<Vec<_>>();
+                    let mut sorted = keys.clone();
+                    sorted.sort_unstable();
+                    assert_eq!(keys, sorted, "tag keys are not sorted");
+                };
+            }
+
+            // validate metadata is empty when there is no group by
+            let md = metadata("SELECT bytes_free FROM disk").unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert!(md.tag_key_columns.is_empty());
+            let md = metadata("SELECT bytes_free FROM disk, cpu").unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert!(md.tag_key_columns.is_empty());
+
+            let md = metadata("SELECT bytes_free FROM disk GROUP BY device").unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert_tag_keys!(md, ("device", 2, false));
+
+            // validate tag in projection is not included in metadata
+            let md = metadata("SELECT cpu, usage_idle, bytes_free FROM cpu, disk GROUP BY device")
+                .unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert_tag_keys!(md, ("device", 2, false));
+
+            // validate multiple tags from different measurements
+            let md = metadata("SELECT usage_idle, bytes_free FROM cpu, disk GROUP BY cpu, device")
+                .unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert_tag_keys!(md, ("cpu", 2, false), ("device", 3, false));
+
+            // validate multiple tags from different measurements, and key order is maintained
+            let md = metadata("SELECT usage_idle, bytes_free FROM cpu, disk GROUP BY device, cpu")
+                .unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert_tag_keys!(md, ("cpu", 2, false), ("device", 3, false));
+
+            // validate that with cpu tag explicitly listed in project, tag-key order is maintained and column index
+            // is valid
+            let md =
+                metadata("SELECT usage_idle, bytes_free, cpu FROM cpu, disk GROUP BY cpu, device")
+                    .unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert_tag_keys!(md, ("cpu", 5, true), ("device", 2, false));
+
+            // validate region tag, shared by both measurements, is still correctly handled
+            let md = metadata(
+                "SELECT region, usage_idle, bytes_free, cpu FROM cpu, disk GROUP BY region, cpu, device",
+            )
+            .unwrap();
+            assert_eq!(md.measurement_column_index, 0);
+            assert_tag_keys!(
+                md,
+                ("cpu", 6, true),
+                ("device", 2, false),
+                ("region", 3, true)
+            );
+        }
+
+        /// Verify the behaviour of the `FROM` clause when selecting from zero to many measurements.
+        #[test]
+        fn test_from_zero_to_many() {
+            assert_snapshot!(plan("SELECT host, cpu, device, usage_idle, bytes_used FROM cpu, disk"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, time ASC NULLS LAST, cpu ASC NULLS LAST, device ASC NULLS LAST, host ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_used:Int64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_used:Int64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.host AS host, CAST(cpu.cpu AS Utf8) AS cpu, CAST(NULL AS Utf8) AS device, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_used [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_used:Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, disk.host AS host, CAST(NULL AS Utf8) AS cpu, CAST(disk.device AS Utf8) AS device, CAST(NULL AS Float64) AS usage_idle, disk.bytes_used AS bytes_used [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_used:Int64;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+
+            // nonexistent
+            assert_snapshot!(plan("SELECT host, usage_idle FROM non_existent"), @"EmptyRelation [iox::measurement:Dictionary(Int32, Utf8)]");
+            assert_snapshot!(plan("SELECT host, usage_idle FROM cpu, non_existent"), @r###"
+            Sort: time ASC NULLS LAST, host ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.host AS host, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // multiple of same measurement
+            assert_snapshot!(plan("SELECT host, usage_idle FROM cpu, cpu"), @r###"
+            Sort: time AS time ASC NULLS LAST, host AS host ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, host AS host, usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), host:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                Union [host:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                  Projection: cpu.host AS host, cpu.time AS time, cpu.usage_idle AS usage_idle [host:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                  Projection: cpu.host AS host, cpu.time AS time, cpu.usage_idle AS usage_idle [host:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N]
+                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_time_range_in_where() {
+            assert_snapshot!(
+                plan("SELECT foo, f64_field FROM data where time > now() - 10s"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: data.time >= TimestampNanosecond(1672531190000000001, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###
+            );
+            assert_snapshot!(
+                plan("SELECT foo, f64_field FROM data where time > '2004-04-09T02:33:45Z'"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: data.time >= TimestampNanosecond(1081478025000000001, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###
+            );
+            assert_snapshot!(
+                plan("SELECT foo, f64_field FROM data where time > '2004-04-09T'"), @r###"
+            rewriting statement
+            caused by
+            split condition
+            caused by
+            Error during planning: invalid expression "'2004-04-09T'": '2004-04-09T' is not a valid timestamp
+            "###
+            );
+
+            // time on the right-hand side
+            assert_snapshot!(
+                plan("SELECT foo, f64_field FROM data where  now() - 10s < time"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: data.time >= TimestampNanosecond(1672531190000000001, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###
+            );
+
+            // fallible
+
+            // Unsupported operator
+            assert_snapshot!(plan("SELECT foo, f64_field FROM data where time != 0"), @r###"
+            rewriting statement
+            caused by
+            split condition
+            caused by
+            Error during planning: invalid time comparison operator: !=
+            "###)
+        }
+
+        #[test]
+        fn test_regex_in_where() {
+            test_helpers::maybe_start_logging();
+            // Regular expression equality tests
+
+            assert_snapshot!(plan("SELECT foo, f64_field FROM data where foo =~ /f/"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: CAST(data.foo AS Utf8) LIKE Utf8("%f%") [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+
+            // regular expression for a numeric field is rewritten to `false`
+            assert_snapshot!(plan("SELECT foo, f64_field FROM data where f64_field =~ /f/"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: Boolean(false) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+
+            // regular expression for a non-existent field is rewritten to `false`
+            assert_snapshot!(
+                plan("SELECT foo, f64_field FROM data where non_existent =~ /f/"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: Boolean(false) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###
+            );
+
+            // Regular expression inequality tests
+
+            assert_snapshot!(plan("SELECT foo, f64_field FROM data where foo !~ /f/"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: data.foo IS NULL OR CAST(data.foo AS Utf8) NOT LIKE Utf8("%f%") [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+
+            // regular expression for a numeric field is rewritten to `false`
+            assert_snapshot!(plan("SELECT foo, f64_field FROM data where f64_field !~ /f/"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: Boolean(false) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+
+            // regular expression for a non-existent field is rewritten to `false`
+            assert_snapshot!(
+                plan("SELECT foo, f64_field FROM data where non_existent !~ /f/"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Filter: Boolean(false) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###
+            );
+        }
+
+        #[test]
+        fn test_column_matching_rules() {
+            // Cast between numeric types
+            assert_snapshot!(plan("SELECT f64_field::integer FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Int64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, CAST(data.f64_field AS Int64) AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Int64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT i64_field::float FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, CAST(data.i64_field AS Float64) AS i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+
+            // use field selector
+            assert_snapshot!(plan("SELECT bool_field::field FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Boolean;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.bool_field AS bool_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Boolean;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+
+            // invalid column reference
+            assert_snapshot!(plan("SELECT not_exists::tag FROM data"), @"EmptyRelation [iox::measurement:Dictionary(Int32, Utf8)]");
+            assert_snapshot!(plan("SELECT not_exists::field FROM data"), @"EmptyRelation [iox::measurement:Dictionary(Int32, Utf8)]");
+
+            // Returns NULL for invalid casts
+            assert_snapshot!(plan("SELECT f64_field::string FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, NULL AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT f64_field::boolean FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, NULL AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT str_field::boolean FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, NULL AS str_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_explain() {
+            assert_snapshot!(plan("EXPLAIN SELECT foo, f64_field FROM data"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("EXPLAIN VERBOSE SELECT foo, f64_field FROM data"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("EXPLAIN ANALYZE SELECT foo, f64_field FROM data"), @r###"
+            Analyze [plan_type:Utf8, plan:Utf8]
+              Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("EXPLAIN ANALYZE VERBOSE SELECT foo, f64_field FROM data"), @r###"
+            Analyze [plan_type:Utf8, plan:Utf8]
+              Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("EXPLAIN SHOW MEASUREMENTS"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: measurements [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+            "###);
+            assert_snapshot!(plan("EXPLAIN SHOW TAG KEYS"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: tag_keys [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8)]
+            "###);
+
+            assert_snapshot!(plan("EXPLAIN SHOW FIELD KEYS"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: field_keys [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8]
+            "###);
+
+            assert_snapshot!(plan("EXPLAIN SHOW RETENTION POLICIES"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: retention policies [iox::measurement:Dictionary(Int32, Utf8), name:Utf8, duration:Utf8, shardGroupDuration:Utf8, replicaN:Int64, default:Boolean]
+            "###);
+
+            assert_snapshot!(plan("EXPLAIN SHOW DATABASES"), @"This feature is not implemented: SHOW DATABASES");
+            assert_snapshot!(plan("EXPLAIN EXPLAIN SELECT f64_field::string FROM data"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              Explain [plan_type:Utf8, plan:Utf8]
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, NULL AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                    TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_select_cast_postfix_operator() {
+            // Float casting
+            assert_snapshot!(plan("SELECT f64_field::float FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, all_types.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT f64_field::unsigned FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:UInt64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, CAST(all_types.f64_field AS UInt64) AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:UInt64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT f64_field::integer FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Int64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, CAST(all_types.f64_field AS Int64) AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Int64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT f64_field::string FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT f64_field::boolean FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+
+            // Integer casting
+            assert_snapshot!(plan("SELECT i64_field::float FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, CAST(all_types.i64_field AS Float64) AS i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Float64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT i64_field::unsigned FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:UInt64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, CAST(all_types.i64_field AS UInt64) AS i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:UInt64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT i64_field::integer FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Int64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, all_types.i64_field AS i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Int64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT i64_field::string FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT i64_field::boolean FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+
+            // Unsigned casting
+            assert_snapshot!(plan("SELECT u64_field::float FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, CAST(all_types.u64_field AS Float64) AS u64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Float64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT u64_field::unsigned FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, all_types.u64_field AS u64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT u64_field::integer FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Int64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, CAST(all_types.u64_field AS Int64) AS u64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Int64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT u64_field::string FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS u64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT u64_field::boolean FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS u64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), u64_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+
+            // String casting
+            assert_snapshot!(plan("SELECT str_field::float FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS str_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT str_field::unsigned FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS str_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT str_field::integer FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS str_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT str_field::string FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Utf8;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, all_types.str_field AS str_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Utf8;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT str_field::boolean FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS str_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), str_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+
+            // Boolean casting
+            assert_snapshot!(plan("SELECT bool_field::float FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS bool_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT bool_field::unsigned FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS bool_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT bool_field::integer FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS bool_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT bool_field::string FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS bool_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT bool_field::boolean FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Boolean;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, all_types.bool_field AS bool_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bool_field:Boolean;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+
+            // Validate various projection expressions with casts
+
+            assert_snapshot!(plan("SELECT f64_field::integer + i64_field + u64_field::integer FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field_i64_field_u64_field:Int64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, CAST(all_types.f64_field AS Int64) + all_types.i64_field + CAST(all_types.u64_field AS Int64) AS f64_field_i64_field_u64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field_i64_field_u64_field:Int64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+
+            assert_snapshot!(plan("SELECT f64_field::integer + i64_field + str_field::integer FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field_i64_field_str_field:Null;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, NULL AS f64_field_i64_field_str_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field_i64_field_str_field:Null;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+        }
+
+        /// Fix for https://github.com/influxdata/influxdb_iox/issues/8168
+        #[test]
+        fn test_integer_division_float_promotion() {
+            assert_snapshot!(plan("SELECT i64_field / i64_field FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field_i64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, all_types.time AS time, coalesce(CAST(all_types.i64_field AS Float64) / CAST(all_types.i64_field AS Float64), Float64(0)) AS i64_field_i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), i64_field_i64_field:Float64;N]
+                TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+            assert_snapshot!(plan("SELECT sum(i64_field) / sum(i64_field) FROM all_types"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), sum_sum:Float64;N]
+              Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce(CAST(SUM(all_types.i64_field) AS Float64) / CAST(SUM(all_types.i64_field) AS Float64), Float64(0)) AS sum_sum [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), sum_sum:Float64;N]
+                Aggregate: groupBy=[[]], aggr=[[SUM(all_types.i64_field)]] [SUM(all_types.i64_field):Int64;N]
+                  TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+            "###);
+        }
+
+        /// See <https://github.com/influxdata/influxdb_iox/issues/9175>
+        #[test]
+        fn test_true_and_time_pred() {
+            assert_snapshot!(plan("SELECT f64_field FROM data WHERE true AND time < '2022-10-31T02:02:00Z'"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+                Filter: data.time <= TimestampNanosecond(1667181719999999999, None) AND Boolean(true) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+    }
+
+    /// Tests to validate InfluxQL `SELECT` statements that project aggregate functions, such as `COUNT` or `SUM`.
+    mod select_aggregate {
+        use super::*;
+
+        mod single_measurement {
+            use super::*;
+
+            #[test]
+            fn no_group_by() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY non_existent"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo"), @r###"
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                    Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // The `COUNT(f64_field)` aggregate is only projected ones in the Aggregate and reused in the projection
+                assert_snapshot!(plan("SELECT COUNT(f64_field), COUNT(f64_field) + COUNT(f64_field), COUNT(f64_field) * 3 FROM data"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_count:Int64;N, count_1:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count, coalesce_struct(COUNT(data.f64_field), Int64(0)) + coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count_count, coalesce_struct(COUNT(data.f64_field), Int64(0)) * Int64(3) AS count_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_count:Int64;N, count_1:Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // non-existent tags are excluded from the Aggregate groupBy and Sort operators
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo, non_existent"), @r###"
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
+                    Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // Aggregate expression is projected once and reused in final projection
+                assert_snapshot!(plan("SELECT COUNT(f64_field),  COUNT(f64_field) * 2 FROM data"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_1:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count, coalesce_struct(COUNT(data.f64_field), Int64(0)) * Int64(2) AS count_1 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N, count_1:Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // Aggregate expression selecting non-existent field
+                assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_mean:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS mean_mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean_mean:Null;N]
+                    EmptyRelation []
+                "###);
+
+                // Aggregate expression with GROUP BY and non-existent field
+                assert_snapshot!(plan("SELECT MEAN(f64_field) + MEAN(non_existent) FROM data GROUP BY foo"), @r###"
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_mean:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS mean_mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, mean_mean:Null;N]
+                    Aggregate: groupBy=[[data.foo]], aggr=[[]] [foo:Dictionary(Int32, Utf8);N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // Aggregate expression selecting tag, should treat as non-existent
+                assert_snapshot!(plan("SELECT MEAN(f64_field), MEAN(f64_field) + MEAN(non_existent) FROM data"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_mean:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, AVG(data.f64_field) AS mean, NULL AS mean_mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), mean:Float64;N, mean_mean:Null;N]
+                    Aggregate: groupBy=[[]], aggr=[[AVG(data.f64_field)]] [AVG(data.f64_field):Float64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // Fallible
+
+                // Cannot combine aggregate and non-aggregate columns in the projection
+                assert_snapshot!(plan("SELECT COUNT(f64_field), f64_field FROM data"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: mixing aggregate and non-aggregate columns is not supported
+                "###);
+                assert_snapshot!(plan("SELECT COUNT(f64_field) + f64_field FROM data"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: mixing aggregate and non-aggregate columns is not supported
+                "###);
+            }
+
+            #[test]
+            fn group_by_time() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(none)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                        TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+
+                // supports offset parameter
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s, 5s) FILL(none)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(5000000000, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                        TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_no_bounds() {
+                // No time bounds
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_no_lower_time_bounds() {
+                // No lower time bounds
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data WHERE time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1667181719999999999, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_no_upper_time_bounds() {
+                // No upper time bounds
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None)))..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_defaul_is_fill_null1() {
+                // Default is FILL(null)
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data WHERE time >= '2022-10-31T02:00:00Z' AND time < '2022-10-31T02:02:00Z' GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Included(Literal(TimestampNanosecond(1667181600000000000, None)))..Included(Literal(TimestampNanosecond(1667181719999999999, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time >= TimestampNanosecond(1667181600000000000, None) AND data.time <= TimestampNanosecond(1667181719999999999, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_default_is_fill_null1() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_default_is_fill_null2() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(null)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_default_is_fill_null3() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(previous)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[LOCF(COUNT(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_default_is_fill_null4() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(0)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_default_is_fill_null5() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(linear)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[INTERPOLATE(COUNT(data.f64_field))]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_coalesces_the_fill_value() {
+                // Coalesces the fill value, which is a float, to the matching type of a `COUNT` aggregate.
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10s) FILL(3.2)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(3)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn group_by_time_gapfill_aggregates_part_of_binary_expression() {
+                // Aggregates as part of a binary expression
+                assert_snapshot!(plan("SELECT COUNT(f64_field) + MEAN(f64_field) FROM data GROUP BY TIME(10s) FILL(3.2)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count_mean:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(3)) + coalesce_struct(AVG(data.f64_field), Float64(3.2)) AS count_mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count_mean:Float64;N]
+                    GapFill: groupBy=[time], aggr=[[COUNT(data.f64_field), AVG(data.f64_field)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N, AVG(data.f64_field):Float64;N]
+                      Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field), AVG(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N, AVG(data.f64_field):Float64;N]
+                        Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                          TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn with_limit_or_offset() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo LIMIT 1"), @r###"
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                  Projection: iox::measurement, time, foo, count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                    Filter: iox::row <= Int64(1) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
+                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [foo] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
+                        Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                            Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
+                              TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn with_limit_or_offset2() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo OFFSET 1"), @r###"
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                  Projection: iox::measurement, time, foo, count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                    Filter: iox::row > Int64(1) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
+                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [foo] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
+                        Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                            Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
+                              TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn with_limit_or_offset3() {
+                assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo LIMIT 2 OFFSET 3"), @r###"
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                  Projection: iox::measurement, time, foo, count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                    Filter: iox::row BETWEEN Int64(4) AND Int64(5) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
+                      WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [foo] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N, iox::row:UInt64;N]
+                        Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                          Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                            Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
+                              TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
+            }
+
+            #[test]
+            fn with_limit_or_offset_errors() {
+                // Fallible
+
+                // returns an error if LIMIT or OFFSET values exceed i64::MAX
+                let max = (i64::MAX as u64) + 1;
+                assert_snapshot!(plan(format!("SELECT COUNT(f64_field) FROM data GROUP BY foo LIMIT {max}")), @"Error during planning: limit out of range");
+                assert_snapshot!(plan(format!("SELECT COUNT(f64_field) FROM data GROUP BY foo OFFSET {max}")), @"Error during planning: offset out of range");
+            }
+
+            /// These tests validate the planner returns an error when using features that
+            /// are not implemented or supported.
+            mod not_implemented {
+                use super::*;
+
+                /// Tracked by <https://github.com/influxdata/influxdb_iox/issues/7204>
+                #[test]
+                fn group_by_time_precision() {
+                    assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY TIME(10u) FILL(none)"), @r###"
+                    Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                      Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, count:Int64;N]
+                        Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000"), data.time, TimestampNanosecond(0, None)) AS time]], aggr=[[COUNT(data.f64_field)]] [time:Timestamp(Nanosecond, None);N, COUNT(data.f64_field):Int64;N]
+                          Filter: data.time <= TimestampNanosecond(1672531200000000000, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                    "###);
+                }
+            }
+        }
+    }
+
+    /// Tests to validate InfluxQL `SELECT` statements that project columns without specifying
+    /// aggregates or `GROUP BY time()` with gap filling.
+    mod select_raw {
+        use super::*;
+
+        /// Select data from a single measurement
+        #[test]
+        fn test_single_measurement() {
+            assert_snapshot!(plan("SELECT f64_field FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT time, f64_field FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT time as timestamp, f64_field FROM data"), @r###"
+            Sort: timestamp ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), timestamp:Timestamp(Nanosecond, None), f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS timestamp, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), timestamp:Timestamp(Nanosecond, None), f64_field:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT foo, f64_field FROM data"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT foo, f64_field, i64_field FROM data"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N, i64_field:Int64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field, data.i64_field AS i64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N, i64_field:Int64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT /^f/ FROM data"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.f64_field AS f64_field, data.foo AS foo [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT * FROM data"), @r###"
+            Sort: time ASC NULLS LAST, bar ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, with space:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.TIME AS TIME, data.bar AS bar, data.bool_field AS bool_field, data.f64_field AS f64_field, data.foo AS foo, data.i64_field AS i64_field, data.mixedCase AS mixedCase, data.str_field AS str_field, data.with space AS with space [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, with space:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT TIME FROM data"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), TIME:Boolean;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.TIME AS TIME [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), TIME:Boolean;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###); // TIME is a field
+        }
+
+        /// Arithmetic expressions in the projection list
+        #[test]
+        fn test_simple_arithmetic_in_projection() {
+            assert_snapshot!(plan("SELECT foo, f64_field + f64_field FROM data"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field_f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field + data.f64_field AS f64_field_f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field_f64_field:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT foo, sin(f64_field) FROM data"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, sin:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, sin(data.f64_field) AS sin [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, sin:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT foo, atan2(f64_field, 2) FROM data"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, atan2:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, atan2(data.f64_field, Int64(2)) AS atan2 [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, atan2:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT foo, f64_field + 0.5 FROM data"), @r###"
+            Sort: time ASC NULLS LAST, foo ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field + Float64(0.5) AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
+                TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_select_single_measurement_group_by() {
+            // Sort should be cpu, time
+            assert_snapshot!(plan("SELECT usage_idle FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // Sort should be cpu, time
+            assert_snapshot!(plan("SELECT cpu, usage_idle FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // Sort should be cpu, region, time
+            assert_snapshot!(plan("SELECT usage_idle FROM cpu GROUP BY cpu, region"), @r###"
+            Sort: cpu ASC NULLS LAST, region ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.region AS region, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // Sort should be cpu, region, time
+            assert_snapshot!(plan("SELECT usage_idle FROM cpu GROUP BY region, cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, region ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.region AS region, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // Sort should be cpu, time, region
+            assert_snapshot!(plan("SELECT region, usage_idle FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST, region ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.region AS region, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // If a tag specified in a GROUP BY does not exist in the measurement, it should be omitted from the sort
+            assert_snapshot!(plan("SELECT usage_idle FROM cpu GROUP BY cpu, non_existent"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, non_existent:Null;N, usage_idle:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, NULL AS non_existent, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, non_existent:Null;N, usage_idle:Float64;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // If a tag specified in a projection does not exist in the measurement, it should be omitted from the sort
+            assert_snapshot!(plan("SELECT usage_idle, cpu, non_existent FROM cpu GROUP BY cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), usage_idle:Float64;N, cpu:Dictionary(Int32, Utf8);N, non_existent:Null;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.usage_idle AS usage_idle, cpu.cpu AS cpu, NULL AS non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), usage_idle:Float64;N, cpu:Dictionary(Int32, Utf8);N, non_existent:Null;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+
+            // If a non-existent field is included in the GROUP BY and projection, it should not be duplicated
+            assert_snapshot!(plan("SELECT usage_idle, non_existent FROM cpu GROUP BY cpu, non_existent"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, non_existent:Null;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS usage_idle, NULL AS non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, non_existent:Null;N]
+                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_select_multiple_measurements_group_by() {
+            // Sort should be iox::measurement, cpu, time
+            assert_snapshot!(plan("SELECT usage_idle, bytes_free FROM cpu, disk GROUP BY cpu"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, CAST(cpu.cpu AS Utf8) AS cpu, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, CAST(NULL AS Utf8) AS cpu, CAST(NULL AS Float64) AS usage_idle, disk.bytes_free AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+
+            // Sort should be iox::measurement, cpu, device, time
+            assert_snapshot!(plan("SELECT usage_idle, bytes_free FROM cpu, disk GROUP BY device, cpu"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, cpu ASC NULLS LAST, device ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, CAST(cpu.cpu AS Utf8) AS cpu, CAST(NULL AS Utf8) AS device, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, CAST(NULL AS Utf8) AS cpu, CAST(disk.device AS Utf8) AS device, CAST(NULL AS Float64) AS usage_idle, disk.bytes_free AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+
+            // Sort should be iox::measurement, cpu, time, device
+            assert_snapshot!(plan("SELECT device, usage_idle, bytes_free FROM cpu, disk GROUP BY cpu"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, cpu ASC NULLS LAST, time ASC NULLS LAST, device ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, CAST(cpu.cpu AS Utf8) AS cpu, CAST(NULL AS Utf8) AS device, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, CAST(NULL AS Utf8) AS cpu, CAST(disk.device AS Utf8) AS device, CAST(NULL AS Float64) AS usage_idle, disk.bytes_free AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Utf8;N, device:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+
+            // Sort should be iox::measurement, cpu, device, time
+            assert_snapshot!(plan("SELECT cpu, usage_idle, bytes_free FROM cpu, disk GROUP BY cpu, device"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, cpu ASC NULLS LAST, device ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, CAST(NULL AS Utf8) AS device, CAST(cpu.cpu AS Utf8) AS cpu, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, CAST(disk.device AS Utf8) AS device, CAST(NULL AS Utf8) AS cpu, CAST(NULL AS Float64) AS usage_idle, disk.bytes_free AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+
+            // Sort should be iox::measurement, device, time, cpu
+            assert_snapshot!(plan("SELECT cpu, usage_idle, bytes_free FROM cpu, disk GROUP BY device"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, device ASC NULLS LAST, time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, CAST(NULL AS Utf8) AS device, CAST(cpu.cpu AS Utf8) AS cpu, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, CAST(disk.device AS Utf8) AS device, CAST(NULL AS Utf8) AS cpu, CAST(NULL AS Float64) AS usage_idle, disk.bytes_free AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+
+            // If a tag specified in a GROUP BY does not exist across all measurements, it should be omitted from the sort
+            assert_snapshot!(plan("SELECT cpu, usage_idle, bytes_free FROM cpu, disk GROUP BY device, non_existent"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, device ASC NULLS LAST, time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, non_existent:Null;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, non_existent:Null;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, CAST(NULL AS Utf8) AS device, NULL AS non_existent, CAST(cpu.cpu AS Utf8) AS cpu, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, non_existent:Null;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, CAST(disk.device AS Utf8) AS device, NULL AS non_existent, CAST(NULL AS Utf8) AS cpu, CAST(NULL AS Float64) AS usage_idle, disk.bytes_free AS bytes_free [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, non_existent:Null;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+
+            // If a tag specified in a projection does not exist across all measurements, it should be omitted from the sort
+            assert_snapshot!(plan("SELECT cpu, usage_idle, bytes_free, non_existent FROM cpu, disk GROUP BY device"), @r###"
+            Sort: iox::measurement ASC NULLS LAST, device ASC NULLS LAST, time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N, non_existent:Null;N]
+              Union [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N, non_existent:Null;N]
+                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, CAST(NULL AS Utf8) AS device, CAST(cpu.cpu AS Utf8) AS cpu, cpu.usage_idle AS usage_idle, CAST(NULL AS Int64) AS bytes_free, NULL AS non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N, non_existent:Null;N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, disk.time AS time, CAST(disk.device AS Utf8) AS device, CAST(NULL AS Utf8) AS cpu, CAST(NULL AS Float64) AS usage_idle, disk.bytes_free AS bytes_free, NULL AS non_existent [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), device:Utf8;N, cpu:Utf8;N, usage_idle:Float64;N, bytes_free:Int64;N, non_existent:Null;N]
+                  TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            "###);
+        }
+
+        #[test]
+        fn test_select_group_by_limit_offset() {
+            assert_snapshot!(plan("SELECT usage_idle FROM cpu GROUP BY cpu LIMIT 1"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: iox::measurement, time, cpu, usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                Filter: iox::row <= Int64(1) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, iox::row:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, iox::row:UInt64;N]
+                    Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                      Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT usage_idle FROM cpu GROUP BY cpu OFFSET 1"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: iox::measurement, time, cpu, usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                Filter: iox::row > Int64(1) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, iox::row:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, iox::row:UInt64;N]
+                    Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                      Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT usage_idle FROM cpu GROUP BY cpu LIMIT 1 OFFSET 1"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+              Projection: iox::measurement, time, cpu, usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                Filter: iox::row BETWEEN Int64(2) AND Int64(2) [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, iox::row:UInt64;N]
+                  WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu] ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS iox::row]] [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N, iox::row:UInt64;N]
+                    Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                      Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, cpu.time AS time, cpu.cpu AS cpu, cpu.usage_idle AS usage_idle [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, usage_idle:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        #[test]
+        fn test_select_function_tag_column() {
+            assert_snapshot!(plan("SELECT last(foo) as foo, first(usage_idle) from cpu group by foo"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Null;N, first:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS foo, (selector_first(cpu.usage_idle,cpu.time))[value] AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Null;N, first:Float64;N]
+                Aggregate: groupBy=[[]], aggr=[[selector_first(cpu.usage_idle, cpu.time)]] [selector_first(cpu.usage_idle,cpu.time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT count(foo) as foo, first(usage_idle) from cpu group by foo"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Null;N, foo_1:Null;N, first:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS foo, (coalesce_struct(selector_first(cpu.usage_idle,cpu.time,NULL), Struct({value:Float64(0),time:TimestampNanosecond(0, None),other_1:NULL})))[other_1] AS foo_1, (selector_first(cpu.usage_idle,cpu.time,NULL))[value] AS first [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Null;N, foo_1:Null;N, first:Float64;N]
+                Aggregate: groupBy=[[]], aggr=[[selector_first(cpu.usage_idle, cpu.time, NULL)]] [selector_first(cpu.usage_idle,cpu.time,NULL):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "other_1", data_type: Null, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
+                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+        }
+
+        // The following is an outline of additional scenarios to develop
+        // as the planner learns more features.
+        // This is not an exhaustive list and is expected to grow as the
+        // planner feature list expands.
+
+        //
+        // Scenarios: field matching rules
+        //
+
+        // Correctly matches mixed case
+        // assert_snapshot!(plan("SELECT mixedCase FROM data"));
+        // assert_snapshot!(plan("SELECT \"mixedCase\" FROM data"));
+
+        // Does not match when case differs
+        // assert_snapshot!(plan("SELECT MixedCase FROM data"));
+
+        // Matches those that require quotes
+        // assert_snapshot!(plan("SELECT \"with space\" FROM data"));
+        // assert_snapshot!(plan("SELECT /(with|f64)/ FROM data"));
+
+        //
+        // Scenarios: Measurement doesn't exist
+        //
+        // assert_snapshot!(plan("SELECT f64_field FROM data_1"));
+        // assert_snapshot!(plan("SELECT foo, f64_field FROM data_1"));
+        // assert_snapshot!(plan("SELECT /^f/ FROM data_1"));
+        // assert_snapshot!(plan("SELECT * FROM data_1"));
+
+        //
+        // Scenarios: measurement exists, mixture of fields that do and don't exist
+        //
+        // assert_snapshot!(plan("SELECT f64_field, missing FROM data"));
+        // assert_snapshot!(plan("SELECT foo, missing FROM data"));
+
+        //
+        // Scenarios: Mathematical scalar functions in the projection list, including
+        // those in arithmetic expressions.
+        //
+        // assert_snapshot!(plan("SELECT abs(f64_field) FROM data"));
+        // assert_snapshot!(plan("SELECT ceil(f64_field) FROM data"));
+        // assert_snapshot!(plan("SELECT floor(f64_field) FROM data"));
+        // assert_snapshot!(plan("SELECT pow(f64_field, 3) FROM data"));
+        // assert_snapshot!(plan("SELECT pow(i64_field, 3) FROM data"));
+
+        //
+        // Scenarios: Invalid scalar functions in the projection list
+        //
+
+        //
+        // Scenarios: WHERE clause with time range, now function and literal values
+        // See `getTimeRange`: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L5791
+        //
+
+        //
+        // Scenarios: WHERE clause with conditional expressions for tag and field
+        // references, including
+        //
+        // * arithmetic expressions,
+        // * regular expressions
+        //
+
+        //
+        // Scenarios: Mathematical expressions in the WHERE clause
+        //
+
+        //
+        // Scenarios: Unsupported scalar expressions in the WHERE clause
+        //
+
+        //
+        // Scenarios: GROUP BY tags only
+        //
+
+        //
+        // Scenarios: LIMIT and OFFSET clauses
+        //
+
+        //
+        // Scenarios: DISTINCT clause and function
+        //
+
+        //
+        // Scenarios: Unsupported multiple DISTINCT clauses and function calls
+        //
+
+        //
+        // Scenarios: Multiple measurements, including
+        //
+        // * explicitly specified,
+        // * regular expression matching
+    }
+
+    /// This module contains esoteric features of InfluxQL that are identified during
+    /// the development of other features, and require additional work to implement or resolve.
+    ///
+    /// These tests are all ignored and will be promoted to the `test` module when resolved.
+    ///
+    /// By containing them in a submodule, they appear neatly grouped together in test output.
+    mod issues {
+        use super::*;
+
+        /// **Issue:**
+        /// Fails InfluxQL type coercion rules
+        /// **Expected:**
+        /// Succeeds and returns null values for the expression
+        /// **Actual:**
+        /// Error during planning: 'Float64 + Utf8' can't be evaluated because there isn't a common type to coerce the types to
+        #[test]
+        #[ignore]
+        fn test_select_coercion_from_str() {
+            assert_snapshot!(plan("SELECT f64_field + str_field::float FROM data"), @"");
+        }
+
+        /// **Issue:**
+        /// InfluxQL identifiers are case-sensitive and query fails to ignore unknown identifiers
+        /// **Expected:**
+        /// Succeeds and plans the query, returning null values for unknown columns
+        /// **Actual:**
+        /// Schema error: No field named 'TIME'. Valid fields are 'data'.'bar', 'data'.'bool_field', 'data'.'f64_field', 'data'.'foo', 'data'.'i64_field', 'data'.'mixedCase', 'data'.'str_field', 'data'.'time', 'data'.'with space'.
+        #[test]
+        #[ignore]
+        fn test_select_case_sensitivity() {
+            // should return no results
+            assert_snapshot!(plan("SELECT TIME, f64_Field FROM data"));
+
+            // should bind to time and f64_field, and i64_Field should return NULL values
+            assert_snapshot!(plan("SELECT time, f64_field, i64_Field FROM data"));
+        }
+    }
+}
diff --git a/iox_query_influxql/src/plan/planner/select.rs b/iox_query_influxql/src/plan/planner/select.rs
new file mode 100644
index 0000000..97f1fdb
--- /dev/null
+++ b/iox_query_influxql/src/plan/planner/select.rs
@@ -0,0 +1,434 @@
+use crate::error;
+use crate::plan::ir::Field;
+use arrow::datatypes::DataType;
+use datafusion::common::{DFSchemaRef, Result};
+use datafusion::logical_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
+use datafusion_util::AsExpr;
+use generated_types::influxdata::iox::querier::v1::influx_ql_metadata::TagKeyColumn;
+use influxdb_influxql_parser::expression::{Call, Expr as IQLExpr, VarRef, VarRefDataType};
+use influxdb_influxql_parser::identifier::Identifier;
+use influxdb_influxql_parser::literal::Literal;
+use itertools::Itertools;
+use schema::{InfluxColumnType, INFLUXQL_MEASUREMENT_COLUMN_NAME};
+use std::collections::{HashMap, HashSet};
+use std::fmt::{Display, Formatter};
+
+pub(super) fn make_tag_key_column_meta(
+    fields: &[Field],
+    tag_set: &[&str],
+    is_projected: &[bool],
+) -> Vec<TagKeyColumn> {
+    /// There is always a [INFLUXQL_MEASUREMENT_COLUMN_NAME] and `time` column projected in the LogicalPlan,
+    /// therefore the start index is 2 for determining the offsets of the
+    /// tag key columns in the column projection list.
+    const START_INDEX: usize = 1;
+
+    // Create a map of tag key columns to their respective index in the projection
+    let index_map = fields
+        .iter()
+        .enumerate()
+        .filter_map(|(index, f)| match &f.data_type {
+            Some(InfluxColumnType::Tag) | None => Some((f.name.as_str(), index + START_INDEX)),
+            _ => None,
+        })
+        .collect::<HashMap<_, _>>();
+
+    // tag_set was previously sorted, so tag_key_columns will be in the correct order
+    tag_set
+        .iter()
+        .zip(is_projected)
+        .map(|(tag_key, is_projected)| TagKeyColumn {
+            tag_key: (*tag_key).to_owned(),
+            column_index: *index_map.get(*tag_key).unwrap() as _,
+            is_projected: *is_projected,
+        })
+        .collect()
+}
+
+/// Create a plan that sorts the input plan.
+///
+/// The ordering of the results is as follows:
+///
+/// iox::measurement, [group by tag 0, .., group by tag n], time, [projection tag 0, .., projection tag n]
+///
+/// ## NOTE
+///
+/// Sort expressions referring to tag keys are always specified in lexicographically ascending order.
+pub(super) fn plan_with_sort(
+    plan: LogicalPlan,
+    mut sort_exprs: Vec<Expr>,
+    sort_by_measurement: bool,
+    group_by_tag_set: &[&str],
+    projection_tag_set: &[&str],
+) -> Result<LogicalPlan> {
+    let mut series_sort = if sort_by_measurement {
+        vec![Expr::sort(
+            INFLUXQL_MEASUREMENT_COLUMN_NAME.as_expr(),
+            true,
+            false,
+        )]
+    } else {
+        vec![]
+    };
+
+    /// Map the fields to DataFusion [`Expr::Sort`] expressions, excluding those columns that
+    /// are [`DataType::Null`]'s, as sorting these column types is not supported and unnecessary.
+    fn map_to_expr<'a>(
+        schema: &'a DFSchemaRef,
+        fields: &'a [&str],
+    ) -> impl Iterator<Item = Expr> + 'a {
+        fields_to_exprs_no_nulls(schema, fields).map(|f| Expr::sort(f, true, false))
+    }
+
+    let schema = plan.schema();
+
+    if !group_by_tag_set.is_empty() {
+        series_sort.extend(map_to_expr(schema, group_by_tag_set));
+    };
+
+    series_sort.append(&mut sort_exprs);
+
+    series_sort.extend(map_to_expr(schema, projection_tag_set));
+
+    LogicalPlanBuilder::from(plan).sort(series_sort)?.build()
+}
+
+/// Map the fields to DataFusion [`Expr::Column`] expressions, excluding those columns that
+/// are [`DataType::Null`]'s.
+pub(super) fn fields_to_exprs_no_nulls<'a>(
+    schema: &'a DFSchemaRef,
+    fields: &'a [&str],
+) -> impl Iterator<Item = Expr> + 'a {
+    fields
+        .iter()
+        .filter(|f| {
+            if let Ok(df) = schema.field_with_unqualified_name(f) {
+                *df.data_type() != DataType::Null
+            } else {
+                false
+            }
+        })
+        .map(|f| f.as_expr())
+}
+
+/// Contains an expanded `SELECT` projection
+pub(super) struct ProjectionInfo<'a> {
+    /// A copy of the `SELECT` fields that includes tags from the `GROUP BY` that were not
+    /// specified in the original `SELECT` projection.
+    pub(super) fields: Vec<Field>,
+
+    /// A list of tag column names specified in the `GROUP BY` clause.
+    pub(super) group_by_tag_set: Vec<&'a str>,
+
+    /// A list of tag column names specified exclusively in the `SELECT` projection.
+    pub(super) projection_tag_set: Vec<&'a str>,
+
+    /// A list of booleans indicating whether matching elements in the
+    /// `group_by_tag_set` are also projected in the query.
+    pub(super) is_projected: Vec<bool>,
+}
+
+impl<'a> ProjectionInfo<'a> {
+    /// Computes a `ProjectionInfo` from the specified `fields` and `group_by_tags`.
+    pub(super) fn new(fields: &'a [Field], group_by_tags: &'a [&'a str]) -> Self {
+        // Skip the `time` column
+        let fields_no_time = &fields[1..];
+        // always start with the time column
+        let mut fields = vec![fields.first().cloned().unwrap()];
+
+        let (group_by_tag_set, projection_tag_set, is_projected) = if group_by_tags.is_empty() {
+            let tag_columns = find_tag_and_unknown_columns(fields_no_time)
+                .sorted()
+                .collect::<Vec<_>>();
+            (vec![], tag_columns, vec![])
+        } else {
+            let mut tag_columns =
+                find_tag_and_unknown_columns(fields_no_time).collect::<HashSet<_>>();
+
+            // Find the list of tag keys specified in the `GROUP BY` clause, and
+            // whether any of the tag keys are also projected in the SELECT list.
+            let (tag_set, is_projected): (Vec<_>, Vec<_>) = group_by_tags
+                .iter()
+                .map(|s| (*s, tag_columns.contains(s)))
+                .unzip();
+
+            // Tags specified in the `GROUP BY` clause that are not already added to the
+            // projection must be projected, so they can be used in the group key.
+            //
+            // At the end of the loop, the `tag_columns` set will contain the tag columns that
+            // exist in the projection and not in the `GROUP BY`.
+            fields.extend(
+                tag_set
+                    .iter()
+                    .filter_map(|col| match tag_columns.remove(*col) {
+                        true => None,
+                        false => Some(Field {
+                            expr: IQLExpr::VarRef(VarRef {
+                                name: (*col).into(),
+                                data_type: Some(VarRefDataType::Tag),
+                            }),
+                            name: col.to_string(),
+                            data_type: Some(InfluxColumnType::Tag),
+                        }),
+                    }),
+            );
+
+            (
+                tag_set,
+                tag_columns.into_iter().sorted().collect::<Vec<_>>(),
+                is_projected,
+            )
+        };
+
+        fields.extend(fields_no_time.iter().cloned());
+
+        Self {
+            fields,
+            group_by_tag_set,
+            projection_tag_set,
+            is_projected,
+        }
+    }
+}
+
+/// Find all the columns where the resolved data type
+/// is a tag or is [`None`], which is unknown.
+fn find_tag_and_unknown_columns(fields: &[Field]) -> impl Iterator<Item = &str> {
+    fields.iter().filter_map(|f| match f.data_type {
+        Some(InfluxColumnType::Tag) | None => Some(f.name.as_str()),
+        _ => None,
+    })
+}
+
+/// The selector function that has been specified for use with a selector
+/// projection type.
+#[derive(Debug)]
+pub(super) enum Selector<'a> {
+    Bottom {
+        field_key: &'a Identifier,
+        tag_keys: Vec<&'a Identifier>,
+        n: i64,
+    },
+    First {
+        field_key: &'a Identifier,
+    },
+    Last {
+        field_key: &'a Identifier,
+    },
+    Max {
+        field_key: &'a Identifier,
+    },
+    Min {
+        field_key: &'a Identifier,
+    },
+    Percentile {
+        field_key: &'a Identifier,
+        n: f64,
+    },
+    Sample {
+        field_key: &'a Identifier,
+        n: i64,
+    },
+    Top {
+        field_key: &'a Identifier,
+        tag_keys: Vec<&'a Identifier>,
+        n: i64,
+    },
+}
+
+impl<'a> Selector<'a> {
+    /// Find the selector function, with its location, in the specified field list.
+    pub(super) fn find_enumerated(fields: &'a [Field]) -> Result<(usize, Self)> {
+        fields
+            .iter()
+            .enumerate()
+            .find_map(|(idx, f)| match &f.expr {
+                IQLExpr::Call(c) => Some((idx, c)),
+                _ => None,
+            })
+            .map(|(idx, c)| {
+                Ok((
+                    idx,
+                    match c.name.as_str() {
+                        "bottom" => Self::bottom(c),
+                        "first" => Self::first(c),
+                        "last" => Self::last(c),
+                        "max" => Self::max(c),
+                        "min" => Self::min(c),
+                        "percentile" => Self::percentile(c),
+                        "sample" => Self::sample(c),
+                        "top" => Self::top(c),
+                        name => error::internal(format!("unexpected selector function: {name}")),
+                    }?,
+                ))
+            })
+            .ok_or_else(|| error::map::internal("expected Call expression"))?
+    }
+
+    fn bottom(call: &'a Call) -> Result<Self> {
+        let [field_key, tag_keys @ .., narg] = call.args.as_slice() else {
+            return error::internal(format!(
+                "invalid number of arguments for bottom: expected 2 or more, got {}",
+                call.args.len()
+            ));
+        };
+        let tag_keys: Result<Vec<_>> = tag_keys.iter().map(Self::identifier).collect();
+        Ok(Self::Bottom {
+            field_key: Self::identifier(field_key)?,
+            tag_keys: tag_keys?,
+            n: Self::literal_int(narg)?,
+        })
+    }
+
+    fn first(call: &'a Call) -> Result<Self> {
+        if call.args.len() != 1 {
+            return error::internal(format!(
+                "invalid number of arguments for first: expected 1, got {}",
+                call.args.len()
+            ));
+        }
+        Ok(Self::First {
+            field_key: Self::identifier(&call.args[0])?,
+        })
+    }
+
+    fn last(call: &'a Call) -> Result<Self> {
+        if call.args.len() != 1 {
+            return error::internal(format!(
+                "invalid number of arguments for last: expected 1, got {}",
+                call.args.len()
+            ));
+        }
+        Ok(Self::Last {
+            field_key: Self::identifier(&call.args[0])?,
+        })
+    }
+
+    fn max(call: &'a Call) -> Result<Self> {
+        if call.args.len() != 1 {
+            return error::internal(format!(
+                "invalid number of arguments for max: expected 1, got {}",
+                call.args.len()
+            ));
+        }
+        Ok(Self::Max {
+            field_key: Self::identifier(&call.args[0])?,
+        })
+    }
+
+    fn min(call: &'a Call) -> Result<Self> {
+        if call.args.len() != 1 {
+            return error::internal(format!(
+                "invalid number of arguments for min: expected 1, got {}",
+                call.args.len()
+            ));
+        }
+        Ok(Self::Min {
+            field_key: Self::identifier(&call.args[0])?,
+        })
+    }
+
+    fn percentile(call: &'a Call) -> Result<Self> {
+        if call.args.len() != 2 {
+            return error::internal(format!(
+                "invalid number of arguments for min: expected 1, got {}",
+                call.args.len()
+            ));
+        }
+        Ok(Self::Percentile {
+            field_key: Self::identifier(&call.args[0])?,
+            n: Self::literal_num(&call.args[1])?,
+        })
+    }
+
+    fn sample(call: &'a Call) -> Result<Self> {
+        if call.args.len() != 2 {
+            return error::internal(format!(
+                "invalid number of arguments for min: expected 1, got {}",
+                call.args.len()
+            ));
+        }
+        Ok(Self::Sample {
+            field_key: Self::identifier(&call.args[0])?,
+            n: Self::literal_int(&call.args[1])?,
+        })
+    }
+
+    fn top(call: &'a Call) -> Result<Self> {
+        let [field_key, tag_keys @ .., narg] = call.args.as_slice() else {
+            return error::internal(format!(
+                "invalid number of arguments for top: expected 2 or more, got {}",
+                call.args.len()
+            ));
+        };
+        let tag_keys: Result<Vec<_>> = tag_keys.iter().map(Self::identifier).collect();
+        Ok(Self::Top {
+            field_key: Self::identifier(field_key)?,
+            tag_keys: tag_keys?,
+            n: Self::literal_int(narg)?,
+        })
+    }
+
+    fn identifier(expr: &'a IQLExpr) -> Result<&'a Identifier> {
+        match expr {
+            IQLExpr::VarRef(v) => Ok(&v.name),
+            e => error::internal(format!("invalid column identifier: {}", e)),
+        }
+    }
+
+    fn literal_int(expr: &'a IQLExpr) -> Result<i64> {
+        match expr {
+            IQLExpr::Literal(Literal::Integer(n)) => Ok(*n),
+            e => error::internal(format!("invalid integer literal: {}", e)),
+        }
+    }
+
+    fn literal_num(expr: &'a IQLExpr) -> Result<f64> {
+        match expr {
+            IQLExpr::Literal(Literal::Integer(n)) => Ok(*n as f64),
+            IQLExpr::Literal(Literal::Float(n)) => Ok(*n),
+            e => error::internal(format!("invalid integer literal: {}", e)),
+        }
+    }
+}
+
+impl<'a> Display for Selector<'a> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Bottom {
+                field_key,
+                tag_keys,
+                n,
+            } => {
+                write!(f, "bottom({field_key}")?;
+                for tag_key in tag_keys {
+                    write!(f, ", {tag_key}")?;
+                }
+                write!(f, ", {n})")
+            }
+            Self::First { field_key } => write!(f, "first({field_key})"),
+            Self::Last { field_key } => write!(f, "last({field_key})"),
+            Self::Max { field_key } => write!(f, "max({field_key})"),
+            Self::Min { field_key } => write!(f, "min({field_key})"),
+            Self::Percentile { field_key, n } => write!(f, "percentile({field_key}, {n})"),
+            Self::Sample { field_key, n } => write!(f, "sample({field_key}, {n})"),
+            Self::Top {
+                field_key,
+                tag_keys,
+                n,
+            } => {
+                write!(f, "top({field_key}")?;
+                for tag_key in tag_keys {
+                    write!(f, ", {tag_key}")?;
+                }
+                write!(f, ", {n})")
+            }
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub(super) enum SelectorWindowOrderBy<'a> {
+    FieldAsc(&'a Identifier),
+    FieldDesc(&'a Identifier),
+}
diff --git a/iox_query_influxql/src/plan/planner_rewrite_expression.rs b/iox_query_influxql/src/plan/planner_rewrite_expression.rs
new file mode 100644
index 0000000..9fd50d5
--- /dev/null
+++ b/iox_query_influxql/src/plan/planner_rewrite_expression.rs
@@ -0,0 +1,968 @@
+//! Rewrite a DataFusion logical expression such that it behaves as similarly to InfluxQL as
+//! possible.
+//!
+//! InfluxQL tries extremely hard to permit expressions, even when the operands are incompatible.
+//! InfluxQL opts to coerce binary expressions to `NULL` or `false`, depending on whether the
+//! operator is arithmetic vs conditional. There are, however, cases where evaluation fails
+//! and returns an error.
+//!
+//! ## Implementation
+//!
+//! Most of the rules have been determined by examining the source code of [`Reduce`], [`EvalBool`]
+//! and [`Eval`] from the Go repository.
+//!
+//! There are a number of edge cases and issues in the source implementation that result in
+//! unexpected behaviour.
+//!
+//! ## Unexpected failures
+//!
+//! There are a number of cases where a user might expect an expression to succeed, however,
+//! InfluxQL may return an error or evaluate the expression to `false` or `NULL`.
+//!
+//! The following are examples of where this behaviour may be observed.
+//!
+//! ## Numeric fields are special
+//!
+//! InfluxQL allows numeric fields to be referenced as part of an arithmetic expression.
+//!
+//! Using the `metrics.lp` data set included in this repository, observe the following queries:
+//!
+//! ```text
+//! select free from disk limit 1;
+//!
+//! name: disk
+//! time                free
+//! ----                ----
+//! 1591894310000000000 882941362176
+//! ```
+//! Now with expressions in the projection and condition:
+//!
+//! ```text
+//! > select free, free + 1 from disk where free + 1 = 882941362176 + 1 limit 1;
+//! name: disk
+//! time                free         free_1
+//! ----                ----         ------
+//! 1591894310000000000 882941362176 882941362177
+//! ```
+//!
+//! ## Tags in expressions
+//!
+//! Tags are only permitted in simple binary expressions, where the tag field
+//! is exclusively referenced on one side and a literal string is the other.
+//!
+//! **🚫 Returns an error for expressions in the projection list**
+//!
+//! ```text
+//! > select device + 'foo' from disk LIMIT 5;
+//! ERR: type error: device::tag + 'foo': incompatible types: tag and string
+//! ```
+//!
+//! **🚫 Unexpected results in conditional expressions**
+//!
+//! Unlike a column expression, a conditional evaluates to false 🙄
+//!
+//! ```text
+//! > select device from disk where device + '' = 'disk1s1' LIMIT 5;
+//! <null>
+//! ```
+//!
+//! **🚫 Can't concat strings to test against a tag**
+//!
+//! This is an issue because the inverted index is searched for tag literal values,
+//! and expressions are not supported.
+//!
+//! ```text
+//! > select device from disk where device = 'disk1s1' + '' LIMIT 5;
+//! <null>
+//! ```
+//!
+//! ## String (and boolean) fields
+//!
+//! ```text
+//! > select uptime_format from system;
+//! name: system
+//! time                uptime_format
+//! ----                -------------
+//! 1591894310000000000 5 days,  7:06
+//! ..
+//! 1591894370000000000 5 days,  7:07
+//! ..
+//! 1591894430000000000 5 days,  7:08
+//! ..
+//! ```
+//!
+//! **🚫 Can't perform any operations on strings in the projection**
+//!
+//! ```text
+//! > select uptime_format + 'foo' from system;
+//! ERR: type error: uptime_format::string + 'foo': incompatible types: string and string
+//! ```
+//!
+//! **✅ Can perform operations on the non-field expression**
+//!
+//! ```text
+//! > select uptime_format from system where uptime_format = '5 days,  ' + '7:08';
+//! name: system
+//! time                uptime_format
+//! ----                -------------
+//! 1591894430000000000 5 days,  7:08
+//! 1591894440000000000 5 days,  7:08
+//! 1591894450000000000 5 days,  7:08
+//! ```
+//!
+//! **🚫 Unable to perform operations on the field expression**
+//!
+//! ```text
+//! > select uptime_format from system where uptime_format + '' = '5 days,  ' + '7:08';
+//! <null>
+//! ```
+//!
+//!
+//! [`Reduce`]: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4850-L4852
+//! [`EvalBool`]: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4181-L4183
+//! [`Eval`]: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4137
+use std::sync::Arc;
+
+use crate::plan::util::IQLSchema;
+use arrow::datatypes::DataType;
+use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
+use datafusion::common::{Result, ScalarValue};
+use datafusion::logical_expr::expr::{AggregateFunction, WindowFunction};
+use datafusion::logical_expr::{
+    binary_expr, cast, coalesce, lit, BinaryExpr, Expr, ExprSchemable, GetIndexedField, Operator,
+};
+use datafusion::optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext};
+use datafusion::physical_expr::execution_props::ExecutionProps;
+use datafusion::prelude::{when, Column};
+use observability_deps::tracing::trace;
+use predicate::rpc_predicate::{iox_expr_rewrite, simplify_predicate};
+
+/// Perform a series of passes to rewrite `expr` in compliance with InfluxQL behavior
+/// in an effort to ensure the query executes without error.
+pub(super) fn rewrite_conditional_expr(
+    exec_props: &ExecutionProps,
+    expr: Expr,
+    schema: &IQLSchema<'_>,
+) -> Result<Expr> {
+    let simplify_context =
+        SimplifyContext::new(exec_props).with_schema(Arc::clone(&schema.df_schema));
+    let simplifier = ExprSimplifier::new(simplify_context);
+
+    Ok(expr)
+        .map(|expr| log_rewrite(expr, "original"))
+        // make regex matching with invalid types produce false
+        .and_then(|expr| expr.rewrite(&mut FixRegularExpressions { schema }))
+        .map(|expr| log_rewrite(expr, "after fix_regular_expressions"))
+        // rewrite exprs with incompatible operands to NULL or FALSE
+        // (seems like FixRegularExpressions could be combined into this pass)
+        .and_then(|expr| rewrite_expr(expr, schema))
+        .map(|expr| log_rewrite(expr, "after rewrite_expr"))
+        // Convert tag column references to CASE WHEN <tag> IS NULL THEN '' ELSE <tag> END
+        .and_then(|expr| rewrite_tag_columns(expr, schema))
+        .map(|expr| log_rewrite(expr, "after rewrite_tag_columns"))
+        // Push comparison operators into CASE exprs:
+        //     CASE WHEN tag0 IS NULL THEN '' ELSE tag0 END = 'foo'
+        // becomes
+        //     CASE WHEN tag0 IS NULL THEN '' = 'foo' ELSE tag0 = 'foo' END
+        .and_then(iox_expr_rewrite)
+        .map(|expr| log_rewrite(expr, "after iox_expr_rewrite"))
+        // Coerce operand types to be compatible:
+        // - convert numeric types so that operands agree
+        // - convert Utf8 to Dictionary as needed
+        // The next step will fail with type errors if we don't do this.
+        .and_then(|expr| simplifier.coerce(expr, Arc::clone(&schema.df_schema)))
+        .map(|expr| log_rewrite(expr, "after coerce"))
+        // DataFusion expression simplification. This is important here because:
+        //     CASE WHEN tag0 IS NULL THEN '' = 'foo' ELSE tag0 = 'foo' END
+        // becomes
+        //     tag0 IS NOT NULL AND tag0 = 'foo'
+        .and_then(|expr| simplifier.simplify(expr))
+        .map(|expr| log_rewrite(expr, "after simplify"))
+        // Further simplify:
+        //     tag0 IS NOT NULL AND tag0 = 'foo'
+        // becomes
+        //     tags = 'foo'
+        // (this could be upstreamed into DataFusion)
+        .and_then(simplify_predicate)
+        .map(|expr| log_rewrite(expr, "after simplify_predicate"))
+        .map(|expr| {
+            if matches!(
+                expr,
+                Expr::Literal(ScalarValue::Null) | Expr::Literal(ScalarValue::Boolean(None))
+            ) {
+                lit(false)
+            } else {
+                expr
+            }
+        })
+}
+
+fn log_rewrite(expr: Expr, description: &str) -> Expr {
+    trace!(?expr, %description, "After rewrite");
+    expr
+}
+
+/// Perform a series of passes to rewrite `expr`, used as a column projection,
+/// to match the behavior of InfluxQL.
+pub(super) fn rewrite_field_expr(expr: Expr, schema: &IQLSchema<'_>) -> Result<Expr> {
+    rewrite_expr(expr, schema)
+}
+
+/// The expression was rewritten
+fn yes(expr: Expr) -> Result<Transformed<Expr>> {
+    Ok(Transformed::Yes(expr))
+}
+
+/// The expression was not rewritten
+fn no(expr: Expr) -> Result<Transformed<Expr>> {
+    Ok(Transformed::No(expr))
+}
+
+/// Rewrite the expression tree and return a result or `NULL` if some of the operands are
+/// incompatible.
+///
+/// Rewrite and coerce the expression tree to model the behavior
+/// of an InfluxQL query.
+fn rewrite_expr(expr: Expr, schema: &IQLSchema<'_>) -> Result<Expr> {
+    expr.transform(&|expr| {
+        match expr {
+            Expr::BinaryExpr(BinaryExpr {
+                ref left,
+                op,
+                ref right,
+            }) => {
+                let lhs_type = left.get_type(&schema.df_schema)?;
+                let rhs_type = right.get_type(&schema.df_schema)?;
+
+                match (lhs_type, op, rhs_type) {
+                    //
+                    // NULL types
+                    //
+
+                    // Operations between a NULL and any numeric type should return a NULL result
+                    (
+                        DataType::Null,
+                        _,
+                        DataType::Float64 | DataType::UInt64 | DataType::Int64
+                    ) |
+                    (
+                        DataType::Float64 | DataType::UInt64 | DataType::Int64,
+                        _,
+                        DataType::Null
+                    ) |
+                    // and any operations on NULLs return a NULL, as DataFusion is unable
+                    // to process these expressions.
+                    (
+                        DataType::Null,
+                        _,
+                        DataType::Null
+                    ) => yes(lit(ScalarValue::Null)),
+
+                    // NULL using AND or OR is rewritten as `false`, which the optimiser
+                    // may short circuit.
+                    (
+                        DataType::Null,
+                        Operator::Or | Operator::And,
+                        _
+                    ) => yes(binary_expr(lit(false), op, (**right).clone())),
+                    (
+                        _,
+                        Operator::Or | Operator::And,
+                        DataType::Null
+                    ) => yes(binary_expr((**left).clone(), op, lit(false))),
+
+                    // NULL with other operators is passed through to DataFusion, which is expected
+                    // evaluate to false.
+                    (
+                        DataType::Null,
+                        Operator::Eq | Operator::NotEq | Operator::Gt | Operator::Lt | Operator::GtEq | Operator::LtEq,
+                        _
+                    ) |
+                    (
+                        _,
+                        Operator::Eq | Operator::NotEq | Operator::Gt | Operator::Lt | Operator::GtEq | Operator::LtEq,
+                        DataType::Null
+                    ) => no(expr),
+
+                    // Any other operations with NULL should return false
+                    (DataType::Null, ..) | (.., DataType::Null) => yes(lit(false)),
+
+                    //
+                    // Boolean types
+                    //
+                    // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4206
+                    (
+                        DataType::Boolean,
+                        Operator::And | Operator::Or | Operator::Eq | Operator::NotEq | Operator::BitwiseAnd | Operator::BitwiseXor | Operator::BitwiseOr,
+                        DataType::Boolean,
+                    ) => yes(rewrite_boolean((**left).clone(), op, (**right).clone())),
+
+                    //
+                    // Numeric types
+                    //
+                    // The following match arms ensure the appropriate target type is chosen based
+                    // on the data types of the operands. In cases where the operands are mixed
+                    // types, Float64 is the highest priority, followed by UInt64 and then Int64.
+                    //
+
+                    // Float on either side of a binary expression involving numeric types
+                    (
+                        DataType::Float64,
+                        _,
+                        DataType::Float64 | DataType::Int64 | DataType::UInt64
+                    ) |
+                    (
+                        DataType::Int64 | DataType::UInt64,
+                        _,
+                        DataType::Float64
+                    ) => match op {
+                        // Dividing by zero would return a `NULL` in DataFusion and other SQL
+                        // implementations, however, InfluxQL coalesces the result to `0`.
+
+                        // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4268-L4270
+                        Operator::Divide => yes(coalesce(vec![expr, lit(0_f64)])),
+                        _ => no(expr),
+                    },
+                    //
+                    // If either of the types UInt64 and the other is UInt64 or Int64
+                    //
+                    (DataType::UInt64, ..) |
+                    (.., DataType::UInt64) => match op {
+                        Operator::Divide => yes(coalesce(vec![expr, lit(0_u64)])),
+                        _ => no(expr),
+                    }
+                    //
+                    // Finally, if both sides are Int64
+                    //
+                    (
+                        DataType::Int64,
+                        _,
+                        DataType::Int64
+                    ) => match op {
+                        // Like Float64, dividing by zero should return 0 for InfluxQL, and
+                        // the expression should be promoted to Float64, so cast both sides.
+                        //
+                        // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4331-L4336
+                        Operator::Divide => yes(coalesce(vec![binary_expr(cast((**left).clone(), DataType::Float64), Operator::Divide, cast((**right).clone(), DataType::Float64)), lit(0_f64)])),
+                        _ => no(expr),
+                    },
+
+                    //
+                    // String
+                    //
+
+                    // Match any of the operators supported by InfluxQL, when the operands a strings
+                    (
+                        DataType::Utf8,
+                        Operator::Eq | Operator::NotEq | Operator::RegexMatch | Operator::RegexNotMatch | Operator::StringConcat,
+                        DataType::Utf8
+                    ) => no(expr),
+                    // Rewrite the + operator to the string-concatenation operator
+                    (
+                        DataType::Utf8,
+                        Operator::Plus,
+                        DataType::Utf8
+                    ) => yes(binary_expr((**left).clone(), Operator::StringConcat, (**right).clone())),
+
+                    //
+                    // Dictionary (tag column) is treated the same as Utf8
+                    //
+                    (
+                        DataType::Dictionary(..),
+                        Operator::Eq | Operator::NotEq | Operator::RegexMatch | Operator::RegexNotMatch | Operator::StringConcat,
+                        DataType::Utf8
+                    ) |
+                    (
+                        DataType::Utf8,
+                        Operator::Eq | Operator::NotEq | Operator::RegexMatch | Operator::RegexNotMatch | Operator::StringConcat,
+                        DataType::Dictionary(..)
+                    ) => no(expr),
+                    (
+                        DataType::Dictionary(..),
+                        Operator::Plus,
+                        DataType::Utf8
+                    ) |
+                    (
+                        DataType::Utf8,
+                        Operator::Plus,
+                        DataType::Dictionary(..)
+                    ) => no(expr),
+
+                    //
+                    // Timestamp (time-range) expressions should pass through to DataFusion.
+                    //
+                    (DataType::Timestamp(..), ..) => no(expr),
+                    (.., DataType::Timestamp(..)) => no(expr),
+
+                    //
+                    // Unhandled binary expressions with conditional operators
+                    // should return `false`.
+                    //
+                    // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4552-L4557
+                    (
+                        _,
+                        Operator::Eq
+                        | Operator::NotEq
+                        | Operator::Gt
+                        | Operator::GtEq
+                        | Operator::Lt
+                        | Operator::LtEq
+                        // These are deviations to resolve ambiguous behaviour in the original implementation
+                        | Operator::And
+                        | Operator::Or,
+                        _
+                    ) => yes(lit(false)),
+
+                    //
+                    // Everything else should result in `NULL`.
+                    //
+                    // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4558
+                    _ => yes(lit(ScalarValue::Null)),
+                }
+            }
+
+            // Invoking an aggregate or window function on a tag column should return `NULL`
+            // to be consistent with OG.
+            Expr::AggregateFunction(AggregateFunction { ref args, .. } )
+            | Expr::WindowFunction(WindowFunction { ref args, .. } ) => match &args[0] {
+               Expr::Column(Column { ref name, ..  }) if schema.is_tag_field(name) => yes(lit(ScalarValue::Null)),
+               _ => no(expr),
+            }
+
+            // If the InfluxQL query used a selector on a tag column,  like `last(tag_col)`
+            // then there will be an indexed field. Convert this to `NULL` as well.
+            Expr::GetIndexedField(GetIndexedField { expr: ref e, .. }) => match e.as_ref() {
+               Expr::Literal(ScalarValue::Null) => yes(lit(ScalarValue::Null)),
+               _ => no(expr),
+            }
+
+            //
+            // Literals and other expressions are passed through to DataFusion,
+            // as it will handle evaluating function calls, etc
+            //
+            // See: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L4638-L4647
+            _ => no(expr),
+        }
+    })
+}
+
+/// Rewrite conditional operators to `false` and any
+/// other operators to `NULL`.
+fn rewrite_any_binary_expr(op: Operator) -> Expr {
+    match op {
+        Operator::Eq
+        | Operator::NotEq
+        | Operator::Gt
+        | Operator::GtEq
+        | Operator::Lt
+        | Operator::LtEq => lit(false),
+        _ => lit(ScalarValue::Null),
+    }
+}
+
+/// Rewrite a binary expression where one of the operands is a [DataType::Boolean].
+///
+/// > **Note**
+/// >
+/// > InfluxQL allows bitwise operations on boolean data types, which must be cast to
+/// > an integer to perform the bitwise operation and back to a boolean.
+fn rewrite_boolean(lhs: Expr, op: Operator, rhs: Expr) -> Expr {
+    match op {
+        Operator::And | Operator::Or | Operator::Eq | Operator::NotEq => binary_expr(lhs, op, rhs),
+        // DataFusion doesn't support arithmetic operators for boolean types,
+        // so cast both sides to an i64 and then the result back to a boolean
+        Operator::BitwiseAnd | Operator::BitwiseXor | Operator::BitwiseOr => cast(
+            binary_expr(cast(lhs, DataType::Int8), op, cast(rhs, DataType::Int8)),
+            DataType::Boolean,
+        ),
+        _ => rewrite_any_binary_expr(op),
+    }
+}
+
+/// Rewrite regex conditional expressions to match InfluxQL behaviour.
+struct FixRegularExpressions<'a> {
+    schema: &'a IQLSchema<'a>,
+}
+
+impl<'a> TreeNodeRewriter for FixRegularExpressions<'a> {
+    type N = Expr;
+
+    fn mutate(&mut self, expr: Expr) -> Result<Expr> {
+        match expr {
+            // InfluxQL evaluates regular expression conditions to false if the column is numeric
+            // or the column doesn't exist.
+            Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: op @ (Operator::RegexMatch | Operator::RegexNotMatch),
+                right,
+            }) => {
+                Ok(if let Expr::Column(ref col) = *left {
+                    match self.schema.df_schema.field_from_column(col)?.data_type() {
+                        DataType::Dictionary(..) | DataType::Utf8 => {
+                            Expr::BinaryExpr(BinaryExpr { left, op, right })
+                        }
+                        // Any other column type should evaluate to false
+                        _ => lit(false),
+                    }
+                } else {
+                    // If this is not a simple column expression, evaluate to false,
+                    // to be consistent with InfluxQL.
+                    //
+                    // References:
+                    //
+                    // * https://github.com/influxdata/influxdb/blob/9308b6586a44e5999180f64a96cfb91e372f04dd/tsdb/index.go#L2487-L2488
+                    // * https://github.com/influxdata/influxdb/blob/9308b6586a44e5999180f64a96cfb91e372f04dd/tsdb/index.go#L2509-L2510
+                    //
+                    // The query engine does not correctly evaluate tag keys and values, always evaluating to false.
+                    //
+                    // Reference example:
+                    //
+                    // * `SELECT f64 FROM m0 WHERE tag0 = '' + tag0`
+                    lit(false)
+                })
+            }
+            _ => Ok(expr),
+        }
+    }
+}
+
+/// Rewrite tag references into
+/// ```sql
+///         case when tag0 is null then "" else tag0 end
+/// ```
+/// This ensures that we treat tags with the same semantics as OG InfluxQL.
+fn rewrite_tag_columns(expr: Expr, schema: &IQLSchema<'_>) -> Result<Expr> {
+    expr.transform(&|expr| match expr {
+        Expr::Column(ref c) if schema.is_tag_field(&c.name) => {
+            yes(when(expr.clone().is_null(), lit("")).otherwise(expr)?)
+        }
+        e => no(e),
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use crate::plan::ir::DataSourceSchema;
+
+    use super::*;
+    use datafusion::prelude::col;
+    use datafusion_util::{lit_timestamptz_nano, AsExpr};
+
+    use chrono::{DateTime, NaiveDate, Utc};
+    use datafusion::common::{DFSchemaRef, ToDFSchema};
+    use schema::{InfluxFieldType, SchemaBuilder};
+    use std::sync::Arc;
+
+    fn new_schema() -> IQLSchema<'static> {
+        let iox_schema = SchemaBuilder::new()
+            .measurement("m0")
+            .timestamp()
+            .tag("tag0")
+            .tag("tag1")
+            .influx_field("float_field", InfluxFieldType::Float)
+            .influx_field("integer_field", InfluxFieldType::Integer)
+            .influx_field("unsigned_field", InfluxFieldType::UInteger)
+            .influx_field("string_field", InfluxFieldType::String)
+            .influx_field("boolean_field", InfluxFieldType::Boolean)
+            .build()
+            .expect("schema failed");
+        let df_schema: DFSchemaRef = Arc::clone(iox_schema.inner()).to_dfschema_ref().unwrap();
+        let ds_schema = DataSourceSchema::Table(iox_schema);
+        IQLSchema::new_from_ds_schema(&df_schema, ds_schema).unwrap()
+    }
+
+    /// Tests which validate that division is coalesced to `0`, to handle division by zero,
+    /// which normally returns a `NULL`, but presents as `0` for InfluxQL.
+    ///
+    /// The rewriter does not check whether the divisor is a literal 0, which it could reduce the
+    /// binary expression to a scalar value, `0`.
+    #[test]
+    fn test_division() {
+        let schemas = new_schema();
+        let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string();
+
+        // Float64
+        let expr = lit(5.0) / "float_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(Float64(5) / float_field, Float64(0))"
+        );
+        let expr = lit(5_u64) / "float_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(UInt64(5) / float_field, Float64(0))"
+        );
+        let expr = lit(5_i64) / "float_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(Int64(5) / float_field, Float64(0))"
+        );
+        let expr = lit(5.0) / "unsigned_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(Float64(5) / unsigned_field, Float64(0))"
+        );
+        let expr = lit(5.0) / "integer_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(Float64(5) / integer_field, Float64(0))"
+        );
+
+        // UInt64
+        let expr = lit(5_u64) / "unsigned_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(UInt64(5) / unsigned_field, UInt64(0))"
+        );
+        let expr = lit(5_i64) / "unsigned_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(Int64(5) / unsigned_field, UInt64(0))"
+        );
+        // integer field combined with an unsigned is coalesced to a Uint64
+        let expr = lit(5_u64) / "integer_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(UInt64(5) / integer_field, UInt64(0))"
+        );
+
+        // Int64 values are cast to Float64 to be consistent with InfluxQL
+        let expr = lit(5_i64) / "integer_field".as_expr();
+        assert_eq!(
+            rewrite(expr),
+            "coalesce(CAST(Int64(5) AS Float64) / CAST(integer_field AS Float64), Float64(0))"
+        );
+    }
+
+    /// Verifies expressions pass through, with the expectation that
+    /// DataFusion will perform any necessary coercions.
+    #[test]
+    fn test_pass_thru() {
+        test_helpers::maybe_start_logging();
+        let schemas = new_schema();
+        let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string();
+
+        let expr = lit(5.5).gt(lit(1_i64));
+        assert_eq!(rewrite(expr), "Float64(5.5) > Int64(1)");
+
+        let expr = lit(5.5) + lit(1_i64);
+        assert_eq!(rewrite(expr), "Float64(5.5) + Int64(1)");
+
+        let expr = (lit(5.5) + lit(1_i64)).gt_eq(lit(3_u64) - lit(9_i64));
+        assert_eq!(
+            rewrite(expr),
+            "Float64(5.5) + Int64(1) >= UInt64(3) - Int64(9)"
+        );
+
+        // regular expressions
+        let expr = binary_expr("tag0".as_expr(), Operator::RegexMatch, lit("foo"));
+        assert_eq!(rewrite(expr), r#"tag0 ~ Utf8("foo")"#);
+
+        let expr = binary_expr("tag0".as_expr(), Operator::RegexNotMatch, lit("foo"));
+        assert_eq!(rewrite(expr), r#"tag0 !~ Utf8("foo")"#);
+    }
+
+    fn execution_props() -> ExecutionProps {
+        let start_time = NaiveDate::from_ymd_opt(2023, 1, 1)
+            .unwrap()
+            .and_hms_opt(0, 0, 0)
+            .unwrap();
+        let start_time = DateTime::<Utc>::from_naive_utc_and_offset(start_time, Utc);
+        let mut props = ExecutionProps::new();
+        props.query_execution_start_time = start_time;
+        props
+    }
+
+    #[test]
+    fn test_string_operations() {
+        let props = execution_props();
+        let schemas = new_schema();
+        let rewrite = |expr| {
+            rewrite_conditional_expr(&props, expr, &schemas)
+                .unwrap()
+                .to_string()
+        };
+
+        // Should rewrite as a string concatenation
+        // NOTE: InfluxQL does not allow operations on string fields
+
+        let expr = "string_field".as_expr() + lit("bar");
+        assert_eq!(rewrite(expr), r#"string_field || Utf8("bar")"#);
+
+        let expr = "string_field".as_expr().eq(lit("bar"));
+        assert_eq!(rewrite(expr), r#"string_field = Utf8("bar")"#);
+
+        let expr = "string_field".as_expr().gt(lit("bar"));
+        assert_eq!(rewrite(expr), r#"Boolean(false)"#);
+    }
+
+    /// Validates operations for boolean operands, with particular attention
+    /// to the supported bitwise operators.
+    #[test]
+    fn test_boolean_operations() {
+        let schemas = new_schema();
+        let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string();
+
+        let expr = "boolean_field".as_expr().and(lit(true));
+        assert_eq!(rewrite(expr), "boolean_field AND Boolean(true)");
+
+        let expr = "boolean_field".as_expr().or(lit(true));
+        assert_eq!(rewrite(expr), "boolean_field OR Boolean(true)");
+
+        let expr = "boolean_field".as_expr().eq(lit(true));
+        assert_eq!(rewrite(expr), "boolean_field = Boolean(true)");
+
+        let expr = "boolean_field".as_expr().not_eq(lit(true));
+        assert_eq!(rewrite(expr), "boolean_field != Boolean(true)");
+
+        // DataFusion does not support bitwise operations when the operands are
+        // Boolean data types. These must be case to an an Int8 in order to perform
+        // the operation and then cast back to Boolean
+
+        let expr = binary_expr("boolean_field".as_expr(), Operator::BitwiseOr, lit(true));
+        assert_eq!(
+            rewrite(expr),
+            "CAST(CAST(boolean_field AS Int8) | CAST(Boolean(true) AS Int8) AS Boolean)"
+        );
+
+        let expr = binary_expr("boolean_field".as_expr(), Operator::BitwiseAnd, lit(true));
+        assert_eq!(
+            rewrite(expr),
+            "CAST(CAST(boolean_field AS Int8) & CAST(Boolean(true) AS Int8) AS Boolean)"
+        );
+
+        let expr = binary_expr("boolean_field".as_expr(), Operator::BitwiseXor, lit(true));
+        assert_eq!(
+            rewrite(expr),
+            "CAST(CAST(boolean_field AS Int8) BIT_XOR CAST(Boolean(true) AS Int8) AS Boolean)"
+        );
+
+        // Unsupported operations
+
+        let expr = "boolean_field".as_expr().gt(lit(true));
+        assert_eq!(rewrite(expr), "Boolean(false)");
+
+        let expr = "boolean_field"
+            .as_expr()
+            .gt(lit(true))
+            .or("boolean_field".as_expr().not_eq(lit(true)));
+        assert_eq!(
+            rewrite(expr),
+            "Boolean(false) OR boolean_field != Boolean(true)"
+        );
+    }
+
+    /// Tests cases to validate Boolean and NULL data types
+    #[test]
+    fn test_rewrite_conditional_null() {
+        let schemas = new_schema();
+        let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string();
+
+        // NULL on either side and boolean on the other of a binary expression
+        let expr = lit(ScalarValue::Null).eq(lit(true));
+        assert_eq!(rewrite(expr), "NULL = Boolean(true)");
+        let expr = lit(ScalarValue::Null).and(lit(true));
+        assert_eq!(rewrite(expr), "Boolean(false) AND Boolean(true)");
+        let expr = lit(true).or(lit(ScalarValue::Null));
+        assert_eq!(rewrite(expr), "Boolean(true) OR Boolean(false)");
+
+        // NULL on either side of a binary expression, that is not a boolean
+        let expr = lit(ScalarValue::Null).eq(lit("test"));
+        assert_eq!(rewrite(expr), r#"NULL = Utf8("test")"#);
+        let expr = lit("test").eq(lit(ScalarValue::Null));
+        assert_eq!(rewrite(expr), r#"Utf8("test") = NULL"#);
+
+        // STRING + INTEGER conditional_op STRING
+        // |> null conditional_op STRING
+        let expr = (lit("foo") + lit(1)).eq(lit("bar"));
+        assert_eq!(rewrite(expr), r#"NULL = Utf8("bar")"#);
+
+        let expr = (lit("foo") + lit(1)).eq(lit("bar") + lit(false));
+        assert_eq!(rewrite(expr), r#"NULL"#);
+
+        // valid boolean operations
+        let expr = lit(false).eq(lit(true));
+        assert_eq!(rewrite(expr), "Boolean(false) = Boolean(true)");
+
+        // booleans don't support mathematical operators
+        let expr = lit(false) + lit(true);
+        assert_eq!(rewrite(expr), "NULL");
+    }
+
+    #[test]
+    fn test_time_range() {
+        let schemas = new_schema();
+        let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string();
+
+        let expr = "time".as_expr().gt_eq(lit_timestamptz_nano(1000));
+        assert_eq!(rewrite(expr), "time >= TimestampNanosecond(1000, None)");
+
+        let expr = lit_timestamptz_nano(1000).lt_eq("time".as_expr());
+        assert_eq!(rewrite(expr), "TimestampNanosecond(1000, None) <= time");
+
+        let expr = "time"
+            .as_expr()
+            .gt_eq(lit_timestamptz_nano(1000))
+            .and("tag0".as_expr().eq(lit("foo")));
+        assert_eq!(
+            rewrite(expr),
+            r#"time >= TimestampNanosecond(1000, None) AND tag0 = Utf8("foo")"#
+        );
+
+        let expr = "time"
+            .as_expr()
+            .gt_eq(lit_timestamptz_nano(1000))
+            .and("float_field".as_expr().eq(lit(false)));
+        assert_eq!(
+            rewrite(expr),
+            r#"time >= TimestampNanosecond(1000, None) AND Boolean(false)"#
+        );
+    }
+
+    /// Tests cases where arithmetic expressions are coerced to `NULL`, as they have no
+    /// valid operation for the given the operands. These are used when projecting columns.
+    #[test]
+    fn test_rewrite_expr_coercion_reduce_to_null() {
+        let schemas = new_schema();
+        let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string();
+
+        //
+        // FLOAT types
+        //
+        let expr = "float_field".as_expr() + lit(true);
+        assert_eq!(rewrite(expr), "NULL");
+        let expr = "float_field".as_expr() + lit(ScalarValue::Null);
+        assert_eq!(rewrite(expr), "NULL");
+
+        //
+        // STRING types
+        //
+        let expr = "string_field".as_expr() + lit(true);
+        assert_eq!(rewrite(expr), "NULL");
+        let expr = lit(true) - "string_field".as_expr();
+        assert_eq!(rewrite(expr), "NULL");
+        let expr = lit(5) + "string_field".as_expr();
+        assert_eq!(rewrite(expr), "NULL");
+        let expr = "string_field".as_expr() - lit(3.3);
+        assert_eq!(rewrite(expr), "NULL");
+
+        //
+        // BOOLEAN types
+        //
+        let expr = "boolean_field".as_expr() + lit(true);
+        assert_eq!(rewrite(expr), "NULL");
+        let expr = lit(true) - "boolean_field".as_expr();
+        assert_eq!(rewrite(expr), "NULL");
+        let expr = lit(5) + "boolean_field".as_expr();
+        assert_eq!(rewrite(expr), "NULL");
+        let expr = "boolean_field".as_expr() - lit(3.3);
+        assert_eq!(rewrite(expr), "NULL");
+    }
+
+    #[test]
+    fn test_rewrite_tag_columns_eq() {
+        test_helpers::maybe_start_logging();
+        let props = execution_props();
+        let schemas = new_schema();
+        let rewrite = |expr| {
+            rewrite_conditional_expr(&props, expr, &schemas)
+                .unwrap()
+                .to_string()
+        };
+
+        // Equality with a non-empty literal
+        let expr = col("tag0").eq(lit("foo"));
+        assert_eq!(rewrite(expr), r#"tag0 = Dictionary(Int32, Utf8("foo"))"#);
+
+        // Equality with an empty literal
+        // It doesn't seem possible to insert an empty tag with line protocol.
+        // so this could perhaps be simplified to remove the IS NULL check.
+        // Such an optimization would be an IOx-ism and not true for DataFusion generally.
+        let expr = col("tag0").eq(lit(""));
+        assert_eq!(
+            rewrite(expr),
+            r#"tag0 IS NULL OR tag0 = Dictionary(Int32, Utf8(""))"#
+        );
+
+        // Inequality with a non-empty literal
+        let expr = col("tag0").not_eq(lit("foo"));
+        assert_eq!(
+            rewrite(expr),
+            r#"tag0 IS NULL OR tag0 != Dictionary(Int32, Utf8("foo"))"#
+        );
+
+        // Inequality with an empty literal
+        let expr = col("tag0").not_eq(lit(""));
+        assert_eq!(rewrite(expr), r#"tag0 != Dictionary(Int32, Utf8(""))"#);
+    }
+
+    fn regex_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexMatch,
+            right: Box::new(right),
+        })
+    }
+
+    fn regex_not_match(left: Expr, right: Expr) -> Expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left: Box::new(left),
+            op: Operator::RegexNotMatch,
+            right: Box::new(right),
+        })
+    }
+
+    #[test]
+    fn test_rewrite_tag_columns_regex() {
+        let props = execution_props();
+        test_helpers::maybe_start_logging();
+        let schemas = new_schema();
+        let rewrite = |expr| {
+            rewrite_conditional_expr(&props, expr, &schemas)
+                .unwrap()
+                .to_string()
+        };
+
+        let expr = regex_match(col("tag0"), lit("^$"));
+        assert_eq!(
+            rewrite(expr),
+            r#"tag0 IS NULL OR CAST(tag0 AS Utf8) = Utf8("")"#
+        );
+        let expr = regex_match(col("tag0"), lit("^foo$"));
+        assert_eq!(rewrite(expr), r#"CAST(tag0 AS Utf8) = Utf8("foo")"#);
+        let expr = regex_not_match(col("tag0"), lit("^$"));
+        assert_eq!(rewrite(expr), r#"CAST(tag0 AS Utf8) != Utf8("")"#);
+        let expr = regex_not_match(col("tag0"), lit("^foo$"));
+        assert_eq!(
+            rewrite(expr),
+            r#"tag0 IS NULL OR CAST(tag0 AS Utf8) != Utf8("foo")"#
+        );
+    }
+
+    #[test]
+    fn test_fields_pass_thru() {
+        test_helpers::maybe_start_logging();
+        let props = execution_props();
+        let schemas = new_schema();
+        let rewrite = |expr| {
+            rewrite_conditional_expr(&props, expr, &schemas)
+                .unwrap()
+                .to_string()
+        };
+
+        // Field predicates are handled naively which is
+        // consistent with InfluxQL OG.
+
+        let expr = col("string_field").eq(lit(""));
+        assert_eq!(rewrite(expr), r#"string_field = Utf8("")"#);
+
+        let expr = col("string_field").eq(lit("foo"));
+        assert_eq!(rewrite(expr), r#"string_field = Utf8("foo")"#);
+
+        let expr = col("string_field").not_eq(lit(""));
+        assert_eq!(rewrite(expr), r#"string_field != Utf8("")"#);
+
+        let expr = col("string_field").not_eq(lit("foo"));
+        assert_eq!(rewrite(expr), r#"string_field != Utf8("foo")"#);
+    }
+}
diff --git a/iox_query_influxql/src/plan/planner_time_range_expression.rs b/iox_query_influxql/src/plan/planner_time_range_expression.rs
new file mode 100644
index 0000000..c6ba15f
--- /dev/null
+++ b/iox_query_influxql/src/plan/planner_time_range_expression.rs
@@ -0,0 +1,41 @@
+//! APIs for transforming InfluxQL [expressions][influxdb_influxql_parser::expression::Expr].
+use datafusion::common::ScalarValue;
+use datafusion::logical_expr::{lit, Expr as DFExpr};
+use datafusion_util::AsExpr;
+use influxdb_influxql_parser::time_range::TimeRange;
+
+fn lower_bound_to_df_expr(v: Option<i64>) -> Option<DFExpr> {
+    v.map(|ts| {
+        "time"
+            .as_expr()
+            .gt_eq(lit(ScalarValue::TimestampNanosecond(Some(ts), None)))
+    })
+}
+
+fn upper_bound_to_df_expr(v: Option<i64>) -> Option<DFExpr> {
+    v.map(|ts| {
+        "time"
+            .as_expr()
+            .lt_eq(lit(ScalarValue::TimestampNanosecond(Some(ts), None)))
+    })
+}
+
+pub(super) fn time_range_to_df_expr(cond: TimeRange) -> Option<DFExpr> {
+    match (cond.lower, cond.upper) {
+        (Some(lower), Some(upper)) if lower == upper => {
+            // Use ts = <lower> if the lower and upper are equal
+            Some(
+                "time"
+                    .as_expr()
+                    .eq(lit(ScalarValue::TimestampNanosecond(Some(lower), None))),
+            )
+        }
+        // If lower > upper, then the expression lower < time < upper is always false
+        (Some(lower), Some(upper)) if lower > upper => Some(lit(false)),
+        (lower, upper) => match (lower_bound_to_df_expr(lower), upper_bound_to_df_expr(upper)) {
+            (Some(e), None) | (None, Some(e)) => Some(e),
+            (Some(lower), Some(upper)) => Some(lower.and(upper)),
+            (None, None) => None,
+        },
+    }
+}
diff --git a/iox_query_influxql/src/plan/rewriter.rs b/iox_query_influxql/src/plan/rewriter.rs
new file mode 100644
index 0000000..dc4fcc7
--- /dev/null
+++ b/iox_query_influxql/src/plan/rewriter.rs
@@ -0,0 +1,2801 @@
+use crate::error;
+use crate::plan::expr_type_evaluator::TypeEvaluator;
+use crate::plan::field::{field_by_name, field_name};
+use crate::plan::field_mapper::{field_and_dimensions, FieldTypeMap};
+use crate::plan::ir::{DataSource, Field, Interval, Select, SelectQuery, TagSet};
+use crate::plan::var_ref::{influx_type_to_var_ref_data_type, var_ref_data_type_to_influx_type};
+use crate::plan::{util, SchemaProvider};
+use datafusion::common::{DataFusionError, Result};
+use influxdb_influxql_parser::common::{MeasurementName, QualifiedMeasurementName, WhereClause};
+use influxdb_influxql_parser::expression::walk::{
+    walk_expr, walk_expr_mut, walk_expression_mut, ExpressionMut,
+};
+use influxdb_influxql_parser::expression::{
+    AsVarRefExpr, Call, Expr, VarRef, VarRefDataType, WildcardType,
+};
+use influxdb_influxql_parser::functions::is_scalar_math_function;
+use influxdb_influxql_parser::identifier::Identifier;
+use influxdb_influxql_parser::literal::Literal;
+use influxdb_influxql_parser::select::{
+    Dimension, FillClause, FromMeasurementClause, GroupByClause, MeasurementSelection,
+    SelectStatement,
+};
+use influxdb_influxql_parser::time_range::{
+    duration_expr_to_nanoseconds, split_cond, ReduceContext, TimeRange,
+};
+use influxdb_influxql_parser::timestamp::Timestamp;
+use itertools::Itertools;
+use schema::InfluxColumnType;
+use std::collections::{BTreeSet, HashMap, HashSet};
+use std::fmt::Debug;
+use std::ops::{ControlFlow, Deref, DerefMut};
+
+/// Recursively rewrite the specified [`SelectStatement`] by performing a series of passes
+/// to validate and normalize the statement.
+pub(super) fn rewrite_statement(
+    s: &dyn SchemaProvider,
+    q: &SelectStatement,
+) -> Result<SelectQuery> {
+    let mut select = rewrite_select(s, q)?;
+    from_drop_empty(s, &mut select);
+    field_list_normalize_time(&mut select);
+
+    Ok(SelectQuery { select })
+}
+
+/// Find the unique list of tables used by `s`, recursively following all `FROM` clauses and
+/// return the results in lexicographically in ascending order.
+pub(super) fn find_table_names(s: &Select) -> BTreeSet<&str> {
+    let mut data_sources = vec![s.from.as_slice()];
+    let mut tables = BTreeSet::new();
+    while let Some(from) = data_sources.pop() {
+        for ds in from {
+            match ds {
+                DataSource::Table(name) => {
+                    tables.insert(name.as_str());
+                }
+                DataSource::Subquery(q) => data_sources.push(q.from.as_slice()),
+            }
+        }
+    }
+    tables
+}
+
+/// Transform a `SelectStatement` to a `Select`, which is an intermediate representation used by
+/// the InfluxQL planner. Transformations include expanding wildcards.
+fn rewrite_select(s: &dyn SchemaProvider, stmt: &SelectStatement) -> Result<Select> {
+    let rw = RewriteSelect::default();
+    rw.rewrite(s, stmt)
+}
+
+/// Asserts that the `SELECT` statement does not use any unimplemented features.
+///
+/// The list of unimplemented or unsupported features are listed below.
+///
+/// # `SLIMIT` and `SOFFSET`
+///
+/// * `SLIMIT` and `SOFFSET` don't work as expected per issue [#7571]
+/// * This issue [is noted](https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#the-slimit-clause) in our official documentation
+///
+/// [#7571]: https://github.com/influxdata/influxdb/issues/7571
+fn check_features(stmt: &SelectStatement) -> Result<()> {
+    if stmt.series_limit.is_some() || stmt.series_offset.is_some() {
+        return error::not_implemented("SLIMIT or SOFFSET");
+    }
+
+    Ok(())
+}
+
+#[derive(Default)]
+struct RewriteSelect {
+    /// The depth of the `SELECT` statement currently processed by the rewriter.
+    depth: u32,
+}
+
+impl RewriteSelect {
+    /// Transform a `SelectStatement` to a `Select`, which is an intermediate representation used by
+    /// the InfluxQL planner. Transformations include expanding wildcards.
+    fn rewrite(&self, s: &dyn SchemaProvider, stmt: &SelectStatement) -> Result<Select> {
+        check_features(stmt)?;
+
+        let from = self.expand_from(s, stmt)?;
+        let tag_set = from_tag_set(s, &from);
+        let (fields, group_by) = self
+            .expand_projection(s, stmt, &from, &tag_set)
+            .map_err(|e| e.context("expand projection"))?;
+        let condition = self
+            .condition_resolve_types(s, stmt, &from)
+            .map_err(|e| e.context("resolve types in condition"))?;
+
+        let now = Timestamp::from(s.execution_props().query_execution_start_time);
+        let rc = ReduceContext {
+            now: Some(now),
+            tz: stmt.timezone.map(|tz| *tz),
+        };
+
+        let interval = self
+            .find_interval_offset(&rc, group_by.as_ref())
+            .map_err(|e| e.context("find interval offset"))?;
+
+        let (condition, time_range) = match condition {
+            Some(where_clause) => split_cond(&rc, &where_clause)
+                .map_err(error::map::expr_error)
+                .map_err(|e| e.context("split condition"))?,
+            None => (None, TimeRange::default()),
+        };
+
+        // If the interval is non-zero and there is no upper bound, default to `now`
+        // for compatibility with InfluxQL OG.
+        //
+        // See: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L172-L179
+        let time_range = match (interval, time_range.upper) {
+            (Some(interval), None) if interval.duration > 0 => TimeRange {
+                lower: time_range.lower,
+                upper: Some(now.timestamp_nanos_opt().expect("'now' in nano range")),
+            },
+            _ => time_range,
+        };
+
+        let SelectStatementInfo {
+            projection_type,
+            extra_intervals,
+        } = select_statement_info(&fields, &group_by, stmt.fill)
+            .map_err(|e| e.context("gather information about select statement"))?;
+
+        // Following InfluxQL OG behaviour, if this is a subquery, and the fill strategy equates
+        // to `FILL(null)`, switch to `FILL(none)`.
+        //
+        // See: https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/iterator.go#L757-L765
+        let fill = if projection_type != ProjectionType::Raw
+            && self.is_subquery()
+            && matches!(stmt.fill, Some(FillClause::Null) | None)
+        {
+            Some(FillClause::None)
+        } else {
+            stmt.fill
+        };
+
+        Ok(Select {
+            projection_type,
+            interval,
+            extra_intervals,
+            fields,
+            from,
+            condition,
+            time_range,
+            group_by,
+            tag_set,
+            fill,
+            order_by: stmt.order_by,
+            limit: stmt.limit,
+            offset: stmt.offset,
+            timezone: stmt.timezone.map(|v| *v),
+        })
+    }
+
+    /// Returns true if the receiver is processing a subquery.
+    #[inline]
+    fn is_subquery(&self) -> bool {
+        self.depth > 0
+    }
+
+    /// Rewrite the `SELECT` statement by applying specific rules for subqueries.
+    fn rewrite_subquery(&self, s: &dyn SchemaProvider, stmt: &SelectStatement) -> Result<Select> {
+        let rw = Self {
+            depth: self.depth + 1,
+        };
+
+        rw.rewrite(s, stmt)
+    }
+
+    /// Rewrite the projection list and GROUP BY of the specified `SELECT` statement.
+    ///
+    /// The following transformations are performed:
+    ///
+    /// * Wildcards and regular expressions in the `SELECT` projection list and `GROUP BY` are expanded.
+    /// * Any fields with no type specifier are rewritten with the appropriate type, if they exist in the
+    ///   underlying schema.
+    ///
+    /// Derived from [Go implementation](https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L1185).
+    fn expand_projection(
+        &self,
+        s: &dyn SchemaProvider,
+        stmt: &SelectStatement,
+        from: &[DataSource],
+        from_tag_set: &TagSet,
+    ) -> Result<(Vec<Field>, Option<GroupByClause>)> {
+        let tv = TypeEvaluator::new(s, from);
+        let fields = stmt
+            .fields
+            .iter()
+            .map(|f| (f.expr.clone(), f.alias.clone()))
+            .map(|(mut expr, alias)| {
+                match walk_expr_mut::<_>(&mut expr, &mut |e| match e {
+                    // Rewrite all `DISTINCT <identifier>` expressions to `DISTINCT(VarRef)`
+                    Expr::Distinct(ident) => {
+                        let mut v = VarRef {
+                            name: ident.take().into(),
+                            data_type: None,
+                        };
+                        v.data_type = match tv.eval_var_ref(&v) {
+                            Ok(v) => v,
+                            Err(e) => ControlFlow::Break(e)?,
+                        };
+
+                        *e = Expr::Call(Call {
+                            name: "distinct".to_owned(),
+                            args: vec![Expr::VarRef(v)],
+                        });
+                        ControlFlow::Continue(())
+                    }
+
+                    // Attempt to rewrite all variable (column) references with their concrete types,
+                    // if one hasn't been specified.
+                    Expr::VarRef(ref mut v) => {
+                        v.data_type = match tv.eval_var_ref(v) {
+                            Ok(v) => v,
+                            Err(e) => ControlFlow::Break(e)?,
+                        };
+                        ControlFlow::Continue(())
+                    }
+
+                    _ => ControlFlow::Continue(()),
+                }) {
+                    ControlFlow::Break(err) => Err(err),
+                    ControlFlow::Continue(()) => {
+                        Ok(influxdb_influxql_parser::select::Field { expr, alias })
+                    }
+                }
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        let (has_field_wildcard, has_group_by_wildcard) = has_wildcards(stmt);
+
+        let (fields, mut group_by) = if has_field_wildcard || has_group_by_wildcard {
+            let (field_set, mut tag_set) = from_field_and_dimensions(s, from)?;
+
+            if !has_group_by_wildcard {
+                if let Some(group_by) = &stmt.group_by {
+                    // Remove any explicitly listed tags in the GROUP BY clause, so they are not
+                    // expanded by any wildcards specified in the SELECT projection list
+                    group_by.tag_names().for_each(|ident| {
+                        tag_set.remove(ident.as_str());
+                    });
+                }
+            }
+
+            let fields = if has_field_wildcard {
+                let var_refs = if field_set.is_empty() {
+                    vec![]
+                } else {
+                    let fields_iter = field_set.iter().map(|(k, v)| VarRef {
+                        name: k.clone().into(),
+                        data_type: Some(*v),
+                    });
+
+                    if !has_group_by_wildcard {
+                        fields_iter
+                            .chain(tag_set.iter().map(|tag| VarRef {
+                                name: tag.clone().into(),
+                                data_type: Some(VarRefDataType::Tag),
+                            }))
+                            .sorted()
+                            .collect::<Vec<_>>()
+                    } else {
+                        fields_iter.sorted().collect::<Vec<_>>()
+                    }
+                };
+
+                fields_expand_wildcards(fields, var_refs)?
+            } else {
+                fields
+            };
+
+            let group_by = match (&stmt.group_by, has_group_by_wildcard) {
+                // GROUP BY with a wildcard
+                (Some(group_by), true) => {
+                    let group_by_tags = tag_set.into_iter().sorted().collect::<Vec<_>>();
+                    let mut new_dimensions = Vec::new();
+
+                    for dim in group_by.iter() {
+                        let add_dim = |dim: &String| {
+                            new_dimensions.push(Dimension::VarRef(VarRef {
+                                name: Identifier::new(dim.clone()),
+                                data_type: Some(VarRefDataType::Tag),
+                            }))
+                        };
+
+                        match dim {
+                            Dimension::Wildcard => {
+                                group_by_tags.iter().for_each(add_dim);
+                            }
+                            Dimension::Regex(re) => {
+                                let re = util::parse_regex(re)?;
+
+                                group_by_tags
+                                    .iter()
+                                    .filter(|dim| re.is_match(dim.as_str()))
+                                    .for_each(add_dim);
+                            }
+                            _ => new_dimensions.push(dim.clone()),
+                        }
+                    }
+                    Some(GroupByClause::new(new_dimensions))
+                }
+                // GROUP BY no wildcard
+                (Some(group_by), false) => Some(group_by.clone()),
+                // No GROUP BY
+                (None, _) => None,
+            };
+
+            (fields, group_by)
+        } else {
+            (fields, stmt.group_by.clone())
+        };
+
+        // resolve possible tag references in group_by
+        if let Some(group_by) = group_by.as_mut() {
+            for dim in group_by.iter_mut() {
+                let Dimension::VarRef(var_ref) = dim else {
+                    continue;
+                };
+                if from_tag_set.contains(var_ref.name.as_str()) {
+                    var_ref.data_type = Some(VarRefDataType::Tag);
+                }
+            }
+        }
+
+        Ok((fields_resolve_aliases_and_types(s, fields, from)?, group_by))
+    }
+
+    /// Recursively expand the `from` clause of `stmt` and any subqueries.
+    fn expand_from(
+        &self,
+        s: &dyn SchemaProvider,
+        stmt: &SelectStatement,
+    ) -> Result<Vec<DataSource>> {
+        let mut new_from = Vec::new();
+        for ms in &*stmt.from {
+            match ms {
+                MeasurementSelection::Name(qmn) => match qmn {
+                    QualifiedMeasurementName {
+                        name: MeasurementName::Name(name),
+                        ..
+                    } => {
+                        if s.table_exists(name) {
+                            new_from.push(DataSource::Table(name.deref().to_owned()))
+                        }
+                    }
+                    QualifiedMeasurementName {
+                        name: MeasurementName::Regex(re),
+                        ..
+                    } => {
+                        let re = util::parse_regex(re)?;
+                        s.table_names()
+                            .into_iter()
+                            .filter(|table| re.is_match(table))
+                            .for_each(|table| new_from.push(DataSource::Table(table.to_owned())));
+                    }
+                },
+                MeasurementSelection::Subquery(q) => {
+                    new_from.push(DataSource::Subquery(Box::new(self.rewrite_subquery(s, q)?)))
+                }
+            }
+        }
+        Ok(new_from)
+    }
+
+    /// Resolve the data types of any [`VarRef`] expressions in the `WHERE` condition.
+    fn condition_resolve_types(
+        &self,
+        s: &dyn SchemaProvider,
+        stmt: &SelectStatement,
+        from: &[DataSource],
+    ) -> Result<Option<WhereClause>> {
+        let Some(mut where_clause) = stmt.condition.clone() else {
+            return Ok(None);
+        };
+
+        let tv = TypeEvaluator::new(s, from);
+
+        if let ControlFlow::Break(err) = walk_expression_mut(where_clause.deref_mut(), &mut |e| {
+            match e {
+                ExpressionMut::Arithmetic(e) => walk_expr_mut(e, &mut |e| match e {
+                    // Attempt to rewrite all variable (column) references with their concrete types,
+                    // if one hasn't been specified.
+                    Expr::VarRef(ref mut v) => {
+                        v.data_type = match tv.eval_var_ref(v) {
+                            Ok(v) => v,
+                            Err(e) => ControlFlow::Break(e)?,
+                        };
+                        ControlFlow::Continue(())
+                    }
+                    _ => ControlFlow::Continue(()),
+                }),
+                ExpressionMut::Conditional(_) => ControlFlow::<DataFusionError>::Continue(()),
+            }
+        }) {
+            Err(err)
+        } else {
+            Ok(Some(where_clause))
+        }
+    }
+
+    /// Return the interval value of the `GROUP BY` clause if it specifies a `TIME`.
+    fn find_interval_offset(
+        &self,
+        ctx: &ReduceContext,
+        group_by: Option<&GroupByClause>,
+    ) -> Result<Option<Interval>> {
+        Ok(
+            if let Some(td) = group_by.and_then(|v| v.time_dimension()) {
+                let duration = duration_expr_to_nanoseconds(ctx, &td.interval)
+                    .map_err(error::map::expr_error)?;
+                let offset = td
+                    .offset
+                    .as_ref()
+                    .map(|o| duration_expr_to_nanoseconds(ctx, o))
+                    .transpose()
+                    .map_err(error::map::expr_error)?;
+                Some(Interval { duration, offset })
+            } else {
+                None
+            },
+        )
+    }
+}
+
+/// Ensures the `time` column is presented consistently across all `SELECT` queries.
+///
+/// The following transformations may occur
+///
+/// * Ensure the `time` field is added to all projections;
+/// * move the `time` field to the first position; and
+/// * remove column alias for `time` in subqueries.
+fn field_list_normalize_time(stmt: &mut Select) {
+    fn normalize_time(stmt: &mut Select, is_subquery: bool) {
+        if let Some(f) = match stmt
+            .fields
+            .iter()
+            .find_position(
+                |c| matches!(&c.expr, Expr::VarRef(VarRef { name, .. }) if name.deref() == "time"),
+            )
+            .map(|(i, _)| i)
+        {
+            Some(0) => None,
+            Some(idx) => Some(stmt.fields.remove(idx)),
+            None => Some(Field {
+                expr: "time".to_var_ref_expr(),
+                name: "time".to_owned(),
+                data_type: None,
+            }),
+        } {
+            stmt.fields.insert(0, f)
+        }
+
+        let c = &mut stmt.fields[0];
+        c.data_type = Some(InfluxColumnType::Timestamp);
+
+        // time aliases in subqueries is ignored
+        if is_subquery {
+            c.name = "time".to_owned()
+        }
+
+        if let Expr::VarRef(VarRef {
+            ref mut data_type, ..
+        }) = c.expr
+        {
+            *data_type = Some(VarRefDataType::Timestamp);
+        }
+    }
+
+    normalize_time(stmt, false);
+
+    // traverse all the subqueries
+    let mut data_sources = vec![stmt.from.as_mut_slice()];
+    while let Some(from) = data_sources.pop() {
+        for sel in from.iter_mut().filter_map(|ds| match ds {
+            DataSource::Subquery(q) => Some(q),
+            _ => None,
+        }) {
+            normalize_time(sel, true);
+            data_sources.push(&mut sel.from);
+        }
+    }
+}
+
+/// Recursively drop any measurements of the `from` clause of `stmt` that do not project
+/// any fields.
+fn from_drop_empty(s: &dyn SchemaProvider, stmt: &mut Select) {
+    stmt.from.retain_mut(|tr| {
+        match tr {
+            DataSource::Table(name) => {
+                // drop any measurements that have no matching fields in the
+                // projection
+
+                if let Some(table) = s.table_schema(name.as_str()) {
+                    stmt.fields.iter().any(|f| {
+                        walk_expr(&f.expr, &mut |e| {
+                            if matches!(e, Expr::VarRef(VarRef { name, ..}) if matches!(table.field_type_by_name(name.deref()), Some(InfluxColumnType::Field(_)))) {
+                                ControlFlow::Break(())
+                            } else {
+                                ControlFlow::Continue(())
+                            }
+                        }).is_break()
+                    })
+                } else {
+                    false
+                }
+            }
+            DataSource::Subquery(q) => {
+                from_drop_empty(s, q);
+                if q.from.is_empty() {
+                    return false;
+                }
+
+                stmt.fields.iter().any(|f| {
+                    walk_expr(&f.expr, &mut |e| {
+                        if matches!(e, Expr::VarRef(VarRef{ name, ..}) if field_by_name(&q.fields, name.as_str()).is_some()) {
+                            ControlFlow::Break(())
+                        } else {
+                            ControlFlow::Continue(())
+                        }
+                    }).is_break()
+                })
+            }
+        }
+    });
+}
+
+/// Determine the combined tag set for the specified `from`.
+fn from_tag_set(s: &dyn SchemaProvider, from: &[DataSource]) -> TagSet {
+    let mut tag_set = TagSet::new();
+
+    for ds in from {
+        match ds {
+            DataSource::Table(table_name) => {
+                if let Some(table) = s.table_schema(table_name) {
+                    tag_set.extend(table.tags_iter().map(|f| f.name().to_owned()))
+                }
+            }
+            DataSource::Subquery(q) => tag_set.extend(q.tag_set.clone()),
+        }
+    }
+
+    tag_set
+}
+
+/// Determine the merged fields and tags of the `FROM` clause.
+fn from_field_and_dimensions(
+    s: &dyn SchemaProvider,
+    from: &[DataSource],
+) -> Result<(FieldTypeMap, TagSet)> {
+    let mut fs = FieldTypeMap::new();
+    let mut ts = TagSet::new();
+
+    for tr in from {
+        match tr {
+            DataSource::Table(name) => {
+                let Some((field_set, tag_set)) = field_and_dimensions(s, name.as_str()) else {
+                    continue;
+                };
+
+                // Merge field_set with existing
+                for (name, ft) in &field_set {
+                    match fs.get(name) {
+                        Some(existing_type) => {
+                            if ft < existing_type {
+                                fs.insert(name.to_string(), *ft);
+                            }
+                        }
+                        None => {
+                            fs.insert(name.to_string(), *ft);
+                        }
+                    };
+                }
+
+                ts.extend(tag_set);
+            }
+            DataSource::Subquery(select) => {
+                for f in &select.fields {
+                    let Field {
+                        name, data_type, ..
+                    } = f;
+                    let Some(dt) = influx_type_to_var_ref_data_type(*data_type) else {
+                        continue;
+                    };
+
+                    match fs.get(name.as_str()) {
+                        Some(existing_type) => {
+                            if dt < *existing_type {
+                                fs.insert(name.to_owned(), dt);
+                            }
+                        }
+                        None => {
+                            fs.insert(name.to_owned(), dt);
+                        }
+                    }
+                }
+
+                if let Some(group_by) = &select.group_by {
+                    // Merge the dimensions from the subquery
+                    ts.extend(group_by.tag_names().map(|i| i.deref().to_string()));
+                }
+            }
+        }
+    }
+    Ok((fs, ts))
+}
+
+/// Returns a tuple indicating whether the specifies `SELECT` statement
+/// has any wildcards or regular expressions in the projection list
+/// and `GROUP BY` clause respectively.
+fn has_wildcards(stmt: &SelectStatement) -> (bool, bool) {
+    use influxdb_influxql_parser::visit::{Recursion, Visitable, Visitor};
+
+    struct HasWildcardsVisitor(bool, bool);
+
+    impl Visitor for HasWildcardsVisitor {
+        type Error = DataFusionError;
+
+        fn pre_visit_expr(self, n: &Expr) -> Result<Recursion<Self>> {
+            Ok(
+                if matches!(n, Expr::Wildcard(_) | Expr::Literal(Literal::Regex(_))) {
+                    Recursion::Stop(Self(true, self.1))
+                } else {
+                    Recursion::Continue(self)
+                },
+            )
+        }
+
+        fn pre_visit_select_from_clause(
+            self,
+            _n: &FromMeasurementClause,
+        ) -> Result<Recursion<Self>> {
+            // Don't traverse FROM and potential subqueries
+            Ok(Recursion::Stop(self))
+        }
+
+        fn pre_visit_select_dimension(self, n: &Dimension) -> Result<Recursion<Self>> {
+            Ok(if matches!(n, Dimension::Wildcard | Dimension::Regex(_)) {
+                Recursion::Stop(Self(self.0, true))
+            } else {
+                Recursion::Continue(self)
+            })
+        }
+    }
+
+    let res = Visitable::accept(stmt, HasWildcardsVisitor(false, false)).unwrap();
+    (res.0, res.1)
+}
+
+/// Traverse expressions of all `fields` and expand wildcard or regular expressions
+/// either at the top-level or as the first argument of function calls, such as `SUM(*)`.
+///
+/// `var_refs` contains the list of field and tags that should be expanded by wildcards.
+fn fields_expand_wildcards(
+    fields: Vec<influxdb_influxql_parser::select::Field>,
+    var_refs: Vec<VarRef>,
+) -> Result<Vec<influxdb_influxql_parser::select::Field>> {
+    let mut new_fields = Vec::new();
+
+    for f in fields {
+        let add_field = |f: &VarRef| {
+            new_fields.push(influxdb_influxql_parser::select::Field {
+                expr: Expr::VarRef(f.clone()),
+                alias: None,
+            })
+        };
+
+        match &f.expr {
+            Expr::Wildcard(wct) => {
+                let filter: fn(&&VarRef) -> bool = match wct {
+                    None => |_| true,
+                    Some(WildcardType::Tag) => |v| v.data_type.map_or(false, |dt| dt.is_tag_type()),
+                    Some(WildcardType::Field) => {
+                        |v| v.data_type.map_or(false, |dt| dt.is_field_type())
+                    }
+                };
+
+                var_refs.iter().filter(filter).for_each(add_field);
+            }
+
+            Expr::Literal(Literal::Regex(re)) => {
+                let re = util::parse_regex(re)?;
+                var_refs
+                    .iter()
+                    .filter(|v| re.is_match(v.name.as_str()))
+                    .for_each(add_field);
+            }
+
+            Expr::Call(Call { name, args }) => {
+                let mut name = name;
+                let mut args = args;
+
+                // Search for the call with a wildcard by continuously descending until
+                // we no longer have a call.
+                while let Some(Expr::Call(Call {
+                    name: inner_name,
+                    args: inner_args,
+                })) = args.first()
+                {
+                    name = inner_name;
+                    args = inner_args;
+                }
+
+                // a list of supported types that may be selected from the var_refs
+                // vector when expanding wildcards in functions.
+                let mut supported_types = HashSet::from([
+                    Some(VarRefDataType::Float),
+                    Some(VarRefDataType::Integer),
+                    Some(VarRefDataType::Unsigned),
+                ]);
+
+                // Modify the supported types for certain functions.
+                match name.as_str() {
+                    "count" | "first" | "last" | "distinct" | "elapsed" | "mode" | "sample" => {
+                        supported_types
+                            .extend([Some(VarRefDataType::String), Some(VarRefDataType::Boolean)]);
+                    }
+                    "min" | "max" => {
+                        supported_types.insert(Some(VarRefDataType::Boolean));
+                    }
+                    "holt_winters" | "holt_winters_with_fit" => {
+                        supported_types.remove(&Some(VarRefDataType::Unsigned));
+                    }
+                    _ => {}
+                }
+
+                let add_field = |v: &VarRef| {
+                    let mut args = args.clone();
+                    args[0] = Expr::VarRef(v.clone());
+                    new_fields.push(influxdb_influxql_parser::select::Field {
+                        expr: Expr::Call(Call {
+                            name: name.clone(),
+                            args,
+                        }),
+                        alias: Some(format!("{}_{}", field_name(&f), v.name).into()),
+                    })
+                };
+
+                match args.first() {
+                    Some(Expr::Wildcard(Some(WildcardType::Tag))) => {
+                        return error::query(format!("unable to use tag as wildcard in {name}()"));
+                    }
+                    Some(Expr::Wildcard(_)) => {
+                        var_refs
+                            .iter()
+                            .filter(|v| supported_types.contains(&v.data_type))
+                            .for_each(add_field);
+                    }
+                    Some(Expr::Literal(Literal::Regex(re))) => {
+                        let re = util::parse_regex(re)?;
+                        var_refs
+                            .iter()
+                            .filter(|v| {
+                                supported_types.contains(&v.data_type)
+                                    && re.is_match(v.name.as_str())
+                            })
+                            .for_each(add_field);
+                    }
+                    _ => {
+                        new_fields.push(f);
+                        continue;
+                    }
+                }
+            }
+
+            Expr::Binary { .. } => {
+                let has_wildcard = walk_expr(&f.expr, &mut |e| {
+                    if matches!(e, Expr::Wildcard(_) | Expr::Literal(Literal::Regex(_))) {
+                        ControlFlow::Break(())
+                    } else {
+                        ControlFlow::Continue(())
+                    }
+                })
+                .is_break();
+
+                if has_wildcard {
+                    return error::query(
+                        "unsupported binary expression: contains a wildcard or regular expression",
+                    );
+                }
+
+                new_fields.push(f);
+            }
+
+            _ => new_fields.push(f),
+        }
+    }
+
+    Ok(new_fields)
+}
+
+/// Resolve the projection list column names and data types.
+/// Column names are resolved in accordance with the [original implementation].
+///
+/// [original implementation]: https://github.com/influxdata/influxql/blob/1ba470371ec093d57a726b143fe6ccbacf1b452b/ast.go#L1651
+fn fields_resolve_aliases_and_types(
+    s: &dyn SchemaProvider,
+    fields: Vec<influxdb_influxql_parser::select::Field>,
+    from: &[DataSource],
+) -> Result<Vec<Field>> {
+    let names = fields.iter().map(field_name).collect::<Vec<_>>();
+    let mut column_aliases = HashMap::<&str, _>::from_iter(names.iter().map(|f| (f.as_str(), 0)));
+
+    let tv = TypeEvaluator::new(s, from);
+
+    names
+        .iter()
+        .zip(fields)
+        .map(|(name, field)| {
+            let expr = field.expr;
+            let data_type = tv.eval_type(&expr)?;
+            let name = match column_aliases.get(name.as_str()) {
+                Some(0) => {
+                    column_aliases.insert(name, 1);
+                    name.to_owned()
+                }
+                Some(count) => {
+                    let mut count = *count;
+                    let mut resolved_name = name.to_owned();
+                    resolved_name.push('_');
+                    let orig_len = resolved_name.len();
+
+                    loop {
+                        resolved_name.push_str(count.to_string().as_str());
+                        if column_aliases.contains_key(resolved_name.as_str()) {
+                            count += 1;
+                            resolved_name.truncate(orig_len)
+                        } else {
+                            column_aliases.insert(name, count + 1);
+                            break resolved_name;
+                        }
+                    }
+                }
+                None => unreachable!(),
+            };
+
+            Ok(Field {
+                expr,
+                name,
+                data_type: var_ref_data_type_to_influx_type(data_type),
+            })
+        })
+        .collect::<Result<Vec<_>>>()
+}
+
+/// Check the length of the arguments slice is within
+/// the expected bounds.
+macro_rules! check_exp_args {
+    ($NAME:expr, $EXP:expr, $ARGS:expr) => {
+        let args_len = $ARGS.len();
+        if args_len != $EXP {
+            return error::query(format!(
+                "invalid number of arguments for {}, expected {}, got {args_len}",
+                $NAME, $EXP
+            ));
+        }
+    };
+    ($NAME:expr, $LO:literal, $HI:literal, $ARGS:expr) => {
+        let args_len = $ARGS.len();
+        if !($LO..=$HI).contains(&args_len) {
+            return error::query(format!(
+                "invalid number of arguments for {}, expected at least {} but no more than {}, got {args_len}",
+                $NAME, $LO, $HI
+            ));
+        }
+    };
+}
+
+/// Verify the argument at a specific position is a [`Literal::Integer`].
+macro_rules! lit_integer {
+    ($NAME:expr, $ARGS:expr, $POS:literal) => {
+        match &$ARGS[$POS] {
+            Expr::Literal(Literal::Integer(v)) => *v,
+            _ => return error::query(format!("expected integer argument in {}()", $NAME)),
+        }
+    };
+
+    ($NAME:expr, $ARGS:expr, $POS:literal?) => {
+        if $POS < $ARGS.len() {
+            Some(lit_integer!($NAME, $ARGS, $POS))
+        } else {
+            None
+        }
+    };
+}
+
+/// Verify the argument at a specific position is a [`Literal::String`].
+macro_rules! lit_string {
+    ($NAME:expr, $ARGS:expr, $POS:literal) => {
+        match &$ARGS[$POS] {
+            Expr::Literal(Literal::String(s)) => s.as_str(),
+            _ => return error::query(format!("expected string argument in {}()", $NAME)),
+        }
+    };
+
+    ($NAME:expr, $ARGS:expr, $POS:literal?) => {
+        if $POS < $ARGS.len() {
+            Some(lit_string!($NAME, $ARGS, $POS))
+        } else {
+            None
+        }
+    };
+}
+
+/// Set the `extra_intervals` field of [`FieldChecker`] if it is
+/// less than then proposed new value.
+macro_rules! set_extra_intervals {
+    ($SELF:expr, $NEW:expr) => {
+        if $SELF.extra_intervals < $NEW as usize {
+            $SELF.extra_intervals = $NEW as usize
+        }
+    };
+}
+
+/// Checks a number of expectations for the fields of a [`SelectStatement`].
+#[derive(Default)]
+struct FieldChecker {
+    /// `true` if the statement contains a `GROUP BY TIME` clause.
+    has_group_by_time: bool,
+
+    /// The number of additional intervals that must be read
+    /// for queries that group by time and use window functions such as
+    /// `DIFFERENCE` or `DERIVATIVE`. This ensures data for the first
+    /// window is available.
+    ///
+    /// See: <https://github.com/influxdata/influxdb/blob/f365bb7e3a9c5e227dbf66d84adf674d3d127176/query/compile.go#L50>
+    extra_intervals: usize,
+
+    /// `true` if the interval was inherited by a parent.
+    /// If this is set, then an interval that was inherited will not cause
+    /// a query that shouldn't have an interval to fail.
+    inherited_group_by_time: bool,
+
+    /// `true` if the projection contains an invocation of the `TOP` or `BOTTOM` function.
+    has_top_bottom: bool,
+
+    /// `true` when one or more projections do not contain an aggregate expression.
+    has_non_aggregate_fields: bool,
+
+    /// `true` when the projection contains a `DISTINCT` function or unary `DISTINCT` operator.
+    has_distinct: bool,
+
+    /// Accumulator for the number of aggregate or window expressions for the statement.
+    aggregate_count: usize,
+
+    /// Accumulator for the number of window expressions for the statement.
+    window_count: usize,
+
+    /// Accumulator for the number of selector expressions for the statement.
+    selector_count: usize,
+    // Set to `true` if any window or aggregate functions are expected to
+    // only produce non-null results.
+    //
+    // This replicates the
+    // filter_null_rows: bool,
+}
+
+impl FieldChecker {
+    fn check_fields(
+        &mut self,
+        fields: &[Field],
+        fill: Option<FillClause>,
+    ) -> Result<SelectStatementInfo> {
+        fields.iter().try_for_each(|f| self.check_expr(&f.expr))?;
+
+        match self.function_count() {
+            0 => {
+                // FILL(PREVIOUS) and FILL(<value>) are both supported for non-aggregate queries
+                //
+                // See: https://github.com/influxdata/influxdb/blob/98361e207349a3643bcc332d54b009818fe7585f/query/compile.go#L1002-L1012
+                match fill {
+                    Some(FillClause::None) => {
+                        return error::query(
+                            "FILL(none) must be used with an aggregate or selector function",
+                        )
+                    }
+                    Some(FillClause::Linear) => {
+                        return error::query(
+                            "FILL(linear) must be used with an aggregate or selector function",
+                        )
+                    }
+                    _ => {}
+                }
+
+                if self.has_group_by_time && !self.inherited_group_by_time {
+                    return error::query("GROUP BY requires at least one aggregate function");
+                }
+            }
+            2.. if self.has_top_bottom => {
+                return error::query(
+                    "selector functions top and bottom cannot be combined with other functions",
+                )
+            }
+            _ => {}
+        }
+
+        // If a distinct() call is present, ensure there is exactly one aggregate function.
+        //
+        // See: https://github.com/influxdata/influxdb/blob/98361e207349a3643bcc332d54b009818fe7585f/query/compile.go#L1013-L1016
+        if self.has_distinct && (self.function_count() != 1 || self.has_non_aggregate_fields) {
+            return error::query(
+                "aggregate function distinct() cannot be combined with other functions or fields",
+            );
+        }
+
+        // Validate we are using a selector or raw query if non-aggregate fields are projected.
+        if self.has_non_aggregate_fields {
+            if self.window_aggregate_count() > 0 {
+                return error::query("mixing aggregate and non-aggregate columns is not supported");
+            } else if self.selector_count > 1 {
+                return error::query(
+                    "mixing multiple selector functions with tags or fields is not supported",
+                );
+            }
+        }
+
+        // At this point the statement is valid, and numerous preconditions
+        // have been met. The final state of the `FieldChecker` is inspected
+        // to determine the type of projection. The ProjectionType dictates
+        // how the query will be planned and other cases, such as how NULL
+        // values are handled, to ensure compatibility with InfluxQL OG.
+
+        let projection_type = if self.has_top_bottom {
+            ProjectionType::TopBottomSelector
+        } else if self.has_group_by_time {
+            if self.window_count > 0 {
+                if self.window_count == self.aggregate_count + self.selector_count {
+                    ProjectionType::WindowAggregate
+                } else {
+                    ProjectionType::WindowAggregateMixed
+                }
+            } else if self.has_distinct {
+                ProjectionType::RawDistinct
+            } else {
+                ProjectionType::Aggregate
+            }
+        } else if self.has_distinct {
+            ProjectionType::RawDistinct
+        } else if self.selector_count == 1 && self.aggregate_count == 0 {
+            ProjectionType::Selector {
+                has_fields: self.has_non_aggregate_fields,
+            }
+        } else if self.selector_count > 1 || self.aggregate_count > 0 {
+            ProjectionType::Aggregate
+        } else if self.window_count > 0 {
+            ProjectionType::Window
+        } else {
+            ProjectionType::Raw
+        };
+
+        Ok(SelectStatementInfo {
+            projection_type,
+            extra_intervals: self.extra_intervals,
+        })
+    }
+
+    /// The total number of functions observed.
+    fn function_count(&self) -> usize {
+        self.window_aggregate_count() + self.selector_count
+    }
+
+    /// The total number of window and aggregate functions observed.
+    fn window_aggregate_count(&self) -> usize {
+        self.aggregate_count + self.window_count
+    }
+}
+
+impl FieldChecker {
+    fn check_expr(&mut self, e: &Expr) -> Result<()> {
+        match e {
+            // The `time` column is ignored
+            Expr::VarRef(VarRef { name, .. }) if name.deref() == "time" => Ok(()),
+            Expr::VarRef(_) => {
+                self.has_non_aggregate_fields = true;
+                Ok(())
+            }
+            Expr::Call(c) if is_scalar_math_function(&c.name) => self.check_math_function(c),
+            Expr::Call(c) => self.check_aggregate_function(c),
+            Expr::Binary(b) => match (&*b.lhs, &*b.rhs) {
+                (Expr::Literal(_), Expr::Literal(_)) => {
+                    error::query("cannot perform a binary expression on two literals")
+                }
+                (Expr::Literal(_), other) | (other, Expr::Literal(_)) => self.check_expr(other),
+                (lhs, rhs) => {
+                    self.check_expr(lhs)?;
+                    self.check_expr(rhs)
+                }
+            },
+            Expr::Nested(e) => self.check_expr(e),
+            // BindParameter should be substituted prior to validating fields.
+            Expr::BindParameter(_) => error::internal("unexpected bind parameter"),
+            Expr::Wildcard(_) => error::internal("unexpected wildcard"),
+            Expr::Literal(Literal::Regex(_)) => error::internal("unexpected regex"),
+            Expr::Distinct(_) => error::internal("unexpected distinct clause"),
+            // See: https://github.com/influxdata/influxdb/blob/98361e207349a3643bcc332d54b009818fe7585f/query/compile.go#L347
+            Expr::Literal(_) => error::query("field must contain at least one variable"),
+        }
+    }
+
+    fn check_math_function(&mut self, c: &Call) -> Result<()> {
+        let name = c.name.as_str();
+        check_exp_args!(
+            name,
+            match name {
+                "atan2" | "pow" | "log" => 2,
+                _ => 1,
+            },
+            c.args
+        );
+
+        // Check each argument that is not a literal number.
+        //
+        // NOTE
+        // This is a slight deviation from OSS, where we only skip
+        // numeric literals, which are the only literal argument types supported by the mathematical
+        // functions in InfluxQL.
+        //
+        // See: https://github.com/influxdata/influxdb/blob/98361e207349a3643bcc332d54b009818fe7585f/query/compile.go#L910-L911
+        c.args.iter().try_for_each(|e| {
+            if matches!(e, Expr::Literal(Literal::Integer(_) | Literal::Float(_))) {
+                Ok(())
+            } else {
+                self.check_expr(e)
+            }
+        })
+    }
+
+    /// Validate `c` is an aggregate, window aggregate or selector function.
+    fn check_aggregate_function(&mut self, c: &Call) -> Result<()> {
+        let name = c.name.as_str();
+
+        match name {
+            "percentile" => self.check_percentile(&c.args),
+            "sample" => self.check_sample(&c.args),
+            "distinct" => self.check_distinct(&c.args, false),
+            "top" | "bottom" if self.has_top_bottom => error::query(format!(
+                "selector function {name}() cannot be combined with other functions"
+            )),
+            "top" | "bottom" => self.check_top_bottom(name, &c.args),
+            "derivative" | "non_negative_derivative" => self.check_derivative(name, &c.args),
+            "difference" | "non_negative_difference" => self.check_difference(name, &c.args),
+            "cumulative_sum" => self.check_cumulative_sum(&c.args),
+            "moving_average" => self.check_moving_average(&c.args),
+            "exponential_moving_average"
+            | "double_exponential_moving_average"
+            | "triple_exponential_moving_average"
+            | "relative_strength_index"
+            | "triple_exponential_derivative" => {
+                self.check_exponential_moving_average(name, &c.args)
+            }
+            "kaufmans_efficiency_ratio" | "kaufmans_adaptive_moving_average" => {
+                self.check_kaufmans(name, &c.args)
+            }
+            "chande_momentum_oscillator" => self.check_chande_momentum_oscillator(name, &c.args),
+            "elapsed" => self.check_elapsed(name, &c.args),
+            "integral" => self.check_integral(name, &c.args),
+            "count_hll" => self.check_count_hll(&c.args),
+            "holt_winters" | "holt_winters_with_fit" => self.check_holt_winters(name, &c.args),
+            "max" | "min" | "first" | "last" => {
+                self.inc_selector_count();
+                check_exp_args!(name, 1, c.args);
+                self.check_symbol(name, &c.args[0])
+            }
+            "count" | "sum" | "mean" | "median" | "mode" | "stddev" | "spread" | "sum_hll" => {
+                self.inc_aggregate_count();
+                check_exp_args!(name, 1, c.args);
+
+                // If this is a call to count(), allow distinct() to be used as the function argument.
+                if name == "count" {
+                    match &c.args[0] {
+                        Expr::Call(c) if c.name == "distinct" => {
+                            return self.check_distinct(&c.args, true);
+                        }
+                        Expr::Distinct(_) => {
+                            return error::internal("unexpected distinct clause in count");
+                        }
+                        _ => {}
+                    }
+                }
+                self.check_symbol(name, &c.args[0])
+            }
+            _ => error::query(format!("unsupported function {name}()")),
+        }
+    }
+
+    fn check_percentile(&mut self, args: &[Expr]) -> Result<()> {
+        self.inc_selector_count();
+
+        check_exp_args!("percentile", 2, args);
+        if !matches!(
+            &args[1],
+            Expr::Literal(Literal::Integer(_)) | Expr::Literal(Literal::Float(_))
+        ) {
+            return error::query(format!(
+                "expected number for percentile(), got {:?}",
+                &args[1]
+            ));
+        }
+        self.check_symbol("percentile", &args[0])
+    }
+
+    fn check_sample(&mut self, args: &[Expr]) -> Result<()> {
+        self.inc_selector_count();
+
+        check_exp_args!("sample", 2, args);
+        let v = lit_integer!("sample", args, 1);
+        // NOTE: this is a deviation from InfluxQL, which incorrectly performs the check for <= 0
+        //
+        // See: https://github.com/influxdata/influxdb/blob/98361e207349a3643bcc332d54b009818fe7585f/query/compile.go#L441-L443
+        if v <= 1 {
+            return error::query(format!("sample window must be greater than 1, got {v}"));
+        }
+
+        self.check_symbol("sample", &args[0])
+    }
+
+    /// Validate the arguments for the `distinct` function call.
+    fn check_distinct(&mut self, args: &[Expr], nested: bool) -> Result<()> {
+        self.inc_aggregate_count();
+
+        check_exp_args!("distinct", 1, args);
+        if !matches!(&args[0], Expr::VarRef(_)) {
+            return error::query("expected field argument in distinct()");
+        }
+
+        if !nested {
+            self.has_distinct = true;
+        }
+
+        Ok(())
+    }
+
+    fn check_top_bottom(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        assert!(!self.has_top_bottom, "should not be called if true");
+
+        self.inc_selector_count();
+        self.has_top_bottom = true;
+
+        if args.len() < 2 {
+            return error::query(format!(
+                "invalid number of arguments for {name}, expected at least 2, got {}",
+                args.len()
+            ));
+        }
+
+        let (last, args) = args.split_last().expect("length >= 2");
+
+        match last {
+            Expr::Literal(Literal::Integer(limit)) => {
+                if *limit <= 0 {
+                    return error::query(format!(
+                        "limit ({limit}) for {name} must be greater than 0"
+                    ));
+                }
+            }
+            got => {
+                return error::query(format!(
+                    "expected integer as last argument for {name}, got {got:?}"
+                ))
+            }
+        }
+
+        let (first, rest) = args.split_first().expect("length >= 1");
+
+        if !matches!(first, Expr::VarRef(_)) {
+            return error::query(format!("expected first argument to be a field for {name}"));
+        }
+
+        for expr in rest {
+            if !matches!(expr, Expr::VarRef(_)) {
+                return error::query(format!(
+                    "only fields or tags are allow for {name}(), got {expr:?}"
+                ));
+            }
+        }
+
+        if !rest.is_empty() {
+            // projecting additional fields and tags, such as <tag> or <field> in `TOP(usage_idle, <tag>, <field>, 5)`
+            self.has_non_aggregate_fields = true
+        }
+
+        Ok(())
+    }
+
+    fn check_derivative(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+
+        check_exp_args!(name, 1, 2, args);
+
+        set_extra_intervals!(self, 1);
+
+        match args.get(1) {
+            Some(Expr::Literal(Literal::Duration(d))) if **d <= 0 => {
+                return error::query(format!("duration argument must be positive, got {d}"))
+            }
+            None | Some(Expr::Literal(Literal::Duration(_))) => {}
+            Some(got) => {
+                return error::query(format!(
+                    "second argument to {name} must be a duration, got {got:?}"
+                ))
+            }
+        }
+
+        self.check_nested_symbol(name, &args[0])
+    }
+
+    fn check_elapsed(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+        check_exp_args!(name, 1, 2, args);
+
+        set_extra_intervals!(self, 1);
+
+        match args.get(1) {
+            Some(Expr::Literal(Literal::Duration(d))) if **d <= 0 => {
+                return error::query(format!("duration argument must be positive, got {d}"))
+            }
+            None | Some(Expr::Literal(Literal::Duration(_))) => {}
+            Some(got) => {
+                return error::query(format!(
+                    "second argument to {name} must be a duration, got {got:?}"
+                ))
+            }
+        }
+
+        self.check_nested_symbol(name, &args[0])
+    }
+
+    fn check_difference(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+        check_exp_args!(name, 1, args);
+
+        set_extra_intervals!(self, 1);
+
+        self.check_nested_symbol(name, &args[0])
+    }
+
+    fn check_cumulative_sum(&mut self, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+        check_exp_args!("cumulative_sum", 1, args);
+        self.check_nested_symbol("cumulative_sum", &args[0])
+    }
+
+    fn check_moving_average(&mut self, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+        check_exp_args!("moving_average", 2, args);
+
+        let v = lit_integer!("moving_average", args, 1);
+        if v <= 1 {
+            return error::query(format!(
+                "moving_average window must be greater than 1, got {v}"
+            ));
+        }
+
+        set_extra_intervals!(self, v);
+
+        self.check_nested_symbol("moving_average", &args[0])
+    }
+
+    fn check_exponential_moving_average(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+        check_exp_args!(name, 2, 4, args);
+
+        let v = lit_integer!(name, args, 1);
+        if v < 1 {
+            return error::query(format!("{name} period must be greater than 1, got {v}"));
+        }
+
+        set_extra_intervals!(self, v);
+
+        if let Some(v) = lit_integer!(name, args, 2?) {
+            match (v, name) {
+                (v, "triple_exponential_derivative") if v < 1 && v != -1 => {
+                    return error::query(format!(
+                        "{name} hold period must be greater than or equal to 1"
+                    ))
+                }
+                (v, _) if v < 0 && v != -1 => {
+                    return error::query(format!(
+                        "{name} hold period must be greater than or equal to 0"
+                    ))
+                }
+                _ => {}
+            }
+        }
+
+        match lit_string!(name, args, 3?) {
+            Some("exponential" | "simple") => {}
+            Some(warmup) => {
+                return error::query(format!(
+                    "{name} warmup type must be one of: 'exponential', 'simple', got {warmup}"
+                ))
+            }
+            None => {}
+        }
+
+        self.check_nested_symbol(name, &args[0])
+    }
+
+    fn check_kaufmans(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+        check_exp_args!(name, 2, 3, args);
+
+        let v = lit_integer!(name, args, 1);
+        if v < 1 {
+            return error::query(format!("{name} period must be greater than 1, got {v}"));
+        }
+
+        set_extra_intervals!(self, v);
+
+        if let Some(v) = lit_integer!(name, args, 2?) {
+            if v < 0 && v != -1 {
+                return error::query(format!(
+                    "{name} hold period must be greater than or equal to 0"
+                ));
+            }
+        }
+
+        self.check_nested_symbol(name, &args[0])
+    }
+
+    fn check_chande_momentum_oscillator(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_window_count();
+        check_exp_args!(name, 2, 4, args);
+
+        let v = lit_integer!(name, args, 1);
+        if v < 1 {
+            return error::query(format!("{name} period must be greater than 1, got {v}"));
+        }
+
+        set_extra_intervals!(self, v);
+
+        if let Some(v) = lit_integer!(name, args, 2?) {
+            if v < 0 && v != -1 {
+                return error::query(format!(
+                    "{name} hold period must be greater than or equal to 0"
+                ));
+            }
+        }
+
+        match lit_string!(name, args, 3?) {
+            Some("none" | "exponential" | "simple") => {}
+            Some(warmup) => {
+                return error::query(format!(
+                "{name} warmup type must be one of: 'none', 'exponential' or 'simple', got {warmup}"
+            ))
+            }
+            None => {}
+        }
+
+        self.check_nested_symbol(name, &args[0])
+    }
+
+    fn check_integral(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_aggregate_count();
+        check_exp_args!(name, 1, 2, args);
+
+        match args.get(1) {
+            Some(Expr::Literal(Literal::Duration(d))) if **d <= 0 => {
+                return error::query(format!("duration argument must be positive, got {d}"))
+            }
+            None | Some(Expr::Literal(Literal::Duration(_))) => {}
+            Some(got) => {
+                return error::query(format!(
+                    "second argument to {name} must be a duration, got {got:?}"
+                ))
+            }
+        }
+
+        self.check_symbol(name, &args[0])
+    }
+
+    fn check_count_hll(&mut self, _args: &[Expr]) -> Result<()> {
+        self.inc_aggregate_count();
+        // The count hyperloglog function is not documented for versions 1.8 or the latest 2.7.
+        // If anyone is using it, we'd like to know, so we'll explicitly return a not implemented
+        // message.
+        //
+        // See: https://docs.influxdata.com/influxdb/v2.7/query-data/influxql/functions/
+        // See: https://docs.influxdata.com/influxdb/v1.8/query_language/functions
+        error::not_implemented("count_hll")
+    }
+
+    fn check_holt_winters(&mut self, name: &str, args: &[Expr]) -> Result<()> {
+        self.inc_aggregate_count();
+        check_exp_args!(name, 3, args);
+
+        let v = lit_integer!(name, args, 1);
+        if v < 1 {
+            return error::query(format!("{name} N argument must be greater than 0, got {v}"));
+        }
+
+        let v = lit_integer!(name, args, 2);
+        if v < 0 {
+            return error::query(format!("{name} S argument cannot be negative, got {v}"));
+        }
+
+        match &args[0] {
+            Expr::Call(_) if !self.has_group_by_time => {
+                error::query(format!("{name} aggregate requires a GROUP BY interval"))
+            }
+            expr @ Expr::Call(_) => self.check_nested_expr(expr),
+            _ => error::query(format!("must use aggregate function with {name}")),
+        }
+    }
+
+    /// Increments the aggregate function call count
+    fn inc_aggregate_count(&mut self) {
+        self.aggregate_count += 1
+    }
+
+    /// Increments the window function call count
+    fn inc_window_count(&mut self) {
+        self.window_count += 1
+    }
+
+    fn inc_selector_count(&mut self) {
+        self.selector_count += 1
+    }
+
+    fn check_nested_expr(&mut self, expr: &Expr) -> Result<()> {
+        match expr {
+            Expr::Call(c) if c.name == "distinct" => self.check_distinct(&c.args, true),
+            _ => self.check_expr(expr),
+        }
+    }
+
+    fn check_nested_symbol(&mut self, name: &str, expr: &Expr) -> Result<()> {
+        match expr {
+            Expr::Call(_) if !self.has_group_by_time => {
+                error::query(format!("{name} aggregate requires a GROUP BY interval"))
+            }
+            Expr::Call(_) => self.check_nested_expr(expr),
+            _ if self.has_group_by_time && !self.inherited_group_by_time => error::query(format!(
+                "aggregate function required inside the call to {name}"
+            )),
+            _ => self.check_symbol(name, expr),
+        }
+    }
+
+    /// Validate that `expr` is either a [`Expr::VarRef`] or a [`Expr::Wildcard`] or
+    /// [`Literal::Regex`] under specific conditions.
+    fn check_symbol(&mut self, name: &str, expr: &Expr) -> Result<()> {
+        match expr {
+            Expr::VarRef(_) => Ok(()),
+            Expr::Wildcard(_) | Expr::Literal(Literal::Regex(_)) => {
+                error::internal("unexpected wildcard or regex")
+            }
+            expr => error::query(format!("expected field argument in {name}(), got {expr:?}")),
+        }
+    }
+}
+
+#[derive(Default, Debug, Copy, Clone, Eq, PartialEq)]
+pub(crate) enum ProjectionType {
+    /// A query that projects no aggregate or selector functions.
+    #[default]
+    Raw,
+    /// A query that projects a single DISTINCT(field).
+    RawDistinct,
+    /// A query that projects one or more aggregate functions or
+    /// two or more selector functions.
+    Aggregate,
+    /// A query that projects one or more window functions.
+    Window,
+    /// A query that projects a combination of window and nested aggregate functions.
+    WindowAggregate,
+    /// A query that projects a combination of window and nested aggregate functions, including
+    /// separate projections that are just aggregates. This requires special handling of
+    /// windows that produce `NULL` results.
+    WindowAggregateMixed,
+    /// A query that projects a single selector function,
+    /// such as `last` or `first`.
+    Selector {
+        /// When `true`, the projection contains additional tags or fields.
+        has_fields: bool,
+    },
+    /// A query that projects the `top` or `bottom` selector function.
+    TopBottomSelector,
+}
+
+/// Holds high-level information as the result of analysing
+/// a `SELECT` query.
+#[derive(Default, Debug, Copy, Clone)]
+struct SelectStatementInfo {
+    /// Identifies the projection type for the `SELECT` query.
+    projection_type: ProjectionType,
+    /// Copied from [extra_intervals](FieldChecker::extra_intervals)
+    ///
+    /// [See also](Select::extra_intervals).
+    extra_intervals: usize,
+}
+
+/// Gather information about the semantics of a [`SelectStatement`] and verify
+/// the `SELECT` projection clause is semantically correct.
+///
+/// Upon success the fields list is guaranteed to adhere to a number of conditions.
+///
+/// Generally:
+///
+/// * All aggregate, selector and window-like functions, such as `sum`, `last` or `difference`,
+///   specify a field expression as their first argument
+/// * All projected columns must refer to a field or tag ensuring there are no literal
+///   projections such as `SELECT 1`
+/// * Argument types and values are valid
+///
+/// When `GROUP BY TIME` is present, the `SelectStatement` is an aggregate query and the
+/// following additional rules apply:
+///
+/// * All projected fields are aggregate or selector expressions
+/// * All window-like functions, such as `difference` or `integral` specify an aggregate
+///   expression, such as `SUM(foo)`, as their first argument
+///
+/// For selector queries, which are those that use selector functions like `last` or `max`:
+///
+/// * Projecting **multiple** selector functions, such as `last` or `first` will not be
+/// combined with non-aggregate columns
+/// * Projecting a **single** selector function, such as `last` or `first` may be combined
+/// with non-aggregate columns
+///
+/// Finally, the `top` and `bottom` function have the following additional restrictions:
+///
+/// * Are not combined with other aggregate, selector or window-like functions and may
+///   only project additional fields
+fn select_statement_info(
+    fields: &[Field],
+    group_by: &Option<GroupByClause>,
+    fill: Option<FillClause>,
+) -> Result<SelectStatementInfo> {
+    let has_group_by_time = group_by
+        .as_ref()
+        .and_then(|gb| gb.time_dimension())
+        .is_some();
+
+    let mut fc = FieldChecker {
+        has_group_by_time,
+        ..Default::default()
+    };
+
+    fc.check_fields(fields, fill)
+}
+
+#[cfg(test)]
+mod test {
+    use super::Result;
+    use crate::plan::ir::{Field, Select};
+    use crate::plan::rewriter::{
+        find_table_names, has_wildcards, rewrite_select, rewrite_statement, ProjectionType,
+        SelectStatementInfo,
+    };
+    use crate::plan::test_utils::{parse_select, MockSchemaProvider};
+    use assert_matches::assert_matches;
+    use datafusion::error::DataFusionError;
+    use influxdb_influxql_parser::select::SelectStatement;
+    use test_helpers::{assert_contains, assert_error};
+
+    #[test]
+    fn test_find_table_names() {
+        let namespace = MockSchemaProvider::default();
+        let parse_select = |s: &str| -> Select {
+            let select = parse_select(s);
+            rewrite_select(&namespace, &select).unwrap()
+        };
+
+        /// Return `find_table_names` as a `Vec` for tests.
+        fn find_table_names_vec(s: &Select) -> Vec<&str> {
+            find_table_names(s).into_iter().collect()
+        }
+
+        let s = parse_select("SELECT usage_idle FROM cpu");
+        assert_eq!(find_table_names_vec(&s), &["cpu"]);
+
+        let s = parse_select("SELECT usage_idle FROM cpu, disk");
+        assert_eq!(find_table_names_vec(&s), &["cpu", "disk"]);
+
+        let s = parse_select("SELECT usage_idle FROM disk, cpu, disk");
+        assert_eq!(find_table_names_vec(&s), &["cpu", "disk"]);
+
+        // subqueries
+
+        let s = parse_select("SELECT usage_idle FROM (select * from cpu, disk)");
+        assert_eq!(find_table_names_vec(&s), &["cpu", "disk"]);
+
+        let s = parse_select("SELECT usage_idle FROM cpu, (select * from cpu, disk)");
+        assert_eq!(find_table_names_vec(&s), &["cpu", "disk"]);
+    }
+
+    #[test]
+    fn test_select_statement_info() {
+        let namespace = MockSchemaProvider::default();
+        let parse_select = |s: &str| -> Select {
+            let select = parse_select(s);
+            rewrite_select(&namespace, &select).unwrap()
+        };
+
+        fn select_statement_info(q: &Select) -> Result<SelectStatementInfo> {
+            super::select_statement_info(&q.fields, &q.group_by, q.fill)
+        }
+
+        let info = select_statement_info(&parse_select("SELECT foo, bar FROM cpu")).unwrap();
+        assert_matches!(info.projection_type, ProjectionType::Raw);
+
+        let info = select_statement_info(&parse_select("SELECT distinct(foo) FROM cpu")).unwrap();
+        assert_matches!(info.projection_type, ProjectionType::RawDistinct);
+
+        let info = select_statement_info(&parse_select("SELECT last(foo) FROM cpu")).unwrap();
+        assert_matches!(
+            info.projection_type,
+            ProjectionType::Selector { has_fields: false }
+        );
+
+        // updates extra_intervals
+        let info = select_statement_info(&parse_select("SELECT difference(foo) FROM cpu")).unwrap();
+        assert_matches!(info.projection_type, ProjectionType::Window);
+        assert_matches!(info.extra_intervals, 1);
+        // derives extra intervals from the window function
+        let info =
+            select_statement_info(&parse_select("SELECT moving_average(foo, 5) FROM cpu")).unwrap();
+        assert_matches!(info.projection_type, ProjectionType::Window);
+        assert_matches!(info.extra_intervals, 5);
+        // uses the maximum extra intervals
+        let info = select_statement_info(&parse_select(
+            "SELECT difference(foo), moving_average(foo, 4) FROM cpu",
+        ))
+        .unwrap();
+        assert_matches!(info.extra_intervals, 4);
+
+        let info = select_statement_info(&parse_select("SELECT last(foo), bar FROM cpu")).unwrap();
+        assert_matches!(
+            info.projection_type,
+            ProjectionType::Selector { has_fields: true }
+        );
+
+        let info = select_statement_info(&parse_select(
+            "SELECT last(foo) FROM cpu GROUP BY TIME(10s)",
+        ))
+        .unwrap();
+        assert_matches!(info.projection_type, ProjectionType::Aggregate);
+
+        let info =
+            select_statement_info(&parse_select("SELECT last(foo), first(foo) FROM cpu")).unwrap();
+        assert_matches!(info.projection_type, ProjectionType::Aggregate);
+
+        let info = select_statement_info(&parse_select("SELECT count(foo) FROM cpu")).unwrap();
+        assert_matches!(info.projection_type, ProjectionType::Aggregate);
+
+        let info = select_statement_info(&parse_select(
+            "SELECT difference(count(foo)) FROM cpu GROUP BY TIME(10s)",
+        ))
+        .unwrap();
+        assert_matches!(info.projection_type, ProjectionType::WindowAggregate);
+
+        let info = select_statement_info(&parse_select(
+            "SELECT difference(count(foo)), mean(foo) FROM cpu GROUP BY TIME(10s)",
+        ))
+        .unwrap();
+        assert_matches!(info.projection_type, ProjectionType::WindowAggregateMixed);
+
+        let info = select_statement_info(&parse_select("SELECT top(foo, 3) FROM cpu")).unwrap();
+        assert_matches!(info.projection_type, ProjectionType::TopBottomSelector);
+    }
+
+    /// Verify all the aggregate, window-like and selector functions are handled
+    /// by `select_statement_info`.
+    #[test]
+    fn test_select_statement_info_functions() {
+        fn select_statement_info(q: &SelectStatement) -> Result<SelectStatementInfo> {
+            let columns = q
+                .fields
+                .iter()
+                .map(|f| Field {
+                    expr: f.expr.clone(),
+                    name: "".to_owned(),
+                    data_type: None,
+                })
+                .collect::<Vec<_>>();
+            super::select_statement_info(&columns, &q.group_by, q.fill)
+        }
+
+        // percentile
+        let sel = parse_select("SELECT percentile(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT percentile(foo) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for percentile, expected 2, got 1");
+        let sel = parse_select("SELECT percentile('foo', /a/) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "expected number for percentile(), got Literal(Regex(Regex(\"a\")))");
+
+        // sample
+        let sel = parse_select("SELECT sample(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT sample(foo) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for sample, expected 2, got 1");
+        let sel = parse_select("SELECT sample(foo, -2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "sample window must be greater than 1, got -2");
+
+        // distinct
+        let sel = parse_select("SELECT distinct(foo) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT distinct(foo, 1) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for distinct, expected 1, got 2");
+        let sel = parse_select("SELECT distinct(sum(foo)) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "expected field argument in distinct()");
+        let sel = parse_select("SELECT distinct(foo), distinct(bar) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "aggregate function distinct() cannot be combined with other functions or fields");
+
+        // top / bottom
+        let sel = parse_select("SELECT top(foo, 3) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT bottom(foo, 3) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT top(foo, 3), bar FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT top(foo, bar, 3) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT top(foo) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for top, expected at least 2, got 1");
+        let sel = parse_select("SELECT bottom(foo) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for bottom, expected at least 2, got 1");
+        let sel = parse_select("SELECT top(foo, -2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "limit (-2) for top must be greater than 0");
+        let sel = parse_select("SELECT top(foo, bar) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "expected integer as last argument for top, got VarRef(VarRef { name: Identifier(\"bar\"), data_type: None })");
+        let sel = parse_select("SELECT top('foo', 3) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "expected first argument to be a field for top");
+        let sel = parse_select("SELECT top(foo, 2, 3) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "only fields or tags are allow for top(), got Literal(Integer(2))");
+        let sel = parse_select("SELECT top(foo, 2), mean(bar) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "selector functions top and bottom cannot be combined with other functions");
+
+        // derivative
+        let sel = parse_select("SELECT derivative(foo) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT derivative(foo, 2s) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT derivative(mean(foo)) FROM cpu GROUP BY TIME(30s)");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT derivative(foo, 2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "second argument to derivative must be a duration, got Literal(Integer(2))");
+        let sel = parse_select("SELECT derivative(foo, -2s) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "duration argument must be positive, got -2s");
+        let sel = parse_select("SELECT derivative(foo, 2s, 1) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for derivative, expected at least 1 but no more than 2, got 3");
+        let sel = parse_select("SELECT derivative(foo) FROM cpu GROUP BY TIME(30s)");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "aggregate function required inside the call to derivative");
+
+        // elapsed
+        let sel = parse_select("SELECT elapsed(foo) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT elapsed(foo, 5s) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT elapsed(foo, 2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "second argument to elapsed must be a duration, got Literal(Integer(2))");
+        let sel = parse_select("SELECT elapsed(foo, -2s) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "duration argument must be positive, got -2s");
+
+        // difference / non_negative_difference
+        let sel = parse_select("SELECT difference(foo) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT non_negative_difference(foo) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT difference(foo, 2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for difference, expected 1, got 2");
+
+        // cumulative_sum
+        let sel = parse_select("SELECT cumulative_sum(foo) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT cumulative_sum(foo, 2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for cumulative_sum, expected 1, got 2");
+
+        // moving_average
+        let sel = parse_select("SELECT moving_average(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT moving_average(foo, bar, 3) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for moving_average, expected 2, got 3");
+        let sel = parse_select("SELECT moving_average(foo, 1) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "moving_average window must be greater than 1, got 1");
+
+        // exponential_moving_average, double_exponential_moving_average
+        // triple_exponential_moving_average, relative_strength_index and triple_exponential_derivative
+        let sel = parse_select("SELECT exponential_moving_average(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT exponential_moving_average(foo, 2, 3) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT exponential_moving_average(foo, 2, -1) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel =
+            parse_select("SELECT exponential_moving_average(foo, 2, 3, 'exponential') FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT exponential_moving_average(foo, 2, 3, 'simple') FROM cpu");
+        select_statement_info(&sel).unwrap();
+        // check variants
+        let sel = parse_select("SELECT double_exponential_moving_average(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT triple_exponential_moving_average(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT relative_strength_index(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT triple_exponential_derivative(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+
+        let sel = parse_select("SELECT exponential_moving_average(foo) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for exponential_moving_average, expected at least 2 but no more than 4, got 1");
+        let sel = parse_select("SELECT exponential_moving_average(foo, 2, 3, 'bad') FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "exponential_moving_average warmup type must be one of: 'exponential', 'simple', got bad");
+        let sel = parse_select("SELECT exponential_moving_average(foo, 2, 3, 4) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "expected string argument in exponential_moving_average()");
+        let sel = parse_select("SELECT exponential_moving_average(foo, 2, -2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "exponential_moving_average hold period must be greater than or equal to 0");
+        let sel = parse_select("SELECT triple_exponential_derivative(foo, 2, 0) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "triple_exponential_derivative hold period must be greater than or equal to 1");
+
+        // kaufmans_efficiency_ratio, kaufmans_adaptive_moving_average
+        let sel = parse_select("SELECT kaufmans_efficiency_ratio(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT kaufmans_adaptive_moving_average(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT kaufmans_efficiency_ratio(foo) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for kaufmans_efficiency_ratio, expected at least 2 but no more than 3, got 1");
+        let sel = parse_select("SELECT kaufmans_efficiency_ratio(foo, 2, -2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "kaufmans_efficiency_ratio hold period must be greater than or equal to 0");
+
+        // chande_momentum_oscillator
+        let sel = parse_select("SELECT chande_momentum_oscillator(foo, 2) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT chande_momentum_oscillator(foo, 2, 3) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT chande_momentum_oscillator(foo, 2, 3, 'none') FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel =
+            parse_select("SELECT chande_momentum_oscillator(foo, 2, 3, 'exponential') FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT chande_momentum_oscillator(foo, 2, 3, 'simple') FROM cpu");
+        select_statement_info(&sel).unwrap();
+
+        let sel = parse_select("SELECT chande_momentum_oscillator(foo) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for chande_momentum_oscillator, expected at least 2 but no more than 4, got 1");
+        let sel = parse_select("SELECT chande_momentum_oscillator(foo, 2, 3, 'bad') FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "chande_momentum_oscillator warmup type must be one of: 'none', 'exponential' or 'simple', got bad");
+
+        // integral
+        let sel = parse_select("SELECT integral(foo) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT integral(foo, 2s) FROM cpu");
+        select_statement_info(&sel).unwrap();
+
+        let sel = parse_select("SELECT integral(foo, -2s) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "duration argument must be positive, got -2s");
+        let sel = parse_select("SELECT integral(foo, 2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "second argument to integral must be a duration, got Literal(Integer(2))");
+
+        // count_hll
+        let sel = parse_select("SELECT count_hll(foo) FROM cpu");
+        assert_error!(
+            select_statement_info(&sel),
+            DataFusionError::NotImplemented(_)
+        );
+
+        // holt_winters, holt_winters_with_fit
+        let sel = parse_select("SELECT holt_winters(mean(foo), 2, 3) FROM cpu GROUP BY time(30s)");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select(
+            "SELECT holt_winters_with_fit(sum(foo), 2, 3) FROM cpu GROUP BY time(30s)",
+        );
+        select_statement_info(&sel).unwrap();
+
+        let sel = parse_select("SELECT holt_winters(sum(foo), 2, 3) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "holt_winters aggregate requires a GROUP BY interval");
+        let sel = parse_select("SELECT holt_winters(foo, 2, 3) FROM cpu GROUP BY time(30s)");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "must use aggregate function with holt_winters");
+        let sel = parse_select("SELECT holt_winters(sum(foo), 2) FROM cpu GROUP BY time(30s)");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for holt_winters, expected 3, got 2");
+        let sel = parse_select("SELECT holt_winters(foo, 0, 3) FROM cpu GROUP BY time(30s)");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "holt_winters N argument must be greater than 0, got 0");
+        let sel = parse_select("SELECT holt_winters(foo, 1, -3) FROM cpu GROUP BY time(30s)");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "holt_winters S argument cannot be negative, got -3");
+
+        // max, min, first, last
+        for name in [
+            "max", "min", "first", "last", "count", "sum", "mean", "median", "mode", "stddev",
+            "spread", "sum_hll",
+        ] {
+            let sel = parse_select(&format!("SELECT {name}(foo) FROM cpu"));
+            select_statement_info(&sel).unwrap();
+            let sel = parse_select(&format!("SELECT {name}(foo, 2) FROM cpu"));
+            let exp = format!("invalid number of arguments for {name}, expected 1, got 2");
+            assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == &exp);
+        }
+
+        // count(distinct)
+        let sel = parse_select("SELECT count(distinct(foo)) FROM cpu");
+        select_statement_info(&sel).unwrap();
+        let sel = parse_select("SELECT count(distinct('foo')) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "expected field argument in distinct()");
+
+        // Test rules for math functions
+        let sel = parse_select("SELECT abs(usage_idle) FROM cpu");
+        select_statement_info(&sel).unwrap();
+
+        // Fallible
+
+        // abs expects 1 argument
+        let sel = parse_select("SELECT abs(foo, 2) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for abs, expected 1, got 2");
+        // pow expects 2 arguments
+        let sel = parse_select("SELECT pow(foo, 2, 3) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "invalid number of arguments for pow, expected 2, got 3");
+
+        // Cannot perform binary operations on literals
+        // See: https://github.com/influxdata/influxdb/blob/98361e207349a3643bcc332d54b009818fe7585f/query/compile.go#L329
+        let sel = parse_select("SELECT 1 + 1 FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "cannot perform a binary expression on two literals");
+
+        // can't project literals
+        let sel = parse_select("SELECT foo, 1 FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "field must contain at least one variable");
+
+        // aggregate functions require a field reference
+        let sel = parse_select("SELECT sum(1) FROM cpu");
+        assert_error!(select_statement_info(&sel), DataFusionError::Plan(ref s) if s == "expected field argument in sum(), got Literal(Integer(1))");
+    }
+
+    mod rewrite_statement {
+        use super::*;
+        use crate::plan::ir::TagSet;
+        use datafusion::common::Result;
+        use influxdb_influxql_parser::select::SelectStatement;
+        use schema::{InfluxColumnType, InfluxFieldType, SchemaBuilder};
+
+        /// Test implementation that converts `Select` to `SelectStatement` so that it can be
+        /// converted back to a string.
+        fn rewrite_select_statement(
+            s: &MockSchemaProvider,
+            q: &SelectStatement,
+        ) -> Result<SelectStatement> {
+            let stmt = rewrite_statement(s, q)?;
+            Ok(stmt.select.into())
+        }
+
+        /// Validate the data types of the fields of a [`Select`].
+        #[test]
+        fn projection_schema() {
+            let namespace = MockSchemaProvider::default();
+
+            let stmt = parse_select("SELECT usage_idle, usage_idle + usage_system, cpu FROM cpu");
+            let q = rewrite_statement(&namespace, &stmt).unwrap();
+            // first field is always the time column and thus a Timestamp
+            assert_matches!(
+                q.select.fields[0].data_type,
+                Some(InfluxColumnType::Timestamp)
+            );
+            // usage_idle is a Float
+            assert_matches!(
+                q.select.fields[1].data_type,
+                Some(InfluxColumnType::Field(InfluxFieldType::Float))
+            );
+            // The expression usage_idle + usage_system is a Float
+            assert_matches!(
+                q.select.fields[2].data_type,
+                Some(InfluxColumnType::Field(InfluxFieldType::Float))
+            );
+            // cpu is a Tag
+            assert_matches!(q.select.fields[3].data_type, Some(InfluxColumnType::Tag));
+
+            let stmt = parse_select("SELECT field_i64 + field_f64, field_i64 / field_i64, field_u64 / field_i64 FROM all_types");
+            let q = rewrite_statement(&namespace, &stmt).unwrap();
+            // first field is always the time column and thus a Timestamp
+            assert_matches!(
+                q.select.fields[0].data_type,
+                Some(InfluxColumnType::Timestamp)
+            );
+            // Expression is promoted to a Float
+            assert_matches!(
+                q.select.fields[1].data_type,
+                Some(InfluxColumnType::Field(InfluxFieldType::Float))
+            );
+            // Integer division is promoted to a Float
+            assert_matches!(
+                q.select.fields[2].data_type,
+                Some(InfluxColumnType::Field(InfluxFieldType::Float))
+            );
+            // Unsigned division is still Unsigned
+            assert_matches!(
+                q.select.fields[3].data_type,
+                Some(InfluxColumnType::Field(InfluxFieldType::UInteger))
+            );
+        }
+
+        /// Validate the tag_set field of a [`Select]`
+        #[test]
+        fn tag_set_schema() {
+            let namespace = MockSchemaProvider::default();
+
+            macro_rules! assert_tag_set {
+                ($Q:ident, $($TAG:literal),*) => {
+                    assert_eq!($Q.select.tag_set, TagSet::from([$($TAG.to_owned(),)*]))
+                };
+            }
+
+            let stmt = parse_select("SELECT usage_system FROM cpu");
+            let q = rewrite_statement(&namespace, &stmt).unwrap();
+            assert_tag_set!(q, "cpu", "host", "region");
+
+            let stmt = parse_select("SELECT usage_system FROM cpu, disk");
+            let q = rewrite_statement(&namespace, &stmt).unwrap();
+            assert_tag_set!(q, "cpu", "host", "region", "device");
+
+            let stmt =
+                parse_select("SELECT usage_system FROM (select * from cpu), (select * from disk)");
+            let q = rewrite_statement(&namespace, &stmt).unwrap();
+            assert_tag_set!(q, "cpu", "host", "region", "device");
+        }
+
+        /// Validating types for simple projections
+        #[test]
+        fn projection_simple() {
+            let namespace = MockSchemaProvider::default();
+
+            // Exact, match
+            let stmt = parse_select("SELECT usage_user FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_user::float AS usage_user FROM cpu"
+            );
+
+            // Duplicate columns do not have conflicting aliases
+            let stmt = parse_select("SELECT usage_user, usage_user FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_user::float AS usage_user, usage_user::float AS usage_user_1 FROM cpu"
+            );
+
+            // Multiple aliases with no conflicts
+            let stmt = parse_select("SELECT usage_user as usage_user_1, usage_user FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_user::float AS usage_user_1, usage_user::float AS usage_user FROM cpu"
+            );
+
+            // Multiple aliases with conflicts
+            let stmt =
+                parse_select("SELECT usage_user as usage_user_1, usage_user, usage_user, usage_user as usage_user_2, usage_user, usage_user_2 FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(stmt.to_string(), "SELECT time::timestamp AS time, usage_user::float AS usage_user_1, usage_user::float AS usage_user, usage_user::float AS usage_user_3, usage_user::float AS usage_user_2, usage_user::float AS usage_user_4, usage_user_2 AS usage_user_2_1 FROM cpu");
+
+            // Only include measurements with at least one field projection
+            let stmt = parse_select("SELECT usage_idle FROM cpu, disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu"
+            );
+
+            // Field does not exist in single measurement
+            let stmt = parse_select("SELECT usage_idle, bytes_free FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle, bytes_free AS bytes_free FROM cpu"
+            );
+
+            // Field exists in each measurement
+            let stmt = parse_select("SELECT usage_idle, bytes_free FROM cpu, disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle, bytes_free::integer AS bytes_free FROM cpu, disk"
+            );
+        }
+
+        /// Validate the expansion of the `FROM` clause using regular expressions
+        #[test]
+        fn from_expand_wildcards() {
+            let namespace = MockSchemaProvider::default();
+
+            // Regex, match, fields from multiple measurements
+            let stmt = parse_select("SELECT bytes_free, bytes_read FROM /d/");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes_free::integer AS bytes_free, bytes_read::integer AS bytes_read FROM disk, diskio"
+            );
+
+            // Regex matches multiple measurement, but only one has a matching field
+            let stmt = parse_select("SELECT bytes_free FROM /d/");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes_free::integer AS bytes_free FROM disk"
+            );
+
+            // Exact, no match
+            let stmt = parse_select("SELECT usage_idle FROM foo");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert!(stmt.from.is_empty());
+
+            // Regex, no match
+            let stmt = parse_select("SELECT bytes_free FROM /^d$/");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert!(stmt.from.is_empty());
+        }
+
+        /// Expanding the projection using wildcards
+        #[test]
+        fn projection_expand_wildcards() {
+            let namespace = MockSchemaProvider::default();
+
+            // Single wildcard, single measurement
+            let stmt = parse_select("SELECT * FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, cpu::tag AS cpu, host::tag AS host, region::tag AS region, usage_idle::float AS usage_idle, usage_system::float AS usage_system, usage_user::float AS usage_user FROM cpu"
+            );
+
+            let stmt = parse_select("SELECT * FROM cpu, disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes_free::integer AS bytes_free, bytes_used::integer AS bytes_used, cpu::tag AS cpu, device::tag AS device, host::tag AS host, region::tag AS region, usage_idle::float AS usage_idle, usage_system::float AS usage_system, usage_user::float AS usage_user FROM cpu, disk"
+            );
+
+            // Regular expression selects fields from multiple measurements
+            let stmt = parse_select("SELECT /usage|bytes/ FROM cpu, disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes_free::integer AS bytes_free, bytes_used::integer AS bytes_used, usage_idle::float AS usage_idle, usage_system::float AS usage_system, usage_user::float AS usage_user FROM cpu, disk"
+            );
+
+            // Selective wildcard for tags
+            let stmt = parse_select("SELECT *::tag, usage_idle FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, cpu::tag AS cpu, host::tag AS host, region::tag AS region, usage_idle::float AS usage_idle FROM cpu"
+            );
+
+            // Selective wildcard for tags only should not select any measurements
+            let stmt = parse_select("SELECT *::tag FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert!(stmt.from.is_empty());
+
+            // Selective wildcard for fields
+            let stmt = parse_select("SELECT *::field FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle, usage_system::float AS usage_system, usage_user::float AS usage_user FROM cpu"
+            );
+
+            // Mixed fields and wildcards
+            let stmt = parse_select("SELECT usage_idle, *::tag FROM cpu");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle, cpu::tag AS cpu, host::tag AS host, region::tag AS region FROM cpu"
+            );
+
+            let stmt = parse_select("SELECT * FROM merge_00, merge_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, col0::float AS col0, col0::tag AS col0_1, col1::float AS col1, col1::tag AS col1_1, col2::string AS col2, col3::string AS col3 FROM merge_00, merge_01"
+            );
+
+            // This should only select merge_01, as col0 is a tag in merge_00
+            let stmt = parse_select("SELECT /col0/ FROM merge_00, merge_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, col0::float AS col0, col0::tag AS col0_1 FROM merge_01"
+            );
+        }
+
+        /// Validate type resolution of [`VarRef`] nodes in the `WHERE` clause.
+        #[test]
+        fn condition() {
+            let namespace = MockSchemaProvider::default();
+
+            // resolves float field
+            let stmt = parse_select("SELECT usage_idle FROM cpu WHERE usage_user > 0");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu WHERE usage_user::float > 0"
+            );
+
+            // resolves tag field
+            let stmt = parse_select("SELECT usage_idle FROM cpu WHERE cpu =~ /foo/");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu WHERE cpu::tag =~ /foo/"
+            );
+
+            // Does not resolve an unknown field
+            let stmt = parse_select("SELECT usage_idle FROM cpu WHERE non_existent = 'bar'");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu WHERE non_existent = 'bar'"
+            );
+
+            // Handles multiple measurements; `bytes_free` is from the `disk` measurement
+            let stmt =
+                parse_select("SELECT usage_idle, bytes_free FROM cpu, disk WHERE bytes_free = 3");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle, bytes_free::integer AS bytes_free FROM cpu, disk WHERE bytes_free::integer = 3"
+            );
+
+            // Resolves recursively through subqueries and aliases
+            let stmt = parse_select("SELECT bytes FROM (SELECT bytes_free AS bytes FROM disk WHERE bytes_free = 3) WHERE bytes > 0");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes::integer AS bytes FROM (SELECT time::timestamp AS time, bytes_free::integer AS bytes FROM disk WHERE bytes_free::integer = 3) WHERE bytes::integer > 0"
+            );
+        }
+
+        #[test]
+        fn group_by() {
+            let namespace = MockSchemaProvider::default();
+
+            let stmt = parse_select("SELECT usage_idle FROM cpu GROUP BY host");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu GROUP BY host::tag"
+            );
+
+            // resolves tag types from multiple measurements
+            let stmt =
+                parse_select("SELECT usage_idle, bytes_free FROM cpu, disk GROUP BY host, device");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle, bytes_free::integer AS bytes_free FROM cpu, disk GROUP BY host::tag, device::tag"
+            );
+
+            // does not resolve non-existent tag
+            let stmt = parse_select("SELECT usage_idle FROM cpu GROUP BY host, non_existent");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu GROUP BY host::tag, non_existent"
+            );
+
+            let stmt = parse_select("SELECT usage_idle FROM cpu GROUP BY *");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu GROUP BY cpu::tag, host::tag, region::tag"
+            );
+
+            // Does not include tags in projection when expanded in GROUP BY
+            let stmt = parse_select("SELECT * FROM cpu GROUP BY *");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle, usage_system::float AS usage_system, usage_user::float AS usage_user FROM cpu GROUP BY cpu::tag, host::tag, region::tag"
+            );
+
+            // Does include explicitly listed tags in projection
+            let stmt = parse_select("SELECT host, * FROM cpu GROUP BY *");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, host::tag AS host, usage_idle::float AS usage_idle, usage_system::float AS usage_system, usage_user::float AS usage_user FROM cpu GROUP BY cpu::tag, host::tag, region::tag"
+            );
+
+            //
+            // TIME
+            //
+
+            // Explicitly adds an upper bound for the time-range for aggregate queries
+            let stmt = parse_select("SELECT mean(usage_idle) FROM cpu WHERE time >= '2022-04-09T12:13:14Z' GROUP BY TIME(30s)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, mean(usage_idle::float) AS mean FROM cpu WHERE time >= 1649506394000000000 AND time <= 1672531200000000000 GROUP BY TIME(30s)"
+            );
+
+            // Does not add an upper bound time range if already specified
+            let stmt = parse_select("SELECT mean(usage_idle) FROM cpu WHERE time >= '2022-04-09T12:13:14Z' AND time < '2022-04-10T12:00:00Z' GROUP BY TIME(30s)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, mean(usage_idle::float) AS mean FROM cpu WHERE time >= 1649506394000000000 AND time <= 1649591999999999999 GROUP BY TIME(30s)"
+            );
+        }
+
+        /// Uncategorized fallible cases
+        #[test]
+        fn fallible() {
+            let namespace = MockSchemaProvider::default();
+
+            // invalid expression, combining float and string fields
+            let stmt = parse_select("SELECT field_f64 + field_str FROM all_types");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_contains!(
+                err.to_string(),
+                "Error during planning: incompatible operands for operator +: float and string"
+            );
+
+            // invalid expression, combining string and string fields, which is compatible with InfluxQL
+            let stmt = parse_select("SELECT field_str + field_str FROM all_types");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_contains!(
+                err.to_string(),
+                "Error during planning: incompatible operands for operator +: string and string"
+            );
+
+            // Invalid regex
+            let stmt = parse_select("SELECT usage_idle FROM /(not/");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_contains!(err.to_string(), "invalid regular expression");
+
+            let stmt = parse_select("SELECT *::field + *::tag FROM cpu");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_contains!(
+                err.to_string(),
+                "Error during planning: unsupported binary expression: contains a wildcard or regular expression"
+            );
+
+            let stmt = parse_select("SELECT COUNT(*) + SUM(usage_idle) FROM cpu");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_contains!(
+                err.to_string(),
+                "Error during planning: unsupported binary expression: contains a wildcard or regular expression"
+            );
+
+            let stmt = parse_select("SELECT COUNT(*::tag) FROM cpu");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_contains!(
+                err.to_string(),
+                "Error during planning: unable to use tag as wildcard in count()"
+            );
+
+            let stmt = parse_select("SELECT usage_idle FROM cpu SLIMIT 1");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "This feature is not implemented: SLIMIT or SOFFSET"
+            );
+
+            let stmt = parse_select("SELECT usage_idle FROM cpu SOFFSET 1");
+            let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
+            assert_eq!(
+                err.to_string(),
+                "This feature is not implemented: SLIMIT or SOFFSET"
+            );
+        }
+
+        /// Verify subqueries
+        #[test]
+        fn subqueries() {
+            let namespace = MockSchemaProvider::default();
+
+            // Subquery, exact, match
+            let stmt = parse_select("SELECT usage_idle FROM (SELECT usage_idle FROM cpu)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM (SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu)"
+            );
+
+            // Subquery, regex, match
+            let stmt =
+                parse_select("SELECT bytes_free FROM (SELECT bytes_free, bytes_read FROM /d/)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes_free::integer AS bytes_free FROM (SELECT time::timestamp AS time, bytes_free::integer AS bytes_free, bytes_read::integer AS bytes_read FROM disk, diskio)"
+            );
+
+            // Subquery, exact, no match
+            let stmt = parse_select("SELECT usage_idle FROM (SELECT usage_idle FROM foo)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert!(stmt.from.is_empty());
+
+            // Subquery, regex, no match
+            let stmt = parse_select("SELECT bytes_free FROM (SELECT bytes_free FROM /^d$/)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert!(stmt.from.is_empty());
+
+            // Correct data type is resolved from subquery
+            let stmt =
+                parse_select("SELECT *::field FROM (SELECT usage_system + usage_idle FROM cpu)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_system_usage_idle::float AS usage_system_usage_idle FROM (SELECT time::timestamp AS time, usage_system::float + usage_idle::float AS usage_system_usage_idle FROM cpu)"
+            );
+
+            // Subquery, no fields projected should be dropped
+            let stmt = parse_select("SELECT usage_idle FROM cpu, (SELECT usage_system FROM cpu)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu"
+            );
+
+            // Outer query are permitted to project tags only, as long as there are other fields
+            // in the subquery
+            let stmt = parse_select("SELECT cpu FROM (SELECT cpu, usage_system FROM cpu)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, cpu::tag AS cpu FROM (SELECT time::timestamp AS time, cpu::tag AS cpu, usage_system::float AS usage_system FROM cpu)"
+            );
+
+            // Outer FROM should be empty, as the subquery does not project any fields
+            let stmt = parse_select("SELECT cpu FROM (SELECT cpu FROM cpu)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert!(stmt.from.is_empty());
+
+            // GROUP BY clauses
+
+            // Projects cpu tag in outer query, as it was specified in the GROUP BY of the subquery
+            let stmt = parse_select("SELECT * FROM (SELECT usage_system FROM cpu GROUP BY cpu)");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, cpu::tag AS cpu, usage_system::float AS usage_system FROM (SELECT time::timestamp AS time, usage_system::float AS usage_system FROM cpu GROUP BY cpu::tag)"
+            );
+
+            // Specifically project cpu tag from GROUP BY
+            let stmt = parse_select(
+                "SELECT cpu, usage_system FROM (SELECT usage_system FROM cpu GROUP BY cpu)",
+            );
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, cpu::tag AS cpu, usage_system::float AS usage_system FROM (SELECT time::timestamp AS time, usage_system::float AS usage_system FROM cpu GROUP BY cpu::tag)"
+            );
+
+            // Projects cpu tag in outer query separately from aliased cpu tag "foo"
+            let stmt = parse_select(
+                "SELECT * FROM (SELECT cpu as foo, usage_system FROM cpu GROUP BY cpu)",
+            );
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, cpu::tag AS cpu, foo::tag AS foo, usage_system::float AS usage_system FROM (SELECT time::timestamp AS time, cpu::tag AS foo, usage_system::float AS usage_system FROM cpu GROUP BY cpu::tag)"
+            );
+
+            // Projects non-existent foo as a tag in the outer query
+            let stmt = parse_select(
+                "SELECT * FROM (SELECT usage_idle FROM cpu GROUP BY foo) GROUP BY cpu",
+            );
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, foo::tag AS foo, usage_idle::float AS usage_idle FROM (SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu GROUP BY foo) GROUP BY cpu::tag"
+            );
+            // Normalises time to all leaf subqueries
+            let stmt = parse_select(
+                "SELECT * FROM (SELECT MAX(value) FROM (SELECT DISTINCT(usage_idle) AS value FROM cpu)) GROUP BY cpu",
+            );
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, max::float AS max FROM (SELECT time::timestamp AS time, max(value::float) AS max FROM (SELECT time::timestamp AS time, distinct(usage_idle::float) AS value FROM cpu FILL(NONE)) FILL(NONE)) GROUP BY cpu::tag"
+            );
+
+            // Projects non-existent tag, "bytes_free" from cpu and also bytes_free field from disk
+            // NOTE: InfluxQL OG does something really strange and arguably incorrect
+            //
+            // ```
+            // SELECT * FROM (SELECT usage_idle FROM cpu GROUP BY bytes_free), (SELECT bytes_free FROM disk) GROUP BY cpu
+            // ```
+            //
+            // The output shows that InfluxQL expanded the non-existent bytes_free tag (bytes_free1)
+            // from the cpu measurement, and the bytes_free field from the disk measurement as two
+            // separate columns but when producing the results, read the data from the `bytes_free`
+            // field for the disk table.
+            //
+            // ```
+            // name: cpu
+            // tags: cpu=cpu-total
+            // time                bytes_free bytes_free_1 usage_idle
+            // ----                ---------- ------------ ----------
+            // 1667181600000000000                         2.98
+            // 1667181610000000000                         2.99
+            // ... trimmed for brevity
+            //
+            // name: disk
+            // tags: cpu=
+            // time                bytes_free bytes_free_1 usage_idle
+            // ----                ---------- ------------ ----------
+            // 1667181600000000000 1234       1234
+            // 1667181600000000000 3234       3234
+            // ... trimmed for brevity
+            // ```
+            let stmt = parse_select(
+                "SELECT * FROM (SELECT usage_idle FROM cpu GROUP BY bytes_free), (SELECT bytes_free FROM disk) GROUP BY cpu",
+            );
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes_free::integer AS bytes_free, bytes_free::tag AS bytes_free_1, usage_idle::float AS usage_idle FROM (SELECT time::timestamp AS time, usage_idle::float AS usage_idle FROM cpu GROUP BY bytes_free), (SELECT time::timestamp AS time, bytes_free::integer AS bytes_free FROM disk) GROUP BY cpu::tag"
+            );
+        }
+
+        /// `DISTINCT` clause and `distinct` function
+        #[test]
+        fn projection_distinct() {
+            let namespace = MockSchemaProvider::default();
+
+            // COUNT(DISTINCT)
+            let stmt = parse_select("SELECT COUNT(DISTINCT bytes_free) FROM disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, count(distinct(bytes_free::integer)) AS count FROM disk"
+            );
+
+            let stmt = parse_select("SELECT DISTINCT bytes_free FROM disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, distinct(bytes_free::integer) AS \"distinct\" FROM disk"
+            );
+        }
+
+        /// Projections with unary and binary expressions
+        #[test]
+        fn projection_unary_binary_expr() {
+            let namespace = MockSchemaProvider::default();
+
+            // Binary expression
+            let stmt = parse_select("SELECT bytes_free+bytes_used FROM disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, bytes_free::integer + bytes_used::integer AS bytes_free_bytes_used FROM disk"
+            );
+
+            // Unary expressions
+            let stmt = parse_select("SELECT -bytes_free FROM disk");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, -1 * bytes_free::integer AS bytes_free FROM disk"
+            );
+        }
+
+        /// Projections which contain function calls
+        #[test]
+        fn projection_call_expr() {
+            let mut namespace = MockSchemaProvider::default();
+            // Add a schema with tags that could conflict with aliasing against an
+            // existing call expression, in this case "last"
+            namespace.add_schema(
+                SchemaBuilder::new()
+                    .measurement("conflicts")
+                    .timestamp()
+                    .tag("last")
+                    .influx_field("field_f64", InfluxFieldType::Float)
+                    .build()
+                    .unwrap(),
+            );
+
+            let stmt = parse_select("SELECT COUNT(field_i64) FROM temp_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, count(field_i64::integer) AS count FROM temp_01"
+            );
+
+            // Duplicate aggregate columns
+            let stmt = parse_select("SELECT COUNT(field_i64), COUNT(field_i64) FROM temp_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, count(field_i64::integer) AS count, count(field_i64::integer) AS count_1 FROM temp_01"
+            );
+
+            let stmt = parse_select("SELECT COUNT(field_f64) FROM temp_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, count(field_f64::float) AS count FROM temp_01"
+            );
+
+            // Expands all fields
+            let stmt = parse_select("SELECT COUNT(*) FROM temp_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, count(field_f64::float) AS count_field_f64, count(field_i64::integer) AS count_field_i64, count(field_str::string) AS count_field_str, count(field_u64::unsigned) AS count_field_u64, count(shared_field0::float) AS count_shared_field0 FROM temp_01"
+            );
+
+            // Expands matching fields
+            let stmt = parse_select("SELECT COUNT(/64$/) FROM temp_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, count(field_f64::float) AS count_field_f64, count(field_i64::integer) AS count_field_i64, count(field_u64::unsigned) AS count_field_u64 FROM temp_01"
+            );
+
+            // Expands only numeric fields
+            let stmt = parse_select("SELECT SUM(*) FROM temp_01");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, sum(field_f64::float) AS sum_field_f64, sum(field_i64::integer) AS sum_field_i64, sum(field_u64::unsigned) AS sum_field_u64, sum(shared_field0::float) AS sum_shared_field0 FROM temp_01"
+            );
+
+            // Handles conflicts when call expression is renamed to match an existing tag
+            let stmt = parse_select("SELECT LAST(field_f64), last FROM conflicts");
+            let stmt = rewrite_select_statement(&namespace, &stmt).unwrap();
+            assert_eq!(
+                stmt.to_string(),
+                "SELECT time::timestamp AS time, last(field_f64::float) AS last, last::tag AS last_1 FROM conflicts"
+            );
+        }
+    }
+
+    #[test]
+    fn test_has_wildcards() {
+        // no GROUP BY
+        let sel = parse_select("select a from b");
+        let res = has_wildcards(&sel);
+        assert!(!res.0);
+        assert!(!res.1);
+
+        let sel = parse_select("select a from b group by c");
+        let res = has_wildcards(&sel);
+        assert!(!res.0);
+        assert!(!res.1);
+
+        let sel = parse_select("select * from b group by c");
+        let res = has_wildcards(&sel);
+        assert!(res.0);
+        assert!(!res.1);
+
+        let sel = parse_select("select /a/ from b group by c");
+        let res = has_wildcards(&sel);
+        assert!(res.0);
+        assert!(!res.1);
+
+        let sel = parse_select("select a from b group by *");
+        let res = has_wildcards(&sel);
+        assert!(!res.0);
+        assert!(res.1);
+
+        let sel = parse_select("select a from b group by /a/");
+        let res = has_wildcards(&sel);
+        assert!(!res.0);
+        assert!(res.1);
+
+        let sel = parse_select("select * from b group by *");
+        let res = has_wildcards(&sel);
+        assert!(res.0);
+        assert!(res.1);
+
+        let sel = parse_select("select /a/ from b group by /b/");
+        let res = has_wildcards(&sel);
+        assert!(res.0);
+        assert!(res.1);
+
+        // finds wildcard in nested expressions
+        let sel = parse_select("select COUNT(*) from b group by *");
+        let res = has_wildcards(&sel);
+        assert!(res.0);
+        assert!(res.1);
+
+        // does not traverse subqueries
+        let sel = parse_select("select a from (select * from c group by *) group by c");
+        let res = has_wildcards(&sel);
+        assert!(!res.0);
+        assert!(!res.1);
+    }
+}
diff --git a/iox_query_influxql/src/plan/test_utils.rs b/iox_query_influxql/src/plan/test_utils.rs
new file mode 100644
index 0000000..d78caa2
--- /dev/null
+++ b/iox_query_influxql/src/plan/test_utils.rs
@@ -0,0 +1,205 @@
+//! APIs for testing.
+#![cfg(test)]
+
+use crate::error;
+use crate::plan::SchemaProvider;
+use chrono::{DateTime, NaiveDate, Utc};
+use datafusion::common::Result as DataFusionResult;
+use datafusion::datasource::empty::EmptyTable;
+use datafusion::datasource::provider_as_source;
+use datafusion::logical_expr::{AggregateUDF, ScalarUDF, TableSource};
+use datafusion::physical_expr::execution_props::ExecutionProps;
+use influxdb_influxql_parser::parse_statements;
+use influxdb_influxql_parser::select::SelectStatement;
+use influxdb_influxql_parser::statement::Statement;
+use itertools::Itertools;
+use schema::{Schema, SchemaBuilder};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Returns the InfluxQL [`SelectStatement`] for the specified SQL, `s`.
+pub(crate) fn parse_select(s: &str) -> SelectStatement {
+    let statements = parse_statements(s).unwrap();
+    match statements.first() {
+        Some(Statement::Select(sel)) => *sel.clone(),
+        _ => panic!("expected SELECT statement"),
+    }
+}
+
+/// Module which provides a test database and schema for InfluxQL tests.
+pub(crate) mod database {
+    use super::*;
+    use schema::InfluxFieldType;
+
+    /// Return a set of schemas that make up the test database.
+    pub(crate) fn schemas() -> Vec<Schema> {
+        vec![
+            SchemaBuilder::new()
+                .measurement("cpu")
+                .timestamp()
+                .tag("host")
+                .tag("region")
+                .tag("cpu")
+                .influx_field("usage_user", InfluxFieldType::Float)
+                .influx_field("usage_system", InfluxFieldType::Float)
+                .influx_field("usage_idle", InfluxFieldType::Float)
+                .build()
+                .unwrap(),
+            SchemaBuilder::new()
+                .measurement("disk")
+                .timestamp()
+                .tag("host")
+                .tag("region")
+                .tag("device")
+                .influx_field("bytes_used", InfluxFieldType::Integer)
+                .influx_field("bytes_free", InfluxFieldType::Integer)
+                .build()
+                .unwrap(),
+            SchemaBuilder::new()
+                .measurement("diskio")
+                .timestamp()
+                .tag("host")
+                .tag("region")
+                .tag("status")
+                .influx_field("bytes_read", InfluxFieldType::Integer)
+                .influx_field("bytes_written", InfluxFieldType::Integer)
+                .influx_field("read_utilization", InfluxFieldType::Float)
+                .influx_field("write_utilization", InfluxFieldType::Float)
+                .influx_field("is_local", InfluxFieldType::Boolean)
+                .build()
+                .unwrap(),
+            // Schemas for testing merged schemas
+            SchemaBuilder::new()
+                .measurement("temp_01")
+                .timestamp()
+                .tag("shared_tag0")
+                .tag("shared_tag1")
+                .influx_field("shared_field0", InfluxFieldType::Float)
+                .influx_field("field_f64", InfluxFieldType::Float)
+                .influx_field("field_i64", InfluxFieldType::Integer)
+                .influx_field("field_u64", InfluxFieldType::UInteger)
+                .influx_field("field_str", InfluxFieldType::String)
+                .build()
+                .unwrap(),
+            SchemaBuilder::new()
+                .measurement("temp_02")
+                .timestamp()
+                .tag("shared_tag0")
+                .tag("shared_tag1")
+                .influx_field("shared_field0", InfluxFieldType::Integer)
+                .build()
+                .unwrap(),
+            SchemaBuilder::new()
+                .measurement("temp_03")
+                .timestamp()
+                .tag("shared_tag0")
+                .tag("shared_tag1")
+                .influx_field("shared_field0", InfluxFieldType::String)
+                .build()
+                .unwrap(),
+            // Schemas for testing clashing column names when merging across measurements
+            SchemaBuilder::new()
+                .measurement("merge_00")
+                .timestamp()
+                .tag("col0")
+                .influx_field("col1", InfluxFieldType::Float)
+                .influx_field("col2", InfluxFieldType::Boolean)
+                .influx_field("col3", InfluxFieldType::String)
+                .build()
+                .unwrap(),
+            SchemaBuilder::new()
+                .measurement("merge_01")
+                .timestamp()
+                .tag("col1")
+                .influx_field("col0", InfluxFieldType::Float)
+                .influx_field("col3", InfluxFieldType::Boolean)
+                .influx_field("col2", InfluxFieldType::String)
+                .build()
+                .unwrap(),
+            // Schema with all types
+            SchemaBuilder::new()
+                .measurement("all_types")
+                .timestamp()
+                .tag("tag0")
+                .influx_field("field_f64", InfluxFieldType::Float)
+                .influx_field("field_i64", InfluxFieldType::Integer)
+                .influx_field("field_u64", InfluxFieldType::UInteger)
+                .influx_field("field_str", InfluxFieldType::String)
+                .influx_field("field_bool", InfluxFieldType::Boolean)
+                .build()
+                .unwrap(),
+        ]
+    }
+}
+
+pub(crate) struct MockSchemaProvider {
+    execution_props: ExecutionProps,
+    tables: HashMap<String, (Arc<dyn TableSource>, Schema)>,
+}
+
+impl Default for MockSchemaProvider {
+    fn default() -> Self {
+        // Choose a static start time so that tests are deteministic.
+        let start_time = NaiveDate::from_ymd_opt(2023, 1, 1)
+            .unwrap()
+            .and_hms_opt(0, 0, 0)
+            .unwrap();
+        let start_time = DateTime::<Utc>::from_naive_utc_and_offset(start_time, Utc);
+        let mut execution_props = ExecutionProps::new();
+        execution_props.query_execution_start_time = start_time;
+        let mut res = Self {
+            execution_props,
+            tables: HashMap::new(),
+        };
+        res.add_schemas(database::schemas());
+        res
+    }
+}
+
+impl MockSchemaProvider {
+    pub(crate) fn add_schema(&mut self, schema: Schema) {
+        let schema = schema.sort_fields_by_name();
+
+        let table_name = schema.measurement().unwrap().clone();
+        let s = Arc::new(EmptyTable::new(schema.as_arrow()));
+        self.tables
+            .insert(table_name, (provider_as_source(s), schema));
+    }
+
+    pub(crate) fn add_schemas(&mut self, schemas: impl IntoIterator<Item = Schema>) {
+        schemas.into_iter().for_each(|s| self.add_schema(s));
+    }
+}
+
+impl SchemaProvider for MockSchemaProvider {
+    fn get_table_provider(&self, name: &str) -> DataFusionResult<Arc<dyn TableSource>> {
+        self.tables
+            .get(name)
+            .map(|(t, _)| Arc::clone(t))
+            .ok_or_else(|| error::map::query(format!("measurement does not exist: {name}")))
+    }
+
+    fn get_function_meta(&self, _name: &str) -> Option<Arc<ScalarUDF>> {
+        None
+    }
+
+    fn get_aggregate_meta(&self, _name: &str) -> Option<Arc<AggregateUDF>> {
+        None
+    }
+
+    fn table_names(&self) -> Vec<&'_ str> {
+        self.tables
+            .keys()
+            .map(|k| k.as_str())
+            .sorted()
+            .collect::<Vec<_>>()
+    }
+
+    fn table_schema(&self, name: &str) -> Option<Schema> {
+        self.tables.get(name).map(|(_, s)| s.clone())
+    }
+
+    fn execution_props(&self) -> &ExecutionProps {
+        &self.execution_props
+    }
+}
diff --git a/iox_query_influxql/src/plan/udf.rs b/iox_query_influxql/src/plan/udf.rs
new file mode 100644
index 0000000..fdf8a2b
--- /dev/null
+++ b/iox_query_influxql/src/plan/udf.rs
@@ -0,0 +1,381 @@
+//! User-defined functions to serve as stand-ins when constructing the
+//! projection expressions for the logical plan.
+//!
+//! As stand-ins, they allow the projection operator to represent the same
+//! call information as the InfluxQL AST. These expressions are then
+//! rewritten at a later stage of planning, with more context available.
+
+use crate::plan::util::find_exprs_in_exprs;
+use crate::{error, NUMERICS};
+use arrow::datatypes::{DataType, TimeUnit};
+use datafusion::{
+    error::{DataFusionError, Result},
+    logical_expr::{
+        Expr, ScalarFunctionDefinition, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature,
+        Volatility,
+    },
+    physical_plan::ColumnarValue,
+};
+use once_cell::sync::Lazy;
+use std::{any::Any, sync::Arc};
+
+pub(super) enum WindowFunction {
+    MovingAverage,
+    Difference,
+    NonNegativeDifference,
+    Derivative,
+    NonNegativeDerivative,
+    CumulativeSum,
+}
+
+impl WindowFunction {
+    /// Try to return the equivalent [`WindowFunction`] for `fun`.
+    pub(super) fn try_from_scalar_udf(fun: Arc<ScalarUDF>) -> Option<Self> {
+        match fun.name() {
+            MOVING_AVERAGE_UDF_NAME => Some(Self::MovingAverage),
+            DIFFERENCE_UDF_NAME => Some(Self::Difference),
+            NON_NEGATIVE_DIFFERENCE_UDF_NAME => Some(Self::NonNegativeDifference),
+            DERIVATIVE_UDF_NAME => Some(Self::Derivative),
+            NON_NEGATIVE_DERIVATIVE_UDF_NAME => Some(Self::NonNegativeDerivative),
+            CUMULATIVE_SUM_UDF_NAME => Some(Self::CumulativeSum),
+            _ => None,
+        }
+    }
+}
+
+/// Find all [`ScalarUDF`] expressions that match one of the supported
+/// window UDF functions.
+pub(super) fn find_window_udfs(exprs: &[Expr]) -> Vec<Expr> {
+    find_exprs_in_exprs(exprs, &|nested_expr| {
+        let Expr::ScalarFunction(fun) = nested_expr else {
+            return false;
+        };
+        let ScalarFunctionDefinition::UDF(udf) = &fun.func_def else {
+            return false;
+        };
+        WindowFunction::try_from_scalar_udf(Arc::clone(udf)).is_some()
+    })
+}
+
+const MOVING_AVERAGE_UDF_NAME: &str = "moving_average";
+
+#[derive(Debug)]
+struct MovingAverageUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for MovingAverageUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        MOVING_AVERAGE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{MOVING_AVERAGE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
+/// Create an expression to represent the `MOVING_AVERAGE` function.
+pub(crate) fn moving_average(args: Vec<Expr>) -> Expr {
+    MOVING_AVERAGE.call(args)
+}
+
+/// Definition of the `MOVING_AVERAGE` function.
+static MOVING_AVERAGE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(MovingAverageUDF {
+        signature: Signature::one_of(
+            NUMERICS
+                .iter()
+                .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64]))
+                .collect(),
+            Volatility::Immutable,
+        ),
+    }))
+});
+
+const DIFFERENCE_UDF_NAME: &str = "difference";
+
+#[derive(Debug)]
+struct DifferenceUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for DifferenceUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        DIFFERENCE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{DIFFERENCE_UDF_NAME} expects at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{DIFFERENCE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
+/// Create an expression to represent the `DIFFERENCE` function.
+pub(crate) fn difference(args: Vec<Expr>) -> Expr {
+    DIFFERENCE.call(args)
+}
+
+/// Definition of the `DIFFERENCE` function.
+static DIFFERENCE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(DifferenceUDF {
+        signature: Signature::one_of(
+            NUMERICS
+                .iter()
+                .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
+                .collect(),
+            Volatility::Immutable,
+        ),
+    }))
+});
+
+const NON_NEGATIVE_DIFFERENCE_UDF_NAME: &str = "non_negative_difference";
+
+#[derive(Debug)]
+struct NonNegativeDifferenceUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for NonNegativeDifferenceUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        NON_NEGATIVE_DIFFERENCE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{NON_NEGATIVE_DIFFERENCE_UDF_NAME} expects at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{NON_NEGATIVE_DIFFERENCE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
+/// Create an expression to represent the `NON_NEGATIVE_DIFFERENCE` function.
+pub(crate) fn non_negative_difference(args: Vec<Expr>) -> Expr {
+    NON_NEGATIVE_DIFFERENCE.call(args)
+}
+
+/// Definition of the `NON_NEGATIVE_DIFFERENCE` function.
+static NON_NEGATIVE_DIFFERENCE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(NonNegativeDifferenceUDF {
+        signature: Signature::one_of(
+            NUMERICS
+                .iter()
+                .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
+                .collect(),
+            Volatility::Immutable,
+        ),
+    }))
+});
+
+const DERIVATIVE_UDF_NAME: &str = "derivative";
+
+#[derive(Debug)]
+struct DerivativeUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for DerivativeUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        DERIVATIVE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{DERIVATIVE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
+/// Create an expression to represent the `DERIVATIVE` function.
+pub(crate) fn derivative(args: Vec<Expr>) -> Expr {
+    DERIVATIVE.call(args)
+}
+
+/// Definition of the `DERIVATIVE` function.
+static DERIVATIVE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(DerivativeUDF {
+        signature: Signature::one_of(
+            NUMERICS
+                .iter()
+                .flat_map(|dt| {
+                    vec![
+                        TypeSignature::Exact(vec![dt.clone()]),
+                        TypeSignature::Exact(vec![
+                            dt.clone(),
+                            DataType::Duration(TimeUnit::Nanosecond),
+                        ]),
+                    ]
+                })
+                .collect(),
+            Volatility::Immutable,
+        ),
+    }))
+});
+
+const NON_NEGATIVE_DERIVATIVE_UDF_NAME: &str = "non_negative_derivative";
+
+#[derive(Debug)]
+struct NonNegativeDerivativeUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for NonNegativeDerivativeUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        NON_NEGATIVE_DERIVATIVE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{NON_NEGATIVE_DERIVATIVE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+/// Create an expression to represent the `NON_NEGATIVE_DERIVATIVE` function.
+pub(crate) fn non_negative_derivative(args: Vec<Expr>) -> Expr {
+    NON_NEGATIVE_DERIVATIVE.call(args)
+}
+
+/// Definition of the `NON_NEGATIVE_DERIVATIVE` function.
+static NON_NEGATIVE_DERIVATIVE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(NonNegativeDerivativeUDF {
+        signature: Signature::one_of(
+            NUMERICS
+                .iter()
+                .flat_map(|dt| {
+                    vec![
+                        TypeSignature::Exact(vec![dt.clone()]),
+                        TypeSignature::Exact(vec![
+                            dt.clone(),
+                            DataType::Duration(TimeUnit::Nanosecond),
+                        ]),
+                    ]
+                })
+                .collect(),
+            Volatility::Immutable,
+        ),
+    }))
+});
+
+const CUMULATIVE_SUM_UDF_NAME: &str = "cumulative_sum";
+
+#[derive(Debug)]
+struct CumulativeSumUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for CumulativeSumUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        CUMULATIVE_SUM_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{CUMULATIVE_SUM_UDF_NAME} expects at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{CUMULATIVE_SUM_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
+/// Create an expression to represent the `CUMULATIVE_SUM` function.
+pub(crate) fn cumulative_sum(args: Vec<Expr>) -> Expr {
+    CUMULATIVE_SUM.call(args)
+}
+/// Definition of the `CUMULATIVE_SUM` function.
+static CUMULATIVE_SUM: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(CumulativeSumUDF {
+        signature: Signature::one_of(
+            NUMERICS
+                .iter()
+                .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
+                .collect(),
+            Volatility::Immutable,
+        ),
+    }))
+});
diff --git a/iox_query_influxql/src/plan/util.rs b/iox_query_influxql/src/plan/util.rs
new file mode 100644
index 0000000..d63f891
--- /dev/null
+++ b/iox_query_influxql/src/plan/util.rs
@@ -0,0 +1,237 @@
+use crate::error;
+use arrow::datatypes::{DataType, TimeUnit};
+use datafusion::common::tree_node::{Transformed, TreeNode, VisitRecursion};
+use datafusion::common::{DFSchemaRef, Result};
+use datafusion::logical_expr::utils::expr_as_column_expr;
+use datafusion::logical_expr::{lit, Expr, ExprSchemable, LogicalPlan, Operator};
+use datafusion::scalar::ScalarValue;
+use influxdb_influxql_parser::expression::BinaryOperator;
+use influxdb_influxql_parser::literal::Number;
+use influxdb_influxql_parser::string::Regex;
+use query_functions::clean_non_meta_escapes;
+use query_functions::coalesce_struct::coalesce_struct;
+use schema::InfluxColumnType;
+use std::sync::Arc;
+
+use super::ir::{DataSourceSchema, Field};
+
+pub(in crate::plan) fn binary_operator_to_df_operator(op: BinaryOperator) -> Operator {
+    match op {
+        BinaryOperator::Add => Operator::Plus,
+        BinaryOperator::Sub => Operator::Minus,
+        BinaryOperator::Mul => Operator::Multiply,
+        BinaryOperator::Div => Operator::Divide,
+        BinaryOperator::Mod => Operator::Modulo,
+        BinaryOperator::BitwiseAnd => Operator::BitwiseAnd,
+        BinaryOperator::BitwiseOr => Operator::BitwiseOr,
+        BinaryOperator::BitwiseXor => Operator::BitwiseXor,
+    }
+}
+
+/// Container for the DataFusion schema as well as
+/// info on which columns are tags.
+pub(in crate::plan) struct IQLSchema<'a> {
+    pub(in crate::plan) df_schema: DFSchemaRef,
+    tag_info: TagInfo<'a>,
+}
+
+impl<'a> IQLSchema<'a> {
+    /// Create a new IQLSchema from a [`DataSourceSchema`] from the
+    /// FROM clause of a query or subquery.
+    pub(in crate::plan) fn new_from_ds_schema(
+        df_schema: &DFSchemaRef,
+        ds_schema: DataSourceSchema<'a>,
+    ) -> Result<Self> {
+        Ok(Self {
+            df_schema: Arc::clone(df_schema),
+            tag_info: TagInfo::DataSourceSchema(ds_schema),
+        })
+    }
+
+    /// Create a new IQLSchema from a list of [`Field`]s on the SELECT list
+    /// of a subquery.
+    pub(in crate::plan) fn new_from_fields(
+        df_schema: &DFSchemaRef,
+        fields: &'a [Field],
+    ) -> Result<Self> {
+        Ok(Self {
+            df_schema: Arc::clone(df_schema),
+            tag_info: TagInfo::FieldList(fields),
+        })
+    }
+
+    /// Returns `true` if the schema contains a tag column with the specified name.
+    pub(crate) fn is_tag_field(&self, name: &str) -> bool {
+        match self.tag_info {
+            TagInfo::DataSourceSchema(ref ds_schema) => ds_schema.is_tag_field(name),
+            TagInfo::FieldList(fields) => fields
+                .iter()
+                .any(|f| f.name == name && f.data_type == Some(InfluxColumnType::Tag)),
+        }
+    }
+
+    /// Returns `true` if the schema contains a tag column with the specified name.
+    /// If the underlying data source is a subquery, it will apply any aliases in the
+    /// projection that represents the SELECT list.
+    pub(crate) fn is_projected_tag_field(&self, name: &str) -> bool {
+        match self.tag_info {
+            TagInfo::DataSourceSchema(ref ds_schema) => ds_schema.is_projected_tag_field(name),
+            _ => self.is_tag_field(name),
+        }
+    }
+}
+
+pub(in crate::plan) enum TagInfo<'a> {
+    DataSourceSchema(DataSourceSchema<'a>),
+    FieldList(&'a [Field]),
+}
+
+/// Sanitize an InfluxQL regular expression and create a compiled [`regex::Regex`].
+pub(crate) fn parse_regex(re: &Regex) -> Result<regex::Regex> {
+    let pattern = clean_non_meta_escapes(re.as_str());
+    regex::Regex::new(&pattern)
+        .map_err(|e| error::map::query(format!("invalid regular expression '{re}': {e}")))
+}
+
+/// Returns `n` as a scalar value of the specified `data_type`.
+fn number_to_scalar(n: &Number, data_type: &DataType) -> Result<ScalarValue> {
+    Ok(match (n, data_type) {
+        (Number::Integer(v), DataType::Int64) => ScalarValue::from(*v),
+        (Number::Integer(v), DataType::Float64) => ScalarValue::from(*v as f64),
+        (Number::Integer(v), DataType::UInt64) => ScalarValue::from(*v as u64),
+        (Number::Integer(v), DataType::Timestamp(TimeUnit::Nanosecond, tz)) => {
+            ScalarValue::TimestampNanosecond(Some(*v), tz.clone())
+        }
+        (Number::Float(v), DataType::Int64) => ScalarValue::from(*v as i64),
+        (Number::Float(v), DataType::Float64) => ScalarValue::from(*v),
+        (Number::Float(v), DataType::UInt64) => ScalarValue::from(*v as u64),
+        (Number::Float(v), DataType::Timestamp(TimeUnit::Nanosecond, tz)) => {
+            ScalarValue::TimestampNanosecond(Some(*v as i64), tz.clone())
+        }
+        (n, DataType::Struct(fields)) => ScalarValue::Struct(
+            Some(
+                fields
+                    .iter()
+                    .map(|f| number_to_scalar(n, f.data_type()))
+                    .collect::<Result<Vec<_>>>()?,
+            ),
+            fields.clone(),
+        ),
+        (_, DataType::Null) => ScalarValue::Null,
+        (n, data_type) => {
+            // The only output data types expected are Int64, Float64 or UInt64
+            return error::internal(format!("no conversion from {n} to {data_type}"));
+        }
+    })
+}
+
+/// Rebuilds an `Expr` as a projection on top of a collection of `Expr`'s.
+///
+/// For example, the expression `a + b < 1` would require, as input, the 2
+/// individual columns, `a` and `b`. But, if the base expressions already
+/// contain the `a + b` result, then that may be used in lieu of the `a` and
+/// `b` columns.
+///
+/// This is useful in the context of a query like:
+///
+/// SELECT a + b < 1 ... GROUP BY a + b
+///
+/// where post-aggregation, `a + b` need not be a projection against the
+/// individual columns `a` and `b`, but rather it is a projection against the
+/// `a + b` found in the GROUP BY.
+///
+/// `fill_if_null` will be used to coalesce any expressions from `NULL`.
+/// This is used with the `FILL(<value>)` strategy.
+pub(crate) fn rebase_expr(
+    expr: &Expr,
+    base_exprs: &[Expr],
+    fill_if_null: &Option<Number>,
+    plan: &LogicalPlan,
+) -> Result<Expr> {
+    if let Some(value) = fill_if_null {
+        expr.clone().transform_up(&|nested_expr| {
+            Ok(if base_exprs.contains(&nested_expr) {
+                let col_expr = expr_as_column_expr(&nested_expr, plan)?;
+                let data_type = col_expr.get_type(plan.schema())?;
+                Transformed::Yes(coalesce_struct(vec![
+                    col_expr,
+                    lit(number_to_scalar(value, &data_type)?),
+                ]))
+            } else {
+                Transformed::No(nested_expr)
+            })
+        })
+    } else {
+        expr.clone().transform_up(&|nested_expr| {
+            Ok(if base_exprs.contains(&nested_expr) {
+                Transformed::Yes(expr_as_column_expr(&nested_expr, plan)?)
+            } else {
+                Transformed::No(nested_expr)
+            })
+        })
+    }
+}
+
+pub(crate) fn contains_expr(expr: &Expr, needle: &Expr) -> bool {
+    let mut found = false;
+    expr.apply(&mut |expr| {
+        if expr == needle {
+            found = true;
+            Ok(VisitRecursion::Stop)
+        } else {
+            Ok(VisitRecursion::Continue)
+        }
+    })
+    .expect("cannot fail");
+    found
+}
+
+/// Search the provided `Expr`'s, and all of their nested `Expr`, for any that
+/// pass the provided test. The returned `Expr`'s are deduplicated and returned
+/// in order of appearance (depth first).
+///
+/// # NOTE
+///
+/// Copied from DataFusion
+pub(crate) fn find_exprs_in_exprs<F>(exprs: &[Expr], test_fn: &F) -> Vec<Expr>
+where
+    F: Fn(&Expr) -> bool,
+{
+    exprs
+        .iter()
+        .flat_map(|expr| find_exprs_in_expr(expr, test_fn))
+        .fold(vec![], |mut acc, expr| {
+            if !acc.contains(&expr) {
+                acc.push(expr)
+            }
+            acc
+        })
+}
+
+/// Search an `Expr`, and all of its nested `Expr`'s, for any that pass the
+/// provided test. The returned `Expr`'s are deduplicated and returned in order
+/// of appearance (depth first).
+///
+/// # NOTE
+///
+/// Copied from DataFusion
+fn find_exprs_in_expr<F>(expr: &Expr, test_fn: &F) -> Vec<Expr>
+where
+    F: Fn(&Expr) -> bool,
+{
+    let mut exprs = vec![];
+    expr.apply(&mut |expr| {
+        if test_fn(expr) {
+            if !(exprs.contains(expr)) {
+                exprs.push(expr.clone())
+            }
+            // stop recursing down this expr once we find a match
+            return Ok(VisitRecursion::Skip);
+        }
+
+        Ok(VisitRecursion::Continue)
+    })
+    // pre_visit always returns OK, so this will always too
+    .expect("no way to return error during recursion");
+    exprs
+}
diff --git a/iox_query_influxql/src/plan/var_ref.rs b/iox_query_influxql/src/plan/var_ref.rs
new file mode 100644
index 0000000..9991379
--- /dev/null
+++ b/iox_query_influxql/src/plan/var_ref.rs
@@ -0,0 +1,84 @@
+use arrow::datatypes::DataType;
+use influxdb_influxql_parser::expression::VarRefDataType;
+use schema::{InfluxColumnType, InfluxFieldType};
+
+/// Map a field-like data type to an equivalent Arrow data type.
+pub(crate) fn var_ref_data_type_to_data_type(v: VarRefDataType) -> Option<DataType> {
+    match v {
+        VarRefDataType::Float => Some(DataType::Float64),
+        VarRefDataType::Integer => Some(DataType::Int64),
+        VarRefDataType::Unsigned => Some(DataType::UInt64),
+        VarRefDataType::String => Some(DataType::Utf8),
+        VarRefDataType::Boolean => Some(DataType::Boolean),
+        VarRefDataType::Tag | VarRefDataType::Field | VarRefDataType::Timestamp => None,
+    }
+}
+
+/// Maps an [`InfluxFieldType`] to a [`VarRefDataType`].
+pub(crate) fn field_type_to_var_ref_data_type(v: InfluxFieldType) -> VarRefDataType {
+    match v {
+        InfluxFieldType::Integer => VarRefDataType::Integer,
+        InfluxFieldType::UInteger => VarRefDataType::Unsigned,
+        InfluxFieldType::Float => VarRefDataType::Float,
+        InfluxFieldType::String => VarRefDataType::String,
+        InfluxFieldType::Boolean => VarRefDataType::Boolean,
+    }
+}
+
+/// Maps an [`InfluxFieldType`] to a [`VarRefDataType`].
+pub(crate) fn influx_type_to_var_ref_data_type(
+    v: Option<InfluxColumnType>,
+) -> Option<VarRefDataType> {
+    match v {
+        None => None,
+        Some(InfluxColumnType::Tag) => Some(VarRefDataType::Tag),
+        Some(InfluxColumnType::Field(ft)) => Some(field_type_to_var_ref_data_type(ft)),
+        Some(InfluxColumnType::Timestamp) => Some(VarRefDataType::Timestamp),
+    }
+}
+
+/// Maps an [`VarRefDataType`] to an [`InfluxColumnType`].
+pub(crate) fn var_ref_data_type_to_influx_type(
+    v: Option<VarRefDataType>,
+) -> Option<InfluxColumnType> {
+    match v {
+        Some(VarRefDataType::Float) => Some(InfluxColumnType::Field(InfluxFieldType::Float)),
+        Some(VarRefDataType::Integer) => Some(InfluxColumnType::Field(InfluxFieldType::Integer)),
+        Some(VarRefDataType::Unsigned) => Some(InfluxColumnType::Field(InfluxFieldType::UInteger)),
+        Some(VarRefDataType::String) => Some(InfluxColumnType::Field(InfluxFieldType::String)),
+        Some(VarRefDataType::Boolean) => Some(InfluxColumnType::Field(InfluxFieldType::Boolean)),
+        Some(VarRefDataType::Tag) => Some(InfluxColumnType::Tag),
+        Some(VarRefDataType::Timestamp) => Some(InfluxColumnType::Timestamp),
+        Some(VarRefDataType::Field) | None => None,
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use assert_matches::assert_matches;
+
+    #[test]
+    fn test_field_type_to_var_ref_data_type() {
+        assert_matches!(
+            field_type_to_var_ref_data_type(InfluxFieldType::Float),
+            VarRefDataType::Float
+        );
+        assert_matches!(
+            field_type_to_var_ref_data_type(InfluxFieldType::Integer),
+            VarRefDataType::Integer
+        );
+        assert_matches!(
+            field_type_to_var_ref_data_type(InfluxFieldType::UInteger),
+            VarRefDataType::Unsigned
+        );
+        assert_matches!(
+            field_type_to_var_ref_data_type(InfluxFieldType::String),
+            VarRefDataType::String
+        );
+        assert_matches!(
+            field_type_to_var_ref_data_type(InfluxFieldType::Boolean),
+            VarRefDataType::Boolean
+        );
+    }
+}
diff --git a/iox_query_influxql/src/window.rs b/iox_query_influxql/src/window.rs
new file mode 100644
index 0000000..32d9586
--- /dev/null
+++ b/iox_query_influxql/src/window.rs
@@ -0,0 +1,66 @@
+//! User defined window functions implementing influxQL features.
+
+use datafusion::logical_expr::{WindowFunctionDefinition, WindowUDF};
+use once_cell::sync::Lazy;
+use std::sync::Arc;
+
+mod cumulative_sum;
+mod derivative;
+mod difference;
+mod moving_average;
+mod non_negative;
+mod percent_row_number;
+
+/// Definition of the `CUMULATIVE_SUM` user-defined window function.
+pub(crate) static CUMULATIVE_SUM: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        cumulative_sum::CumulativeSumUDWF::new(),
+    )))
+});
+
+/// Definition of the `DERIVATIVE` user-defined window function.
+pub(crate) static DERIVATIVE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        derivative::DerivativeUDWF::new(),
+    )))
+});
+
+/// Definition of the `DIFFERENCE` user-defined window function.
+pub(crate) static DIFFERENCE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        difference::DifferenceUDWF::new(),
+    )))
+});
+
+/// Definition of the `MOVING_AVERAGE` user-defined window function.
+pub(crate) static MOVING_AVERAGE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        moving_average::MovingAverageUDWF::new(),
+    )))
+});
+
+/// Definition of the `NON_NEGATIVE_DERIVATIVE` user-defined window function.
+pub(crate) static NON_NEGATIVE_DERIVATIVE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        non_negative::NonNegativeUDWF::new(
+            "non_negative_derivative",
+            derivative::DerivativeUDWF::new(),
+        ),
+    )))
+});
+/// Definition of the `NON_NEGATIVE_DIFFERENCE` user-defined window function.
+pub(crate) static NON_NEGATIVE_DIFFERENCE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        non_negative::NonNegativeUDWF::new(
+            "non_negative_difference",
+            difference::DifferenceUDWF::new(),
+        ),
+    )))
+});
+
+/// Definition of the `PERCENT_ROW_NUMBER` user-defined window function.
+pub(crate) static PERCENT_ROW_NUMBER: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        percent_row_number::PercentRowNumberUDWF::new(),
+    )))
+});
diff --git a/iox_query_influxql/src/window/cumulative_sum.rs b/iox_query_influxql/src/window/cumulative_sum.rs
new file mode 100644
index 0000000..8153a92
--- /dev/null
+++ b/iox_query_influxql/src/window/cumulative_sum.rs
@@ -0,0 +1,83 @@
+use crate::NUMERICS;
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::DataType;
+use datafusion::common::{Result, ScalarValue};
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct CumulativeSumUDWF {
+    signature: Signature,
+}
+
+impl CumulativeSumUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl WindowUDFImpl for CumulativeSumUDWF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "cumumlative_sum"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(CumulativeSumPartitionEvaluator {}))
+    }
+}
+
+/// PartitionEvaluator which returns the cumulative sum of the input.
+#[derive(Debug)]
+struct CumulativeSumPartitionEvaluator {}
+
+impl PartitionEvaluator for CumulativeSumPartitionEvaluator {
+    fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result<Arc<dyn Array>> {
+        assert_eq!(values.len(), 1);
+
+        let array = Arc::clone(&values[0]);
+        let mut sum = ScalarValue::new_zero(array.data_type())?;
+        let mut cumulative: Vec<ScalarValue> = vec![];
+        for idx in 0..num_rows {
+            let v = ScalarValue::try_from_array(&array, idx)?;
+            let res = if v.is_null() {
+                v
+            } else {
+                sum = sum.add(&v)?;
+                sum.clone()
+            };
+            cumulative.push(res);
+        }
+        Ok(Arc::new(ScalarValue::iter_to_array(cumulative)?))
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        false
+    }
+
+    fn include_rank(&self) -> bool {
+        false
+    }
+}
diff --git a/iox_query_influxql/src/window/derivative.rs b/iox_query_influxql/src/window/derivative.rs
new file mode 100644
index 0000000..019bc4a
--- /dev/null
+++ b/iox_query_influxql/src/window/derivative.rs
@@ -0,0 +1,152 @@
+use crate::{error, NUMERICS};
+use arrow::array::{Array, ArrayRef};
+use arrow::datatypes::{DataType, TimeUnit};
+use datafusion::common::{Result, ScalarValue};
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, TIMEZONE_WILDCARD,
+};
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct DerivativeUDWF {
+    signature: Signature,
+}
+
+impl DerivativeUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .flat_map(|dt| {
+                        [
+                            TypeSignature::Exact(vec![
+                                dt.clone(),
+                                DataType::Duration(TimeUnit::Nanosecond),
+                                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                            ]),
+                            TypeSignature::Exact(vec![
+                                dt.clone(),
+                                DataType::Duration(TimeUnit::Nanosecond),
+                                DataType::Timestamp(
+                                    TimeUnit::Nanosecond,
+                                    Some(TIMEZONE_WILDCARD.into()),
+                                ),
+                            ]),
+                        ]
+                    })
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl WindowUDFImpl for DerivativeUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "derivative"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(DifferencePartitionEvaluator {}))
+    }
+}
+
+/// PartitionEvaluator which returns the derivative between input values,
+/// in the provided units.
+#[derive(Debug)]
+struct DifferencePartitionEvaluator {}
+
+impl PartitionEvaluator for DifferencePartitionEvaluator {
+    fn evaluate_all(&mut self, values: &[ArrayRef], _num_rows: usize) -> Result<Arc<dyn Array>> {
+        assert_eq!(values.len(), 3);
+
+        let array = Arc::clone(&values[0]);
+        let times = Arc::clone(&values[2]);
+
+        // The second element of the values array is the second argument to
+        // the 'derivative' function. This specifies the unit duration for the
+        // derivation to use.
+        //
+        // INVARIANT:
+        // The planner guarantees that the second argument is always a duration
+        // literal.
+        let unit = ScalarValue::try_from_array(&values[1], 0)?;
+
+        let mut idx: usize = 0;
+        let mut last: ScalarValue = array.data_type().try_into()?;
+        let mut last_time: ScalarValue = times.data_type().try_into()?;
+        let mut derivative: Vec<ScalarValue> = vec![];
+
+        while idx < array.len() {
+            last = ScalarValue::try_from_array(&array, idx)?;
+            last_time = ScalarValue::try_from_array(&times, idx)?;
+            derivative.push(ScalarValue::Float64(None));
+            idx += 1;
+            if !last.is_null() {
+                break;
+            }
+        }
+        while idx < array.len() {
+            let v = ScalarValue::try_from_array(&array, idx)?;
+            let t = ScalarValue::try_from_array(&times, idx)?;
+            if v.is_null() {
+                derivative.push(ScalarValue::Float64(None));
+            } else {
+                derivative.push(ScalarValue::Float64(Some(
+                    delta(&v, &last)? / delta_time(&t, &last_time, &unit)?,
+                )));
+                last = v.clone();
+                last_time = t.clone();
+            }
+            idx += 1;
+        }
+        Ok(Arc::new(ScalarValue::iter_to_array(derivative)?))
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        false
+    }
+
+    fn include_rank(&self) -> bool {
+        false
+    }
+}
+
+fn delta(curr: &ScalarValue, prev: &ScalarValue) -> Result<f64> {
+    match (curr, prev) {
+        (ScalarValue::Float64(Some(curr)), ScalarValue::Float64(Some(prev))) => Ok(*curr - *prev),
+        (ScalarValue::Int64(Some(curr)), ScalarValue::Int64(Some(prev))) => {
+            Ok(*curr as f64 - *prev as f64)
+        }
+        (ScalarValue::UInt64(Some(curr)), ScalarValue::UInt64(Some(prev))) => {
+            Ok(*curr as f64 - *prev as f64)
+        }
+        _ => error::internal("derivative attempted on unsupported values"),
+    }
+}
+
+fn delta_time(curr: &ScalarValue, prev: &ScalarValue, unit: &ScalarValue) -> Result<f64> {
+    if let (
+        ScalarValue::TimestampNanosecond(Some(curr), _),
+        ScalarValue::TimestampNanosecond(Some(prev), _),
+        ScalarValue::IntervalMonthDayNano(Some(unit)),
+    ) = (curr, prev, unit)
+    {
+        Ok((*curr as f64 - *prev as f64) / *unit as f64)
+    } else {
+        error::internal("derivative attempted on unsupported values")
+    }
+}
diff --git a/iox_query_influxql/src/window/difference.rs b/iox_query_influxql/src/window/difference.rs
new file mode 100644
index 0000000..1618d72
--- /dev/null
+++ b/iox_query_influxql/src/window/difference.rs
@@ -0,0 +1,98 @@
+use crate::NUMERICS;
+use arrow::array::{Array, ArrayRef};
+use arrow::compute::kernels::numeric::sub_wrapping;
+use arrow::compute::shift;
+use arrow::datatypes::DataType;
+use datafusion::common::{Result, ScalarValue};
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct DifferenceUDWF {
+    signature: Signature,
+}
+
+impl DifferenceUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl WindowUDFImpl for DifferenceUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "difference"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(DifferencePartitionEvaluator {}))
+    }
+}
+
+/// PartitionEvaluator which returns the difference between input values.
+#[derive(Debug)]
+struct DifferencePartitionEvaluator {}
+
+impl PartitionEvaluator for DifferencePartitionEvaluator {
+    fn evaluate_all(&mut self, values: &[ArrayRef], _num_rows: usize) -> Result<Arc<dyn Array>> {
+        assert_eq!(values.len(), 1);
+
+        let array = Arc::clone(&values[0]);
+        if array.null_count() == 0 {
+            // If there are no gaps then use arrow kernels.
+            Ok(sub_wrapping(&array, &shift(&array, 1)?)?)
+        } else {
+            let mut idx: usize = 0;
+            let mut last: ScalarValue = array.data_type().try_into()?;
+            let mut difference: Vec<ScalarValue> = vec![];
+            while idx < array.len() {
+                last = ScalarValue::try_from_array(&array, idx)?;
+                difference.push(array.data_type().try_into()?);
+                idx += 1;
+                if !last.is_null() {
+                    break;
+                }
+            }
+            while idx < array.len() {
+                let v = ScalarValue::try_from_array(&array, idx)?;
+                if v.is_null() {
+                    difference.push(array.data_type().try_into()?);
+                } else {
+                    difference.push(v.sub(last)?);
+                    last = v;
+                }
+                idx += 1;
+            }
+            Ok(Arc::new(ScalarValue::iter_to_array(difference)?))
+        }
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        false
+    }
+
+    fn include_rank(&self) -> bool {
+        false
+    }
+}
diff --git a/iox_query_influxql/src/window/moving_average.rs b/iox_query_influxql/src/window/moving_average.rs
new file mode 100644
index 0000000..e611291
--- /dev/null
+++ b/iox_query_influxql/src/window/moving_average.rs
@@ -0,0 +1,116 @@
+use crate::{error, NUMERICS};
+use arrow::array::{Array, ArrayRef, Int64Array};
+use arrow::datatypes::DataType;
+use datafusion::common::{downcast_value, DataFusionError, Result, ScalarValue};
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct MovingAverageUDWF {
+    signature: Signature,
+}
+
+impl MovingAverageUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64]))
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl WindowUDFImpl for MovingAverageUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "moving_average"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(AvgNPartitionEvaluator {}))
+    }
+}
+
+/// PartitionEvaluator which returns a moving average of the input data..
+#[derive(Debug)]
+struct AvgNPartitionEvaluator {}
+
+impl PartitionEvaluator for AvgNPartitionEvaluator {
+    fn evaluate_all(&mut self, values: &[ArrayRef], _num_rows: usize) -> Result<Arc<dyn Array>> {
+        assert_eq!(values.len(), 2, "AVG_N expects two arguments");
+
+        // The second element of the values array is the second argument to the `moving_average`
+        // function, which specifies the minimum number of values that must be aggregated.
+        //
+        // INVARIANT:
+        // The planner and rewriter guarantee that the second argument is
+        // always a numeric constant.
+        //
+        // See: FieldChecker::check_moving_average
+        let n_values = downcast_value!(&values[1], Int64Array);
+        let n = n_values.value(0);
+
+        let array = &values[0];
+        let mut deq: VecDeque<f64> = VecDeque::new();
+        let mut avg_n: Vec<ScalarValue> = vec![];
+        for idx in 0..array.len() {
+            let value = match ScalarValue::try_from_array(&array, idx)? {
+                ScalarValue::Float64(o) => o,
+                ScalarValue::Int64(o) => o.map(|v| v as f64),
+                ScalarValue::UInt64(o) => o.map(|v| v as f64),
+                _ => {
+                    return error::internal(format!(
+                        "unsupported data type for moving_average ({})",
+                        array.data_type()
+                    ));
+                }
+            };
+            match value {
+                None => {
+                    avg_n.push(ScalarValue::Float64(None));
+                    continue;
+                }
+                Some(v) => {
+                    deq.push_back(v);
+                    if deq.len() > n as usize {
+                        deq.pop_front();
+                    }
+                    if deq.len() != n as usize {
+                        avg_n.push(ScalarValue::Float64(None));
+                        continue;
+                    }
+                    avg_n.push(ScalarValue::Float64(Some(
+                        deq.iter().sum::<f64>() / n as f64,
+                    )));
+                }
+            }
+        }
+        Ok(Arc::new(ScalarValue::iter_to_array(avg_n)?))
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        false
+    }
+
+    fn include_rank(&self) -> bool {
+        false
+    }
+}
diff --git a/iox_query_influxql/src/window/non_negative.rs b/iox_query_influxql/src/window/non_negative.rs
new file mode 100644
index 0000000..d971879
--- /dev/null
+++ b/iox_query_influxql/src/window/non_negative.rs
@@ -0,0 +1,113 @@
+use arrow::array::Array;
+use arrow::compute::kernels::cmp::lt;
+use arrow::compute::nullif;
+use arrow::datatypes::DataType;
+use datafusion::common::{Result, ScalarValue};
+use datafusion::logical_expr::window_state::WindowAggState;
+use datafusion::logical_expr::{PartitionEvaluator, Signature, WindowUDFImpl};
+use std::any::Any;
+use std::ops::Range;
+use std::sync::Arc;
+
+/// Wrap a WindowUDF so that all values are non-negative.
+
+#[derive(Debug)]
+pub(super) struct NonNegativeUDWF<U: WindowUDFImpl> {
+    name: String,
+    inner: U,
+}
+
+impl<U: WindowUDFImpl> NonNegativeUDWF<U> {
+    pub(super) fn new(name: impl Into<String>, inner: U) -> Self {
+        Self {
+            name: name.into(),
+            inner,
+        }
+    }
+}
+
+impl<U: WindowUDFImpl + 'static> WindowUDFImpl for NonNegativeUDWF<U> {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        self.inner.signature()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        self.inner.return_type(arg_types)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(NonNegative {
+            partition_evaluator: self.inner.partition_evaluator()?,
+        }))
+    }
+}
+
+/// Wraps an existing [`PartitionEvaluator`] and ensures that all values are
+/// non-negative.
+#[derive(Debug)]
+struct NonNegative {
+    partition_evaluator: Box<dyn PartitionEvaluator>,
+}
+impl PartitionEvaluator for NonNegative {
+    fn memoize(&mut self, state: &mut WindowAggState) -> Result<()> {
+        self.partition_evaluator.memoize(state)
+    }
+
+    fn get_range(&self, idx: usize, n_rows: usize) -> Result<Range<usize>> {
+        self.partition_evaluator.get_range(idx, n_rows)
+    }
+
+    fn evaluate_all(
+        &mut self,
+        values: &[Arc<dyn Array>],
+        num_rows: usize,
+    ) -> Result<Arc<dyn Array>> {
+        let array = self.partition_evaluator.evaluate_all(values, num_rows)?;
+        let zero = ScalarValue::new_zero(array.data_type())?;
+        let predicate = lt(&array, &zero.to_scalar()?)?;
+        Ok(nullif(&array, &predicate)?)
+    }
+
+    fn evaluate(&mut self, values: &[Arc<dyn Array>], range: &Range<usize>) -> Result<ScalarValue> {
+        let value = self.partition_evaluator.evaluate(values, range)?;
+        Ok(match value {
+            ScalarValue::Float64(Some(v)) if v < 0.0 => ScalarValue::Float64(None),
+            ScalarValue::Int64(Some(v)) if v < 0 => ScalarValue::Int64(None),
+            v => v,
+        })
+    }
+
+    fn evaluate_all_with_rank(
+        &self,
+        num_rows: usize,
+        ranks_in_partition: &[Range<usize>],
+    ) -> Result<Arc<dyn Array>> {
+        let array = self
+            .partition_evaluator
+            .evaluate_all_with_rank(num_rows, ranks_in_partition)?;
+
+        let zero = ScalarValue::new_zero(array.data_type())?;
+        let predicate = lt(&array, &zero.to_scalar()?)?;
+        Ok(nullif(&array, &predicate)?)
+    }
+
+    fn supports_bounded_execution(&self) -> bool {
+        self.partition_evaluator.supports_bounded_execution()
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        self.partition_evaluator.uses_window_frame()
+    }
+
+    fn include_rank(&self) -> bool {
+        self.partition_evaluator.include_rank()
+    }
+}
diff --git a/iox_query_influxql/src/window/percent_row_number.rs b/iox_query_influxql/src/window/percent_row_number.rs
new file mode 100644
index 0000000..7d1714e
--- /dev/null
+++ b/iox_query_influxql/src/window/percent_row_number.rs
@@ -0,0 +1,113 @@
+use crate::error;
+use arrow::array::{Array, ArrayRef, Float64Array, Int64Array, UInt64Array};
+use arrow::datatypes::DataType;
+use datafusion::common::{downcast_value, DataFusionError, Result};
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct PercentRowNumberUDWF {
+    signature: Signature,
+}
+
+impl PercentRowNumberUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                    TypeSignature::Exact(vec![DataType::Float64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl WindowUDFImpl for PercentRowNumberUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "percent_row_number"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::UInt64)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(PercentRowNumberPartitionEvaluator {}))
+    }
+}
+
+/// PartitionEvaluator which returns the row number at which the nth
+/// percentile of the data will occur.
+///
+/// This evaluator calculates the row_number accross the entire partition,
+/// any data that should not be included must be filtered out before
+/// evaluating the window function.
+#[derive(Debug)]
+struct PercentRowNumberPartitionEvaluator {}
+
+impl PartitionEvaluator for PercentRowNumberPartitionEvaluator {
+    fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result<Arc<dyn Array>> {
+        assert_eq!(values.len(), 1);
+
+        let array = Arc::clone(&values[0]);
+        let mut builder = UInt64Array::builder(array.len());
+        match array.data_type() {
+            DataType::Int64 => builder.extend(downcast_value!(array, Int64Array).iter().map(|o| {
+                o.and_then(|v| percentile_idx(num_rows, v as f64).map(|v| v as u64))
+                    .or(Some(0))
+            })),
+            DataType::Float64 => {
+                builder.extend(downcast_value!(array, Float64Array).iter().map(|o| {
+                    o.and_then(|v| percentile_idx(num_rows, v).map(|v| v as u64))
+                        .or(Some(0))
+                }))
+            }
+            dt => {
+                return error::internal(format!(
+                    "invalid data type ({dt}) for PERCENTILE n argument"
+                ))
+            }
+        };
+        Ok(Arc::new(builder.finish()))
+    }
+
+    fn supports_bounded_execution(&self) -> bool {
+        false
+    }
+
+    fn uses_window_frame(&self) -> bool {
+        false
+    }
+
+    fn include_rank(&self) -> bool {
+        false
+    }
+}
+
+/// Calculate the location in an ordered list of len items where the
+/// location of the item at the given percentile would be found.
+///
+/// Note that row numbers are 1-based so this returns values in the
+/// range \[1,len\].
+///
+/// This uses the same algorithm as the original influxdb implementation
+/// of percentile as can be found in
+/// <https://github.com/influxdata/influxdb/blob/75a8bcfae2af7b0043933be9f96b98c0741ceee3/influxql/query/call_iterator.go#L1087>.
+fn percentile_idx(len: usize, percentile: f64) -> Option<usize> {
+    match TryInto::<usize>::try_into(((len as f64) * percentile / 100.0 + 0.5).floor() as isize) {
+        Ok(idx) if 0 < idx && idx < len => Some(idx),
+        _ => None,
+    }
+}
diff --git a/iox_query_influxrpc/Cargo.toml b/iox_query_influxrpc/Cargo.toml
new file mode 100644
index 0000000..73996fd
--- /dev/null
+++ b/iox_query_influxrpc/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "iox_query_influxrpc"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+data_types = { path = "../data_types" }
+datafusion = { workspace = true }
+datafusion_util = { path = "../datafusion_util" }
+futures = "0.3"
+hashbrown = { workspace = true }
+iox_query = { path = "../iox_query" }
+observability_deps = { path = "../observability_deps" }
+query_functions = { path = "../query_functions"}
+schema = { path = "../schema" }
+predicate = { path = "../predicate" }
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+arrow_util = { path = "../arrow_util" }
+test_helpers = { path = "../test_helpers" }
+insta = { version = "1", features = ["yaml"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot"] }
diff --git a/iox_query_influxrpc/src/lib.rs b/iox_query_influxrpc/src/lib.rs
new file mode 100644
index 0000000..ea4808f
--- /dev/null
+++ b/iox_query_influxrpc/src/lib.rs
@@ -0,0 +1,2598 @@
+//! Query frontend for InfluxDB Storage gRPC requests
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    clippy::clone_on_ref_ptr,
+    clippy::dbg_macro,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::use_self,
+    missing_copy_implementations,
+    missing_debug_implementations,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use arrow::datatypes::DataType;
+use data_types::ChunkId;
+use datafusion::{
+    common::DFSchemaRef,
+    error::DataFusionError,
+    logical_expr::{utils::exprlist_to_columns, ExprSchemable, LogicalPlan, LogicalPlanBuilder},
+    prelude::{when, Column, Expr},
+};
+use datafusion_util::{
+    config::{DEFAULT_CATALOG, DEFAULT_SCHEMA},
+    AsExpr,
+};
+use futures::{Stream, StreamExt, TryStreamExt};
+use hashbrown::HashSet;
+use iox_query::{
+    exec::{
+        field::FieldColumns, fieldlist::Field, make_non_null_checker, make_schema_pivot,
+        stringset::StringSet, IOxSessionContext,
+    },
+    plan::{
+        fieldlist::FieldListPlan,
+        seriesset::{SeriesSetPlan, SeriesSetPlans},
+        stringset::{Error as StringSetError, StringSetPlan, StringSetPlanBuilder},
+    },
+    QueryChunk, QueryNamespace,
+};
+use observability_deps::tracing::{debug, warn};
+use predicate::{
+    rpc_predicate::{
+        InfluxRpcPredicate, QueryNamespaceMeta, FIELD_COLUMN_NAME, GROUP_KEY_SPECIAL_START,
+        GROUP_KEY_SPECIAL_STOP, MEASUREMENT_COLUMN_NAME,
+    },
+    Predicate,
+};
+use query_functions::{
+    group_by::{Aggregate, WindowDuration},
+    make_window_bound_expr,
+    selectors::{selector_first, selector_last, selector_max, selector_min},
+};
+use schema::{InfluxColumnType, Projection, Schema, TIME_COLUMN_NAME};
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+use std::collections::{BTreeMap, HashSet as StdHashSet};
+use std::{cmp::Reverse, collections::BTreeSet, sync::Arc};
+
+use crate::scan_plan::ScanPlanBuilder;
+
+mod missing_columns;
+mod scan_plan;
+
+const CONCURRENT_TABLE_JOBS: usize = 10;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "gRPC planner got error fetching chunks for table '{}': {}",
+        table_name,
+        source
+    ))]
+    GettingChunks {
+        table_name: String,
+        source: DataFusionError,
+    },
+
+    #[snafu(display(
+        "gRPC planner got error checking if chunk {} could pass predicate: {}",
+        chunk_id,
+        source
+    ))]
+    CheckingChunkPredicate {
+        chunk_id: ChunkId,
+        source: DataFusionError,
+    },
+
+    #[snafu(display("gRPC planner got error creating string set plan: {}", source))]
+    CreatingStringSet { source: StringSetError },
+
+    #[snafu(display("gRPC planner got error creating predicates: {}", source))]
+    CreatingPredicates { source: DataFusionError },
+
+    #[snafu(display("gRPC planner got error building plan: {}", source))]
+    BuildingPlan { source: DataFusionError },
+
+    #[snafu(display("gRPC planner got error reading columns from expression: {}", source))]
+    ReadColumns {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display(
+        "gRPC planner error: column '{}' is not a tag, it is {:?}",
+        tag_name,
+        influx_column_type
+    ))]
+    InvalidTagColumn {
+        tag_name: String,
+        influx_column_type: Option<InfluxColumnType>,
+    },
+
+    #[snafu(display(
+        "Internal error: tag column '{}' is not Utf8 type, it is {:?} ",
+        tag_name,
+        data_type
+    ))]
+    InternalInvalidTagType {
+        tag_name: String,
+        data_type: DataType,
+    },
+
+    #[snafu(display("Duplicate group column '{}'", column_name))]
+    DuplicateGroupColumn { column_name: String },
+
+    #[snafu(display(
+        "Group column '{}' not found in tag columns: {:?}",
+        column_name,
+        all_tag_column_names
+    ))]
+    GroupColumnNotFound {
+        column_name: String,
+        all_tag_column_names: Vec<String>,
+    },
+
+    #[snafu(display("Error creating aggregate expression:  {}", source))]
+    CreatingAggregates {
+        source: query_functions::group_by::Error,
+    },
+
+    #[snafu(display("Error creating scan:  {}", source))]
+    CreatingScan { source: crate::scan_plan::Error },
+
+    #[snafu(display(
+        "gRPC planner got error casting aggregate {:?} for {}: {}",
+        agg,
+        field_name,
+        source
+    ))]
+    CastingAggregates {
+        agg: Aggregate,
+        field_name: String,
+        source: DataFusionError,
+    },
+
+    #[snafu(display("Internal error: unexpected aggregate request for None aggregate",))]
+    InternalUnexpectedNoneAggregate {},
+
+    #[snafu(display("Internal error: aggregate {:?} is not a selector", agg))]
+    InternalAggregateNotSelector { agg: Aggregate },
+
+    #[snafu(display("Table was removed while planning query: {}", table_name))]
+    TableRemoved { table_name: String },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl Error {
+    pub fn to_df_error(self, method: &'static str) -> DataFusionError {
+        let msg = self.to_string();
+
+        match self {
+            Self::GettingChunks { source, .. }
+            | Self::CreatingPredicates { source, .. }
+            | Self::BuildingPlan { source, .. }
+            | Self::ReadColumns { source, .. }
+            | Self::CheckingChunkPredicate { source, .. }
+            | Self::CastingAggregates { source, .. } => {
+                DataFusionError::Context(format!("{method}: {msg}"), Box::new(source))
+            }
+            Self::TableRemoved { .. }
+            | Self::InvalidTagColumn { .. }
+            | Self::DuplicateGroupColumn { .. }
+            | Self::GroupColumnNotFound { .. } => DataFusionError::Plan(msg),
+            e @ (Self::CreatingStringSet { .. }
+            | Self::InternalInvalidTagType { .. }
+            | Self::CreatingAggregates { .. }
+            | Self::CreatingScan { .. }
+            | Self::InternalUnexpectedNoneAggregate {}
+            | Self::InternalAggregateNotSelector { .. }) => DataFusionError::External(Box::new(e)),
+        }
+    }
+}
+
+impl From<crate::scan_plan::Error> for Error {
+    fn from(source: crate::scan_plan::Error) -> Self {
+        Self::CreatingScan { source }
+    }
+}
+
+impl From<DataFusionError> for Error {
+    fn from(source: DataFusionError) -> Self {
+        Self::BuildingPlan { source }
+    }
+}
+
+/// Plans queries that originate from the InfluxDB Storage gRPC
+/// interface, which are in terms of the InfluxDB Data model (e.g.
+/// `ParsedLine`). The query methods on this trait such as
+/// `tag_keys` are specific to this data model.
+///
+/// The IOx storage engine implements this trait to provide time-series
+/// specific queries, but also provides more generic access to the
+/// same underlying data via other frontends (e.g. SQL).
+///
+/// The InfluxDB data model can be thought of as a relational
+/// database table where each column has both a type as well as one of
+/// the following categories:
+///
+/// * Tag (always String type)
+/// * Field (Float64, Int64, UInt64, String, or Bool)
+/// * Time (Int64)
+///
+/// While the underlying storage is the same for columns in different
+/// categories with the same data type, columns of different
+/// categories are treated differently in the different query types.
+#[derive(Debug)]
+pub struct InfluxRpcPlanner {
+    /// Optional executor currently only used to provide span context for tracing.
+    ctx: IOxSessionContext,
+
+    /// Namespace metadata.
+    meta: Arc<NamespaceMeta>,
+}
+
+impl InfluxRpcPlanner {
+    /// Create a new instance of the RPC planner
+    pub async fn new(ctx: IOxSessionContext) -> Self {
+        let meta = Arc::new(NamespaceMeta::new(&ctx).await);
+        Self { ctx, meta }
+    }
+
+    /// Returns a builder that includes
+    ///   . A set of table names got from meta data that will participate
+    ///      in the requested `predicate`
+    ///   . A set of plans of tables of either
+    ///       . chunks with deleted data or
+    ///       . chunks without deleted data but cannot be decided from meta data
+    pub async fn table_names(
+        &self,
+        namespace: Arc<dyn QueryNamespace>,
+        rpc_predicate: InfluxRpcPredicate,
+    ) -> Result<StringSetPlan> {
+        let ctx = self.ctx.child_ctx("table_names planning");
+        debug!(?rpc_predicate, "planning table_names");
+
+        // Special case predicates that span the entire valid timestamp range
+        let rpc_predicate = rpc_predicate.clear_timestamp_if_max_range();
+
+        let table_predicates = rpc_predicate
+            .table_predicates(self.meta.as_ref())
+            .context(CreatingPredicatesSnafu)?;
+        let tables: Vec<_> = table_chunk_stream(
+            Arc::clone(&namespace),
+            false,
+            &table_predicates,
+            &ctx,
+            &self.meta,
+        )
+        .try_filter_map(
+            |(table_name, table_schema, table_predicate, chunks)| async move {
+                let chunks_full = prune_chunks(&table_schema, chunks, &table_predicate);
+
+                Ok((!chunks_full.is_empty())
+                    .then_some((table_name, Some((table_predicate, chunks_full)))))
+            },
+        )
+        .try_collect()
+        .await?;
+
+        // Feed builder
+        let mut builder = StringSetPlanBuilder::new();
+        for (table_name, maybe_full_plan) in tables {
+            match maybe_full_plan {
+                None => {
+                    builder.append_string(table_name.to_string());
+                }
+                Some((predicate, chunks)) => {
+                    let schema = self
+                        .meta
+                        .table_schema(table_name)
+                        .context(TableRemovedSnafu {
+                            table_name: table_name.as_ref(),
+                        })?;
+
+                    let plan =
+                        Self::table_name_plan(Arc::clone(table_name), &schema, &predicate, chunks)?;
+                    builder = builder.append_other(plan.into());
+                }
+            }
+        }
+
+        builder.build().context(CreatingStringSetSnafu)
+    }
+
+    /// Returns a set of plans that produces the names of "tag" columns (as defined in the InfluxDB
+    /// data model) names in this namespace that have more than zero rows which pass the conditions
+    /// specified by `predicate`.
+    pub async fn tag_keys(
+        &self,
+        namespace: Arc<dyn QueryNamespace>,
+        rpc_predicate: InfluxRpcPredicate,
+    ) -> Result<StringSetPlan> {
+        let ctx = self.ctx.child_ctx("tag_keys planning");
+        debug!(?rpc_predicate, "planning tag_keys");
+
+        // Special case predicates that span the entire valid timestamp range
+        let rpc_predicate = rpc_predicate.clear_timestamp_if_max_range();
+
+        // The basic algorithm is:
+        //
+        // 1. Find all the potential tables in the chunks
+        //
+        // 2. For each table/chunk pair, figure out which can be found from only metadata and which
+        //    need full plans
+
+        let table_predicates = rpc_predicate
+            .table_predicates(self.meta.as_ref())
+            .context(CreatingPredicatesSnafu)?;
+
+        let mut table_predicates_need_chunks = vec![];
+        let mut builder = StringSetPlanBuilder::new();
+        for (table_name, predicate) in table_predicates {
+            if predicate.is_empty() {
+                // special case - return the columns from metadata only.
+                // Note that columns with all rows deleted will still show here
+                builder = builder.append_other(
+                    self.meta
+                        .table_schema(&table_name)
+                        .context(TableRemovedSnafu {
+                            table_name: table_name.as_ref(),
+                        })?
+                        .tags_iter()
+                        .map(|f| f.name().clone())
+                        .collect::<BTreeSet<_>>()
+                        .into(),
+                );
+            } else {
+                table_predicates_need_chunks.push((table_name, predicate));
+            }
+        }
+
+        let tables: Vec<_> = table_chunk_stream(
+            Arc::clone(&namespace),
+            false,
+            &table_predicates_need_chunks,
+            &ctx,
+            &self.meta,
+        )
+        .and_then(|(table_name, table_schema, predicate, chunks)| {
+            let mut ctx = ctx.child_ctx("table");
+            ctx.set_metadata("table", table_name.to_string());
+
+            async move {
+                let mut chunks_full = vec![];
+                let mut known_columns = BTreeSet::new();
+
+                let chunks = prune_chunks(&table_schema, chunks, &predicate);
+                for chunk in cheap_chunk_first(chunks) {
+                    // get only tag columns from metadata
+                    let schema = chunk.schema();
+
+                    let column_names: Vec<&str> = schema
+                        .tags_iter()
+                        .map(|f| f.name().as_str())
+                        .collect::<Vec<&str>>();
+
+                    let selection = Projection::Some(&column_names);
+
+                    // filter the columns further from the predicate
+                    let maybe_names = chunk_column_names(&chunk, &predicate, selection);
+
+                    match maybe_names {
+                        Some(mut names) => {
+                            debug!(
+                                %table_name,
+                                names=?names,
+                                chunk_id=%chunk.id().get(),
+                                "column names found from metadata",
+                            );
+                            known_columns.append(&mut names);
+                        }
+                        None => {
+                            debug!(
+                                %table_name,
+                                chunk_id=%chunk.id().get(),
+                                "column names need full plan"
+                            );
+                            chunks_full.push(chunk);
+                        }
+                    }
+                }
+
+                Ok((table_name, predicate, chunks_full, known_columns))
+            }
+        })
+        .try_collect()
+        .await?;
+
+        // At this point, we have a set of column names we know pass
+        // in `known_columns`, and potentially some tables in chunks
+        // that we need to run a plan to know if they pass the
+        // predicate.
+        for (table_name, predicate, chunks_full, known_columns) in tables {
+            builder = builder.append_other(known_columns.into());
+
+            if !chunks_full.is_empty() {
+                // TODO an additional optimization here would be to filter
+                // out chunks (and tables) where all columns in that chunk
+                // were already known to have data (based on the contents of known_columns)
+
+                let schema = self
+                    .meta
+                    .table_schema(table_name)
+                    .context(TableRemovedSnafu {
+                        table_name: table_name.as_ref(),
+                    })?;
+
+                let plan = self.tag_keys_plan(table_name, &schema, &predicate, chunks_full)?;
+
+                if let Some(plan) = plan {
+                    builder = builder.append_other(plan)
+                }
+            }
+        }
+
+        builder.build().context(CreatingStringSetSnafu)
+    }
+
+    /// Returns a plan which finds the distinct, non-null tag values in the specified `tag_name`
+    /// column of this namespace which pass the conditions specified by `predicate`.
+    pub async fn tag_values(
+        &self,
+        namespace: Arc<dyn QueryNamespace>,
+        tag_name: &str,
+        rpc_predicate: InfluxRpcPredicate,
+    ) -> Result<StringSetPlan> {
+        let ctx = self.ctx.child_ctx("tag_values planning");
+        debug!(?rpc_predicate, tag_name, "planning tag_values");
+
+        // The basic algorithm is:
+        //
+        // 1. Find all the potential tables in the chunks
+        //
+        // 2. For each table/chunk pair, figure out which have
+        // distinct values that can be found from only metadata and
+        // which need full plans
+
+        let table_predicates = rpc_predicate
+            .table_predicates(self.meta.as_ref())
+            .context(CreatingPredicatesSnafu)?;
+
+        // filter out tables that do NOT contain `tag_name` early, esp. before performing any chunk
+        // scan (which includes ingester RPC)
+        let mut table_predicates_filtered = Vec::with_capacity(table_predicates.len());
+        for (table_name, predicate) in table_predicates {
+            let schema = self
+                .meta
+                .table_schema(&table_name)
+                .context(TableRemovedSnafu {
+                    table_name: table_name.as_ref(),
+                })?;
+
+            // Skip this table if the tag_name is not a column in this table
+            if schema.find_index_of(tag_name).is_none() {
+                continue;
+            };
+
+            table_predicates_filtered.push((table_name, predicate));
+        }
+
+        let tables: Vec<_> = table_chunk_stream(
+            Arc::clone(&namespace),
+            false,
+            &table_predicates_filtered,
+            &ctx,
+            &self.meta,
+        )
+        .and_then(|(table_name, table_schema, predicate, chunks)| async move {
+            let mut chunks_full = vec![];
+
+            let chunks = prune_chunks(&table_schema, chunks, &predicate);
+            for chunk in cheap_chunk_first(chunks) {
+                // use schema to validate column type
+                let schema = chunk.schema();
+
+                // Skip this table if the tag_name is not a column in this chunk
+                // Note: This may happen even when the table contains the tag_name, because some
+                // chunks may not contain all columns.
+                let idx = if let Some(idx) = schema.find_index_of(tag_name) {
+                    idx
+                } else {
+                    continue;
+                };
+
+                // Validate that this really is a Tag column
+                let (influx_column_type, field) = schema.field(idx);
+                ensure!(
+                    influx_column_type == InfluxColumnType::Tag,
+                    InvalidTagColumnSnafu {
+                        tag_name,
+                        influx_column_type,
+                    }
+                );
+                ensure!(
+                    influx_column_type.valid_arrow_type(field.data_type()),
+                    InternalInvalidTagTypeSnafu {
+                        tag_name,
+                        data_type: field.data_type().clone(),
+                    }
+                );
+
+                debug!(
+                    %table_name,
+                    chunk_id=%chunk.id().get(),
+                    "need full plan to find tag values"
+                );
+                chunks_full.push(chunk);
+            }
+
+            Ok((table_name, predicate, chunks_full))
+        })
+        .try_collect()
+        .await?;
+
+        let mut builder = StringSetPlanBuilder::new();
+
+        let select_exprs = vec![tag_name.as_expr()];
+
+        // At this point, we have a set of tag_values we know at plan
+        // time in `known_columns`, and some tables in chunks that we
+        // need to run a plan to find what values pass the predicate.
+        for (table_name, predicate, chunks_full) in tables {
+            if !chunks_full.is_empty() {
+                let schema = self
+                    .meta
+                    .table_schema(table_name)
+                    .context(TableRemovedSnafu {
+                        table_name: table_name.as_ref(),
+                    })?;
+
+                let scan_and_filter = ScanPlanBuilder::new(Arc::clone(table_name), &schema)
+                    .with_chunks(chunks_full)
+                    .with_predicate(&predicate)
+                    .build()?;
+
+                let tag_name_is_not_null = tag_name.as_expr().is_not_null();
+
+                // TODO: optimize this to use "DISINCT" or do
+                // something more intelligent that simply fetching all
+                // the values and reducing them in the query Executor
+                //
+                // Until then, simply use a plan which looks like:
+                //
+                //    Projection
+                //      Filter(is not null)
+                //        Filter(predicate)
+                //          Scan
+                let plan = scan_and_filter
+                    .plan_builder
+                    .project(select_exprs.clone())
+                    .context(BuildingPlanSnafu)?
+                    .filter(tag_name_is_not_null)
+                    .context(BuildingPlanSnafu)?
+                    .build()
+                    .context(BuildingPlanSnafu)?;
+
+                builder = builder.append_other(plan.into());
+            }
+        }
+
+        builder.build().context(CreatingStringSetSnafu)
+    }
+
+    /// Returns a plan that produces a list of columns and their datatypes (as defined in the data
+    /// written via `write_lines`), and which have more than zero rows which pass the conditions
+    /// specified by `predicate`.
+    pub async fn field_columns(
+        &self,
+        namespace: Arc<dyn QueryNamespace>,
+        rpc_predicate: InfluxRpcPredicate,
+    ) -> Result<FieldListPlan> {
+        let ctx = self.ctx.child_ctx("field_columns planning");
+        debug!(?rpc_predicate, "planning field_columns");
+
+        // Special case predicates that span the entire valid timestamp range
+        let rpc_predicate = rpc_predicate.clear_timestamp_if_max_range();
+
+        // Algorithm is to run a "select field_cols from table where
+        // <predicate> type plan for each table in the chunks"
+        //
+        // The executor then figures out which columns have non-null
+        // values and stops the plan executing once it has them
+
+        let table_predicates = rpc_predicate
+            .table_predicates(self.meta.as_ref())
+            .context(CreatingPredicatesSnafu)?;
+
+        // optimization: just get the field columns from metadata.
+        // note this both ignores field keys, and sets the timestamp data 'incorrectly'.
+        let mut field_list_plan = FieldListPlan::new();
+        let mut table_predicates_need_chunks = Vec::with_capacity(table_predicates.len());
+        for (table_name, predicate) in table_predicates {
+            if predicate.is_empty() {
+                let schema = self
+                    .meta
+                    .table_schema(&table_name)
+                    .context(TableRemovedSnafu {
+                        table_name: table_name.as_ref(),
+                    })?;
+                let fields = schema.fields_iter().map(|f| Field {
+                    name: f.name().clone(),
+                    data_type: f.data_type().clone(),
+                    last_timestamp: 0,
+                });
+                for field in fields {
+                    field_list_plan.append_field(field);
+                }
+            } else {
+                table_predicates_need_chunks.push((table_name, predicate));
+            }
+        }
+
+        // full scans
+        let plans = create_plans(
+            namespace,
+            &table_predicates_need_chunks,
+            ctx,
+            Arc::clone(&self.meta),
+            |table_name, predicate, chunks, schema| {
+                Self::field_columns_plan(Arc::from(table_name), schema, predicate, chunks)
+            },
+        )
+        .await?;
+        for plan in plans {
+            field_list_plan = field_list_plan.append_other(plan.into());
+        }
+
+        Ok(field_list_plan)
+    }
+
+    /// Returns a plan that finds all rows which pass the
+    /// conditions specified by `predicate` in the form of logical
+    /// time series.
+    ///
+    /// A time series is defined by the unique values in a set of
+    /// "tag_columns" for each field in the "field_columns", ordered by
+    /// the time column.
+    ///
+    /// The output looks like:
+    /// ```text
+    /// (tag_col1, tag_col2, ... field1, field2, ... timestamp)
+    /// ```
+    ///
+    /// The  tag_columns are ordered by name.
+    ///
+    /// The data is sorted on (tag_col1, tag_col2, ...) so that all
+    /// rows for a particular series (groups where all tags are the
+    /// same) occur together in the plan
+    pub async fn read_filter(
+        &self,
+        namespace: Arc<dyn QueryNamespace>,
+        rpc_predicate: InfluxRpcPredicate,
+    ) -> Result<SeriesSetPlans> {
+        let ctx = self.ctx.child_ctx("planning_read_filter");
+        debug!(?rpc_predicate, "planning read_filter");
+
+        let table_predicates = rpc_predicate
+            .table_predicates(self.meta.as_ref())
+            .context(CreatingPredicatesSnafu)?;
+
+        let plans = create_plans(
+            namespace,
+            &table_predicates,
+            ctx,
+            Arc::clone(&self.meta),
+            |table_name, predicate, chunks, schema| {
+                Self::read_filter_plan(table_name, schema, predicate, chunks)
+            },
+        )
+        .await?;
+
+        Ok(SeriesSetPlans::new(plans))
+    }
+
+    /// Creates one or more GroupedSeriesSet plans that produces an
+    /// output table with rows grouped according to group_columns and
+    /// an aggregate function which is applied to each *series* (aka
+    /// distinct set of tag value). Note the aggregate is not applied
+    /// across series within the same group.
+    ///
+    /// Specifically the data that is output from the plans is
+    /// guaranteed to be sorted such that:
+    ///
+    ///   1. The group_columns are a prefix of the sort key
+    ///
+    ///   2. All remaining tags appear in the sort key, in order,
+    ///   after the prefix (as the tag key may also appear as a group
+    ///   key)
+    ///
+    /// Schematically, the plan looks like:
+    ///
+    /// (order by {group_coumns, remaining tags})
+    ///   (aggregate by group -- agg, gby_exprs=tags)
+    ///      (apply filters)
+    pub async fn read_group(
+        &self,
+        namespace: Arc<dyn QueryNamespace>,
+        rpc_predicate: InfluxRpcPredicate,
+        agg: Aggregate,
+        group_columns: &[impl AsRef<str> + Send + Sync],
+    ) -> Result<SeriesSetPlans> {
+        let ctx = self.ctx.child_ctx("read_group planning");
+        debug!(?rpc_predicate, ?agg, "planning read_group");
+
+        let table_predicates = rpc_predicate
+            .table_predicates(self.meta.as_ref())
+            .context(CreatingPredicatesSnafu)?;
+
+        // Note always group (which will resort the frames)
+        // by tag, even if there are 0 columns
+        let group_columns = group_columns
+            .iter()
+            .map(|s| Arc::from(s.as_ref()))
+            .collect::<Vec<Arc<str>>>();
+        let mut group_columns_set: HashSet<Arc<str>> = HashSet::with_capacity(group_columns.len());
+        for group_col in &group_columns {
+            match group_columns_set.entry(Arc::clone(group_col)) {
+                hashbrown::hash_set::Entry::Occupied(_) => {
+                    return Err(Error::DuplicateGroupColumn {
+                        column_name: group_col.to_string(),
+                    });
+                }
+                hashbrown::hash_set::Entry::Vacant(v) => {
+                    v.insert();
+                }
+            }
+        }
+
+        let plans = create_plans(
+            namespace,
+            &table_predicates,
+            ctx,
+            Arc::clone(&self.meta),
+            |table_name, predicate, chunks, schema| {
+                // check group_columns for unknown columns
+                let known_tags_vec = schema
+                    .tags_iter()
+                    .map(|f| f.name().clone())
+                    .collect::<Vec<_>>();
+                let known_tags_set = known_tags_vec
+                    .iter()
+                    .map(|s| s.as_str())
+                    .collect::<HashSet<_>>();
+                for group_col in &group_columns {
+                    if (group_col.as_ref() == FIELD_COLUMN_NAME)
+                        || (group_col.as_ref() == MEASUREMENT_COLUMN_NAME)
+                        || (group_col.as_ref() == GROUP_KEY_SPECIAL_START)
+                        || (group_col.as_ref() == GROUP_KEY_SPECIAL_STOP)
+                    {
+                        continue;
+                    }
+
+                    ensure!(
+                        known_tags_set.contains(group_col.as_ref()),
+                        GroupColumnNotFoundSnafu {
+                            column_name: group_col.as_ref(),
+                            all_tag_column_names: known_tags_vec.clone(),
+                        }
+                    );
+                }
+
+                match agg {
+                    Aggregate::None => {
+                        Self::read_filter_plan(table_name, schema, predicate, chunks)
+                    }
+                    _ => Self::read_group_plan(table_name, schema, predicate, agg, chunks),
+                }
+            },
+        )
+        .await?;
+
+        Ok(SeriesSetPlans::new(plans).grouped_by(group_columns))
+    }
+
+    /// Creates a GroupedSeriesSet plan that produces an output table with rows
+    /// that are grouped by window definitions
+    pub async fn read_window_aggregate(
+        &self,
+        namespace: Arc<dyn QueryNamespace>,
+        rpc_predicate: InfluxRpcPredicate,
+        agg: Aggregate,
+        every: WindowDuration,
+        offset: WindowDuration,
+    ) -> Result<SeriesSetPlans> {
+        let ctx = self.ctx.child_ctx("read_window_aggregate planning");
+        debug!(
+            ?rpc_predicate,
+            ?agg,
+            ?every,
+            ?offset,
+            "planning read_window_aggregate"
+        );
+
+        let table_predicates = rpc_predicate
+            .table_predicates(self.meta.as_ref())
+            .context(CreatingPredicatesSnafu)?;
+
+        let plans = create_plans(
+            namespace,
+            &table_predicates,
+            ctx,
+            Arc::clone(&self.meta),
+            |table_name, predicate, chunks, schema| {
+                Self::read_window_aggregate_plan(
+                    table_name, schema, predicate, agg, every, offset, chunks,
+                )
+            },
+        )
+        .await?;
+
+        Ok(SeriesSetPlans::new(plans))
+    }
+
+    /// Creates a DataFusion LogicalPlan that returns column *names* as a
+    /// single column of Strings for a specific table
+    ///
+    /// The created plan looks like:
+    ///
+    /// ```text
+    ///  Extension(PivotSchema)
+    ///    Filter(predicate)
+    ///      TableScan (of chunks)
+    /// ```
+    fn tag_keys_plan(
+        &self,
+        table_name: &str,
+        schema: &Schema,
+        predicate: &Predicate,
+        chunks: Vec<Arc<dyn QueryChunk>>,
+    ) -> Result<Option<StringSetPlan>> {
+        let scan_and_filter = ScanPlanBuilder::new(Arc::from(table_name), schema)
+            .with_predicate(predicate)
+            .with_chunks(chunks)
+            .build()?;
+
+        // now, select only the tag columns
+        let select_exprs = scan_and_filter
+            .schema()
+            .iter()
+            .filter_map(|(influx_column_type, field)| {
+                if influx_column_type == InfluxColumnType::Tag {
+                    Some(field.name().as_expr())
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+
+        // If the projection is empty then there is no plan to execute.
+        if select_exprs.is_empty() {
+            return Ok(None);
+        }
+
+        let plan = scan_and_filter
+            .plan_builder
+            .project(select_exprs)
+            .context(BuildingPlanSnafu)?
+            .build()
+            .context(BuildingPlanSnafu)?;
+
+        // And finally pivot the plan
+        let plan = make_schema_pivot(plan);
+        debug!(table_name=table_name, plan=%plan.display_indent_schema(),
+               "created column_name plan for table");
+
+        Ok(Some(plan.into()))
+    }
+
+    /// Creates a DataFusion LogicalPlan that returns the timestamp
+    /// and all field columns for a specified table:
+    ///
+    /// The output looks like (field0, field1, ..., time)
+    ///
+    /// The data is not sorted in any particular order
+    ///
+    /// returns `None` if the table contains no rows that would pass
+    /// the predicate.
+    ///
+    /// The created plan looks like:
+    ///
+    /// ```text
+    ///  Projection (select the field columns needed)
+    ///      Filter(predicate) [optional]
+    ///        Scan
+    /// ```
+    fn field_columns_plan(
+        table_name: Arc<str>,
+        schema: &Schema,
+        predicate: &Predicate,
+        chunks: Vec<Arc<dyn QueryChunk>>,
+    ) -> Result<LogicalPlan> {
+        let scan_and_filter = ScanPlanBuilder::new(table_name, schema)
+            .with_predicate(predicate)
+            .with_chunks(chunks)
+            .build()?;
+
+        // Selection of only fields and time
+        let select_exprs = scan_and_filter
+            .schema()
+            .iter()
+            .filter_map(|(influx_column_type, field)| match influx_column_type {
+                InfluxColumnType::Field(_) => Some(field.name().as_expr()),
+                InfluxColumnType::Timestamp => Some(field.name().as_expr()),
+                InfluxColumnType::Tag => None,
+            })
+            .collect::<Vec<_>>();
+
+        let plan = scan_and_filter
+            .plan_builder
+            .project(select_exprs)
+            .context(BuildingPlanSnafu)?
+            .build()
+            .context(BuildingPlanSnafu)?;
+
+        Ok(plan)
+    }
+
+    /// Creates a DataFusion LogicalPlan that returns the values in
+    /// the fields for a specified table:
+    ///
+    /// The output produces the table name as a single string if any
+    /// non null values are passed in.
+    ///
+    /// The data is not sorted in any particular order
+    ///
+    /// returns `None` if the table contains no rows that would pass
+    /// the predicate.
+    ///
+    /// The created plan looks like:
+    ///
+    /// ```text
+    ///  NonNullChecker
+    ///    Projection (select fields)
+    ///      Filter(predicate) [optional]
+    ///        Scan
+    /// ```
+    fn table_name_plan(
+        table_name: Arc<str>,
+        schema: &Schema,
+        predicate: &Predicate,
+        chunks: Vec<Arc<dyn QueryChunk>>,
+    ) -> Result<LogicalPlan> {
+        debug!(%table_name, "Creating table_name full plan");
+        let scan_and_filter = ScanPlanBuilder::new(Arc::clone(&table_name), schema)
+            .with_predicate(predicate)
+            .with_chunks(chunks)
+            .build()?;
+
+        // Select only fields requested
+        let select_exprs: Vec<_> = filtered_fields_iter(scan_and_filter.schema(), predicate)
+            .map(|field| field.name.as_expr())
+            .collect();
+
+        let plan = scan_and_filter
+            .plan_builder
+            .project(select_exprs)
+            .context(BuildingPlanSnafu)?
+            .build()
+            .context(BuildingPlanSnafu)?;
+
+        // Add the final node that outputs the table name or not, depending
+        let plan = make_non_null_checker(&table_name, plan);
+
+        Ok(plan)
+    }
+
+    /// Creates a plan for computing series sets for a given table,
+    /// returning None if the predicate rules out matching any rows in
+    /// the table
+    //
+    /// The created plan looks like:
+    ///
+    ///    Projection (select the columns needed)
+    ///      Order by (tag_columns, timestamp_column)
+    ///        Filter(predicate)
+    ///          Scan
+    fn read_filter_plan(
+        table_name: &str,
+        schema: &Schema,
+        predicate: &Predicate,
+        chunks: Vec<Arc<dyn QueryChunk>>,
+    ) -> Result<SeriesSetPlan> {
+        let scan_and_filter = ScanPlanBuilder::new(Arc::from(table_name), schema)
+            .with_predicate(predicate)
+            .with_chunks(chunks)
+            .build()?;
+
+        let schema = scan_and_filter.provider.iox_schema();
+
+        let tags_and_timestamp: Vec<_> = scan_and_filter
+            .schema()
+            .tags_iter()
+            .chain(scan_and_filter.schema().time_iter())
+            .map(|f| f.name() as &str)
+            // Convert to SortExprs to pass to the plan builder
+            .map(|n| n.as_sort_expr())
+            .collect();
+
+        // Order by
+        let plan_builder = scan_and_filter
+            .plan_builder
+            .sort(tags_and_timestamp)
+            .context(BuildingPlanSnafu)?;
+
+        // Select away anything that isn't in the influx data model
+        let tags_fields_and_timestamps: Vec<Expr> = schema
+            .tags_iter()
+            .map(|field| field.name().as_expr())
+            .chain(filtered_fields_iter(schema, predicate).map(|f| f.expr))
+            .chain(schema.time_iter().map(|field| field.name().as_expr()))
+            .collect();
+
+        let plan_builder = plan_builder
+            .project(tags_fields_and_timestamps)
+            .context(BuildingPlanSnafu)?;
+
+        let plan = plan_builder.build().context(BuildingPlanSnafu)?;
+
+        let tag_columns = schema
+            .tags_iter()
+            .map(|field| Arc::from(field.name().as_str()))
+            .collect();
+
+        let field_columns = filtered_fields_iter(schema, predicate)
+            .map(|field| Arc::from(field.name))
+            .collect();
+
+        // TODO: remove the use of tag_columns and field_column names
+        // and instead use the schema directly)
+        let ss_plan = SeriesSetPlan::new_from_shared_timestamp(
+            Arc::from(table_name),
+            plan,
+            tag_columns,
+            field_columns,
+        );
+
+        Ok(ss_plan)
+    }
+
+    /// Creates a GroupedSeriesSet plan that produces an output table
+    /// with one row per tagset and the values aggregated using a
+    /// specific function.
+    ///
+    /// Equivalent to this SQL query for 'aggregates': sum, count, mean
+    /// SELECT
+    ///   tag1...tagN
+    ///   agg_function(_val1) as _value1
+    ///   ...
+    ///   agg_function(_valN) as _valueN
+    ///   agg_function(time) as time
+    /// GROUP BY
+    ///   tags,
+    /// ORDER BY
+    ///   tags
+    ///
+    /// Note the columns are the same but in a different order
+    /// for GROUP BY / ORDER BY
+    ///
+    /// Equivalent to this SQL query for 'selector' functions: first, last, min,
+    /// max as they can have different values of the timestamp column
+    ///
+    /// SELECT
+    ///   tag1...tagN
+    ///   agg_function(_val1) as _value1
+    ///   agg_function(time) as time1
+    ///   ..
+    ///   agg_function(_valN) as _valueN
+    ///   agg_function(time) as timeN
+    /// GROUP BY
+    ///   tags
+    /// ORDER BY
+    ///   tags
+    ///
+    /// The created plan looks like:
+    ///
+    ///  OrderBy(gby cols; agg)
+    ///     GroupBy(gby cols, aggs, time cols)
+    ///       Filter(predicate)
+    ///          Scan
+    fn read_group_plan(
+        table_name: &str,
+        schema: &Schema,
+        predicate: &Predicate,
+        agg: Aggregate,
+        chunks: Vec<Arc<dyn QueryChunk>>,
+    ) -> Result<SeriesSetPlan> {
+        let scan_and_filter = ScanPlanBuilder::new(Arc::from(table_name), schema)
+            .with_predicate(predicate)
+            .with_chunks(chunks)
+            .build()?;
+
+        // order the tag columns so that the group keys come first (we
+        // will group and
+        // order in the same order)
+        let schema = scan_and_filter.provider.iox_schema();
+        let tag_columns: Vec<_> = schema.tags_iter().map(|f| f.name() as &str).collect();
+
+        // Group by all tag columns
+        let group_exprs = tag_columns
+            .iter()
+            .map(|tag_name| tag_name.as_expr())
+            .collect::<Vec<_>>();
+
+        let AggExprs {
+            agg_exprs,
+            field_columns,
+        } = AggExprs::try_new_for_read_group(agg, schema, predicate)?;
+
+        let plan_builder = scan_and_filter
+            .plan_builder
+            .aggregate(group_exprs, agg_exprs)
+            .context(BuildingPlanSnafu)?;
+
+        // Reorganize the output so it is ordered and sorted on tag columns
+
+        // no columns if there are no tags in the input and no group columns in the query
+        let plan_builder = if !tag_columns.is_empty() {
+            // reorder columns
+            let reorder_exprs = tag_columns
+                .iter()
+                .map(|tag_name| tag_name.as_expr())
+                .collect::<Vec<_>>();
+
+            let sort_exprs = reorder_exprs
+                .iter()
+                .map(|expr| expr.as_sort_expr())
+                .collect::<Vec<_>>();
+
+            let project_exprs = project_exprs_in_schema(&tag_columns, plan_builder.schema());
+
+            plan_builder
+                .project(project_exprs)
+                .context(BuildingPlanSnafu)?
+                .sort(sort_exprs)
+                .context(BuildingPlanSnafu)?
+        } else {
+            plan_builder
+        };
+
+        let plan_builder = cast_aggregates(plan_builder, agg, &field_columns)?;
+
+        let plan = plan_builder.build().context(BuildingPlanSnafu)?;
+
+        let tag_columns = tag_columns.iter().map(|s| Arc::from(*s)).collect();
+        let ss_plan = SeriesSetPlan::new(
+            Arc::from(table_name.to_string()),
+            plan,
+            tag_columns,
+            field_columns,
+        );
+
+        Ok(ss_plan)
+    }
+
+    /// Creates a SeriesSetPlan that produces an output table with rows
+    /// that are grouped by window definitions
+    ///
+    /// The order of the tag_columns
+    ///
+    /// The data is sorted on tag_col1, tag_col2, ...) so that all
+    /// rows for a particular series (groups where all tags are the
+    /// same) occur together in the plan
+    ///
+    /// Equivalent to this SQL query
+    ///
+    /// SELECT tag1, ... tagN,
+    ///   window_bound(time, every, offset) as time,
+    ///   agg_function1(field), as field_name
+    /// FROM measurement
+    /// GROUP BY
+    ///   tag1, ... tagN,
+    ///   window_bound(time, every, offset) as time,
+    /// ORDER BY
+    ///   tag1, ... tagN,
+    ///   window_bound(time, every, offset) as time
+    ///
+    /// The created plan looks like:
+    ///
+    ///  OrderBy(gby: tag columns, window_function; agg: aggregate(field))
+    ///      GroupBy(gby: tag columns, window_function; agg: aggregate(field))
+    ///        Filter(predicate)
+    ///          Scan
+    #[allow(clippy::too_many_arguments)]
+    fn read_window_aggregate_plan(
+        table_name: &str,
+        schema: &Schema,
+        predicate: &Predicate,
+        agg: Aggregate,
+        every: WindowDuration,
+        offset: WindowDuration,
+        chunks: Vec<Arc<dyn QueryChunk>>,
+    ) -> Result<SeriesSetPlan> {
+        let scan_and_filter = ScanPlanBuilder::new(Arc::from(table_name), schema)
+            .with_predicate(predicate)
+            .with_chunks(chunks)
+            .build()?;
+
+        let schema = scan_and_filter.provider.iox_schema();
+
+        // Group by all tag columns and the window bounds
+        let window_bound = make_window_bound_expr(TIME_COLUMN_NAME.as_expr(), every, offset)
+            .alias(TIME_COLUMN_NAME);
+
+        let group_exprs = schema
+            .tags_iter()
+            .map(|field| field.name().as_expr())
+            .chain(std::iter::once(window_bound))
+            .collect::<Vec<_>>();
+
+        let AggExprs {
+            agg_exprs,
+            field_columns,
+        } = AggExprs::try_new_for_read_window_aggregate(agg, schema, predicate)?;
+
+        // sort by the group by expressions as well
+        let sort_exprs = group_exprs
+            .iter()
+            .map(|expr| expr.as_sort_expr())
+            .collect::<Vec<_>>();
+
+        let plan_builder = scan_and_filter
+            .plan_builder
+            .aggregate(group_exprs, agg_exprs)?
+            .sort(sort_exprs)?;
+
+        let plan_builder = cast_aggregates(plan_builder, agg, &field_columns)?;
+
+        // and finally create the plan
+        let plan = plan_builder.build()?;
+
+        let tag_columns = schema
+            .tags_iter()
+            .map(|field| Arc::from(field.name().as_str()))
+            .collect();
+
+        Ok(SeriesSetPlan::new(
+            Arc::from(table_name),
+            plan,
+            tag_columns,
+            field_columns,
+        ))
+    }
+}
+
+/// Stream of chunks for table predicates.
+/// This function is used by influx grpc meta queries that want to know which table/tags/fields
+/// that match the given predicates.
+/// `need_fields` means the grpc queries will need to return field columns. If  `need_fields`
+/// is false, the grpc query does not need to return field columns but it still filters data on the
+/// field columns in the predicate
+///
+/// This function is directly invoked by `table_name, `tag_keys` and `tag_values` where need_fields should be false.
+/// This function is indirectly invoked by `field_columns`, `read_filter`, `read_group` and `read_window_aggregate`
+/// through the function `create_plans` where need_fields should be true.
+fn table_chunk_stream<'a>(
+    namespace: Arc<dyn QueryNamespace>,
+    need_fields: bool,
+    table_predicates: &'a [(Arc<str>, Predicate)],
+    ctx: &'a IOxSessionContext,
+    meta: &'a NamespaceMeta,
+) -> impl Stream<
+    Item = Result<(
+        &'a Arc<str>,
+        Arc<Schema>,
+        Predicate,
+        Vec<Arc<dyn QueryChunk>>,
+    )>,
+> + 'a {
+    futures::stream::iter(table_predicates)
+        .filter_map(move |(table_name, predicate)| async move {
+            let Some(table_schema) = meta.table_schema(table_name) else {
+                return None;
+            };
+            let table_schema = Arc::new(table_schema);
+            Some((table_name, table_schema, predicate))
+        })
+        .map(move |(table_name, table_schema, predicate)| {
+            let mut ctx = ctx.child_ctx("table");
+            ctx.set_metadata("table", table_name.to_string());
+
+            let namespace = Arc::clone(&namespace);
+
+            async move {
+                let predicate = match namespace.retention_time_ns() {
+                    Some(ret) => predicate.clone().with_retention(ret),
+                    None => predicate.clone(),
+                };
+                let filters = predicate.filter_expr().into_iter().collect::<Vec<_>>();
+                let projection =
+                    columns_in_predicates(need_fields, &table_schema, table_name, &predicate);
+
+                let mut chunks = namespace
+                    .chunks(
+                        table_name,
+                        &filters,
+                        projection.as_ref(),
+                        ctx.child_ctx("table chunks"),
+                    )
+                    .await
+                    .context(GettingChunksSnafu {
+                        table_name: table_name.as_ref(),
+                    })?;
+
+                // if there is a field restriction on the predicate, only
+                // chunks with that field should be returned. If the chunk has
+                // none of the fields specified, then it doesn't match
+                // TODO: test this branch
+                if let Some(field_columns) = &predicate.field_columns {
+                    chunks.retain(|chunk| {
+                        let schema = chunk.schema();
+                        // keep chunk if it has any of the columns requested
+                        field_columns
+                            .iter()
+                            .any(|col| schema.find_index_of(col).is_some())
+                    })
+                }
+
+                Ok((table_name, Arc::clone(&table_schema), predicate, chunks))
+            }
+        })
+        .buffered(CONCURRENT_TABLE_JOBS)
+}
+
+// Return all columns in predicate's field_columns, exprs and val_exprs.
+// Return None means nothing is filtered in this function and all field columns should be used.
+// None is returned when:
+//   - we cannot determine at least one column in the predicate
+//   - need_fields is true and the predicate does not have any field_columns.
+//     This signal that all fields are needed.
+// Note that the returned columns can also include tag and time columns if they happen to be
+// in the predicate.
+fn columns_in_predicates(
+    need_fields: bool,
+    table_schema: &Schema,
+    table_name: &str,
+    predicate: &Predicate,
+) -> Option<Vec<usize>> {
+    // columns in field_columns
+    let mut columns = match &predicate.field_columns {
+        Some(field_columns) => field_columns
+            .iter()
+            .map(Column::from_name)
+            .collect::<StdHashSet<_>>(),
+        None => {
+            if need_fields {
+                // fields wanted and `field_columns` is empty mean al fields will be needed
+                return None;
+            } else {
+                StdHashSet::new()
+            }
+        }
+    };
+
+    // columns in exprs
+    let expr_cols_result =
+        exprlist_to_columns(&predicate.exprs, &mut columns).context(ReadColumnsSnafu);
+
+    // columns in val_exprs
+    let exprs: Vec<Expr> = predicate
+        .value_expr
+        .iter()
+        .map(|e| Expr::from((*e).clone()))
+        .collect();
+    let val_exprs_cols_result = exprlist_to_columns(&exprs, &mut columns).context(ReadColumnsSnafu);
+
+    if expr_cols_result.is_err() || val_exprs_cols_result.is_err() {
+        if expr_cols_result.is_err() {
+            let error_message = expr_cols_result.err().unwrap().to_string();
+            warn!(table_name, ?predicate.exprs, ?error_message, "cannot determine columns in predicate.exprs");
+        }
+        if val_exprs_cols_result.is_err() {
+            let error_message = val_exprs_cols_result.err().unwrap().to_string();
+            warn!(table_name, ?predicate.value_expr, ?error_message, "cannot determine columns in predicate.value_expr");
+        }
+
+        None
+    } else {
+        // convert the column names into their corresponding indexes in the schema
+        if columns.is_empty() {
+            return None;
+        }
+
+        let mut indices = Vec::with_capacity(columns.len());
+        for c in columns {
+            if let Some(idx) = table_schema.find_index_of(&c.name) {
+                indices.push(idx);
+            } else {
+                warn!(
+                    table_name,
+                    column=c.name.as_str(),
+                    table_columns=?table_schema.iter().map(|(_t, f)| f.name()).collect::<Vec<_>>(),
+                    "cannot find predicate column (field column, value expr, filter expression) table schema",
+                );
+                return None;
+            }
+        }
+        Some(indices)
+    }
+}
+
+/// Create plans that fetch the data specified in table_predicates.
+///
+/// table_predicates contains `(table_name, predicate_specialized_for_that_table)`
+///
+/// The plans are created in parallel as different `async` Tasks to reduce
+/// query latency due to planning
+///
+/// `f(ctx, table_name, table_predicate, chunks, table_schema)` is
+///  invoked on the chunks for each table to produce a plan for each
+async fn create_plans<F, P>(
+    namespace: Arc<dyn QueryNamespace>,
+    table_predicates: &[(Arc<str>, Predicate)],
+    ctx: IOxSessionContext,
+    meta: Arc<NamespaceMeta>,
+    f: F,
+) -> Result<Vec<P>>
+where
+    F: for<'a> Fn(&'a str, &'a Predicate, Vec<Arc<dyn QueryChunk>>, &'a Schema) -> Result<P>
+        + Clone
+        + Send
+        + Sync,
+    P: Send,
+{
+    table_chunk_stream(namespace, true, table_predicates, &ctx, &meta)
+        .and_then(|(table_name, table_schema, predicate, chunks)| async move {
+            let chunks = prune_chunks(&table_schema, chunks, &predicate);
+            Ok((table_name, predicate, chunks))
+        })
+        // rustc seems to heavily confused about the filter step here, esp. it dislikes `.try_filter` and even
+        // `.try_filter_map` requires some additional type annotations
+        .try_filter_map(|(table_name, predicate, chunks)| async move {
+            Ok((!chunks.is_empty()).then_some((table_name, predicate, chunks)))
+                as Result<Option<(&Arc<str>, Predicate, Vec<_>)>>
+        })
+        .and_then(|(table_name, predicate, chunks)| {
+            let mut ctx = ctx.child_ctx("table");
+            ctx.set_metadata("table", table_name.to_string());
+
+            let meta = Arc::clone(&meta);
+            let f = f.clone();
+
+            async move {
+                let schema = meta.table_schema(table_name).context(TableRemovedSnafu {
+                    table_name: table_name.as_ref(),
+                })?;
+
+                f(table_name, &predicate, chunks, &schema)
+            }
+        })
+        .try_collect()
+        .await
+}
+
+/// Return a `Vec` of `Exprs` such that it starts with `prefix` cols and
+/// then has all columns in `schema` that are not already in the prefix.
+fn project_exprs_in_schema(prefix: &[&str], schema: &DFSchemaRef) -> Vec<Expr> {
+    let seen: HashSet<_> = prefix.iter().cloned().collect();
+
+    let prefix_exprs = prefix.iter().map(|name| name.as_expr());
+    let new_exprs = schema.fields().iter().filter_map(|f| {
+        let name = f.name().as_str();
+        if !seen.contains(name) {
+            Some(name.as_expr())
+        } else {
+            None
+        }
+    });
+
+    prefix_exprs.chain(new_exprs).collect()
+}
+
+/// casts aggregates (fields named in field_columns) to the types
+/// expected by Flux. Currently this means converting count aggregates
+/// into Int64
+fn cast_aggregates(
+    plan_builder: LogicalPlanBuilder,
+    agg: Aggregate,
+    field_columns: &FieldColumns,
+) -> Result<LogicalPlanBuilder> {
+    if !matches!(agg, Aggregate::Count) {
+        return Ok(plan_builder);
+    }
+
+    let schema = plan_builder.schema();
+
+    // in read_group and read_window_aggregate, aggregates are only
+    // applied to fields, so all fields are also aggregates.
+    let field_names: HashSet<&str> = match field_columns {
+        FieldColumns::SharedTimestamp(field_names) => {
+            field_names.iter().map(|s| s.as_ref()).collect()
+        }
+        FieldColumns::DifferentTimestamp(fields_and_timestamps) => fields_and_timestamps
+            .iter()
+            .map(|(field, _timestamp)| field.as_ref())
+            .collect(),
+    };
+
+    // Build expressions for each select list
+    let cast_exprs = schema
+        .fields()
+        .iter()
+        .map(|df_field| {
+            let field_name = df_field.name();
+            let expr = if field_names.contains(field_name.as_str()) {
+                // CAST(field_name as Int64) as field_name
+                field_name
+                    .as_expr()
+                    .cast_to(&DataType::Int64, schema)
+                    .context(CastingAggregatesSnafu { agg, field_name })?
+                    .alias(field_name)
+            } else {
+                field_name.as_expr()
+            };
+            Ok(expr)
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    plan_builder.project(cast_exprs).context(BuildingPlanSnafu)
+}
+
+/// Helper for creating aggregates
+pub(crate) struct AggExprs {
+    agg_exprs: Vec<Expr>,
+    field_columns: FieldColumns,
+}
+
+// Encapsulates a field column projection as an expression. In the simplest case
+// the expression is a `Column` expression. In more complex cases it might be
+// a predicate that filters rows for the projection.
+#[derive(Clone)]
+struct FieldExpr<'a> {
+    expr: Expr,
+    name: &'a str,
+}
+
+// Returns an iterator of fields from schema that pass the predicate. If there
+// are expressions associated with field column projections then these are
+// applied to the column via `CASE` statements.
+//
+// TODO(edd): correctly support multiple `_value` expressions. Right now they
+// are OR'd together, which makes sense for equality operators like `_value == xyz`.
+fn filtered_fields_iter<'a>(
+    schema: &'a Schema,
+    predicate: &'a Predicate,
+) -> impl Iterator<Item = FieldExpr<'a>> {
+    schema.fields_iter().filter_map(move |f| {
+        if !predicate.should_include_field(f.name()) {
+            return None;
+        }
+
+        // For example, assume two fields (`field1` and `field2`) along with
+        // a predicate like: `_value = 1.32 OR _value = 2.87`. The projected
+        // field columns become:
+        //
+        // SELECT
+        //  CASE WHEN #field1 = Float64(1.32) OR #field1 = Float64(2.87) THEN #field1 END AS field1,
+        //  CASE WHEN #field2 = Float64(1.32) OR #field2 = Float64(2.87) THEN #field2 END AS field2
+        //
+        let expr = predicate
+            .value_expr
+            .iter()
+            .map(|value_expr| value_expr.replace_col(f.name()))
+            .reduce(|a, b| a.or(b))
+            .map(|when_expr| when(when_expr, f.name().as_expr()).end())
+            .unwrap_or_else(|| Ok(f.name().as_expr()))
+            .unwrap();
+
+        Some(FieldExpr {
+            expr: expr.alias(f.name()),
+            name: f.name(),
+        })
+    })
+}
+
+/// Creates aggregate expressions and tracks field output according to
+/// the rules explained on `read_group_plan`
+impl AggExprs {
+    /// Create the appropriate aggregate expressions, based on the type of the
+    /// field for a `read_group` plan.
+    pub(crate) fn try_new_for_read_group(
+        agg: Aggregate,
+        schema: &Schema,
+        predicate: &Predicate,
+    ) -> Result<Self> {
+        match agg {
+            Aggregate::Sum | Aggregate::Count | Aggregate::Mean => {
+                Self::agg_for_read_group(agg, schema, predicate)
+            }
+            Aggregate::First | Aggregate::Last | Aggregate::Min | Aggregate::Max => {
+                Self::selector_aggregates(agg, schema, predicate)
+            }
+            Aggregate::None => InternalUnexpectedNoneAggregateSnafu.fail(),
+        }
+    }
+
+    /// Create the appropriate aggregate expressions, based on the type of the
+    /// field for a `read_window_aggregate` plan.
+    pub(crate) fn try_new_for_read_window_aggregate(
+        agg: Aggregate,
+        schema: &Schema,
+        predicate: &Predicate,
+    ) -> Result<Self> {
+        match agg {
+            Aggregate::Sum | Aggregate::Count | Aggregate::Mean => {
+                Self::agg_for_read_window_aggregate(agg, schema, predicate)
+            }
+            Aggregate::First | Aggregate::Last | Aggregate::Min | Aggregate::Max => {
+                Self::selector_aggregates(agg, schema, predicate)
+            }
+            Aggregate::None => InternalUnexpectedNoneAggregateSnafu.fail(),
+        }
+    }
+
+    // Creates special aggregate "selector" expressions for the fields in the
+    // provided schema. Selectors ensure that the relevant aggregate functions
+    // are also provided to a distinct time column for each field column.
+    //
+    // Equivalent SQL would look like:
+    //
+    //   agg_function(_val1) as _value1
+    //   agg_function(time) as time1
+    //   ..
+    //   agg_function(_valN) as _valueN
+    //   agg_function(time) as timeN
+    fn selector_aggregates(agg: Aggregate, schema: &Schema, predicate: &Predicate) -> Result<Self> {
+        // might be nice to use a more functional style here
+        let mut agg_exprs = Vec::new();
+        let mut field_list = Vec::new();
+
+        for field in filtered_fields_iter(schema, predicate) {
+            let selector = make_selector_expr(agg, field.clone())?;
+
+            let field_name = field.name;
+            agg_exprs.push(selector.clone().field("value").alias(field_name));
+
+            let time_column_name = format!("{TIME_COLUMN_NAME}_{field_name}");
+            agg_exprs.push(selector.field("time").alias(&time_column_name));
+
+            field_list.push((
+                Arc::from(field_name), // value name
+                Arc::from(time_column_name.as_str()),
+            ));
+        }
+
+        let field_columns = field_list.into();
+        Ok(Self {
+            agg_exprs,
+            field_columns,
+        })
+    }
+
+    // Creates aggregate expressions for use in a read_group plan, which
+    // includes the time column.
+    //
+    // Equivalent SQL would look like this:
+    //
+    //  agg_function(_val1) as _value1
+    //  ...
+    //  agg_function(_valN) as _valueN
+    //  agg_function(time) as time
+    fn agg_for_read_group(agg: Aggregate, schema: &Schema, predicate: &Predicate) -> Result<Self> {
+        let agg_exprs = filtered_fields_iter(schema, predicate)
+            .map(|field| make_agg_expr(agg, field))
+            .chain(schema.time_iter().map(|field| {
+                make_agg_expr(
+                    agg,
+                    FieldExpr {
+                        expr: field.name().as_expr(),
+                        name: field.name(),
+                    },
+                )
+            }))
+            .collect::<Result<Vec<_>>>()?;
+
+        let field_columns = filtered_fields_iter(schema, predicate)
+            .map(|field| Arc::from(field.name))
+            .collect::<Vec<_>>()
+            .into();
+
+        Ok(Self {
+            agg_exprs,
+            field_columns,
+        })
+    }
+
+    // Creates aggregate expressions for use in a read_window_aggregate plan. No
+    // aggregates are created for the time column because the
+    // `read_window_aggregate` uses a time column calculated using window
+    // bounds.
+    //
+    // Equivalent SQL would look like this:
+    //
+    //  agg_function(_val1) as _value1
+    //  ...
+    //  agg_function(_valN) as _valueN
+    fn agg_for_read_window_aggregate(
+        agg: Aggregate,
+        schema: &Schema,
+        predicate: &Predicate,
+    ) -> Result<Self> {
+        let agg_exprs = filtered_fields_iter(schema, predicate)
+            .map(|field| make_agg_expr(agg, field))
+            .collect::<Result<Vec<_>>>()?;
+
+        let field_columns = filtered_fields_iter(schema, predicate)
+            .map(|field| Arc::from(field.name))
+            .collect::<Vec<_>>()
+            .into();
+
+        Ok(Self {
+            agg_exprs,
+            field_columns,
+        })
+    }
+}
+
+/// Creates a DataFusion expression suitable for calculating an aggregate:
+///
+/// equivalent to `CAST agg(field) as field`
+fn make_agg_expr(agg: Aggregate, field_expr: FieldExpr<'_>) -> Result<Expr> {
+    // For timestamps, use `MAX` which corresponds to the last
+    // timestamp in the group, unless `MIN` was specifically requested
+    // to be consistent with the Go implementation which takes the
+    // timestamp at the end of the window
+    let agg = if field_expr.name == TIME_COLUMN_NAME && agg != Aggregate::Min {
+        Aggregate::Max
+    } else {
+        agg
+    };
+
+    let field_name = field_expr.name;
+    agg.to_datafusion_expr(field_expr.expr)
+        .context(CreatingAggregatesSnafu)
+        .map(|agg| agg.alias(field_name))
+}
+
+/// Creates a DataFusion expression suitable for calculating the time part of a
+/// selector:
+///
+/// The output expression is equivalent to `CAST selector_time(field_expression)
+/// as col_name`.
+///
+/// In the simplest scenarios the field expressions are `Column` expressions.
+/// In some cases the field expressions are `CASE` statements such as for
+/// example:
+///
+/// CAST selector_time(
+///     CASE WHEN field = 1.87 OR field = 1.99 THEN field
+///     ELSE NULL
+/// END) as col_name
+///
+fn make_selector_expr(agg: Aggregate, field: FieldExpr<'_>) -> Result<Expr> {
+    let uda = match agg {
+        Aggregate::First => selector_first(),
+        Aggregate::Last => selector_last(),
+        Aggregate::Min => selector_min(),
+        Aggregate::Max => selector_max(),
+        _ => return InternalAggregateNotSelectorSnafu { agg }.fail(),
+    };
+
+    Ok(uda.call(vec![field.expr, TIME_COLUMN_NAME.as_expr()]))
+}
+
+/// Orders chunks so it is likely that the ones that already have cached data are pulled first.
+///
+/// We use the inverse chunk order as a heuristic here. See <https://github.com/influxdata/influxdb_iox/issues/5037> for
+/// a more advanced variant.
+fn cheap_chunk_first(mut chunks: Vec<Arc<dyn QueryChunk>>) -> Vec<Arc<dyn QueryChunk>> {
+    chunks.sort_by_key(|chunk| Reverse(chunk.order()));
+    chunks
+}
+
+fn prune_chunks(
+    table_schema: &Schema,
+    chunks: Vec<Arc<dyn QueryChunk>>,
+    predicate: &Predicate,
+) -> Vec<Arc<dyn QueryChunk>> {
+    use iox_query::pruning::prune_chunks;
+
+    let filters = predicate.filter_expr().into_iter().collect::<Vec<_>>();
+    let Ok(mask) = prune_chunks(table_schema, &chunks, &filters) else {
+        return chunks;
+    };
+
+    chunks
+        .into_iter()
+        .zip(mask)
+        .filter(|(_c, m)| *m)
+        .map(|(c, _m)| c)
+        .collect()
+}
+
+fn chunk_column_names(
+    chunk: &Arc<dyn QueryChunk>,
+    predicate: &Predicate,
+    columns: Projection<'_>,
+) -> Option<StringSet> {
+    if !predicate.is_empty() {
+        // if there is anything in the predicate, bail for now and force a full plan
+        return None;
+    }
+
+    let fields = chunk.schema().inner().fields().iter();
+
+    Some(match columns {
+        Projection::Some(cols) => fields
+            .filter_map(|x| {
+                if cols.contains(&x.name().as_str()) {
+                    Some(x.name().clone())
+                } else {
+                    None
+                }
+            })
+            .collect(),
+        Projection::All => fields.map(|x| x.name().clone()).collect(),
+    })
+}
+
+#[derive(Debug)]
+struct NamespaceMeta {
+    tables: BTreeMap<String, Schema>,
+}
+
+impl NamespaceMeta {
+    async fn new(ctx: &IOxSessionContext) -> Self {
+        let schema_provider = ctx
+            .inner()
+            .catalog(DEFAULT_CATALOG)
+            .expect("default catalog exists")
+            .schema(DEFAULT_SCHEMA)
+            .expect("default schema exists");
+
+        let tables = futures::stream::iter(schema_provider.table_names())
+            .map(|table_name| {
+                let schema_provider = Arc::clone(&schema_provider);
+                async move {
+                    let table_provider = schema_provider.table(&table_name).await?;
+                    let schema: Schema = table_provider
+                        .schema()
+                        .try_into()
+                        .expect("valid IOx schema");
+                    Some((table_name, schema))
+                }
+            })
+            .buffer_unordered(CONCURRENT_TABLE_JOBS)
+            .filter_map(|x| async move { x })
+            .collect()
+            .await;
+
+        Self { tables }
+    }
+}
+
+impl QueryNamespaceMeta for NamespaceMeta {
+    fn table_names(&self) -> Vec<String> {
+        self.tables.keys().cloned().collect()
+    }
+
+    fn table_schema(&self, table_name: &str) -> Option<Schema> {
+        self.tables.get(table_name).cloned()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{
+        common::ScalarValue,
+        prelude::{col, lit},
+    };
+    use datafusion_util::lit_dict;
+    use futures::{future::BoxFuture, FutureExt};
+    use predicate::Predicate;
+
+    use iox_query::{
+        exec::Executor,
+        test::{TestChunk, TestDatabase},
+    };
+    use test_helpers::maybe_start_logging;
+
+    use super::*;
+
+    #[test]
+    fn test_columns_in_predicates() {
+        // setup a db
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+        // index of columns in the above chunk: [bar, foo, i64_field, i64_field_2, time]
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let table = "h2o";
+        let schema = chunk0.schema();
+
+        // test 1: empty predicate without need_fields
+        let predicate = Predicate::new();
+        let need_fields = false;
+        let projection = columns_in_predicates(need_fields, schema, table, &predicate);
+        assert_eq!(projection, None);
+
+        // test 2: empty predicate with need_fields
+        let need_fields = true;
+        let projection = columns_in_predicates(need_fields, schema, table, &predicate);
+        assert_eq!(projection, None);
+
+        // test 3: predicate on tag without need_fields
+        let predicate = Predicate::new().with_expr(col("foo").eq(lit("some_thing")));
+        let need_fields = false;
+        let projection = columns_in_predicates(need_fields, schema, table, &predicate).unwrap();
+        // return index of foo
+        assert_eq!(projection, vec![1]);
+
+        // test 4: predicate on tag with need_fields
+        let need_fields = true;
+        let projection = columns_in_predicates(need_fields, schema, table, &predicate);
+        // return None means all fields
+        assert_eq!(projection, None);
+
+        // test 5: predicate on tag with field_columns without need_fields
+        let predicate = Predicate::new()
+            .with_expr(col("foo").eq(lit("some_thing")))
+            .with_field_columns(vec!["i64_field".to_string()])
+            .unwrap();
+        let need_fields = false;
+        let mut projection = columns_in_predicates(need_fields, schema, table, &predicate).unwrap();
+        projection.sort();
+        // return indexes of i64_field and foo
+        assert_eq!(projection, vec![1, 2]);
+
+        // test 6: predicate on tag with field_columns with need_fields
+        let need_fields = true;
+        let mut projection = columns_in_predicates(need_fields, schema, table, &predicate).unwrap();
+        projection.sort();
+        // return indexes of foo and index of i64_field
+        assert_eq!(projection, vec![1, 2]);
+
+        // test 7: predicate on tag and field with field_columns without need_fields
+        let predicate = Predicate::new()
+            .with_expr(col("bar").eq(lit(1)).and(col("i64_field").eq(lit(1))))
+            .with_field_columns(vec!["i64_field".to_string()])
+            .unwrap();
+        let need_fields = false;
+        let mut projection = columns_in_predicates(need_fields, schema, table, &predicate).unwrap();
+        projection.sort();
+        // return indexes of bard and i64_field
+        assert_eq!(projection, vec![0, 2]);
+
+        // test 7: predicate on tag and field with field_columns with need_fields
+        let need_fields = true;
+        let mut projection = columns_in_predicates(need_fields, schema, table, &predicate).unwrap();
+        projection.sort();
+        // return indexes of bard and i64_field
+        assert_eq!(projection, vec![0, 2]);
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_no_field_columns() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let meta = NamespaceMeta::new(&ctx).await;
+
+        // predicate has no field_columns
+        // predicate on a tag column `foo`
+        let expr = col("foo").eq(lit("some_thing"));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![(Arc::from("h2o"), predicate)];
+
+        ////////////////////////////
+        // Test 1: need_fields --> all columns will be selected
+        let need_fields = true;
+
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because we asked it return all fileds (and implicit PK) even though the predicate is on `foo` only
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+
+        ////////////////////////////
+        // Test 2: no need_fields
+        let need_fields = false;
+
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // chunk schema includes still includes everything (the test table implementation does NOT project chunks)
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_empty_pred() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let meta = NamespaceMeta::new(&ctx).await;
+
+        // empty predicate
+        let predicate = Predicate::new();
+        let table_predicates = vec![(Arc::from("h2o"), predicate)];
+
+        /////////////
+        // Test 1: empty predicate with need_fields
+        let need_fields = true;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because the preidcate is empty
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+
+        /////////////
+        // Test 2: empty predicate without need_fields
+        let need_fields = false;
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // chunk schema includes  all 5 columns of the table because the preidcate is empty
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_on_tag_no_data() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column(), // no row added for this chunk on purpose
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let meta = NamespaceMeta::new(&ctx).await;
+
+        // predicate on a tag column `foo`
+        let expr = col("foo").eq(lit("some_thing"));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![(Arc::from("h2o"), predicate)];
+
+        let need_fields = false;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // Since no data, we do not do pushdown in the test chunk.
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_and_field_columns() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let meta = NamespaceMeta::new(&ctx).await;
+
+        let need_fields = false;
+
+        /////////////
+        // Test 1: predicate on field `i64_field_2` and `field_columns` is empty
+        // predicate on field column
+        let expr = col("i64_field_2").eq(lit(10));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![(Arc::from("h2o"), predicate)];
+
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // chunk schema includes everything (test table does NOT perform any projection)
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+
+        /////////////
+        // Test 2: predicate on tag `foo` and `field_columns` is not empty
+        let expr = col("bar").eq(lit(10));
+        let predicate = Predicate::new()
+            .with_expr(expr)
+            .with_field_columns(vec!["i64_field".to_string()])
+            .unwrap();
+        let table_predicates = vec![(Arc::from("h2o"), predicate)];
+
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // chunk schema includes everything (test table does NOT perform any projection)
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+    }
+
+    #[tokio::test]
+    async fn test_table_chunk_stream_pred_on_unknown_field() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_tag_column("bar")
+                .with_i64_field_column("i64_field")
+                .with_i64_field_column("i64_field_2")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+        let ctx = test_db.new_query_context(None);
+        let meta = NamespaceMeta::new(&ctx).await;
+
+        // predicate on unknown column
+        let expr = col("unknown_name").eq(lit(10));
+        let predicate = Predicate::new().with_expr(expr);
+        let table_predicates = vec![(Arc::from("h2o"), predicate)];
+
+        let need_fields = false;
+        let result = table_chunk_stream(test_db, need_fields, &table_predicates, &ctx, &meta)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert!(!result.is_empty());
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].0.as_ref(), "h2o"); // table name
+        assert_eq!(result[0].3.len(), 1); // returned chunks
+
+        // chunk schema includes all 5 columns since we hit the unknown column
+        let chunk = &result[0].3[0];
+        let chunk_schema = (*chunk.schema()).clone();
+        assert_eq!(chunk_schema.len(), 5);
+        let chunk_schema = chunk_schema.sort_fields_by_name();
+        assert_eq!(chunk_schema.field(0).1.name(), "bar");
+        assert_eq!(chunk_schema.field(1).1.name(), "foo");
+        assert_eq!(chunk_schema.field(2).1.name(), "i64_field");
+        assert_eq!(chunk_schema.field(3).1.name(), "i64_field_2");
+        assert_eq!(chunk_schema.field(4).1.name(), TIME_COLUMN_NAME);
+    }
+
+    #[tokio::test]
+    async fn test_predicate_rewrite_table_names() {
+        run_test(|test_db, rpc_predicate| {
+            async move {
+                InfluxRpcPlanner::new(test_db.new_query_context(None))
+                    .await
+                    .table_names(test_db, rpc_predicate)
+                    .await
+                    .expect("creating plan");
+            }
+            .boxed()
+        })
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_predicate_rewrite_tag_keys() {
+        run_test(|test_db, rpc_predicate| {
+            async move {
+                InfluxRpcPlanner::new(test_db.new_query_context(None))
+                    .await
+                    .tag_keys(test_db, rpc_predicate)
+                    .await
+                    .expect("creating plan");
+            }
+            .boxed()
+        })
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_predicate_rewrite_tag_values() {
+        run_test(|test_db, rpc_predicate| {
+            async move {
+                InfluxRpcPlanner::new(test_db.new_query_context(None))
+                    .await
+                    .tag_values(test_db, "foo", rpc_predicate)
+                    .await
+                    .expect("creating plan");
+            }
+            .boxed()
+        })
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_predicate_rewrite_field_columns() {
+        run_test(|test_db, rpc_predicate| {
+            async move {
+                InfluxRpcPlanner::new(test_db.new_query_context(None))
+                    .await
+                    .field_columns(test_db, rpc_predicate)
+                    .await
+                    .expect("creating plan");
+            }
+            .boxed()
+        })
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_predicate_rewrite_read_filter() {
+        run_test(|test_db, rpc_predicate| {
+            async move {
+                InfluxRpcPlanner::new(test_db.new_query_context(None))
+                    .await
+                    .read_filter(test_db, rpc_predicate)
+                    .await
+                    .expect("creating plan");
+            }
+            .boxed()
+        })
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_predicate_read_group() {
+        run_test(|test_db, rpc_predicate| {
+            async move {
+                let agg = Aggregate::None;
+                let group_columns = &["foo"];
+                InfluxRpcPlanner::new(test_db.new_query_context(None))
+                    .await
+                    .read_group(test_db, rpc_predicate, agg, group_columns)
+                    .await
+                    .expect("creating plan");
+            }
+            .boxed()
+        })
+        .await
+    }
+
+    /// Fix to address [IDPE issue #17144][17144]
+    ///
+    /// [17144]: https://github.com/influxdata/idpe/issues/17144
+    #[tokio::test]
+    async fn test_idpe_issue_17144() {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_f64_field_column("foo.bar") // with period
+                .with_time_column(),
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+
+        // verify that _field = 'foo.bar' is rewritten correctly
+        let predicate = Predicate::new().with_expr(
+            "_field"
+                .as_expr()
+                .eq(lit("foo.bar"))
+                .and("_value".as_expr().eq(lit(1.2))),
+        );
+
+        let rpc_predicate = InfluxRpcPredicate::new(None, predicate);
+
+        let agg = Aggregate::None;
+        let group_columns = &["foo"];
+        let res = InfluxRpcPlanner::new(test_db.new_query_context(None))
+            .await
+            .read_group(Arc::clone(&test_db) as _, rpc_predicate, agg, group_columns)
+            .await
+            .expect("creating plan");
+        assert_eq!(res.plans.len(), 1);
+        let ssplan = res.plans.first().unwrap();
+        insta::assert_snapshot!(ssplan.plan.display_indent_schema().to_string(), @r###"
+        Projection: h2o.foo, CASE WHEN h2o.foo.bar = Float64(1.2) THEN h2o.foo.bar END AS foo.bar, h2o.time [foo:Dictionary(Int32, Utf8);N, foo.bar:Float64;N, time:Timestamp(Nanosecond, None)]
+          Sort: h2o.foo ASC NULLS FIRST, h2o.time ASC NULLS FIRST [foo:Dictionary(Int32, Utf8);N, foo.bar:Float64;N, time:Timestamp(Nanosecond, None)]
+            TableScan: h2o [foo:Dictionary(Int32, Utf8);N, foo.bar:Float64;N, time:Timestamp(Nanosecond, None)]
+        "###);
+    }
+
+    #[tokio::test]
+    async fn test_predicate_read_window_aggregate() {
+        run_test(|test_db, rpc_predicate| {
+            async move {
+                let agg = Aggregate::First;
+                let every = WindowDuration::from_months(1, false);
+                let offset = WindowDuration::from_months(1, false);
+                InfluxRpcPlanner::new(test_db.new_query_context(None))
+                    .await
+                    .read_window_aggregate(test_db, rpc_predicate, agg, every, offset)
+                    .await
+                    .expect("creating plan");
+            }
+            .boxed()
+        })
+        .await
+    }
+
+    #[tokio::test]
+    async fn test_issue_7848() {
+        maybe_start_logging();
+
+        let chunk = Arc::new(
+            TestChunk::new("table")
+                .with_id(0)
+                .with_tag_column("tag")
+                .with_f64_field_column("field")
+                .with_time_column()
+                .with_one_row_of_data(),
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db =
+            Arc::new(TestDatabase::new(Arc::clone(&executor)).with_retention_time_ns(Some(1)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk));
+
+        let predicate = Predicate::new().with_expr("tag".as_expr().eq(lit("MA")));
+
+        let rpc_predicate = InfluxRpcPredicate::new(None, predicate);
+
+        let res = InfluxRpcPlanner::new(test_db.new_query_context(None))
+            .await
+            .read_filter(Arc::clone(&test_db) as _, rpc_predicate)
+            .await
+            .expect("creating plan");
+        assert_eq!(res.plans.len(), 1);
+
+        // Note: The retention policy (i.e. a time predicate) does NOT occur within the logical plan because it is an
+        //       implementation detail of the table itself and will only be manifested when the `TableScan` is converted
+        //       into a physical plan (which uses the IOx table provider code).
+        let ssplan = res.plans.first().unwrap();
+        insta::assert_snapshot!(ssplan.plan.display_indent_schema().to_string(), @r###"
+        Projection: table.tag, table.field AS field, table.time [tag:Dictionary(Int32, Utf8);N, field:Float64;N, time:Timestamp(Nanosecond, None)]
+          Sort: table.tag ASC NULLS FIRST, table.time ASC NULLS FIRST [field:Float64;N, tag:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+            Filter: table.tag = Dictionary(Int32, Utf8("MA")) AND table.time > TimestampNanosecond(1, None) [field:Float64;N, tag:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+              TableScan: table [field:Float64;N, tag:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+        "###);
+    }
+
+    /// Runs func() and checks that predicates are simplified prior to
+    /// sending them down to the chunks for processing.
+    async fn run_test<T>(func: T)
+    where
+        T: Fn(Arc<TestDatabase>, InfluxRpcPredicate) -> BoxFuture<'static, ()> + Send + Sync,
+    {
+        test_helpers::maybe_start_logging();
+        // ------------- Test 1 ----------------
+
+        // this is what happens with a grpc predicate on a tag
+        //
+        // tag(foo) = 'bar' becomes
+        //
+        // CASE WHEN foo IS NULL then '' ELSE foo END = 'bar'
+        //
+        // It is critical to be rewritten foo = 'bar' correctly so
+        // that it can be evaluated quickly
+        let expr = when(col("foo").is_null(), lit(""))
+            .otherwise(col("foo"))
+            .unwrap();
+        let silly_predicate = Predicate::new().with_expr(expr.eq(lit("bar")));
+
+        // verify that the predicate was rewritten to `foo = 'bar'`
+        let expected_predicate = vec![col("foo").eq(lit_dict("bar"))];
+
+        run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
+
+        // ------------- Test 2 ----------------
+        // Validate that _measurement predicates are translated
+        //
+        // https://github.com/influxdata/influxdb_iox/issues/3601
+        // _measurement = 'foo'
+        let silly_predicate = Predicate::new().with_expr(col("_measurement").eq(lit("foo")));
+
+        // verify that the predicate was rewritten to `false` as the
+        // measurement name is `h20`
+        let expected_predicate = vec![lit(false)];
+        run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
+
+        // ------------- Test 3 ----------------
+        // more complicated _measurement predicates are translated
+        //
+        // https://github.com/influxdata/influxdb_iox/issues/3601
+        // (_measurement = 'foo' or measurement = 'h2o') AND foo = 'bar'
+        let silly_predicate = Predicate::new().with_expr(
+            col("_measurement")
+                .eq(lit("foo"))
+                .or(col("_measurement").eq(lit("h2o")))
+                .and(col("foo").eq(lit("bar"))),
+        );
+
+        // verify that the predicate was rewritten to foo = 'bar'
+        let dict = ScalarValue::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(ScalarValue::Utf8(Some("bar".to_string()))),
+        );
+        let expected_predicate = vec![col("foo").eq(lit(dict))];
+        run_test_with_predicate(&func, silly_predicate, expected_predicate).await;
+    }
+
+    /// Runs func() with the specified predicate and verifies
+    /// `expected_predicate` is received by the chunk
+    async fn run_test_with_predicate<T>(
+        func: &T,
+        predicate: Predicate,
+        expected_predicate: Vec<Expr>,
+    ) where
+        T: Fn(Arc<TestDatabase>, InfluxRpcPredicate) -> BoxFuture<'static, ()> + Send + Sync,
+    {
+        let chunk0 = Arc::new(
+            TestChunk::new("h2o")
+                .with_id(0)
+                .with_tag_column("foo")
+                .with_f64_field_column("my_field")
+                .with_time_column(),
+        );
+
+        let executor = Arc::new(Executor::new_testing());
+        let test_db = Arc::new(TestDatabase::new(Arc::clone(&executor)));
+        test_db.add_chunk("my_partition_key", Arc::clone(&chunk0));
+
+        let rpc_predicate = InfluxRpcPredicate::new(None, predicate);
+
+        // run the function
+        func(Arc::clone(&test_db), rpc_predicate).await;
+
+        let actual_predicate = test_db.get_chunks_predicate();
+
+        assert_eq!(
+            actual_predicate, expected_predicate,
+            "\nActual: {actual_predicate:?}\nExpected: {expected_predicate:?}"
+        );
+    }
+}
diff --git a/iox_query_influxrpc/src/missing_columns.rs b/iox_query_influxrpc/src/missing_columns.rs
new file mode 100644
index 0000000..d79d560
--- /dev/null
+++ b/iox_query_influxrpc/src/missing_columns.rs
@@ -0,0 +1,226 @@
+use arrow::datatypes::DataType;
+use datafusion::{
+    common::{tree_node::TreeNodeRewriter, DFSchema},
+    error::Result as DatafusionResult,
+    logical_expr::{BinaryExpr, ExprSchemable},
+    prelude::{binary_expr, lit, Expr},
+    scalar::ScalarValue,
+};
+use predicate::rpc_predicate::{FIELD_COLUMN_NAME, MEASUREMENT_COLUMN_NAME};
+use schema::Schema;
+
+/// Rewrites the provided expr such that references to any column that
+/// are not present in `schema` become null.
+///
+/// So for example, if the predicate is
+///
+/// `(STATE = 'CA') OR (READING >0)`
+///
+/// but the schema only has `STATE` (and not `READING`), then the
+/// predicate is rewritten to
+///
+/// `(STATE = 'CA') OR (NULL >0)`
+///
+/// This matches the Influx data model where any value that is not
+/// explicitly specified is implicitly NULL. Since different chunks
+/// and measurements can have different subsets of the columns, only
+/// parts of the predicate make sense.
+/// See comments on 'is_null_column'
+#[derive(Debug)]
+pub(crate) struct MissingColumnsToNull<'a> {
+    schema: &'a Schema,
+    df_schema: DFSchema,
+}
+
+impl<'a> MissingColumnsToNull<'a> {
+    pub(crate) fn new(schema: &'a Schema) -> Self {
+        let df_schema: DFSchema = schema
+            .as_arrow()
+            .as_ref()
+            .clone()
+            .try_into()
+            .expect("Create DF Schema");
+
+        Self { schema, df_schema }
+    }
+
+    /// Returns true if `expr` is a `Expr::Column` reference to a
+    /// column that doesn't exist in this schema
+    fn is_null_column(&self, expr: &Expr) -> bool {
+        if let Expr::Column(column) = &expr {
+            if column.name != MEASUREMENT_COLUMN_NAME && column.name != FIELD_COLUMN_NAME {
+                return self.schema.find_index_of(&column.name).is_none();
+            }
+        }
+        false
+    }
+
+    /// Rewrites an arg like col if col refers to a non existent
+    /// column into a null literal with "type" of `other_arg`, if possible
+    fn rewrite_op_arg(&self, arg: Expr, other_arg: &Expr) -> DatafusionResult<Expr> {
+        if self.is_null_column(&arg) {
+            let other_datatype = match other_arg.get_type(&self.df_schema) {
+                Ok(other_datatype) => other_datatype,
+                Err(_) => {
+                    // the other arg is also unknown and will be
+                    // rewritten, default to Int32 (sins due to
+                    // https://github.com/apache/arrow-datafusion/issues/1179)
+                    DataType::Int32
+                }
+            };
+
+            let scalar: ScalarValue = (&other_datatype).try_into()?;
+            Ok(Expr::Literal(scalar))
+        } else {
+            Ok(arg)
+        }
+    }
+}
+
+impl<'a> TreeNodeRewriter for MissingColumnsToNull<'a> {
+    type N = Expr;
+
+    fn mutate(&mut self, expr: Expr) -> DatafusionResult<Expr> {
+        // Ideally this would simply find all Expr::Columns and
+        // replace them with a constant NULL value. However, doing do
+        // is blocked on DF bug
+        // https://github.com/apache/arrow-datafusion/issues/1179
+        //
+        // Until then, we need to know what type of expr the column is
+        // being compared with, so workaround by finding the datatype of the other arg
+        match expr {
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
+                let left = self.rewrite_op_arg(*left, &right)?;
+                let right = self.rewrite_op_arg(*right, &left)?;
+                Ok(binary_expr(left, op, right))
+            }
+            Expr::IsNull(expr) if self.is_null_column(&expr) => Ok(lit(true)),
+            Expr::IsNotNull(expr) if self.is_null_column(&expr) => Ok(lit(false)),
+            expr => Ok(expr),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::datatypes::DataType;
+    use datafusion::{
+        common::tree_node::TreeNode,
+        prelude::{col, lit},
+        scalar::ScalarValue,
+    };
+    use schema::builder::SchemaBuilder;
+
+    use super::*;
+
+    #[test]
+    fn test_missing_colums_to_null() {
+        let schema = SchemaBuilder::new()
+            .tag("tag")
+            .field("str", DataType::Utf8)
+            .unwrap()
+            .field("int", DataType::Int64)
+            .unwrap()
+            .field("uint", DataType::UInt64)
+            .unwrap()
+            .field("float", DataType::Float64)
+            .unwrap()
+            .field("bool", DataType::Boolean)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // The fact that these need to be typed is due to
+        // https://github.com/apache/arrow-datafusion/issues/1179
+        let utf8_null = Expr::Literal(ScalarValue::Utf8(None));
+        let int32_null = Expr::Literal(ScalarValue::Int32(None));
+
+        // no rewrite
+        let expr = lit(1);
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // tag != str (no rewrite)
+        let expr = col("tag").not_eq(col("str"));
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // tag == str (no rewrite)
+        let expr = col("tag").eq(col("str"));
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // int < 5 (no rewrite, int part of schema)
+        let expr = col("int").lt(lit(5));
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // unknown < 5 --> NULL < 5 (unknown not in schema)
+        let expr = col("unknown").lt(lit(5));
+        let expected = int32_null.clone().lt(lit(5));
+        assert_rewrite(&schema, expr, expected);
+
+        // 5 < unknown --> 5 < NULL (unknown not in schema)
+        let expr = lit(5).lt(col("unknown"));
+        let expected = lit(5).lt(int32_null.clone());
+        assert_rewrite(&schema, expr, expected);
+
+        // _measurement < 5 --> _measurement < 5 (special column)
+        let expr = col("_measurement").lt(lit(5));
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // _field < 5 --> _field < 5 (special column)
+        let expr = col("_field").lt(lit(5));
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // _field < 5 OR col("unknown") < 5 --> _field < 5 OR (NULL < 5)
+        let expr = col("_field").lt(lit(5)).or(col("unknown").lt(lit(5)));
+        let expected = col("_field").lt(lit(5)).or(int32_null.clone().lt(lit(5)));
+        assert_rewrite(&schema, expr, expected);
+
+        // unknown < unknown2 -->  NULL < NULL (both unknown columns)
+        let expr = col("unknown").lt(col("unknown2"));
+        let expected = int32_null.clone().lt(int32_null);
+        assert_rewrite(&schema, expr, expected);
+
+        // int < 5 OR unknown != "foo"
+        let expr = col("int").lt(lit(5)).or(col("unknown").not_eq(lit("foo")));
+        let expected = col("int").lt(lit(5)).or(utf8_null.not_eq(lit("foo")));
+        assert_rewrite(&schema, expr, expected);
+
+        // int IS NULL
+        let expr = col("int").is_null();
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // unknown IS NULL --> true
+        let expr = col("unknown").is_null();
+        let expected = lit(true);
+        assert_rewrite(&schema, expr, expected);
+
+        // int IS NOT NULL
+        let expr = col("int").is_not_null();
+        let expected = expr.clone();
+        assert_rewrite(&schema, expr, expected);
+
+        // unknown IS NOT NULL --> false
+        let expr = col("unknown").is_not_null();
+        let expected = lit(false);
+        assert_rewrite(&schema, expr, expected);
+    }
+
+    fn assert_rewrite(schema: &Schema, expr: Expr, expected: Expr) {
+        let mut rewriter = MissingColumnsToNull::new(schema);
+        let rewritten_expr = expr
+            .clone()
+            .rewrite(&mut rewriter)
+            .expect("Rewrite successful");
+
+        assert_eq!(
+            &rewritten_expr, &expected,
+            "Mismatch rewriting\nInput: {expr}\nRewritten: {rewritten_expr}\nExpected: {expected}"
+        );
+    }
+}
diff --git a/iox_query_influxrpc/src/scan_plan.rs b/iox_query_influxrpc/src/scan_plan.rs
new file mode 100644
index 0000000..d2d5839
--- /dev/null
+++ b/iox_query_influxrpc/src/scan_plan.rs
@@ -0,0 +1,283 @@
+use std::sync::Arc;
+
+use datafusion::{common::tree_node::TreeNode, logical_expr::LogicalPlanBuilder};
+use observability_deps::tracing::trace;
+use predicate::Predicate;
+use schema::Schema;
+use snafu::{ResultExt, Snafu};
+
+use iox_query::{
+    provider::{ChunkTableProvider, ProviderBuilder},
+    QueryChunk,
+};
+
+use crate::missing_columns::MissingColumnsToNull;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "gRPC planner got error adding chunk for table {}: {}",
+        table_name,
+        source
+    ))]
+    CreatingProvider {
+        table_name: String,
+        source: iox_query::provider::Error,
+    },
+
+    #[snafu(display(
+        "Internal gRPC planner rewriting predicate for {}: {}",
+        table_name,
+        source
+    ))]
+    RewritingFilterPredicate {
+        table_name: String,
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display("Error building plan: {}", source))]
+    BuildingPlan {
+        source: datafusion::error::DataFusionError,
+    },
+}
+
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Represents scanning one or more [`QueryChunk`]s.
+pub(crate) struct ScanPlan {
+    pub(crate) plan_builder: LogicalPlanBuilder,
+    pub(crate) provider: Arc<ChunkTableProvider>,
+}
+
+impl std::fmt::Debug for ScanPlan {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ScanPlan")
+            .field("plan_builder", &"<...>")
+            .field("provider", &self.provider)
+            .finish()
+    }
+}
+
+impl ScanPlan {
+    /// Return the schema of the source (the merged schema across all tables)
+    pub(crate) fn schema(&self) -> &Schema {
+        self.provider.iox_schema()
+    }
+}
+
+/// Builder for [`ScanPlan`]s which scan the data  1 or more [`QueryChunk`] for
+/// IOx's custom query frontends (InfluxRPC and Reorg at the time of
+/// writing).
+///
+/// The created plan looks like:
+///
+/// ```text
+///   Filter(predicate) [optional]
+///     Scan
+/// ```
+///
+/// NOTE: This function assumes the chunks have already been "pruned"
+/// based on statistics and will not attempt to prune them
+/// further. Some frontends like influxrpc or the reorg planner manage
+/// (and thus prune) their own chunklist.
+
+#[derive(Debug)]
+pub(crate) struct ScanPlanBuilder<'a> {
+    table_name: Arc<str>,
+    /// The schema of the resulting table (any chunks that don't have
+    /// all the necessary columns will be extended appropriately)
+    table_schema: &'a Schema,
+    chunks: Vec<Arc<dyn QueryChunk>>,
+    predicate: Option<&'a Predicate>,
+}
+
+impl<'a> ScanPlanBuilder<'a> {
+    pub(crate) fn new(table_name: Arc<str>, table_schema: &'a Schema) -> Self {
+        Self {
+            table_name,
+            table_schema,
+            chunks: vec![],
+            predicate: None,
+        }
+    }
+
+    /// Adds `chunks` to the list of Chunks to scan
+    pub(crate) fn with_chunks(
+        mut self,
+        chunks: impl IntoIterator<Item = Arc<dyn QueryChunk>>,
+    ) -> Self {
+        self.chunks.extend(chunks);
+        self
+    }
+
+    /// Sets the predicate
+    pub(crate) fn with_predicate(mut self, predicate: &'a Predicate) -> Self {
+        assert!(self.predicate.is_none());
+        self.predicate = Some(predicate);
+        self
+    }
+
+    /// Creates a `ScanPlan` from the specified chunks
+    pub(crate) fn build(self) -> Result<ScanPlan> {
+        let Self {
+            table_name,
+            chunks,
+            table_schema,
+            predicate,
+        } = self;
+
+        assert!(!chunks.is_empty(), "no chunks provided");
+
+        // Prepare the plan for the table
+        let mut builder = ProviderBuilder::new(Arc::clone(&table_name), table_schema.clone())
+            .with_enable_deduplication(true);
+
+        for chunk in chunks {
+            builder = builder.add_chunk(chunk);
+        }
+
+        let provider = builder.build().context(CreatingProviderSnafu {
+            table_name: table_name.as_ref(),
+        })?;
+        let provider = Arc::new(provider);
+        let mut plan_builder = Arc::clone(&provider)
+            .into_logical_plan_builder()
+            .context(BuildingPlanSnafu)?;
+
+        // Use a filter node to add general predicates + timestamp
+        // range, if any
+        if let Some(predicate) = predicate {
+            if let Some(filter_expr) = predicate.filter_expr() {
+                // Rewrite expression so it only refers to columns in this chunk
+                let schema = provider.iox_schema();
+                trace!(%table_name, ?filter_expr, "Adding filter expr");
+                let mut rewriter = MissingColumnsToNull::new(schema);
+                let filter_expr =
+                    filter_expr
+                        .rewrite(&mut rewriter)
+                        .context(RewritingFilterPredicateSnafu {
+                            table_name: table_name.as_ref(),
+                        })?;
+
+                trace!(?filter_expr, "Rewritten filter_expr");
+
+                plan_builder = plan_builder
+                    .filter(filter_expr)
+                    .context(BuildingPlanSnafu)?;
+            }
+        }
+
+        Ok(ScanPlan {
+            plan_builder,
+            provider,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_util::assert_batches_eq;
+    use datafusion_util::test_collect_partition;
+    use iox_query::{
+        exec::{Executor, ExecutorType},
+        test::{format_execution_plan, TestChunk},
+    };
+    use schema::merge::SchemaMerger;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_scan_plan_deduplication() {
+        test_helpers::maybe_start_logging();
+        // Create 2 overlapped chunks
+        let (schema, chunks) = get_test_overlapped_chunks();
+
+        // Build a logical plan with deduplication
+        let scan_plan = ScanPlanBuilder::new(Arc::from("t"), &schema)
+            .with_chunks(chunks)
+            .build()
+            .unwrap();
+        let logical_plan = scan_plan.plan_builder.build().unwrap();
+
+        // Build physical plan
+        let executor = Executor::new_testing();
+        let physical_plan = executor
+            .new_context(ExecutorType::Reorg)
+            .create_physical_plan(&logical_plan)
+            .await
+            .unwrap();
+
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&physical_plan),
+            @r###"
+        ---
+        - " ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
+        - "   DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
+        - "     SortPreservingMergeExec: [tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
+        - "       SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
+        - "         RecordBatchesExec: chunks=2, projection=[__chunk_order, field_int, field_int2, tag1, time]"
+        "###
+        );
+
+        // Verify output data
+        // Since data is merged due to deduplication, the two input chunks will be merged into one output chunk
+        assert_eq!(
+            physical_plan.output_partitioning().partition_count(),
+            1,
+            "{:?}",
+            physical_plan.output_partitioning()
+        );
+        let batches0 = test_collect_partition(Arc::clone(&physical_plan), 0).await;
+        // Data is sorted on tag1 & time. One row is removed due to deduplication
+        let expected = vec![
+            "+-----------+------------+------+--------------------------------+",
+            "| field_int | field_int2 | tag1 | time                           |",
+            "+-----------+------------+------+--------------------------------+",
+            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
+            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |            | MT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z    |",
+            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z    |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z    |", // other row with the same tag1 and time is removed
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z    |",
+            "+-----------+------------+------+--------------------------------+",
+        ];
+        assert_batches_eq!(&expected, &batches0);
+    }
+
+    fn get_test_overlapped_chunks() -> (Schema, Vec<Arc<dyn QueryChunk>>) {
+        let max_time = 70000;
+        let chunk1 = Arc::new(
+            TestChunk::new("t")
+                .with_order(1)
+                .with_partition(1)
+                .with_time_column_with_stats(Some(50), Some(max_time))
+                .with_tag_column_with_stats("tag1", Some("AL"), Some("MT"))
+                .with_i64_field_column("field_int")
+                .with_five_rows_of_data(),
+        );
+
+        // Chunk 2 has an extra field, and only 4 rows
+        let chunk2 = Arc::new(
+            TestChunk::new("t")
+                .with_order(2)
+                .with_partition(1)
+                .with_time_column_with_stats(Some(28000), Some(220000))
+                .with_tag_column_with_stats("tag1", Some("UT"), Some("WA"))
+                .with_i64_field_column("field_int")
+                .with_i64_field_column("field_int2")
+                .with_may_contain_pk_duplicates(true)
+                .with_four_rows_of_data(),
+        );
+
+        let schema = SchemaMerger::new()
+            .merge(chunk1.schema())
+            .unwrap()
+            .merge(chunk2.schema())
+            .unwrap()
+            .build();
+
+        (schema, vec![chunk1, chunk2])
+    }
+}
diff --git a/iox_query_params/Cargo.toml b/iox_query_params/Cargo.toml
new file mode 100644
index 0000000..3c0eeb9
--- /dev/null
+++ b/iox_query_params/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "iox_query_params"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+datafusion = { workspace = true }
+generated_types = { path = "../generated_types" }
+observability_deps = { path = "../observability_deps" }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+thiserror = "1.0"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+assert_matches = "1"
+
diff --git a/iox_query_params/src/lib.rs b/iox_query_params/src/lib.rs
new file mode 100644
index 0000000..5014935
--- /dev/null
+++ b/iox_query_params/src/lib.rs
@@ -0,0 +1,21 @@
+//! Crate for common types and utilities related to InfluxDB
+//! query/statement parameters.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+
+mod params;
+
+pub use params::*;
+
+use workspace_hack as _;
diff --git a/iox_query_params/src/params.rs b/iox_query_params/src/params.rs
new file mode 100644
index 0000000..ce2526c
--- /dev/null
+++ b/iox_query_params/src/params.rs
@@ -0,0 +1,675 @@
+//! General-purpose data type and utilities for working with
+//! values that can be supplied as an InfluxDB bind parameter.
+use std::{borrow::Cow, collections::HashMap};
+
+use datafusion::scalar::ScalarValue;
+use observability_deps::tracing::warn;
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+// remap protobuf types for convenience
+mod proto {
+    pub(super) use generated_types::influxdata::iox::querier::v1::read_info::{
+        query_param::{NullValue, Value},
+        QueryParam,
+    };
+}
+
+#[derive(Debug, Error)]
+/// Parameter errors
+pub enum Error {
+    /// Data conversion error
+    #[error("{}", msg)]
+    Conversion { msg: String },
+}
+
+/// A helper macro to construct a `HashMap` over `(String, StatementParam)` pairs.
+#[macro_export]
+macro_rules! params {
+    () => (
+        std::collections::HashMap::new()
+    );
+    ($($key:expr => $val:expr),+ $(,)?) => (
+        std::collections::HashMap::from([$((String::from($key), $crate::StatementParam::from($val))),+])
+    );
+}
+
+/// A collection of statement parameter (name,value) pairs.
+///
+/// This is a newtype wrapper to facillitate data conversions.
+/// [From] instances can be used to convert to/from protobuf and JSON
+/// protocol formats.
+///
+/// There is also a [From] instance to convert to
+/// [datafusion::common::ParamValues] which makes it possible to pass
+/// parameters into a [datafusion::logical_expr::LogicalPlan]
+#[repr(transparent)]
+#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct StatementParams(HashMap<String, StatementParam>);
+
+impl StatementParams {
+    /// Convert to internal representation.
+    pub fn into_inner(&self) -> &HashMap<String, StatementParam> {
+        &self.0
+    }
+
+    /// Convert into a HashMap of (name, value) pairs
+    pub fn into_hashmap<V: From<StatementParam>>(self) -> HashMap<String, V> {
+        self.0
+            .into_iter()
+            .map(|(key, value)| (key, value.into()))
+            .collect::<HashMap<String, V>>()
+    }
+
+    /// Convert to [datafusion::common::ParamValues] used by [datafusion::logical_expr::LogicalPlan]::with_param_values
+    pub fn into_df_param_values(self) -> datafusion::common::ParamValues {
+        self.into()
+    }
+}
+
+/// From HashMap
+impl From<HashMap<String, StatementParam>> for StatementParams {
+    fn from(value: HashMap<String, StatementParam>) -> Self {
+        Self(value)
+    }
+}
+
+/// To HashMap
+impl From<StatementParams> for HashMap<String, StatementParam> {
+    fn from(value: StatementParams) -> Self {
+        value.0
+    }
+}
+
+/// Converting to [datafusion::common::ParamValues] allows for
+/// parameters to be passed to DataFusion
+impl From<StatementParams> for datafusion::common::ParamValues {
+    fn from(params: StatementParams) -> Self {
+        Self::Map(params.into_hashmap())
+    }
+}
+
+/// Convert from protobuf
+impl TryFrom<Vec<proto::QueryParam>> for StatementParams {
+    type Error = self::Error;
+    fn try_from(proto: Vec<proto::QueryParam>) -> Result<Self, Self::Error> {
+        let params = proto
+            .into_iter()
+            .map(|param| {
+                match param.value {
+                    Some(value) => Ok((param.name, StatementParam::from(value))),
+                    None => Err(Error::Conversion {
+                        msg: format!(
+                            "Missing value for parameter \"{}\" when decoding query parameters in Flight gRPC ticket.",
+                            param.name)
+                    })
+                }
+            }).collect::<Result<HashMap<_, _>, _>>()?;
+        Ok(Self(params))
+    }
+}
+
+/// Convert into protobuf
+impl From<StatementParams> for Vec<proto::QueryParam> {
+    fn from(params: StatementParams) -> Self {
+        params
+            .0
+            .into_iter()
+            .map(|(name, value)| proto::QueryParam {
+                name,
+                value: Some(value.into()),
+            })
+            .collect()
+    }
+}
+
+/// Enum of possible data types that can be used as parameters in an InfluxQL query.
+///
+/// # creating values
+///
+/// [From] implementations for many builtin types are provided to make creation of parameter values
+/// easier from the influxdb client.
+///
+/// # protocol formats
+///
+/// There are [From]/[TryFrom] implementations to convert to/from
+/// protobuf and JSON. These are used for deserialization/serialization of
+/// protocol messages across gRPC and the legacy REST API
+///
+/// # planning/execution
+///
+/// There is a [From] implementation to convert to DataFusion [ScalarValue]s. This
+/// allows params to be passed into the DataFusion [datafusion::logical_expr::LogicalPlan]
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[serde(try_from = "serde_json::Value", into = "serde_json::Value")]
+pub enum StatementParam {
+    /// a NULL value
+    #[default]
+    Null,
+    /// a boolean value
+    Boolean(bool),
+    /// an unsigned integer value
+    UInt64(u64),
+    /// a signed integer value
+    Int64(i64),
+    /// a floating point value
+    Float64(f64),
+    /// a UTF-8 string value
+    String(String),
+}
+
+/// Display as "SQL-like" literals
+impl std::fmt::Display for StatementParam {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Null => write!(f, "NULL"),
+            Self::Boolean(b) => write!(f, "{}", b.to_string().to_uppercase()),
+            Self::UInt64(u) => write!(f, "{}", u),
+            Self::Int64(i) => write!(f, "{}", i),
+            Self::Float64(fl) => write!(f, "{}", fl),
+            Self::String(s) => write!(f, "'{}'", s.replace('\'', "''")),
+        }
+    }
+}
+
+impl PartialEq<Self> for StatementParam {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (Self::Null, Self::Null) => true,
+            (Self::Boolean(b1), Self::Boolean(b2)) => b1 == b2,
+            (Self::UInt64(u1), Self::UInt64(u2)) => u1 == u2,
+            (Self::Int64(i1), Self::Int64(i2)) => i1 == i2,
+            (Self::Float64(f1), Self::Float64(f2)) => f1 == f2,
+            (Self::String(s1), Self::String(s2)) => s1 == s2,
+            // do not use a `_` pattern here because we want the exhaustiveness
+            // check to fail if a new param variant is added
+            (
+                Self::Null
+                | Self::Boolean(_)
+                | Self::UInt64(_)
+                | Self::Int64(_)
+                | Self::Float64(_)
+                | Self::String(_),
+                _,
+            ) => false,
+        }
+    }
+}
+
+impl Eq for StatementParam {}
+
+/// Convert into protobuf representation
+impl From<StatementParam> for proto::Value {
+    fn from(value: StatementParam) -> Self {
+        use proto::NullValue;
+        match value {
+            StatementParam::Null => Self::Null(NullValue::Unspecified.into()),
+            StatementParam::Boolean(b) => Self::Boolean(b),
+            StatementParam::UInt64(u) => Self::UInt64(u),
+            StatementParam::Int64(i) => Self::Int64(i),
+            StatementParam::Float64(f) => Self::Float64(f),
+            StatementParam::String(s) => Self::String(s),
+        }
+    }
+}
+
+/// Convert into JSON representation
+impl From<StatementParam> for serde_json::Value {
+    fn from(param: StatementParam) -> Self {
+        match param {
+            StatementParam::Null => Self::Null,
+            StatementParam::Boolean(b) => Self::Bool(b),
+            StatementParam::Float64(f) => Self::from(f),
+            StatementParam::UInt64(u) => Self::from(u),
+            StatementParam::Int64(i) => Self::from(i),
+            StatementParam::String(s) => Self::String(s),
+        }
+    }
+}
+
+/// Convert to DataFusion [ScalarValue]. This makes it possible to pass parameters
+/// into a datafusion [datafusion::logical_expr::LogicalPlan]
+impl From<StatementParam> for ScalarValue {
+    fn from(value: StatementParam) -> Self {
+        match value {
+            StatementParam::Null => Self::Null,
+            StatementParam::Boolean(b) => Self::Boolean(Some(b)),
+            StatementParam::UInt64(u) => Self::UInt64(Some(u)),
+            StatementParam::Int64(i) => Self::Int64(Some(i)),
+            StatementParam::Float64(f) => Self::Float64(Some(f)),
+            StatementParam::String(s) => Self::Utf8(Some(s)),
+        }
+    }
+}
+
+/// Convert from protobuf representation
+impl From<proto::Value> for StatementParam {
+    fn from(value: proto::Value) -> Self {
+        match value {
+            proto::Value::Null(n) => {
+                const UNSPECIFIED: i32 = proto::NullValue::Unspecified as i32;
+                if n != UNSPECIFIED {
+                    warn!(
+                        "Malformed Null in protobuf when decoding parameter \
+                        value into StatementParam. Expected Null({UNSPECIFIED}) \
+                        but found Null({n}). Possibly mismatched protobuf \
+                        versions.
+                        "
+                    );
+                }
+                Self::Null
+            }
+            proto::Value::Boolean(b) => Self::Boolean(b),
+            proto::Value::Float64(f) => Self::from(f),
+            proto::Value::UInt64(u) => Self::from(u),
+            proto::Value::Int64(i) => Self::from(i),
+            proto::Value::String(s) => Self::String(s),
+        }
+    }
+}
+
+/// Convert from JSON representation
+impl TryFrom<serde_json::Value> for StatementParam {
+    type Error = self::Error;
+    fn try_from(value: serde_json::Value) -> Result<Self, Self::Error> {
+        use serde_json::Value;
+        match value {
+            Value::Null => Ok(Self::Null),
+            Value::Bool(b) => Ok(Self::Boolean(b)),
+            Value::Number(n) => {
+                if let Some(u) = n.as_u64() {
+                    Ok(Self::UInt64(u))
+                } else if let Some(i) = n.as_i64() {
+                    Ok(Self::Int64(i))
+                } else if let Some(f) = n.as_f64() {
+                    Ok(Self::Float64(f))
+                } else {
+                    // NOTE: without the "arbitrary_precision" feature enabled on serde_json,
+                    // deserialization will never encounter this branch
+                    Err(Error::Conversion {
+                        msg: format!("Could not convert JSON number to i64 or f64: {n}"),
+                    })
+                }
+            }
+            Value::String(s) => Ok(Self::String(s)),
+            Value::Array(_) => Err(Error::Conversion {
+                msg: "JSON arrays are not supported as query parameters. Expected null, boolean, number, or string.".to_string(),
+            }),
+            Value::Object(_) => Err(Error::Conversion {
+                msg: "JSON objects are not supported as query parameters. Expected null, boolean, number, or string".to_string(),
+            }),
+        }
+    }
+}
+
+/// [`Option`] values are unwrapped and [`None`] values are converted to NULL
+impl<T> From<Option<T>> for StatementParam
+where
+    Self: From<T>,
+{
+    fn from(value: Option<T>) -> Self {
+        match value {
+            None => Self::Null,
+            Some(value) => value.into(),
+        }
+    }
+}
+
+/// Unit type is converted to NULL
+impl From<()> for StatementParam {
+    fn from(_value: ()) -> Self {
+        Self::Null
+    }
+}
+
+impl From<bool> for StatementParam {
+    fn from(value: bool) -> Self {
+        Self::Boolean(value)
+    }
+}
+
+impl From<u8> for StatementParam {
+    fn from(value: u8) -> Self {
+        Self::UInt64(value as u64)
+    }
+}
+
+impl From<u16> for StatementParam {
+    fn from(value: u16) -> Self {
+        Self::UInt64(value as u64)
+    }
+}
+
+impl From<u32> for StatementParam {
+    fn from(value: u32) -> Self {
+        Self::UInt64(value as u64)
+    }
+}
+
+impl From<u64> for StatementParam {
+    fn from(value: u64) -> Self {
+        Self::UInt64(value)
+    }
+}
+
+impl From<usize> for StatementParam {
+    fn from(value: usize) -> Self {
+        Self::UInt64(value.try_into().unwrap())
+    }
+}
+
+impl From<i8> for StatementParam {
+    fn from(value: i8) -> Self {
+        Self::Int64(value as i64)
+    }
+}
+
+impl From<i16> for StatementParam {
+    fn from(value: i16) -> Self {
+        Self::Int64(value as i64)
+    }
+}
+
+impl From<i32> for StatementParam {
+    fn from(value: i32) -> Self {
+        Self::Int64(value.into())
+    }
+}
+
+impl From<i64> for StatementParam {
+    fn from(value: i64) -> Self {
+        Self::Int64(value)
+    }
+}
+
+impl From<isize> for StatementParam {
+    fn from(value: isize) -> Self {
+        Self::Int64(value.try_into().unwrap())
+    }
+}
+
+impl From<f32> for StatementParam {
+    fn from(value: f32) -> Self {
+        Self::Float64(value.into())
+    }
+}
+
+impl From<f64> for StatementParam {
+    fn from(value: f64) -> Self {
+        Self::Float64(value)
+    }
+}
+
+impl From<&str> for StatementParam {
+    fn from(value: &str) -> Self {
+        Self::String(value.to_string())
+    }
+}
+
+impl From<String> for StatementParam {
+    fn from(value: String) -> Self {
+        Self::String(value)
+    }
+}
+
+impl<'a> From<Cow<'a, str>> for StatementParam {
+    fn from(value: Cow<'a, str>) -> Self {
+        Self::String(value.into_owned())
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::approx_constant)] // allow 3.14  >:)
+mod tests {
+    use assert_matches::assert_matches;
+    use serde_json::json;
+
+    use super::*;
+
+    #[test]
+    fn params_from_protobuf_value() {
+        // empty case
+        assert_matches!(StatementParams::try_from(vec![]), Ok(StatementParams(hm)) if hm.is_empty());
+
+        // test happy path with all value types
+        let proto: Vec<proto::QueryParam> = [
+            ("foo", proto::Value::String("Test String".to_string())),
+            ("bar", proto::Value::Float64(3.14)),
+            ("baz", proto::Value::UInt64(1234)),
+            ("int", proto::Value::Int64(-1234)),
+            ("1", proto::Value::Boolean(false)),
+            ("2", proto::Value::Null(0)),
+        ]
+        .map(|(key, value)| proto::QueryParam {
+            name: key.to_string(),
+            value: Some(value),
+        })
+        .into();
+        let result = StatementParams::try_from(proto);
+        let params = result.unwrap().0;
+        assert_eq!(
+            params,
+            params! {
+                "foo" => "Test String",
+                "bar" => 3.14_f64,
+                "baz" => 1234_u64,
+                "int" => -1234_i64,
+                "1" => false,
+                "2" => StatementParam::Null,
+            }
+        );
+    }
+
+    #[test]
+    fn params_from_json_values() {
+        use serde_json::Value;
+        assert_matches!(
+            StatementParam::try_from(Value::from("Test String")),
+            Ok(StatementParam::String(s)) if s == "Test String");
+        assert_matches!(
+            StatementParam::try_from(Value::from(3.14)),
+            Ok(StatementParam::Float64(n)) if n == 3.14
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::from(1234)),
+            Ok(StatementParam::UInt64(1234))
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::from(-1234)),
+            Ok(StatementParam::Int64(-1234))
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::from(false)),
+            Ok(StatementParam::Boolean(false))
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::Null),
+            Ok(StatementParam::Null)
+        );
+        // invalid values
+        assert_matches!(
+            StatementParam::try_from(json!([1, 2, 3])),
+            Err(Error::Conversion { .. })
+        );
+        assert_matches!(
+            StatementParam::try_from(json!({ "a": 1, "b": 2, "c": 3})),
+            Err(Error::Conversion { .. })
+        );
+    }
+
+    #[test]
+    fn params_from_json_str() {
+        let json = r#"
+            {
+                "foo": "Test String",
+                "bar": 3.14,
+                "baz": 1234,
+                "int": -1234,
+                "1": false,
+                "2": null
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        let params = result.unwrap().0;
+        assert_eq!(
+            params,
+            params! {
+                "foo" => "Test String",
+                "bar" => 3.14_f64,
+                "baz" => 1234_u64,
+                "int" => -1234_i64,
+                "1" => false,
+                "2" => StatementParam::Null,
+            }
+        );
+    }
+
+    #[test]
+    fn params_from_json_str_invalid() {
+        // invalid top-level values
+        assert_matches!(serde_json::from_str::<StatementParams>("null"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("100"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("3.14"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("true"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("[\"foo\"]"), Err(_));
+
+        // nested lists are invalid
+        let json = r#"
+            {
+                "foo": [],
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+
+        // nested objects are invalid
+        let json = r#"
+            {
+                "foo": {},
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+
+        // nested list with contents
+        let json = r#"
+            {
+                "foo bar": [1, 2, "3", "4 5 6", [null], [[]], {}],
+                "baz": null
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+
+        // nested object with contents
+        let json = r#"
+            {
+                "fazbar": {
+                    "a": 1,
+                    "b": 2,
+                    "c": null
+                },
+                "baz": null
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+    }
+
+    // tests what happens when integer and float are out of bounds
+    //
+    // without `arbitrary_precision` flag, `serde_json` will always deserialize numbers to
+    // either i64 or f64.
+    //
+    // one potential edge case to be aware of is what happens when `serde_json::Value`` deserializes
+    // an integer number that's out-of-bounds for i64, but in-bounds for f64. In this case
+    // it will be interpreted as a float, and rounding errors can be introduced. This case
+    // is unlikely to occur as long as clients are properly validating that their integers
+    // are within 64-bit bounds, but it's possible that a client serializing a bigdecimal could
+    // encounter this case. this has not been testing when `serde_json` has `arbitrary_precision` enabled,
+    // so it's possible adding that feature would prevent rounding errors in this case.
+    // supporting bigdecimal parameters would also fix this edge case.
+    #[test]
+    fn params_from_json_str_bignum() {
+        let json = format! {" {{ \"abc\" : {}999 }} ", f64::MAX};
+        let result = serde_json::from_str::<StatementParams>(&json);
+        // NOTE: without the "arbitrary_precision" feature enabled on serde_json, deserialization will never encounter
+        // our out-of-bounds guard
+        let err = result.unwrap_err();
+        assert!(err.to_string().contains("number out of range"));
+    }
+
+    #[test]
+    fn params_conversions() {
+        assert_matches!(StatementParam::from(true), StatementParam::Boolean(true));
+        assert_matches!(StatementParam::from(123_u32), StatementParam::UInt64(123));
+        assert_matches!(StatementParam::from(-123), StatementParam::Int64(-123));
+        assert_matches!(StatementParam::from(1.23), StatementParam::Float64(f) if f == 1.23);
+        assert_matches!(StatementParam::from("a string"), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from("a string".to_owned()), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Cow::from("a string")), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(()), StatementParam::Null);
+        assert_matches!(
+            StatementParam::from(None::<Option<bool>>),
+            StatementParam::Null
+        );
+        assert_matches!(
+            StatementParam::from(Some(true)),
+            StatementParam::Boolean(true)
+        );
+        assert_matches!(
+            StatementParam::from(Some(123_u32)),
+            StatementParam::UInt64(123)
+        );
+        assert_matches!(
+            StatementParam::from(Some(-123)),
+            StatementParam::Int64(-123)
+        );
+        assert_matches!(StatementParam::from(Some(1.23)), StatementParam::Float64(f) if f == 1.23);
+        assert_matches!(StatementParam::from(Some("a string")), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Some("a string".to_owned())), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Some(Cow::from("a string"))), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Some(())), StatementParam::Null);
+        assert_matches!(
+            StatementParam::from(Some(None::<Option<i32>>)),
+            StatementParam::Null
+        );
+        assert_matches!(
+            StatementParam::from(Some(Some(true))),
+            StatementParam::Boolean(true)
+        );
+    }
+
+    // test equality comparisons for StatementParams
+    #[test]
+    fn params_equality() {
+        let values = [
+            StatementParam::Null,
+            StatementParam::from(true),
+            StatementParam::from(32_u32),
+            StatementParam::from(-23),
+            StatementParam::from(32.23),
+            StatementParam::from("a string"),
+        ];
+        for (i, value1) in values.iter().enumerate() {
+            for (j, value2) in values.iter().enumerate() {
+                if i == j {
+                    assert_eq!(value1, value2);
+                } else {
+                    assert_ne!(value1, value2);
+                }
+            }
+        }
+        assert_ne!(StatementParam::from(true), StatementParam::from(false));
+        assert_ne!(
+            StatementParam::from(1984_u32),
+            StatementParam::from(2077_u32)
+        );
+        assert_ne!(StatementParam::from(-100), StatementParam::from(100));
+        assert_ne!(StatementParam::from(-1.23), StatementParam::from(1.23));
+        assert_ne!(
+            StatementParam::from("string1"),
+            StatementParam::from("string2")
+        );
+    }
+}
diff --git a/iox_tests/Cargo.toml b/iox_tests/Cargo.toml
new file mode 100644
index 0000000..ae46c10
--- /dev/null
+++ b/iox_tests/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "iox_tests"
+description = "IOx test utils and tests"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+data_types = { path = "../data_types" }
+datafusion = { workspace = true }
+datafusion_util = { path = "../datafusion_util" }
+generated_types = { path = "../generated_types" }
+iox_catalog = { path = "../iox_catalog" }
+iox_query = { path = "../iox_query" }
+iox_time = { path = "../iox_time" }
+metric = { path = "../metric" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+parquet_file = { path = "../parquet_file" }
+schema = { path = "../schema" }
+uuid = { version = "1", features = ["v4"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/iox_tests/src/builders.rs b/iox_tests/src/builders.rs
new file mode 100644
index 0000000..f1eb5ba
--- /dev/null
+++ b/iox_tests/src/builders.rs
@@ -0,0 +1,294 @@
+use data_types::{
+    Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, NamespaceId, ObjectStoreId,
+    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId,
+    PartitionKey, SkippedCompaction, Table, TableId, Timestamp,
+};
+use uuid::Uuid;
+
+#[derive(Debug, Clone)]
+/// Build up [`ParquetFile`]s for testing
+pub struct ParquetFileBuilder {
+    file: ParquetFile,
+}
+
+impl ParquetFileBuilder {
+    /// Create a builder that will create a parquet file with
+    /// `parquet_id` of `id`
+    pub fn new(id: i64) -> Self {
+        let table_id = TableId::new(0);
+        Self {
+            file: ParquetFile {
+                id: ParquetFileId::new(id),
+                namespace_id: NamespaceId::new(0),
+                table_id,
+                partition_id: PartitionId::new(0),
+                partition_hash_id: Some(PartitionHashId::new(
+                    table_id,
+                    &PartitionKey::from("arbitrary"),
+                )),
+                object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(
+                    id.try_into().expect("invalid id"),
+                )),
+                min_time: Timestamp::new(0),
+                max_time: Timestamp::new(0),
+                to_delete: None,
+                file_size_bytes: 1,
+                row_count: 1,
+                compaction_level: CompactionLevel::FileNonOverlapped,
+                created_at: Timestamp::new(0),
+                column_set: ColumnSet::new(vec![]),
+                max_l0_created_at: Timestamp::new(0),
+            },
+        }
+    }
+
+    /// Set the partition identifier
+    pub fn with_partition(self, partition_id: PartitionId) -> Self {
+        Self {
+            file: ParquetFile {
+                partition_id,
+                ..self.file
+            },
+        }
+    }
+
+    /// Set the compaction level
+    pub fn with_compaction_level(self, level: CompactionLevel) -> Self {
+        Self {
+            file: ParquetFile {
+                compaction_level: level,
+                ..self.file
+            },
+        }
+    }
+
+    /// Set the file size
+    pub fn with_file_size_bytes(self, file_size_bytes: i64) -> Self {
+        Self {
+            file: ParquetFile {
+                file_size_bytes,
+                ..self.file
+            },
+        }
+    }
+
+    /// Set the min/max time range
+    pub fn with_time_range(self, min_time: i64, max_time: i64) -> Self {
+        Self {
+            file: ParquetFile {
+                min_time: Timestamp::new(min_time),
+                max_time: Timestamp::new(max_time),
+                ..self.file
+            },
+        }
+    }
+
+    /// Set the row_count
+    pub fn with_row_count(self, row_count: i64) -> Self {
+        Self {
+            file: ParquetFile {
+                row_count,
+                ..self.file
+            },
+        }
+    }
+
+    /// Set max_l0_created_at
+    pub fn with_max_l0_created_at(self, max_l0_created_at: i64) -> Self {
+        Self {
+            file: ParquetFile {
+                max_l0_created_at: Timestamp::new(max_l0_created_at),
+                ..self.file
+            },
+        }
+    }
+
+    /// Create the [`ParquetFile`]
+    pub fn build(self) -> ParquetFile {
+        self.file
+    }
+
+    /// Construct [`ParquetFileParams`] and the corresponding [`ParquetFile`]
+    pub fn params(self) -> (ParquetFileParams, ParquetFile) {
+        let file = self.clone().build();
+        let params = ParquetFileParams {
+            partition_id: self.file.partition_id,
+            partition_hash_id: self.file.partition_hash_id,
+            namespace_id: self.file.namespace_id,
+            table_id: self.file.table_id,
+            object_store_id: self.file.object_store_id,
+            min_time: self.file.min_time,
+            max_time: self.file.max_time,
+            file_size_bytes: self.file.file_size_bytes,
+            row_count: self.file.row_count,
+            compaction_level: self.file.compaction_level,
+            created_at: self.file.created_at,
+            column_set: self.file.column_set,
+            max_l0_created_at: self.file.max_l0_created_at,
+        };
+        (params, file)
+    }
+}
+
+impl From<ParquetFile> for ParquetFileBuilder {
+    fn from(file: ParquetFile) -> Self {
+        Self { file }
+    }
+}
+
+#[derive(Debug)]
+/// Build  [`Column`]s for testing
+pub struct ColumnBuilder {
+    column: Column,
+}
+
+impl ColumnBuilder {
+    /// Create a builder to create a column with `table_id` `id`
+    pub fn new(id: i64, table_id: i64) -> Self {
+        Self {
+            column: Column {
+                id: ColumnId::new(id),
+                table_id: TableId::new(table_id),
+                name: "column".to_string(),
+                column_type: ColumnType::Tag,
+            },
+        }
+    }
+
+    /// Set the column name
+    pub fn with_name(self, name: &str) -> Self {
+        Self {
+            column: Column {
+                name: name.to_string(),
+                ..self.column
+            },
+        }
+    }
+
+    /// Set column type
+    pub fn with_column_type(self, column_type: ColumnType) -> Self {
+        Self {
+            column: Column {
+                column_type,
+                ..self.column
+            },
+        }
+    }
+
+    /// Create the table
+    pub fn build(self) -> Column {
+        self.column
+    }
+}
+
+#[derive(Debug)]
+/// Build  [`Table`]s for testing
+pub struct TableBuilder {
+    table: Table,
+}
+
+impl TableBuilder {
+    /// Create a builder to create a table with `table_id` `id`
+    pub fn new(id: i64) -> Self {
+        Self {
+            table: Table {
+                id: TableId::new(id),
+                namespace_id: NamespaceId::new(0),
+                name: "table".to_string(),
+                partition_template: Default::default(),
+            },
+        }
+    }
+
+    /// Set the table name
+    pub fn with_name(self, name: &str) -> Self {
+        Self {
+            table: Table {
+                name: name.to_string(),
+                ..self.table
+            },
+        }
+    }
+
+    /// Create the table
+    pub fn build(self) -> Table {
+        self.table
+    }
+}
+
+#[derive(Debug)]
+/// Builds [`Partition`]s for testing
+pub struct PartitionBuilder {
+    partition: Partition,
+}
+
+impl PartitionBuilder {
+    /// Create a builder to create a partition with `partition_id` `id`
+    pub fn new(id: i64) -> Self {
+        let table_id = TableId::new(0);
+        let key = PartitionKey::from("key");
+        let hash_id = PartitionHashId::new(table_id, &key);
+
+        Self {
+            partition: Partition::new_catalog_only(
+                PartitionId::new(id),
+                Some(hash_id),
+                table_id,
+                key,
+                Default::default(),
+                None,
+            ),
+        }
+    }
+
+    /// Set the `new_file_at` attribute, without needing to actually create Parquet files for this
+    /// partition
+    pub fn with_new_file_at(mut self, time: Timestamp) -> Self {
+        self.partition.new_file_at = Some(time);
+        self
+    }
+
+    /// Create the partition
+    pub fn build(self) -> Partition {
+        self.partition
+    }
+}
+
+#[derive(Debug)]
+/// A builder to create a skipped compaction record
+pub struct SkippedCompactionBuilder {
+    skipped_compaction: SkippedCompaction,
+}
+
+impl SkippedCompactionBuilder {
+    /// Create the builder for skipped_compaction_id = id
+    pub fn new(id: i64) -> Self {
+        Self {
+            skipped_compaction: SkippedCompaction {
+                partition_id: PartitionId::new(id),
+                reason: "test skipped compaction".to_string(),
+                skipped_at: Timestamp::new(0),
+                num_files: 0,
+                limit_num_files: 0,
+                estimated_bytes: 0,
+                limit_bytes: 0,
+                limit_num_files_first_in_partition: 0,
+            },
+        }
+    }
+
+    /// Add a reason for the skipped compaction
+    pub fn with_reason(self, reason: &str) -> Self {
+        Self {
+            skipped_compaction: SkippedCompaction {
+                reason: reason.to_string(),
+                ..self.skipped_compaction
+            },
+        }
+    }
+
+    /// Build the skipped compaction
+    pub fn build(self) -> SkippedCompaction {
+        self.skipped_compaction
+    }
+}
diff --git a/iox_tests/src/catalog.rs b/iox_tests/src/catalog.rs
new file mode 100644
index 0000000..507ef09
--- /dev/null
+++ b/iox_tests/src/catalog.rs
@@ -0,0 +1,947 @@
+//! Utils of the tests
+
+use arrow::{
+    compute::{lexsort, SortColumn, SortOptions},
+    record_batch::RecordBatch,
+};
+use data_types::{
+    partition_template::TablePartitionTemplateOverride, Column, ColumnSet, ColumnType,
+    ColumnsByName, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceName,
+    NamespaceSchema, ObjectStoreId, ParquetFile, ParquetFileParams, Partition, PartitionId,
+    SortKeyIds, Table, TableSchema, Timestamp, TransitionPartitionId,
+};
+use datafusion::physical_plan::metrics::Count;
+use datafusion_util::{unbounded_memory_pool, MemoryStream};
+use generated_types::influxdata::iox::partition_template::v1::PartitionTemplate;
+use iox_catalog::interface::PartitionRepoExt;
+use iox_catalog::{
+    interface::{Catalog, ParquetFileRepoExt, RepoCollection, SoftDeletedRows},
+    mem::MemCatalog,
+    test_helpers::arbitrary_table,
+    util::{get_schema_by_id, get_table_columns_by_id},
+};
+use iox_query::{
+    exec::{DedicatedExecutors, Executor, ExecutorConfig},
+    provider::RecordBatchDeduplicator,
+    util::arrow_sort_key_exprs,
+};
+use iox_time::{MockProvider, Time, TimeProvider};
+use mutable_batch_lp::test_helpers::lp_to_mutable_batch;
+use object_store::{memory::InMemory, DynObjectStore};
+use observability_deps::tracing::debug;
+use parquet_file::{
+    chunk::ParquetChunk,
+    metadata::IoxMetadata,
+    storage::{ParquetStorage, StorageId},
+};
+use schema::{
+    sort::{adjust_sort_key_columns, compute_sort_key, SortKey},
+    Projection, Schema,
+};
+use std::{collections::HashMap, num::NonZeroUsize, sync::Arc};
+
+/// Common retention period used throughout tests
+pub(crate) const TEST_RETENTION_PERIOD_NS: Option<i64> = Some(3_600 * 1_000_000_000);
+
+/// Catalog for tests
+#[derive(Debug)]
+#[allow(missing_docs)]
+pub struct TestCatalog {
+    pub catalog: Arc<dyn Catalog>,
+    pub metric_registry: Arc<metric::Registry>,
+    pub object_store: Arc<DynObjectStore>,
+    pub parquet_store: ParquetStorage,
+    pub time_provider: Arc<MockProvider>,
+    pub exec: Arc<Executor>,
+}
+
+impl TestCatalog {
+    /// Initialize the catalog
+    ///
+    /// All test catalogs use the same [`Executor`]. Use [`with_execs`](Self::with_execs) if you need a special or
+    /// dedicated executor.
+    pub fn new() -> Arc<Self> {
+        let exec = Arc::new(DedicatedExecutors::new_testing());
+        Self::with_execs(exec, NonZeroUsize::new(1).unwrap())
+    }
+
+    /// Initialize with partitions
+    pub fn with_target_query_partitions(target_query_partitions: NonZeroUsize) -> Arc<Self> {
+        let exec = Arc::new(DedicatedExecutors::new_testing());
+        Self::with_execs(exec, target_query_partitions)
+    }
+
+    /// Initialize with given executors and partitions
+    pub fn with_execs(
+        exec: Arc<DedicatedExecutors>,
+        target_query_partitions: NonZeroUsize,
+    ) -> Arc<Self> {
+        let metric_registry = Arc::new(metric::Registry::new());
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap()));
+        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(
+            Arc::clone(&metric_registry),
+            Arc::clone(&time_provider) as _,
+        ));
+        let object_store = Arc::new(InMemory::new());
+        let parquet_store =
+            ParquetStorage::new(Arc::clone(&object_store) as _, StorageId::from("iox"));
+        let exec = Arc::new(Executor::new_with_config_and_executors(
+            ExecutorConfig {
+                num_threads: exec.num_threads(),
+                target_query_partitions,
+                object_stores: HashMap::from([(
+                    parquet_store.id(),
+                    Arc::clone(parquet_store.object_store()),
+                )]),
+                metric_registry: Arc::clone(&metric_registry),
+                mem_pool_size: 1024 * 1024 * 1024,
+            },
+            exec,
+        ));
+
+        Arc::new(Self {
+            metric_registry,
+            catalog,
+            object_store,
+            parquet_store,
+            time_provider,
+            exec,
+        })
+    }
+
+    /// Return the catalog
+    pub fn catalog(&self) -> Arc<dyn Catalog> {
+        Arc::clone(&self.catalog)
+    }
+
+    /// Return the catalog's metric registry
+    pub fn metric_registry(&self) -> Arc<metric::Registry> {
+        Arc::clone(&self.metric_registry)
+    }
+
+    /// Return the catalog's  object store
+    pub fn object_store(&self) -> Arc<DynObjectStore> {
+        Arc::clone(&self.object_store)
+    }
+
+    /// Return the mockable version of the catalog's time provider.
+    ///
+    /// If you need a generic time provider, use [`time_provider`](Self::time_provider) instead.
+    pub fn mock_time_provider(&self) -> &MockProvider {
+        self.time_provider.as_ref()
+    }
+
+    /// Return the catalog's time provider
+    ///
+    /// If you need to mock the time, use [`mock_time_provider`](Self::mock_time_provider) instead.
+    pub fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider) as _
+    }
+
+    /// Return the catalog's executor
+    pub fn exec(&self) -> Arc<Executor> {
+        Arc::clone(&self.exec)
+    }
+
+    /// Create namespace with specified retention
+    pub async fn create_namespace_with_retention(
+        self: &Arc<Self>,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Arc<TestNamespace> {
+        let mut repos = self.catalog.repositories();
+        let namespace_name = NamespaceName::new(name).unwrap();
+        let namespace = repos
+            .namespaces()
+            .create(&namespace_name, None, retention_period_ns, None)
+            .await
+            .unwrap();
+
+        Arc::new(TestNamespace {
+            catalog: Arc::clone(self),
+            namespace,
+        })
+    }
+
+    /// Create a namespace in the catalog
+    pub async fn create_namespace_1hr_retention(
+        self: &Arc<Self>,
+        name: &str,
+    ) -> Arc<TestNamespace> {
+        self.create_namespace_with_retention(name, TEST_RETENTION_PERIOD_NS)
+            .await
+    }
+
+    /// Add a partition into skipped compaction
+    pub async fn add_to_skipped_compaction(
+        self: &Arc<Self>,
+        partition_id: PartitionId,
+        reason: &str,
+    ) {
+        let mut repos = self.catalog.repositories();
+
+        repos
+            .partitions()
+            .record_skipped_compaction(partition_id, reason, 0, 0, 0, 0, 0)
+            .await
+            .unwrap();
+    }
+}
+
+/// A test namespace
+#[derive(Debug)]
+#[allow(missing_docs)]
+pub struct TestNamespace {
+    pub catalog: Arc<TestCatalog>,
+    pub namespace: Namespace,
+}
+
+impl TestNamespace {
+    /// Create a table in this namespace
+    pub async fn create_table(self: &Arc<Self>, name: &str) -> Arc<TestTable> {
+        let mut repos = self.catalog.catalog.repositories();
+
+        let table = arbitrary_table(&mut *repos, name, &self.namespace).await;
+
+        Arc::new(TestTable {
+            catalog: Arc::clone(&self.catalog),
+            namespace: Arc::clone(self),
+            table,
+        })
+    }
+
+    /// Create a table in this namespace w/ given partition template
+    pub async fn create_table_with_partition_template(
+        self: &Arc<Self>,
+        name: &str,
+        template: Option<PartitionTemplate>,
+    ) -> Arc<TestTable> {
+        let mut repos = self.catalog.catalog.repositories();
+
+        let table = repos
+            .tables()
+            .create(
+                name,
+                TablePartitionTemplateOverride::try_new(
+                    template,
+                    &self.namespace.partition_template,
+                )
+                .unwrap(),
+                self.namespace.id,
+            )
+            .await
+            .unwrap();
+
+        Arc::new(TestTable {
+            catalog: Arc::clone(&self.catalog),
+            namespace: Arc::clone(self),
+            table,
+        })
+    }
+
+    /// Get namespace schema for this namespace.
+    pub async fn schema(&self) -> NamespaceSchema {
+        let mut repos = self.catalog.catalog.repositories();
+        get_schema_by_id(
+            self.namespace.id,
+            repos.as_mut(),
+            SoftDeletedRows::ExcludeDeleted,
+        )
+        .await
+        .expect("no catalog error")
+        .expect("namespace exists")
+    }
+
+    /// Set the number of tables allowed in this namespace.
+    pub async fn update_table_limit(&self, new_max: usize) {
+        let mut repos = self.catalog.catalog.repositories();
+        repos
+            .namespaces()
+            .update_table_limit(&self.namespace.name, MaxTables::try_from(new_max).unwrap())
+            .await
+            .unwrap();
+    }
+
+    /// Set the number of columns per table allowed in this namespace.
+    pub async fn update_column_limit(&self, new_max: usize) {
+        let mut repos = self.catalog.catalog.repositories();
+        repos
+            .namespaces()
+            .update_column_limit(
+                &self.namespace.name,
+                MaxColumnsPerTable::try_from(new_max).unwrap(),
+            )
+            .await
+            .unwrap();
+    }
+}
+
+/// A test table of a namespace in the catalog
+#[allow(missing_docs)]
+#[derive(Debug)]
+pub struct TestTable {
+    pub catalog: Arc<TestCatalog>,
+    pub namespace: Arc<TestNamespace>,
+    pub table: Table,
+}
+
+impl TestTable {
+    /// Creat a partition for the table
+    pub async fn create_partition(self: &Arc<Self>, key: &str) -> Arc<TestPartition> {
+        let mut repos = self.catalog.catalog.repositories();
+
+        let partition = repos
+            .partitions()
+            .create_or_get(key.into(), self.table.id)
+            .await
+            .unwrap();
+
+        Arc::new(TestPartition {
+            catalog: Arc::clone(&self.catalog),
+            namespace: Arc::clone(&self.namespace),
+            table: Arc::clone(self),
+            partition,
+        })
+    }
+
+    /// Create a partition with a specified sort key for the table
+    pub async fn create_partition_with_sort_key(
+        self: &Arc<Self>,
+        key: &str,
+        sort_key_ids: &[i64],
+    ) -> Arc<TestPartition> {
+        let mut repos = self.catalog.catalog.repositories();
+
+        let partition = repos
+            .partitions()
+            .create_or_get(key.into(), self.table.id)
+            .await
+            .unwrap();
+
+        let partition = repos
+            .partitions()
+            .cas_sort_key(
+                partition.id,
+                None,
+                &SortKeyIds::from(sort_key_ids.iter().cloned()),
+            )
+            .await
+            .unwrap();
+
+        Arc::new(TestPartition {
+            catalog: Arc::clone(&self.catalog),
+            namespace: Arc::clone(&self.namespace),
+            table: Arc::clone(self),
+            partition,
+        })
+    }
+
+    /// Create a column for the table
+    pub async fn create_column(
+        self: &Arc<Self>,
+        name: &str,
+        column_type: ColumnType,
+    ) -> Arc<TestColumn> {
+        let mut repos = self.catalog.catalog.repositories();
+
+        let column = repos
+            .columns()
+            .create_or_get(name, self.table.id, column_type)
+            .await
+            .unwrap();
+
+        Arc::new(TestColumn {
+            catalog: Arc::clone(&self.catalog),
+            namespace: Arc::clone(&self.namespace),
+            table: Arc::clone(self),
+            column,
+        })
+    }
+
+    /// Get the TableSchema from the catalog.
+    pub async fn catalog_schema(&self) -> TableSchema {
+        TableSchema {
+            id: self.table.id,
+            partition_template: Default::default(),
+            columns: self.catalog_columns().await,
+        }
+    }
+
+    /// Get columns from the catalog.
+    pub async fn catalog_columns(&self) -> ColumnsByName {
+        let mut repos = self.catalog.catalog.repositories();
+
+        get_table_columns_by_id(self.table.id, repos.as_mut())
+            .await
+            .unwrap()
+    }
+
+    /// Get schema for this table.
+    pub async fn schema(&self) -> Schema {
+        self.catalog_columns().await.try_into().unwrap()
+    }
+
+    /// Read the record batches from the specified Parquet File associated with this table.
+    pub async fn read_parquet_file(&self, file: ParquetFile) -> Vec<RecordBatch> {
+        // get schema
+        let table_catalog_columns = self.catalog_columns().await;
+        let column_id_lookup = table_catalog_columns.id_map();
+        let table_schema = self.schema().await;
+        let selection: Vec<_> = file
+            .column_set
+            .iter()
+            .map(|id| column_id_lookup.get(id).unwrap().as_ref())
+            .collect();
+        let schema = table_schema.select_by_names(&selection[..]).unwrap();
+
+        let chunk = ParquetChunk::new(Arc::new(file), schema, self.catalog.parquet_store.clone());
+        chunk
+            .parquet_exec_input()
+            .read_to_batches(
+                chunk.schema().as_arrow(),
+                Projection::All,
+                &chunk.store().test_df_context(),
+            )
+            .await
+            .unwrap()
+    }
+}
+
+/// A test column.
+#[allow(missing_docs)]
+#[derive(Debug)]
+pub struct TestColumn {
+    pub catalog: Arc<TestCatalog>,
+    pub namespace: Arc<TestNamespace>,
+    pub table: Arc<TestTable>,
+    pub column: Column,
+}
+
+impl TestColumn {
+    pub fn id(&self) -> i64 {
+        self.column.id.get()
+    }
+}
+
+/// A test catalog with specified namespace, table, partition
+#[allow(missing_docs)]
+#[derive(Debug)]
+pub struct TestPartition {
+    pub catalog: Arc<TestCatalog>,
+    pub namespace: Arc<TestNamespace>,
+    pub table: Arc<TestTable>,
+    pub partition: Partition,
+}
+
+impl TestPartition {
+    /// Update sort key.
+    pub async fn update_sort_key(self: &Arc<Self>, sort_key_ids: &SortKeyIds) -> Arc<Self> {
+        let mut repos = self.catalog.catalog.repositories();
+        let partition = repos
+            .partitions()
+            .get_by_id(self.partition.id)
+            .await
+            .unwrap()
+            .unwrap();
+
+        let old_sort_key_ids = partition.sort_key_ids();
+
+        let partition = repos
+            .partitions()
+            .cas_sort_key(self.partition.id, old_sort_key_ids, sort_key_ids)
+            .await
+            .unwrap();
+
+        Arc::new(Self {
+            catalog: Arc::clone(&self.catalog),
+            namespace: Arc::clone(&self.namespace),
+            table: Arc::clone(&self.table),
+            partition,
+        })
+    }
+
+    /// Create a Parquet file in this partition in object storage and the catalog with attributes
+    /// specified by the builder
+    pub async fn create_parquet_file(
+        self: &Arc<Self>,
+        builder: TestParquetFileBuilder,
+    ) -> TestParquetFile {
+        let TestParquetFileBuilder {
+            record_batch,
+            table,
+            schema,
+            min_time,
+            max_time,
+            file_size_bytes,
+            size_override,
+            creation_time,
+            compaction_level,
+            to_delete,
+            object_store_id,
+            row_count,
+            max_l0_created_at,
+        } = builder;
+
+        let record_batch = record_batch.expect("A record batch is required");
+        let table = table.expect("A table is required");
+        let schema = schema.expect("A schema is required");
+        assert_eq!(
+            table, self.table.table.name,
+            "Table name of line protocol and partition should have matched",
+        );
+
+        assert!(
+            row_count.is_none(),
+            "Cannot have both a record batch and a manually set row_count!"
+        );
+        let row_count = record_batch.num_rows();
+        assert!(row_count > 0, "Parquet file must have at least 1 row");
+        let (record_batch, sort_key) = sort_batch(record_batch, &schema);
+        let record_batch = dedup_batch(record_batch, &sort_key);
+
+        let object_store_id = object_store_id.unwrap_or_else(ObjectStoreId::new);
+
+        let metadata = IoxMetadata {
+            object_store_id,
+            creation_timestamp: now(),
+            namespace_id: self.namespace.namespace.id,
+            namespace_name: self.namespace.namespace.name.clone().into(),
+            table_id: self.table.table.id,
+            table_name: self.table.table.name.clone().into(),
+            partition_key: self.partition.partition_key.clone(),
+            compaction_level: CompactionLevel::Initial,
+            sort_key: Some(sort_key.clone()),
+            max_l0_created_at: Time::from_timestamp_nanos(max_l0_created_at),
+        };
+        let real_file_size_bytes = create_parquet_file(
+            ParquetStorage::new(
+                Arc::clone(&self.catalog.object_store),
+                StorageId::from("iox"),
+            ),
+            &self.partition.transition_partition_id(),
+            &metadata,
+            record_batch.clone(),
+        )
+        .await;
+
+        let builder = TestParquetFileBuilder {
+            record_batch: Some(record_batch),
+            table: Some(table),
+            schema: Some(schema),
+            min_time,
+            max_time,
+            file_size_bytes: Some(file_size_bytes.unwrap_or(real_file_size_bytes as u64)),
+            size_override,
+            creation_time,
+            compaction_level,
+            to_delete,
+            object_store_id: Some(object_store_id),
+            row_count: None, // will be computed from the record batch again
+            max_l0_created_at,
+        };
+
+        let result = self.create_parquet_file_catalog_record(builder).await;
+        let mut repos = self.catalog.catalog.repositories();
+        update_catalog_sort_key_if_needed(repos.as_mut(), self.partition.id, sort_key).await;
+        result
+    }
+
+    /// Only update the catalog with the builder's info, don't create anything in object storage.
+    /// Record batch is not required in this case.
+    pub async fn create_parquet_file_catalog_record(
+        self: &Arc<Self>,
+        builder: TestParquetFileBuilder,
+    ) -> TestParquetFile {
+        let TestParquetFileBuilder {
+            record_batch,
+            min_time,
+            max_time,
+            file_size_bytes,
+            size_override,
+            creation_time,
+            compaction_level,
+            to_delete,
+            object_store_id,
+            row_count,
+            max_l0_created_at,
+            ..
+        } = builder;
+
+        let table_catalog_columns = self.table.catalog_columns().await;
+
+        let (row_count, column_set) = if let Some(record_batch) = record_batch {
+            let column_set = ColumnSet::new(record_batch.schema().fields().iter().map(|f| {
+                table_catalog_columns
+                    .get(f.name())
+                    .unwrap_or_else(|| panic!("Column {} is not registered", f.name()))
+                    .id
+            }));
+
+            assert!(
+                row_count.is_none(),
+                "Cannot have both a record batch and a manually set row_count!"
+            );
+
+            (record_batch.num_rows(), column_set)
+        } else {
+            let column_set = ColumnSet::new(table_catalog_columns.ids());
+            (row_count.unwrap_or(0), column_set)
+        };
+
+        let parquet_file_params = ParquetFileParams {
+            namespace_id: self.namespace.namespace.id,
+            table_id: self.table.table.id,
+            partition_id: self.partition.id,
+            partition_hash_id: self.partition.hash_id().cloned(),
+            object_store_id: object_store_id.unwrap_or_else(ObjectStoreId::new),
+            min_time: Timestamp::new(min_time),
+            max_time: Timestamp::new(max_time),
+            file_size_bytes: file_size_bytes.unwrap_or(0) as i64,
+            row_count: row_count as i64,
+            created_at: Timestamp::new(creation_time),
+            compaction_level,
+            column_set,
+            max_l0_created_at: Timestamp::new(max_l0_created_at),
+        };
+
+        let mut repos = self.catalog.catalog.repositories();
+        let parquet_file = repos
+            .parquet_files()
+            .create(parquet_file_params)
+            .await
+            .unwrap();
+
+        if to_delete {
+            repos
+                .parquet_files()
+                .create_upgrade_delete(
+                    parquet_file.partition_id,
+                    &[parquet_file.object_store_id],
+                    &[],
+                    &[],
+                    CompactionLevel::Initial,
+                )
+                .await
+                .unwrap();
+        }
+
+        TestParquetFile {
+            catalog: Arc::clone(&self.catalog),
+            namespace: Arc::clone(&self.namespace),
+            table: Arc::clone(&self.table),
+            partition: Arc::clone(self),
+            parquet_file,
+            size_override,
+        }
+    }
+}
+
+/// A builder for creating parquet files within partitions.
+#[derive(Debug, Clone)]
+pub struct TestParquetFileBuilder {
+    record_batch: Option<RecordBatch>,
+    table: Option<String>,
+    schema: Option<Schema>,
+    min_time: i64,
+    max_time: i64,
+    file_size_bytes: Option<u64>,
+    size_override: Option<i64>,
+    creation_time: i64,
+    compaction_level: CompactionLevel,
+    to_delete: bool,
+    object_store_id: Option<ObjectStoreId>,
+    row_count: Option<usize>,
+    max_l0_created_at: i64,
+}
+
+impl Default for TestParquetFileBuilder {
+    fn default() -> Self {
+        Self {
+            record_batch: None,
+            table: None,
+            schema: None,
+            min_time: now().timestamp_nanos(),
+            max_time: now().timestamp_nanos(),
+            file_size_bytes: None,
+            size_override: None,
+            creation_time: 1,
+            compaction_level: CompactionLevel::Initial,
+            to_delete: false,
+            object_store_id: None,
+            row_count: None,
+            max_l0_created_at: 1,
+        }
+    }
+}
+
+impl TestParquetFileBuilder {
+    /// Specify the line protocol that should become the record batch in this parquet file.
+    pub fn with_line_protocol(self, line_protocol: &str) -> Self {
+        let (table, batch) = lp_to_mutable_batch(line_protocol);
+
+        let schema = batch.schema(Projection::All).unwrap();
+        let record_batch = batch.to_arrow(Projection::All).unwrap();
+
+        self.with_record_batch(record_batch)
+            .with_table(table)
+            .with_schema(schema)
+    }
+
+    /// Specify an object store id for this parquet file.
+    pub fn with_object_store_id(mut self, object_store_id: ObjectStoreId) -> Self {
+        self.object_store_id = Some(object_store_id);
+        self
+    }
+
+    fn with_record_batch(mut self, record_batch: RecordBatch) -> Self {
+        self.record_batch = Some(record_batch);
+        self
+    }
+
+    fn with_table(mut self, table: String) -> Self {
+        self.table = Some(table);
+        self
+    }
+
+    fn with_schema(mut self, schema: Schema) -> Self {
+        self.schema = Some(schema);
+        self
+    }
+
+    /// Specify the minimum time for the parquet file metadata.
+    pub fn with_min_time(mut self, min_time: i64) -> Self {
+        self.min_time = min_time;
+        self
+    }
+
+    /// Specify the maximum time for the parquet file metadata.
+    pub fn with_max_time(mut self, max_time: i64) -> Self {
+        self.max_time = max_time;
+        self
+    }
+
+    /// Specify the creation time for the parquet file metadata.
+    pub fn with_creation_time(mut self, creation_time: iox_time::Time) -> Self {
+        self.creation_time = creation_time.timestamp_nanos();
+        self
+    }
+
+    /// specify max creation time of all L0 this file was created from
+    pub fn with_max_l0_created_at(mut self, time: iox_time::Time) -> Self {
+        self.max_l0_created_at = time.timestamp_nanos();
+        self
+    }
+
+    /// Specify the compaction level for the parquet file metadata.
+    pub fn with_compaction_level(mut self, compaction_level: CompactionLevel) -> Self {
+        self.compaction_level = compaction_level;
+        self
+    }
+
+    /// Specify whether the parquet file should be marked as deleted or not.
+    pub fn with_to_delete(mut self, to_delete: bool) -> Self {
+        self.to_delete = to_delete;
+        self
+    }
+
+    /// Specify the number of rows in this parquet file. If line protocol/record batch are also
+    /// set, this will panic! Only use this when you're not specifying any rows!
+    pub fn with_row_count(mut self, row_count: usize) -> Self {
+        self.row_count = Some(row_count);
+        self
+    }
+
+    /// Specify the size override to use for a CompactorParquetFile
+    pub fn with_size_override(mut self, size_override: i64) -> Self {
+        self.size_override = Some(size_override);
+        self
+    }
+
+    /// Specify the file size to use for a CompactorParquetFile
+    pub fn with_file_size_bytes(mut self, file_size_bytes: u64) -> Self {
+        self.file_size_bytes = Some(file_size_bytes);
+        self
+    }
+}
+
+async fn update_catalog_sort_key_if_needed<R>(repos: &mut R, id: PartitionId, sort_key: SortKey)
+where
+    R: RepoCollection + ?Sized,
+{
+    // Fetch the latest partition info from the catalog
+    let partition = repos.partitions().get_by_id(id).await.unwrap().unwrap();
+
+    // fecth column ids from catalog
+    let columns = get_table_columns_by_id(partition.table_id, repos)
+        .await
+        .unwrap();
+
+    // Similarly to what the ingester does, if there's an existing sort key in the catalog, add new
+    // columns onto the end
+    match partition.sort_key(&columns) {
+        Some(catalog_sort_key) => {
+            let new_sort_key = sort_key.to_columns().collect::<Vec<_>>();
+            let (_metadata, update) = adjust_sort_key_columns(&catalog_sort_key, &new_sort_key);
+            if let Some(new_sort_key) = update {
+                let new_sort_key = new_sort_key.to_columns().collect::<Vec<_>>();
+                let new_sort_key_ids = columns.ids_for_names(&new_sort_key);
+
+                debug!(
+                    "Updating (sort_key, sort_key_ids) from ({:?}, {:?}) to ({:?}, {:?})",
+                    catalog_sort_key.to_columns().collect::<Vec<_>>(),
+                    partition.sort_key_ids(),
+                    &new_sort_key,
+                    &new_sort_key_ids,
+                );
+
+                repos
+                    .partitions()
+                    .cas_sort_key(partition.id, partition.sort_key_ids(), &new_sort_key_ids)
+                    .await
+                    .unwrap();
+            }
+        }
+        None => {
+            let new_columns = sort_key.to_columns().collect::<Vec<_>>();
+            debug!("Updating sort key from None to {:?}", &new_columns);
+            let column_ids = columns.ids_for_names(&new_columns);
+            repos
+                .partitions()
+                .cas_sort_key(partition.id, None, &column_ids)
+                .await
+                .unwrap();
+        }
+    }
+}
+
+/// Create parquet file and return file size.
+async fn create_parquet_file(
+    store: ParquetStorage,
+    partition_id: &TransitionPartitionId,
+    metadata: &IoxMetadata,
+    record_batch: RecordBatch,
+) -> usize {
+    let stream = Box::pin(MemoryStream::new(vec![record_batch]));
+    let (_meta, file_size) = store
+        .upload(stream, partition_id, metadata, unbounded_memory_pool())
+        .await
+        .expect("persisting parquet file should succeed");
+    file_size
+}
+
+/// A test parquet file of the catalog
+#[allow(missing_docs)]
+#[derive(Debug)]
+pub struct TestParquetFile {
+    pub catalog: Arc<TestCatalog>,
+    pub namespace: Arc<TestNamespace>,
+    pub table: Arc<TestTable>,
+    pub partition: Arc<TestPartition>,
+    pub parquet_file: ParquetFile,
+    pub size_override: Option<i64>,
+}
+
+impl From<TestParquetFile> for ParquetFile {
+    fn from(tpf: TestParquetFile) -> Self {
+        let TestParquetFile { parquet_file, .. } = tpf;
+
+        parquet_file
+    }
+}
+
+impl TestParquetFile {
+    /// Make the parquet file deletable
+    pub async fn flag_for_delete(&self) {
+        let mut repos = self.catalog.catalog.repositories();
+
+        repos
+            .parquet_files()
+            .create_upgrade_delete(
+                self.parquet_file.partition_id,
+                &[self.parquet_file.object_store_id],
+                &[],
+                &[],
+                CompactionLevel::Initial,
+            )
+            .await
+            .unwrap();
+    }
+
+    /// Get Parquet file schema.
+    pub async fn schema(&self) -> Schema {
+        let table_columns = self.table.catalog_columns().await;
+        let column_id_lookup = table_columns.id_map();
+        let selection: Vec<_> = self
+            .parquet_file
+            .column_set
+            .iter()
+            .map(|id| column_id_lookup.get(id).unwrap().as_ref())
+            .collect();
+        let table_schema: Schema = table_columns.clone().try_into().unwrap();
+        table_schema.select_by_names(&selection[..]).unwrap()
+    }
+}
+
+/// Return the current time
+pub(crate) fn now() -> Time {
+    Time::from_timestamp(0, 0).unwrap()
+}
+
+/// Sort arrow record batch into arrow record batch and sort key.
+fn sort_batch(record_batch: RecordBatch, schema: &Schema) -> (RecordBatch, SortKey) {
+    // calculate realistic sort key
+    let sort_key = compute_sort_key(schema, std::iter::once(&record_batch));
+
+    // set up sorting
+    let mut sort_columns = Vec::with_capacity(record_batch.num_columns());
+    let mut reverse_index: Vec<_> = (0..record_batch.num_columns()).map(|_| None).collect();
+    for (column_name, _options) in sort_key.iter() {
+        let index = record_batch
+            .schema()
+            .column_with_name(column_name.as_ref())
+            .unwrap()
+            .0;
+        reverse_index[index] = Some(sort_columns.len());
+        sort_columns.push(SortColumn {
+            values: Arc::clone(record_batch.column(index)),
+            options: Some(SortOptions::default()),
+        });
+    }
+    for (index, reverse_index) in reverse_index.iter_mut().enumerate() {
+        if reverse_index.is_none() {
+            *reverse_index = Some(sort_columns.len());
+            sort_columns.push(SortColumn {
+                values: Arc::clone(record_batch.column(index)),
+                options: None,
+            });
+        }
+    }
+
+    // execute sorting
+    let arrays = lexsort(&sort_columns, None).unwrap();
+
+    // re-create record batch
+    let arrays: Vec<_> = reverse_index
+        .into_iter()
+        .map(|index| {
+            let index = index.unwrap();
+            Arc::clone(&arrays[index])
+        })
+        .collect();
+    let record_batch = RecordBatch::try_new(record_batch.schema(), arrays).unwrap();
+
+    (record_batch, sort_key)
+}
+
+fn dedup_batch(record_batch: RecordBatch, sort_key: &SortKey) -> RecordBatch {
+    let schema = record_batch.schema();
+    let sort_keys = arrow_sort_key_exprs(sort_key, &schema);
+    let mut deduplicator = RecordBatchDeduplicator::new(sort_keys, Count::default(), None);
+
+    let mut batches = vec![deduplicator.push(record_batch).unwrap()];
+    if let Some(batch) = deduplicator.finish().unwrap() {
+        batches.push(batch);
+    }
+
+    arrow::compute::concat_batches(&schema, &batches).unwrap()
+}
diff --git a/iox_tests/src/lib.rs b/iox_tests/src/lib.rs
new file mode 100644
index 0000000..b5e4a28
--- /dev/null
+++ b/iox_tests/src/lib.rs
@@ -0,0 +1,28 @@
+//! IOx test utils and tests
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod catalog;
+pub use catalog::{
+    TestCatalog, TestNamespace, TestParquetFile, TestParquetFileBuilder, TestPartition, TestTable,
+};
+
+mod builders;
+pub use builders::{
+    ColumnBuilder, ParquetFileBuilder, PartitionBuilder, SkippedCompactionBuilder, TableBuilder,
+};
diff --git a/iox_time/Cargo.toml b/iox_time/Cargo.toml
new file mode 100644
index 0000000..c8a8398
--- /dev/null
+++ b/iox_time/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "iox_time"
+description = "Time functionality for IOx"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+chrono = { version = "0.4.31", default-features = false, features = ["clock", "std"] }
+parking_lot = "0.12"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/iox_time/src/lib.rs b/iox_time/src/lib.rs
new file mode 100644
index 0000000..3a2cf83
--- /dev/null
+++ b/iox_time/src/lib.rs
@@ -0,0 +1,649 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use chrono::{DateTime, TimeZone, Timelike, Utc};
+use parking_lot::{lock_api::RwLockUpgradableReadGuard, RwLock};
+use std::{
+    fmt::{Debug, Display},
+    future::Future,
+    ops::{Add, Sub},
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll, Waker},
+    time::Duration,
+};
+
+/// A UTC Timestamp returned by a [`TimeProvider`]
+///
+/// Purposefully does not provide [`std::convert::From`] implementations
+/// as intended to be an opaque type returned by a `TimeProvider` - the construction methods
+/// provided are intended for serialization/deserialization and tests only
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct Time(DateTime<Utc>);
+
+impl Add<Duration> for Time {
+    type Output = Self;
+
+    fn add(self, rhs: Duration) -> Self::Output {
+        let duration = chrono::Duration::from_std(rhs).unwrap();
+        Self(self.0 + duration)
+    }
+}
+
+impl Sub<Duration> for Time {
+    type Output = Self;
+
+    fn sub(self, rhs: Duration) -> Self::Output {
+        let duration = chrono::Duration::from_std(rhs).unwrap();
+        Self(self.0 - duration)
+    }
+}
+
+impl Debug for Time {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Display::fmt(self, f)
+    }
+}
+
+impl std::fmt::Display for Time {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.to_rfc3339())
+    }
+}
+
+impl Time {
+    pub const MAX: Self = Self(DateTime::<Utc>::MAX_UTC);
+    pub const MIN: Self = Self(DateTime::<Utc>::MIN_UTC);
+
+    /// Makes a new `Time` from the number of non-leap nanoseconds
+    /// since January 1, 1970 0:00:00 UTC (aka "UNIX timestamp").
+    pub fn from_timestamp_nanos(nanos: i64) -> Self {
+        Self(Utc.timestamp_nanos(nanos))
+    }
+
+    /// Makes a new `DateTime` from the number of non-leap milliseconds
+    /// since January 1, 1970 0:00:00 UTC (aka "UNIX timestamp").
+    pub fn from_timestamp_millis(millis: i64) -> Option<Self> {
+        Utc.timestamp_millis_opt(millis).single().map(Self)
+    }
+
+    /// Makes a new `Time` from the number of non-leap seconds
+    /// since January 1, 1970 0:00:00 UTC (aka "UNIX timestamp")
+    /// and the number of nanoseconds since the last whole non-leap second.
+    pub fn from_timestamp(secs: i64, nanos: u32) -> Option<Self> {
+        Utc.timestamp_opt(secs, nanos).single().map(Self)
+    }
+
+    /// Makes a new `Time` from the provided [`DateTime<Utc>`]
+    pub fn from_date_time(time: chrono::DateTime<Utc>) -> Self {
+        Self(time)
+    }
+
+    /// Makes a new `Time` from the provided [`DateTime<Utc>`]
+    pub fn from_datetime(datetime: DateTime<Utc>) -> Self {
+        Self(datetime)
+    }
+
+    /// Returns an RFC 3339 and ISO 8601 date and time string such as `1996-12-19T16:39:57+00:00`.
+    pub fn to_rfc3339(&self) -> String {
+        self.0.to_rfc3339()
+    }
+
+    /// Parses data from RFC 3339 format.
+    pub fn from_rfc3339(s: &str) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
+        Ok(Self(DateTime::<Utc>::from(
+            DateTime::parse_from_rfc3339(s).map_err(Box::new)?,
+        )))
+    }
+
+    /// Returns the number of non-leap-nanoseconds since January 1, 1970 UTC
+    pub fn timestamp_nanos(&self) -> i64 {
+        // TODO: ensure that this can never over-/underflow
+        self.0.timestamp_nanos_opt().expect("nanos in range")
+    }
+
+    /// Returns the number of seconds since January 1, 1970 UTC
+    pub fn timestamp(&self) -> i64 {
+        self.0.timestamp()
+    }
+
+    /// Returns the hour number from 0 to 23.
+    pub fn hour(&self) -> u32 {
+        self.0.hour()
+    }
+
+    /// Returns the minute number from 0 to 59.
+    pub fn minute(&self) -> u32 {
+        self.0.minute()
+    }
+
+    /// Returns the second number from 0 to 59.
+    pub fn second(&self) -> u32 {
+        self.0.second()
+    }
+
+    /// Returns the number of nanoseconds since the last second boundary
+    pub fn timestamp_subsec_nanos(&self) -> u32 {
+        self.0.timestamp_subsec_nanos()
+    }
+
+    /// Returns the number of non-leap-milliseconds since January 1, 1970 UTC
+    pub fn timestamp_millis(&self) -> i64 {
+        self.0.timestamp_millis()
+    }
+
+    /// Returns the duration since the provided time or None if it would be negative
+    pub fn checked_duration_since(&self, other: Self) -> Option<Duration> {
+        self.0.signed_duration_since(other.0).to_std().ok()
+    }
+
+    /// Adds given [`Duration`] to the current date and time.
+    ///
+    /// Returns `None` if it would result in overflow
+    pub fn checked_add(&self, duration: Duration) -> Option<Self> {
+        let duration = chrono::Duration::from_std(duration).ok()?;
+        Some(Self(self.0.checked_add_signed(duration)?))
+    }
+
+    /// Subtracts the given [`Duration`] from the current date and time.
+    ///
+    /// Returns `None` if it would result in overflow
+    pub fn checked_sub(&self, duration: Duration) -> Option<Self> {
+        let duration = chrono::Duration::from_std(duration).ok()?;
+        Some(Self(self.0.checked_sub_signed(duration)?))
+    }
+
+    /// Returns `Time` as a [`DateTime<Utc>`]
+    pub fn date_time(&self) -> DateTime<Utc> {
+        self.0
+    }
+}
+
+pub trait TimeProvider: Debug + Display + Send + Sync + 'static {
+    /// Returns the current `Time`. No guarantees are made about monotonicity
+    fn now(&self) -> Time;
+
+    /// Sleep for the given duration.
+    fn sleep(&self, d: Duration) -> Pin<Box<dyn Future<Output = ()> + Send + 'static>> {
+        self.sleep_until(self.now() + d)
+    }
+
+    /// Sleep until given time.
+    fn sleep_until(&self, t: Time) -> Pin<Box<dyn Future<Output = ()> + Send + 'static>>;
+
+    /// Return a time that is the specified number of minutes in the future relative to this
+    /// provider's `now`.
+    fn minutes_into_future(&self, minutes: u64) -> Time {
+        self.now() + Duration::from_secs(60 * minutes)
+    }
+
+    /// Return a time that is the specified number of minutes in the past relative to this
+    /// provider's `now`.
+    fn minutes_ago(&self, minutes_ago: u64) -> Time {
+        self.now() - Duration::from_secs(60 * minutes_ago)
+    }
+
+    /// Return a time that is the specified number of hours in the past relative to this provider's
+    /// `now`.
+    fn hours_ago(&self, hours_ago: u64) -> Time {
+        self.now() - Duration::from_secs(60 * 60 * hours_ago)
+    }
+}
+
+/// A [`TimeProvider`] that uses [`Utc::now`] as a clock source
+#[derive(Debug, Default, Clone, Copy)]
+pub struct SystemProvider {}
+
+impl SystemProvider {
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl Display for SystemProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "System")
+    }
+}
+
+impl TimeProvider for SystemProvider {
+    fn now(&self) -> Time {
+        Time(Utc::now())
+    }
+
+    fn sleep_until(&self, t: Time) -> Pin<Box<dyn Future<Output = ()> + Send + 'static>> {
+        let d = t.checked_duration_since(self.now());
+
+        Box::pin(async move {
+            if let Some(d) = d {
+                tokio::time::sleep(d).await;
+            }
+        })
+    }
+}
+
+/// Internal state fo [`MockProvider`]
+#[derive(Debug)]
+struct MockProviderInner {
+    now: Time,
+    waiting: Vec<Waker>,
+}
+
+/// A [`TimeProvider`] that returns a fixed `Time` that can be set by [`MockProvider::set`]
+#[derive(Debug, Clone)]
+pub struct MockProvider {
+    inner: Arc<RwLock<MockProviderInner>>,
+}
+
+impl MockProvider {
+    pub fn new(start: Time) -> Self {
+        Self {
+            inner: Arc::new(RwLock::new(MockProviderInner {
+                now: start,
+                waiting: vec![],
+            })),
+        }
+    }
+
+    pub fn set(&self, time: Time) {
+        let mut inner = self.inner.write();
+        inner.now = time;
+        for waiter in inner.waiting.drain(..) {
+            waiter.wake()
+        }
+    }
+
+    pub fn inc(&self, duration: Duration) -> Time {
+        let mut inner = self.inner.write();
+        inner.now = inner.now + duration;
+        for waiter in inner.waiting.drain(..) {
+            waiter.wake()
+        }
+        inner.now
+    }
+}
+
+impl Display for MockProvider {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Mock")
+    }
+}
+
+impl TimeProvider for MockProvider {
+    fn now(&self) -> Time {
+        self.inner.read().now
+    }
+
+    fn sleep_until(&self, t: Time) -> Pin<Box<dyn Future<Output = ()> + Send + 'static>> {
+        Box::pin(MockSleep {
+            inner: Arc::clone(&self.inner),
+            deadline: t,
+        })
+    }
+}
+
+struct MockSleep {
+    inner: Arc<RwLock<MockProviderInner>>,
+    deadline: Time,
+}
+
+impl Future for MockSleep {
+    type Output = ();
+
+    fn poll(self: std::pin::Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let inner = self.inner.upgradable_read();
+        if inner.now >= self.deadline {
+            Poll::Ready(())
+        } else {
+            let mut inner = RwLockUpgradableReadGuard::upgrade(inner);
+            inner.waiting.push(cx.waker().clone());
+            Poll::Pending
+        }
+    }
+}
+
+impl<T> TimeProvider for Arc<T>
+where
+    T: TimeProvider,
+{
+    fn now(&self) -> Time {
+        (**self).now()
+    }
+
+    fn sleep(&self, d: Duration) -> Pin<Box<dyn Future<Output = ()> + Send + 'static>> {
+        (**self).sleep(d)
+    }
+
+    fn sleep_until(&self, t: Time) -> Pin<Box<dyn Future<Output = ()> + Send + 'static>> {
+        (**self).sleep_until(t)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_system_provider_now() {
+        let provider = SystemProvider::new();
+        let a = provider.now();
+        std::thread::sleep(Duration::from_secs(1));
+        let b = provider.now();
+        let c = provider.now();
+
+        let delta = b.checked_duration_since(a).unwrap();
+        assert!(delta > Duration::from_millis(500));
+        assert!(delta < Duration::from_secs(5));
+        assert!(b <= c);
+    }
+
+    #[tokio::test]
+    async fn test_system_provider_sleep() {
+        let provider = SystemProvider::new();
+
+        let a = provider.now();
+        provider.sleep(Duration::from_secs(1)).await;
+        let b = provider.now();
+
+        let delta = b.checked_duration_since(a).unwrap();
+        assert!(delta > Duration::from_millis(500));
+        assert!(delta < Duration::from_secs(5));
+    }
+
+    #[tokio::test]
+    async fn test_system_provider_sleep_until() {
+        let provider = SystemProvider::new();
+
+        let a = provider.now();
+        provider.sleep_until(a + Duration::from_secs(1)).await;
+        let b = provider.now();
+
+        let delta = b.checked_duration_since(a).unwrap();
+        assert!(delta > Duration::from_millis(500));
+        assert!(delta < Duration::from_secs(5));
+    }
+
+    #[test]
+    fn test_mock_provider_now() {
+        let provider = MockProvider::new(Time::from_timestamp_nanos(0));
+        assert_eq!(provider.now().timestamp_nanos(), 0);
+        assert_eq!(provider.now().timestamp_nanos(), 0);
+
+        provider.set(Time::from_timestamp_nanos(12));
+        assert_eq!(provider.now().timestamp_nanos(), 12);
+        assert_eq!(provider.now().timestamp_nanos(), 12);
+    }
+
+    #[tokio::test]
+    async fn test_mock_provider_sleep() {
+        let provider = MockProvider::new(Time::from_timestamp_nanos(0));
+
+        // not sleeping finishes instantly
+        provider.sleep(Duration::from_secs(0)).await;
+
+        // ==== sleep with `inc` ====
+        let fut = provider.sleep(Duration::from_millis(100));
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+
+        // does not finish immediately
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // does not finish when not incremented enough
+        provider.inc(Duration::from_millis(50));
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // finishes once incremented at least to the duration
+        provider.inc(Duration::from_millis(50));
+        handle.await.unwrap();
+
+        // finishes also when "overshooting" the duration
+        let fut = provider.sleep(Duration::from_millis(100));
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+        provider.inc(Duration::from_millis(101));
+        handle.await.unwrap();
+
+        // ==== sleep with `set` ====
+        provider.set(Time::from_timestamp_millis(100).unwrap());
+        let fut = provider.sleep(Duration::from_millis(100));
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+
+        // does not finish immediately
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // does not finish when time goes backwards
+        provider.set(Time::from_timestamp_millis(0).unwrap());
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // does not finish when time goes forward but not enough
+        provider.set(Time::from_timestamp_millis(150).unwrap());
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // finishes when time is set at least to the wait duration
+        provider.set(Time::from_timestamp_millis(200).unwrap());
+        handle.await.unwrap();
+
+        // also finishes when "overshooting"
+        let fut = provider.sleep(Duration::from_millis(100));
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+        provider.set(Time::from_timestamp_millis(301).unwrap());
+        handle.await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_mock_provider_sleep_until() {
+        let provider = MockProvider::new(Time::from_timestamp_nanos(0));
+
+        // not sleeping finishes instantly
+        provider.sleep(Duration::from_secs(0)).await;
+
+        // ==== sleep with `inc` ====
+        let fut = provider.sleep_until(Time::from_timestamp_millis(100).unwrap());
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+
+        // does not finish immediately
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // does not finish when not incremented enough
+        provider.inc(Duration::from_millis(50));
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // finishes once incremented at least to the duration
+        provider.inc(Duration::from_millis(50));
+        handle.await.unwrap();
+
+        // finishes also when "overshooting" the duration
+        let fut = provider.sleep_until(Time::from_timestamp_millis(200).unwrap());
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+        provider.inc(Duration::from_millis(101));
+        handle.await.unwrap();
+
+        // ==== sleep with `set` ====
+        provider.set(Time::from_timestamp_millis(100).unwrap());
+        let fut = provider.sleep_until(Time::from_timestamp_millis(200).unwrap());
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+
+        // does not finish immediately
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // does not finish when time goes backwards
+        provider.set(Time::from_timestamp_millis(0).unwrap());
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // does not finish when time goes forward but not enough
+        provider.set(Time::from_timestamp_millis(150).unwrap());
+        tokio::time::sleep(Duration::from_millis(200)).await;
+        assert!(!handle.is_finished());
+
+        // finishes when time is set at least to the wait duration
+        provider.set(Time::from_timestamp_millis(200).unwrap());
+        handle.await.unwrap();
+
+        // also finishes when "overshooting"
+        let fut = provider.sleep_until(Time::from_timestamp_millis(300).unwrap());
+        let handle = tokio::task::spawn(async move {
+            fut.await;
+        });
+        provider.set(Time::from_timestamp_millis(301).unwrap());
+        handle.await.unwrap();
+    }
+
+    #[test]
+    fn test_time() {
+        let verify = |date_time: DateTime<Utc>| {
+            let time = Time::from_datetime(date_time);
+
+            assert_eq!(time.hour(), date_time.hour());
+            assert_eq!(time.minute(), date_time.minute());
+            assert_eq!(time.second(), date_time.second());
+
+            assert_eq!(time.date_time(), date_time);
+            assert_eq!(
+                time,
+                Time::from_timestamp(date_time.timestamp(), date_time.timestamp_subsec_nanos())
+                    .unwrap(),
+            );
+            assert_eq!(
+                time,
+                Time::from_timestamp_nanos(date_time.timestamp_nanos_opt().unwrap())
+            );
+            assert_eq!(
+                Time::from_timestamp_millis(date_time.timestamp_millis()).unwrap(),
+                Time::from_date_time(
+                    Utc.timestamp_millis_opt(date_time.timestamp_millis())
+                        .unwrap()
+                )
+            );
+
+            assert_eq!(
+                time.timestamp_nanos(),
+                date_time.timestamp_nanos_opt().unwrap()
+            );
+            assert_eq!(time.timestamp_millis(), date_time.timestamp_millis());
+            assert_eq!(time.to_rfc3339(), date_time.to_rfc3339());
+
+            let duration = Duration::from_millis(265367345);
+
+            assert_eq!(
+                time + duration,
+                Time::from_date_time(date_time + chrono::Duration::from_std(duration).unwrap())
+            );
+
+            assert_eq!(
+                time - duration,
+                Time::from_date_time(date_time - chrono::Duration::from_std(duration).unwrap())
+            );
+
+            assert_eq!(time, Time::from_rfc3339(&time.to_rfc3339()).unwrap());
+        };
+
+        verify(Utc.timestamp_nanos(3406960448958394583));
+        verify(Utc.timestamp_nanos(0));
+        verify(Utc.timestamp_nanos(-3659396346346));
+    }
+
+    #[test]
+    fn test_overflow() {
+        let time = Time::MAX;
+        assert!(time.checked_add(Duration::from_nanos(1)).is_none());
+        assert!(time.checked_sub(Duration::from_nanos(1)).is_some());
+
+        let time = Time::MIN;
+        assert!(time.checked_add(Duration::from_nanos(1)).is_some());
+        assert!(time.checked_sub(Duration::from_nanos(1)).is_none());
+
+        let duration = Duration::from_millis(i64::MAX as u64 + 1);
+
+        let time = Time::from_timestamp_nanos(0);
+        assert!(chrono::Duration::from_std(duration).is_err());
+        assert!(time.checked_add(duration).is_none());
+        assert!(time.checked_sub(duration).is_none());
+    }
+
+    #[test]
+    fn test_duration_since() {
+        assert_eq!(
+            Time::from_timestamp_nanos(5056)
+                .checked_duration_since(Time::from_timestamp_nanos(-465))
+                .unwrap(),
+            Duration::from_nanos(5056 + 465)
+        );
+
+        assert!(Time::MAX.checked_duration_since(Time::MIN).is_some());
+
+        assert!(Time::from_timestamp_nanos(505)
+            .checked_duration_since(Time::from_timestamp_nanos(506))
+            .is_none());
+    }
+
+    #[test]
+    fn test_minutes_ago() {
+        let now = "2022-07-07T00:00:00+00:00";
+        let ago = "2022-07-06T22:38:00+00:00";
+
+        let provider = MockProvider::new(Time::from_rfc3339(now).unwrap());
+
+        let min_ago = provider.minutes_ago(82);
+        assert_eq!(min_ago, Time::from_timestamp_nanos(1657147080000000000));
+        assert_eq!(min_ago.to_rfc3339(), ago);
+    }
+
+    #[test]
+    fn test_minutes_into_future() {
+        let now = "2022-07-07T00:00:00+00:00";
+        let future = "2022-07-07T00:10:00+00:00";
+
+        let provider = MockProvider::new(Time::from_rfc3339(now).unwrap());
+
+        let min_future = provider.minutes_into_future(10);
+        assert_eq!(min_future, Time::from_timestamp_nanos(1657152600000000000));
+        assert_eq!(min_future.to_rfc3339(), future);
+    }
+
+    #[test]
+    fn test_hours_ago() {
+        let now = "2022-07-07T00:00:00+00:00";
+        let ago = "2022-07-03T14:00:00+00:00";
+
+        let provider = MockProvider::new(Time::from_rfc3339(now).unwrap());
+
+        let hrs_ago = provider.hours_ago(82);
+        assert_eq!(hrs_ago, Time::from_timestamp_nanos(1656856800000000000));
+        assert_eq!(hrs_ago.to_rfc3339(), ago);
+    }
+}
diff --git a/ioxd_common/Cargo.toml b/ioxd_common/Cargo.toml
new file mode 100644
index 0000000..c52b49f
--- /dev/null
+++ b/ioxd_common/Cargo.toml
@@ -0,0 +1,60 @@
+[package]
+name = "ioxd_common"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+# Optional feature 'pprof' enables http://localhost:8080/debug/pprof/profile support support
+
+[dependencies]
+# Workspace dependencies, in alphabetical order
+authz = { path = "../authz", features = ["http"] }
+clap_blocks = { path = "../clap_blocks" }
+generated_types = { path = "../generated_types" }
+heappy = { git = "https://github.com/mkmik/heappy", rev = "01a1f88e1b404c5894f89eb1a57f813f713d7ad1", features = ["enable_heap_profiler", "jemalloc_shim", "measure_free"], optional = true }
+metric = { path = "../metric" }
+metric_exporters = { path = "../metric_exporters" }
+observability_deps = { path = "../observability_deps" }
+# NOTE: we may not notice that we need the "backtrace-rs" feature if we also build with the heappy feature, which depends on backtrace-rs.
+# (honestly I thought that cargo dependencies were isolated on a per crate basis so I'm a bit surprised that pprof accidentally builds
+# successfully just because another crate happens to depend on backtrace-rs)
+pprof = { version = "0.13", default-features = false, features = ["flamegraph", "prost-codec"], optional = true }
+service_grpc_testing = { path = "../service_grpc_testing" }
+tower_trailer = { path = "../tower_trailer" }
+trace = { path = "../trace" }
+trace_exporters = { path = "../trace_exporters" }
+trace_http = { path = "../trace_http" }
+
+# Crates.io dependencies, in alphabetical order
+async-trait = "0.1"
+bytes = "1.5"
+clap = { version = "4", features = ["derive", "env"] }
+flate2 = "1.0"
+futures = "0.3"
+hashbrown = { workspace = true }
+http = "0.2.11"
+hyper = "0.14"
+log = "0.4"
+parking_lot = "0.12"
+reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.111"
+serde_urlencoded = "0.7.0"
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
+tokio-stream = { version = "0.1", features = ["net"] }
+tokio-util = { version = "0.7.10" }
+tonic  = { workspace = true }
+tonic-health  = { workspace = true }
+tonic-reflection = { workspace = true }
+tower = "0.4"
+tower-http = { version = "0.4", features = ["catch-panic"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+# Workspace dependencies, in alphabetical order
+# Crates.io dependencies, in alphabetical order
diff --git a/ioxd_common/src/http/error.rs b/ioxd_common/src/http/error.rs
new file mode 100644
index 0000000..c08146c
--- /dev/null
+++ b/ioxd_common/src/http/error.rs
@@ -0,0 +1,205 @@
+use hyper::{Body, Response, StatusCode};
+use observability_deps::tracing::warn;
+use serde::Serialize;
+
+/// Constants used in API error codes.
+///
+/// See <https://docs.influxdata.com/influxdb/v2.1/api/#operation/PostWrite>.
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[allow(dead_code)]
+pub enum HttpApiErrorCode {
+    InternalError,
+    NotFound,
+    Conflict,
+    Invalid,
+    UnprocessableEntity,
+    EmptyValue,
+    PartialWrite,
+    Unavailable,
+    Forbidden,
+    TooManyRequests,
+    Unauthorized,
+    MethodNotAllowed,
+    RequestTooLarge,
+    UnsupportedMediaType,
+}
+
+impl HttpApiErrorCode {
+    /// Get machine-readable text representation.
+    fn as_text(&self) -> &'static str {
+        match self {
+            Self::InternalError => "internal error",
+            Self::NotFound => "not found",
+            Self::Conflict => "conflict",
+            Self::Invalid => "invalid",
+            Self::UnprocessableEntity => "unprocessable entity",
+            Self::EmptyValue => "empty value",
+            Self::PartialWrite => "created with partial errors found",
+            Self::Unavailable => "unavailable",
+            Self::Forbidden => "forbidden",
+            Self::TooManyRequests => "too many requests",
+            Self::Unauthorized => "unauthorized",
+            Self::MethodNotAllowed => "method not allowed",
+            Self::RequestTooLarge => "request too large",
+            Self::UnsupportedMediaType => "unsupported media type",
+        }
+    }
+
+    /// Get tonic HTTP status code.
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Self::InternalError => StatusCode::INTERNAL_SERVER_ERROR,
+            Self::NotFound => StatusCode::NOT_FOUND,
+            Self::Conflict => StatusCode::CONFLICT,
+            Self::Invalid => StatusCode::BAD_REQUEST,
+            Self::UnprocessableEntity => StatusCode::UNPROCESSABLE_ENTITY,
+            Self::EmptyValue => StatusCode::NO_CONTENT,
+            Self::PartialWrite => StatusCode::CREATED,
+            Self::Unavailable => StatusCode::SERVICE_UNAVAILABLE,
+            Self::Forbidden => StatusCode::FORBIDDEN,
+            Self::TooManyRequests => StatusCode::TOO_MANY_REQUESTS,
+            Self::Unauthorized => StatusCode::UNAUTHORIZED,
+            Self::MethodNotAllowed => StatusCode::METHOD_NOT_ALLOWED,
+            Self::RequestTooLarge => StatusCode::PAYLOAD_TOO_LARGE,
+            Self::UnsupportedMediaType => StatusCode::UNSUPPORTED_MEDIA_TYPE,
+        }
+    }
+
+    /// Check if the code is an internal server error.
+    fn is_internal(&self) -> bool {
+        matches!(self, Self::InternalError)
+    }
+}
+
+impl From<StatusCode> for HttpApiErrorCode {
+    fn from(s: StatusCode) -> Self {
+        match s {
+            StatusCode::INTERNAL_SERVER_ERROR => Self::InternalError,
+            StatusCode::NOT_FOUND => Self::NotFound,
+            StatusCode::CONFLICT => Self::Conflict,
+            StatusCode::BAD_REQUEST => Self::Invalid,
+            StatusCode::UNPROCESSABLE_ENTITY => Self::UnprocessableEntity,
+            StatusCode::NO_CONTENT => Self::EmptyValue,
+            StatusCode::CREATED => Self::PartialWrite,
+            StatusCode::SERVICE_UNAVAILABLE => Self::Unavailable,
+            StatusCode::FORBIDDEN => Self::Forbidden,
+            StatusCode::TOO_MANY_REQUESTS => Self::TooManyRequests,
+            StatusCode::UNAUTHORIZED => Self::Unauthorized,
+            StatusCode::METHOD_NOT_ALLOWED => Self::MethodNotAllowed,
+            StatusCode::PAYLOAD_TOO_LARGE => Self::RequestTooLarge,
+            StatusCode::UNSUPPORTED_MEDIA_TYPE => Self::UnsupportedMediaType,
+            v => {
+                warn!(code=%v, "returning unexpected status code as internal error");
+                Self::InternalError
+            }
+        }
+    }
+}
+
+impl Serialize for HttpApiErrorCode {
+    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
+        serializer.serialize_str(self.as_text())
+    }
+}
+
+/// Error that is compatible with the Influxdata Cloud 2 HTTP API.
+///
+/// See <https://docs.influxdata.com/influxdb/v2.1/api/#operation/PostWrite>.
+#[derive(Debug, Serialize)]
+pub struct HttpApiError {
+    /// Machine-readable error code.
+    code: HttpApiErrorCode,
+
+    /// Human-readable message.
+    #[serde(rename = "message")]
+    msg: String,
+
+    /// Optional error line (for line protocol errors).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    line: Option<usize>,
+}
+
+impl HttpApiError {
+    /// Create new error from code and message.
+    pub fn new(code: impl Into<HttpApiErrorCode>, msg: impl Into<String>) -> Self {
+        Self {
+            code: code.into(),
+            msg: msg.into(),
+            line: None,
+        }
+    }
+
+    /// Add body to error.
+    pub fn with_line(self, line: Option<usize>) -> Self {
+        Self { line, ..self }
+    }
+
+    /// Generate response body for this error.
+    fn body(&self) -> Body {
+        Body::from(serde_json::to_string(&self).expect("must serialise to json"))
+    }
+
+    /// Generate response for this error.
+    pub fn response(&self) -> Response<Body> {
+        Response::builder()
+            .status(self.code.status_code())
+            .header("content-type", "application/json")
+            .body(self.body())
+            .unwrap()
+    }
+
+    /// Check if the error is an internal server error.
+    pub fn is_internal(&self) -> bool {
+        self.code.is_internal()
+    }
+}
+
+impl std::fmt::Display for HttpApiError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}: {}", self.code.as_text(), self.msg)
+    }
+}
+
+impl std::error::Error for HttpApiError {}
+
+/// Mixin-trait to simplify creation of [`HttpApiError`].
+pub trait HttpApiErrorExt {
+    /// No data can be returned, but the server was asked to do so.
+    fn empty_value(&self) -> HttpApiError;
+
+    /// Internal server error. This is a bug / misconfiguration.
+    fn internal_error(&self) -> HttpApiError;
+
+    /// Invalid/bad request.
+    fn invalid(&self) -> HttpApiError;
+
+    /// Resource was not found.
+    fn not_found(&self) -> HttpApiError;
+}
+
+impl<E> HttpApiErrorExt for E
+where
+    E: std::error::Error,
+{
+    fn empty_value(&self) -> HttpApiError {
+        HttpApiError::new(HttpApiErrorCode::EmptyValue, self.to_string())
+    }
+
+    fn internal_error(&self) -> HttpApiError {
+        HttpApiError::new(HttpApiErrorCode::InternalError, self.to_string())
+    }
+
+    fn invalid(&self) -> HttpApiError {
+        HttpApiError::new(HttpApiErrorCode::Invalid, self.to_string())
+    }
+
+    fn not_found(&self) -> HttpApiError {
+        HttpApiError::new(HttpApiErrorCode::NotFound, self.to_string())
+    }
+}
+
+/// An error that can be transformed into a [`HttpApiError`].
+pub trait HttpApiErrorSource: std::error::Error {
+    /// Create [`HttpApiError`].
+    fn to_http_api_error(&self) -> HttpApiError;
+}
diff --git a/ioxd_common/src/http/heappy.rs b/ioxd_common/src/http/heappy.rs
new file mode 100644
index 0000000..d05dfde
--- /dev/null
+++ b/ioxd_common/src/http/heappy.rs
@@ -0,0 +1,44 @@
+//! Memory profiling support using heappy
+//!
+//! Compiled only when the "heappy" feature is enabled
+
+use heappy::{self, HeapReport};
+use observability_deps::tracing::info;
+use snafu::{ResultExt, Snafu};
+use std::{thread, time};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("{}", source))]
+    HeappyError { source: heappy::Error },
+
+    #[snafu(display("{}", source))]
+    JoinError { source: tokio::task::JoinError },
+}
+
+pub(crate) async fn dump_heappy_rsprof(seconds: u64, interval: i32) -> Result<HeapReport, Error> {
+    // heap profiler guard is not Send so it can't be used in async code.
+    let report = tokio::task::spawn_blocking(move || {
+        let guard = heappy::HeapProfilerGuard::new(interval as usize)?;
+        info!(
+            "start allocs profiling {} seconds with interval {} bytes",
+            seconds, interval
+        );
+
+        thread::sleep(time::Duration::from_secs(seconds));
+
+        info!(
+            "done allocs profiling {} seconds with interval {} bytes, computing report",
+            seconds, interval
+        );
+
+        let report = guard.report();
+        info!("heap allocation report computed",);
+        Ok(report)
+    })
+    .await
+    .context(JoinSnafu)?
+    .context(HeappySnafu)?;
+
+    Ok(report)
+}
diff --git a/ioxd_common/src/http/metrics.rs b/ioxd_common/src/http/metrics.rs
new file mode 100644
index 0000000..ee45816
--- /dev/null
+++ b/ioxd_common/src/http/metrics.rs
@@ -0,0 +1,154 @@
+use hashbrown::HashMap;
+use metric::{Attributes, Metric, U64Counter, U64Histogram, U64HistogramOptions};
+use parking_lot::{MappedMutexGuard, Mutex, MutexGuard};
+
+/// Line protocol ingest metrics
+#[derive(Debug)]
+pub struct LineProtocolMetrics {
+    /// The number of LP lines ingested
+    ingest_lines: Metric<U64Counter>,
+
+    /// The number of LP fields ingested
+    ingest_fields: Metric<U64Counter>,
+
+    /// The number of LP bytes ingested
+    ingest_bytes: Metric<U64Counter>,
+
+    /// Distribution of LP batch sizes.
+    ingest_batch_size_bytes: Metric<U64Histogram>,
+
+    /// Database metrics keyed by database name
+    databases: Mutex<HashMap<String, LineProtocolDatabaseMetrics>>,
+}
+
+/// Line protocol metrics for a given database
+#[derive(Debug)]
+struct LineProtocolDatabaseMetrics {
+    /// The number of LP lines ingested successfully
+    ingest_lines_ok: U64Counter,
+
+    /// The number of LP lines ingested unsuccessfully
+    ingest_lines_error: U64Counter,
+
+    /// The number of LP fields ingested successfully
+    ingest_fields_ok: U64Counter,
+
+    /// The number of LP fields ingested unsuccessfully
+    ingest_fields_error: U64Counter,
+
+    /// The number of LP bytes ingested successfully
+    ingest_bytes_ok: U64Counter,
+
+    /// The number of LP bytes ingested unsuccessfully
+    ingest_bytes_error: U64Counter,
+
+    /// Distribution of LP batch sizes ingested successfully
+    ingest_batch_size_bytes_ok: U64Histogram,
+
+    /// Distribution of LP batch sizes ingested unsuccessfully
+    ingest_batch_size_bytes_error: U64Histogram,
+}
+
+impl LineProtocolMetrics {
+    pub fn new(registry: &metric::Registry) -> Self {
+        Self {
+            ingest_lines: registry.register_metric("ingest_lines", "total LP points ingested"),
+            ingest_fields: registry
+                .register_metric("ingest_fields", "total LP field values ingested"),
+            ingest_bytes: registry.register_metric("ingest_bytes", "total LP bytes ingested"),
+            ingest_batch_size_bytes: registry.register_metric_with_options(
+                "ingest_batch_size_bytes",
+                "distribution of ingested LP batch sizes",
+                || {
+                    U64HistogramOptions::new([
+                        1024,
+                        16 * 1024,
+                        32 * 1024,
+                        128 * 1024,
+                        256 * 1024,
+                        512 * 1024,
+                        768 * 1024,
+                        1024 * 1024,
+                        4 * 1024 * 1024,
+                        8 * 1024 * 1024,
+                        16 * 1024 * 1024,
+                        24 * 1024 * 1024,
+                        32 * 1024 * 1024,
+                        u64::MAX,
+                    ])
+                },
+            ),
+            databases: Default::default(),
+        }
+    }
+
+    pub fn record_write(
+        &self,
+        db_name: &str,
+        lines: usize,
+        fields: usize,
+        bytes: usize,
+        success: bool,
+    ) {
+        let metrics = self.database_metrics(db_name);
+
+        match success {
+            true => {
+                metrics.ingest_lines_ok.inc(lines as u64);
+                metrics.ingest_fields_ok.inc(fields as u64);
+                metrics.ingest_bytes_ok.inc(bytes as u64);
+                metrics.ingest_batch_size_bytes_ok.record(bytes as u64);
+            }
+            false => {
+                metrics.ingest_lines_error.inc(lines as u64);
+                metrics.ingest_fields_error.inc(fields as u64);
+                metrics.ingest_bytes_error.inc(bytes as u64);
+                metrics.ingest_batch_size_bytes_error.record(bytes as u64);
+            }
+        }
+    }
+
+    fn database_metrics(&self, db_name: &str) -> MappedMutexGuard<'_, LineProtocolDatabaseMetrics> {
+        MutexGuard::map(self.databases.lock(), |databases| {
+            let (_, metrics) = databases
+                .raw_entry_mut()
+                .from_key(db_name)
+                .or_insert_with(|| {
+                    let metrics = LineProtocolDatabaseMetrics::new(self, db_name);
+                    (db_name.to_string(), metrics)
+                });
+            metrics
+        })
+    }
+}
+
+impl LineProtocolDatabaseMetrics {
+    fn new(metrics: &LineProtocolMetrics, db_name: &str) -> Self {
+        let mut attributes = Attributes::from([("db_name", db_name.to_string().into())]);
+
+        attributes.insert("status", "ok");
+        let ingest_lines_ok = metrics.ingest_lines.recorder(attributes.clone());
+        let ingest_fields_ok = metrics.ingest_fields.recorder(attributes.clone());
+        let ingest_bytes_ok = metrics.ingest_bytes.recorder(attributes.clone());
+        let ingest_batch_size_bytes_ok =
+            metrics.ingest_batch_size_bytes.recorder(attributes.clone());
+
+        attributes.insert("status", "error");
+        let ingest_lines_error = metrics.ingest_lines.recorder(attributes.clone());
+        let ingest_fields_error = metrics.ingest_fields.recorder(attributes.clone());
+        let ingest_bytes_error = metrics.ingest_bytes.recorder(attributes.clone());
+        let ingest_batch_size_bytes_error =
+            metrics.ingest_batch_size_bytes.recorder(attributes.clone());
+
+        Self {
+            ingest_lines_ok,
+            ingest_lines_error,
+            ingest_fields_ok,
+            ingest_fields_error,
+            ingest_bytes_ok,
+            ingest_bytes_error,
+            ingest_batch_size_bytes_ok,
+            ingest_batch_size_bytes_error,
+        }
+    }
+}
diff --git a/ioxd_common/src/http/mod.rs b/ioxd_common/src/http/mod.rs
new file mode 100644
index 0000000..21a770d
--- /dev/null
+++ b/ioxd_common/src/http/mod.rs
@@ -0,0 +1,347 @@
+use http::StatusCode;
+use std::{convert::Infallible, num::NonZeroI32, sync::Arc};
+
+use authz::http::AuthorizationHeaderExtension;
+use hyper::{
+    http::HeaderValue,
+    server::conn::{AddrIncoming, AddrStream},
+    Body, Method, Request, Response,
+};
+use observability_deps::tracing::{debug, error};
+use serde::Deserialize;
+use snafu::Snafu;
+use tokio_util::sync::CancellationToken;
+use tower::Layer;
+use trace_http::{ctx::TraceHeaderParser, tower::TraceLayer};
+
+use crate::{
+    http::error::{HttpApiError, HttpApiErrorExt, HttpApiErrorSource},
+    server_type::ServerType,
+};
+
+#[cfg(feature = "heappy")]
+mod heappy;
+
+#[cfg(feature = "pprof")]
+mod pprof;
+
+pub mod error;
+pub mod metrics;
+pub mod utils;
+
+pub mod test_utils;
+
+#[allow(clippy::large_enum_variant)]
+#[derive(Debug, Snafu)]
+pub enum ApplicationError {
+    /// Error for when we could not parse the http query uri (e.g.
+    /// `?foo=bar&bar=baz)`
+    #[snafu(display("Invalid query string in HTTP URI '{}': {}", query_string, source))]
+    InvalidQueryString {
+        query_string: String,
+        source: serde_urlencoded::de::Error,
+    },
+
+    #[snafu(display("PProf error: {}", source))]
+    PProf {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[cfg(feature = "heappy")]
+    #[snafu(display("Heappy error: {}", source))]
+    HeappyError { source: heappy::Error },
+
+    #[snafu(display("Protobuf error: {}", source))]
+    Prost {
+        source: Box<dyn std::error::Error + Send + Sync>,
+    },
+
+    #[snafu(display("Protobuf error: {}", source))]
+    ProstIO { source: std::io::Error },
+
+    #[snafu(display("Empty flamegraph"))]
+    EmptyFlamegraph,
+
+    #[snafu(display("heappy support is not compiled"))]
+    HeappyIsNotCompiled,
+
+    #[snafu(display("pprof support is not compiled"))]
+    PProfIsNotCompiled,
+
+    #[snafu(display("Route error from run mode: {}", e))]
+    RunModeRouteError { e: Box<dyn HttpApiErrorSource> },
+}
+
+impl HttpApiErrorSource for ApplicationError {
+    fn to_http_api_error(&self) -> HttpApiError {
+        match self {
+            e @ Self::InvalidQueryString { .. } => e.invalid(),
+            e @ Self::PProf { .. } => e.internal_error(),
+            e @ Self::Prost { .. } => e.internal_error(),
+            e @ Self::ProstIO { .. } => e.internal_error(),
+            e @ Self::EmptyFlamegraph => e.empty_value(),
+            e @ Self::HeappyIsNotCompiled => e.internal_error(),
+            e @ Self::PProfIsNotCompiled => e.internal_error(),
+            #[cfg(feature = "heappy")]
+            e @ Self::HeappyError { .. } => e.internal_error(),
+            Self::RunModeRouteError { e } => e.to_http_api_error(),
+        }
+    }
+}
+
+pub async fn serve(
+    addr: AddrIncoming,
+    server_type: Arc<dyn ServerType>,
+    shutdown: CancellationToken,
+    trace_header_parser: TraceHeaderParser,
+) -> Result<(), hyper::Error> {
+    let trace_collector = server_type.trace_collector();
+    let trace_layer = TraceLayer::new(
+        trace_header_parser,
+        Arc::new(server_type.http_request_metrics()),
+        trace_collector,
+        server_type.name(),
+    );
+
+    hyper::Server::builder(addr)
+        .serve(hyper::service::make_service_fn(|_conn: &AddrStream| {
+            let server_type = Arc::clone(&server_type);
+            let service = hyper::service::service_fn(move |request: Request<_>| {
+                route_request(Arc::clone(&server_type), request)
+            });
+
+            let service = trace_layer.layer(service);
+            futures::future::ready(Ok::<_, Infallible>(service))
+        }))
+        .with_graceful_shutdown(shutdown.cancelled())
+        .await
+}
+
+async fn route_request(
+    server_type: Arc<dyn ServerType>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, Infallible> {
+    let auth = { req.headers().get(hyper::header::AUTHORIZATION).cloned() };
+    req.extensions_mut()
+        .insert(AuthorizationHeaderExtension::new(auth));
+
+    // we don't need the authorization header anymore and we don't want to accidentally log it.
+    req.headers_mut().remove(hyper::header::AUTHORIZATION);
+    debug!(request = ?req,"Processing request");
+
+    let method = req.method().clone();
+    let uri = req.uri().clone();
+    let content_length = req.headers().get("content-length").cloned();
+
+    let response = match (method.clone(), uri.path()) {
+        (Method::GET, "/health") => Ok(health(server_type.as_ref())),
+        (Method::GET, "/metrics") => handle_metrics(server_type.as_ref()),
+        (Method::GET, "/debug/pprof") => pprof_home(req).await,
+        (Method::GET, "/debug/pprof/profile") => pprof_profile(req).await,
+        (Method::GET, "/debug/pprof/allocs") => pprof_heappy_profile(req).await,
+        _ => server_type
+            .route_http_request(req)
+            .await
+            .map_err(|e| ApplicationError::RunModeRouteError { e }),
+    };
+
+    // TODO: Move logging to TraceLayer
+    match response {
+        Ok(response) => {
+            debug!(?response, "Successfully processed request");
+            Ok(response)
+        }
+        Err(error) => {
+            let error: HttpApiError = error.to_http_api_error();
+            if error.is_internal() {
+                error!(%error, %method, %uri, ?content_length, "Error while handling request");
+            } else {
+                debug!(%error, %method, %uri, ?content_length, "Error while handling request");
+            }
+            Ok(error.response())
+        }
+    }
+}
+
+fn health(server_type: &dyn ServerType) -> Response<Body> {
+    match server_type.is_healthy() {
+        true => {
+            let response_body = "OK";
+            Response::new(Body::from(response_body.to_string()))
+        }
+        false => {
+            let mut resp = Response::new(Body::empty());
+            *resp.status_mut() = StatusCode::SERVICE_UNAVAILABLE;
+            resp
+        }
+    }
+}
+
+fn handle_metrics(server_type: &dyn ServerType) -> Result<Response<Body>, ApplicationError> {
+    let mut body: Vec<u8> = Default::default();
+    let mut reporter = metric_exporters::PrometheusTextEncoder::new(&mut body);
+    server_type.metric_registry().report(&mut reporter);
+
+    Ok(Response::new(Body::from(body)))
+}
+
+async fn pprof_home(req: Request<Body>) -> Result<Response<Body>, ApplicationError> {
+    let default_host = HeaderValue::from_static("localhost");
+    let host = req
+        .headers()
+        .get("host")
+        .unwrap_or(&default_host)
+        .to_str()
+        .unwrap_or_default();
+    let profile_cmd = format!(
+        "/debug/pprof/profile?seconds={}",
+        PProfArgs::default_seconds()
+    );
+    let allocs_cmd = format!(
+        "/debug/pprof/allocs?seconds={}",
+        PProfAllocsArgs::default_seconds()
+    );
+    Ok(Response::new(Body::from(format!(
+        r#"<a href="{profile_cmd}">http://{host}{profile_cmd}</a><br><a href="{allocs_cmd}">http://{host}{allocs_cmd}</a>"#,
+    ))))
+}
+
+#[derive(Debug, Deserialize)]
+struct PProfArgs {
+    #[serde(default = "PProfArgs::default_seconds")]
+    #[allow(dead_code)]
+    seconds: u64,
+    #[serde(default = "PProfArgs::default_frequency")]
+    #[allow(dead_code)]
+    frequency: NonZeroI32,
+}
+
+impl PProfArgs {
+    fn default_seconds() -> u64 {
+        30
+    }
+
+    // 99Hz to avoid coinciding with special periods
+    fn default_frequency() -> NonZeroI32 {
+        NonZeroI32::new(99).unwrap()
+    }
+}
+
+#[derive(Debug, Deserialize)]
+struct PProfAllocsArgs {
+    #[serde(default = "PProfAllocsArgs::default_seconds")]
+    #[allow(dead_code)]
+    seconds: u64,
+    // The sampling interval is a number of bytes that have to cumulatively allocated for a sample to be taken.
+    //
+    // For example if the sampling interval is 99, and you're doing a million of 40 bytes allocations,
+    // the allocations profile will account for 16MB instead of 40MB.
+    // Heappy will adjust the estimate for sampled recordings, but now that feature is not yet implemented.
+    #[serde(default = "PProfAllocsArgs::default_interval")]
+    #[allow(dead_code)]
+    interval: NonZeroI32,
+}
+
+impl PProfAllocsArgs {
+    fn default_seconds() -> u64 {
+        30
+    }
+
+    // 1 means: sample every allocation.
+    fn default_interval() -> NonZeroI32 {
+        NonZeroI32::new(1).unwrap()
+    }
+}
+
+#[cfg(feature = "pprof")]
+async fn pprof_profile(req: Request<Body>) -> Result<Response<Body>, ApplicationError> {
+    use ::pprof::protos::Message;
+    use snafu::ResultExt;
+
+    let query_string = req.uri().query().unwrap_or_default();
+    let query: PProfArgs = serde_urlencoded::from_str(query_string)
+        .context(InvalidQueryStringSnafu { query_string })?;
+
+    let report = self::pprof::dump_rsprof(query.seconds, query.frequency.get())
+        .await
+        .map_err(|e| Box::new(e) as _)
+        .context(PProfSnafu)?;
+
+    let mut body: Vec<u8> = Vec::new();
+
+    // render flamegraph when opening in the browser
+    // otherwise render as protobuf; works great with: go tool pprof http://..../debug/pprof/profile
+    if req
+        .headers()
+        .get_all("Accept")
+        .iter()
+        .flat_map(|i| i.to_str().unwrap_or_default().split(','))
+        .any(|i| i == "text/html" || i == "image/svg+xml")
+    {
+        report
+            .flamegraph(&mut body)
+            .map_err(|e| Box::new(e) as _)
+            .context(PProfSnafu)?;
+        if body.is_empty() {
+            return EmptyFlamegraphSnafu.fail();
+        }
+    } else {
+        let profile = report
+            .pprof()
+            .map_err(|e| Box::new(e) as _)
+            .context(PProfSnafu)?;
+        profile
+            .encode(&mut body)
+            .map_err(|e| Box::new(e) as _)
+            .context(ProstSnafu)?;
+    }
+
+    Ok(Response::new(Body::from(body)))
+}
+
+#[cfg(not(feature = "pprof"))]
+async fn pprof_profile(_req: Request<Body>) -> Result<Response<Body>, ApplicationError> {
+    PProfIsNotCompiledSnafu {}.fail()
+}
+
+// If heappy support is enabled, call it
+#[cfg(feature = "heappy")]
+async fn pprof_heappy_profile(req: Request<Body>) -> Result<Response<Body>, ApplicationError> {
+    use snafu::ResultExt;
+
+    let query_string = req.uri().query().unwrap_or_default();
+    let query: PProfAllocsArgs = serde_urlencoded::from_str(query_string)
+        .context(InvalidQueryStringSnafu { query_string })?;
+
+    let report = self::heappy::dump_heappy_rsprof(query.seconds, query.interval.get())
+        .await
+        .context(HeappySnafu)?;
+
+    let mut body: Vec<u8> = Vec::new();
+
+    // render flamegraph when opening in the browser
+    // otherwise render as protobuf;
+    // works great with: go tool pprof http://..../debug/pprof/allocs
+    if req
+        .headers()
+        .get_all("Accept")
+        .iter()
+        .flat_map(|i| i.to_str().unwrap_or_default().split(','))
+        .any(|i| i == "text/html" || i == "image/svg+xml")
+    {
+        report.flamegraph(&mut body);
+        if body.is_empty() {
+            return EmptyFlamegraphSnafu.fail();
+        }
+    } else {
+        report.write_pprof(&mut body).context(ProstIOSnafu)?
+    }
+
+    Ok(Response::new(Body::from(body)))
+}
+
+//  Return error if heappy not enabled
+#[cfg(not(feature = "heappy"))]
+async fn pprof_heappy_profile(_req: Request<Body>) -> Result<Response<Body>, ApplicationError> {
+    HeappyIsNotCompiledSnafu {}.fail()
+}
diff --git a/ioxd_common/src/http/pprof.rs b/ioxd_common/src/http/pprof.rs
new file mode 100644
index 0000000..c15a62d
--- /dev/null
+++ b/ioxd_common/src/http/pprof.rs
@@ -0,0 +1,18 @@
+use observability_deps::tracing::info;
+use tokio::time::Duration;
+
+pub(crate) async fn dump_rsprof(seconds: u64, frequency: i32) -> pprof::Result<pprof::Report> {
+    let guard = pprof::ProfilerGuard::new(frequency)?;
+    info!(
+        "start profiling {} seconds with frequency {} /s",
+        seconds, frequency
+    );
+
+    tokio::time::sleep(Duration::from_secs(seconds)).await;
+
+    info!(
+        "done profiling {} seconds with frequency {} /s",
+        seconds, frequency
+    );
+    guard.report().build()
+}
diff --git a/ioxd_common/src/http/test_utils.rs b/ioxd_common/src/http/test_utils.rs
new file mode 100644
index 0000000..7066716
--- /dev/null
+++ b/ioxd_common/src/http/test_utils.rs
@@ -0,0 +1,241 @@
+use std::{
+    fmt::Debug,
+    net::{IpAddr, Ipv4Addr, SocketAddr},
+    sync::Arc,
+};
+
+use http::header::CONTENT_TYPE;
+use hyper::{server::conn::AddrIncoming, StatusCode};
+use reqwest::Client;
+use serde::de::DeserializeOwned;
+use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
+use trace::RingBufferTraceCollector;
+
+use crate::{http::serve, server_type::ServerType};
+
+/// checks a http response against expected results
+pub async fn check_response(
+    description: &str,
+    response: Result<reqwest::Response, reqwest::Error>,
+    expected_status: StatusCode,
+    expected_body: Option<&str>,
+) {
+    // Print the response so if the test fails, we have a log of
+    // what went wrong
+    println!("{description} response: {response:?}");
+
+    if let Ok(response) = response {
+        let status = response.status();
+        let body = response
+            .text()
+            .await
+            .expect("Converting request body to string");
+
+        assert_eq!(status, expected_status);
+        if let Some(expected_body) = expected_body {
+            assert!(
+                body.contains(expected_body),
+                "Could not find expected in body.\n\nExpected:\n{expected_body}\n\nBody:\n{body}"
+            );
+        }
+    } else {
+        panic!("Unexpected error response: {response:?}");
+    }
+}
+
+#[allow(dead_code)]
+pub async fn check_json_response<T: DeserializeOwned + Eq + Debug>(
+    client: &reqwest::Client,
+    url: &str,
+    expected_status: StatusCode,
+) -> T {
+    let response = client.get(url).send().await;
+
+    // Print the response so if the test fails, we have a log of
+    // what went wrong
+    println!("{url} response: {response:?}");
+
+    if let Ok(response) = response {
+        let status = response.status();
+        let body: T = response
+            .json()
+            .await
+            .expect("Converting request body to string");
+
+        assert_eq!(status, expected_status);
+        body
+    } else {
+        panic!("Unexpected error response: {response:?}");
+    }
+}
+
+pub fn get_content_type(response: &Result<reqwest::Response, reqwest::Error>) -> String {
+    if let Ok(response) = response {
+        response
+            .headers()
+            .get(CONTENT_TYPE)
+            .map(|v| v.to_str().unwrap())
+            .unwrap_or("")
+            .to_string()
+    } else {
+        "".to_string()
+    }
+}
+
+#[derive(Debug)]
+pub struct TestServer<M>
+where
+    M: ServerType,
+{
+    join_handle: JoinHandle<()>,
+    url: String,
+    server_type: Arc<M>,
+}
+
+impl<M> TestServer<M>
+where
+    M: ServerType,
+{
+    pub fn new(server_type: Arc<M>) -> Self {
+        // NB: specify port 0 to let the OS pick the port.
+        let bind_addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 0);
+        let addr = AddrIncoming::bind(&bind_addr).expect("failed to bind server");
+        let url = format!("http://{}", addr.local_addr());
+
+        let trace_header_parser = trace_http::ctx::TraceHeaderParser::new()
+            .with_jaeger_trace_context_header_name("uber-trace-id");
+
+        let server_type_captured = Arc::clone(&server_type);
+        let join_handle = tokio::task::spawn(async {
+            serve(
+                addr,
+                server_type_captured,
+                CancellationToken::new(),
+                trace_header_parser,
+            )
+            .await
+            .unwrap();
+        });
+        println!("Started server at {url}");
+
+        Self {
+            join_handle,
+            url,
+            server_type,
+        }
+    }
+
+    pub fn url(&self) -> &str {
+        &self.url
+    }
+
+    pub fn server_type(&self) -> &Arc<M> {
+        &self.server_type
+    }
+}
+
+impl<M> Drop for TestServer<M>
+where
+    M: ServerType,
+{
+    fn drop(&mut self) {
+        self.join_handle.abort();
+    }
+}
+
+pub const TEST_MAX_REQUEST_SIZE: usize = 1024 * 1024;
+
+/// Assert that health route is working.
+pub async fn assert_health<T>(test_server: TestServer<T>)
+where
+    T: ServerType,
+{
+    let client = Client::new();
+    let response = client
+        .get(&format!("{}/health", test_server.url()))
+        .send()
+        .await;
+
+    // Print the response so if the test fails, we have a log of what went wrong
+    check_response("health", response, StatusCode::OK, Some("OK")).await;
+}
+
+/// Assert that metrics exposure is working.
+pub async fn assert_metrics<T>(test_server: TestServer<T>)
+where
+    T: ServerType,
+{
+    use metric::{Metric, U64Counter};
+
+    let metric: Metric<U64Counter> = test_server
+        .server_type()
+        .metric_registry()
+        .register_metric("my_metric", "description");
+
+    metric.recorder(&[("tag", "value")]).inc(20);
+
+    let client = Client::new();
+    let response = client
+        .get(&format!("{}/metrics", test_server.url()))
+        .send()
+        .await
+        .unwrap();
+
+    let data = response.text().await.unwrap();
+
+    assert!(data.contains("\nmy_metric_total{tag=\"value\"} 20\n"));
+
+    let response = client
+        .get(&format!("{}/nonexistent", test_server.url()))
+        .send()
+        .await
+        .unwrap();
+
+    assert_eq!(response.status().as_u16(), 404);
+
+    let response = client
+        .get(&format!("{}/metrics", test_server.url()))
+        .send()
+        .await
+        .unwrap();
+
+    let data = response.text().await.unwrap();
+
+    // Should include previous metrics scrape but not the current one
+    assert!(data.contains("\nhttp_requests_total{path=\"/metrics\",status=\"ok\"} 1\n"));
+    // Should include 404 but not encode the path
+    assert!(!data.contains("nonexistent"));
+    assert!(data.contains("\nhttp_requests_total{status=\"client_error\"} 1\n"));
+}
+
+/// Assert that tracing works.
+///
+/// For this to work the used trace collector must be a [`RingBufferTraceCollector`].
+pub async fn assert_tracing<T>(test_server: TestServer<T>)
+where
+    T: ServerType,
+{
+    let trace_collector = test_server.server_type().trace_collector().unwrap();
+    let trace_collector = trace_collector
+        .as_any()
+        .downcast_ref::<RingBufferTraceCollector>()
+        .unwrap();
+
+    let client = Client::new();
+    let response = client
+        .get(&format!("{}/health", test_server.url()))
+        .header("uber-trace-id", "34f3495:36e34:0:1")
+        .send()
+        .await;
+
+    // Print the response so if the test fails, we have a log of what went wrong
+    check_response("health", response, StatusCode::OK, Some("OK")).await;
+
+    let mut spans = trace_collector.spans();
+    assert_eq!(spans.len(), 1);
+
+    let span = spans.pop().unwrap();
+    assert_eq!(span.ctx.trace_id.get(), 0x34f3495);
+    assert_eq!(span.ctx.parent_span_id.unwrap().get(), 0x36e34);
+}
diff --git a/ioxd_common/src/http/utils.rs b/ioxd_common/src/http/utils.rs
new file mode 100644
index 0000000..eea9552
--- /dev/null
+++ b/ioxd_common/src/http/utils.rs
@@ -0,0 +1,215 @@
+use bytes::{Bytes, BytesMut};
+use futures::StreamExt;
+use http::header::CONTENT_ENCODING;
+use hyper::Body;
+use snafu::{ResultExt, Snafu};
+
+use super::error::{HttpApiError, HttpApiErrorExt, HttpApiErrorSource};
+
+#[allow(clippy::large_enum_variant)]
+#[derive(Debug, Snafu)]
+pub enum ParseBodyError {
+    #[snafu(display("Body exceeds limit of {} bytes", max_body_size))]
+    RequestSizeExceeded { max_body_size: usize },
+
+    #[snafu(display("Invalid content encoding: {}", content_encoding))]
+    InvalidContentEncoding { content_encoding: String },
+
+    #[snafu(display("Error reading request header '{}' as Utf8: {}", header_name, source))]
+    ReadingHeaderAsUtf8 {
+        header_name: String,
+        source: hyper::header::ToStrError,
+    },
+
+    #[snafu(display("Error decompressing body as gzip: {}", source))]
+    ReadingBodyAsGzip { source: std::io::Error },
+
+    #[snafu(display("Client hung up while sending body: {}", source))]
+    ClientHangup { source: hyper::Error },
+}
+
+impl HttpApiErrorSource for ParseBodyError {
+    fn to_http_api_error(&self) -> HttpApiError {
+        match self {
+            e @ Self::RequestSizeExceeded { .. } => e.invalid(),
+            e @ Self::InvalidContentEncoding { .. } => e.invalid(),
+            e @ Self::ReadingHeaderAsUtf8 { .. } => e.invalid(),
+            e @ Self::ReadingBodyAsGzip { .. } => e.invalid(),
+            e @ Self::ClientHangup { .. } => e.invalid(),
+        }
+    }
+}
+
+/// Parse the request's body into raw bytes, applying size limits and
+/// content encoding as needed.
+pub async fn parse_body(
+    req: hyper::Request<Body>,
+    max_size: usize,
+) -> Result<Bytes, ParseBodyError> {
+    // clippy says the const needs to be assigned to a local variable:
+    // error: a `const` item with interior mutability should not be borrowed
+    let header_name = CONTENT_ENCODING;
+    let ungzip = match req.headers().get(&header_name) {
+        None => false,
+        Some(content_encoding) => {
+            let content_encoding = content_encoding
+                .to_str()
+                .context(ReadingHeaderAsUtf8Snafu {
+                    header_name: header_name.as_str(),
+                })?;
+            match content_encoding {
+                "gzip" => true,
+                "identity" => false,
+                _ => InvalidContentEncodingSnafu { content_encoding }.fail()?,
+            }
+        }
+    };
+
+    let mut payload = req.into_body();
+
+    let mut body = BytesMut::new();
+    while let Some(chunk) = payload.next().await {
+        let chunk = chunk.context(ClientHangupSnafu)?;
+        // limit max size of in-memory payload
+        if (body.len() + chunk.len()) > max_size {
+            return Err(ParseBodyError::RequestSizeExceeded {
+                max_body_size: max_size,
+            });
+        }
+        body.extend_from_slice(&chunk);
+    }
+    let body = body.freeze();
+
+    // apply any content encoding needed
+    if ungzip {
+        use std::io::Read;
+        let decoder = flate2::read::GzDecoder::new(&body[..]);
+
+        // Read at most max_size bytes to prevent a decompression bomb based
+        // DoS.
+        //
+        // In order to detect if the entire stream has been read, or truncated,
+        // read an extra byte beyond the limit and check the resulting data
+        // length - see test_read_gzipped_body_truncation.
+        let mut decoder = decoder.take(max_size as u64 + 1);
+        let mut decoded_data = Vec::new();
+        decoder
+            .read_to_end(&mut decoded_data)
+            .context(ReadingBodyAsGzipSnafu)?;
+
+        // If the length is max_size+1, the body is at least max_size+1 bytes in
+        // length, and possibly longer, but truncated.
+        if decoded_data.len() > max_size {
+            return Err(ParseBodyError::RequestSizeExceeded {
+                max_body_size: max_size,
+            });
+        }
+
+        Ok(decoded_data.into())
+    } else {
+        Ok(body)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{io::Write, iter};
+
+    use flate2::{write::GzEncoder, Compression};
+    use futures::stream;
+    use http::HeaderValue;
+    use hyper::Request;
+    use tokio_stream::wrappers::ReceiverStream;
+
+    use crate::http::test_utils::TEST_MAX_REQUEST_SIZE;
+
+    use super::*;
+
+    const MAX_BYTES: usize = 1024;
+
+    #[tokio::test]
+    async fn client_hangup_during_parse() {
+        #[derive(Debug, Snafu)]
+        enum TestError {
+            #[snafu(display("Blarg Error"))]
+            Blarg {},
+        }
+
+        let (tx, rx) = tokio::sync::mpsc::channel(2);
+        let body = Body::wrap_stream(ReceiverStream::new(rx));
+
+        tx.send(Ok("foo")).await.unwrap();
+        tx.send(Err(TestError::Blarg {})).await.unwrap();
+
+        let request = Request::builder()
+            .uri("https://ye-olde-non-existent-server/")
+            .body(body)
+            .unwrap();
+
+        let parse_result = parse_body(request, TEST_MAX_REQUEST_SIZE)
+            .await
+            .unwrap_err();
+        assert_eq!(
+            parse_result.to_string(),
+            "Client hung up while sending body: error reading a body from connection: Blarg Error"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_read_gzipped_body_truncation() {
+        // Generate a LP string in the form of:
+        //
+        //  bananas,A=AAAAAAAAAA(repeated)... B=42
+        //                                  ^
+        //                                  |
+        //                         MAX_BYTES boundary
+        //
+        // So that reading MAX_BYTES number of bytes produces the string:
+        //
+        //  bananas,A=AAAAAAAAAA(repeated)...
+        //
+        // Effectively trimming off the " B=42" suffix.
+        let body = "bananas,A=";
+        let body = iter::once(body)
+            .chain(iter::repeat("A").take(MAX_BYTES - body.len()))
+            .chain(iter::once(" B=42\n"))
+            .flat_map(|s| s.bytes())
+            .collect::<Vec<u8>>();
+
+        // Apply gzip compression to the body
+        let mut e = GzEncoder::new(Vec::new(), Compression::default());
+        e.write_all(&body).unwrap();
+        let body = e.finish().expect("failed to compress test body");
+
+        let body: Result<_, std::io::Error> = Ok(body);
+        let body = Body::wrap_stream(stream::iter(iter::once(body)));
+
+        let mut request = Request::builder()
+            .uri("https://explosions.example/")
+            .body(body)
+            .unwrap();
+
+        request
+            .headers_mut()
+            .insert(CONTENT_ENCODING, HeaderValue::from_static("gzip"));
+
+        let got = parse_body(request, MAX_BYTES).await;
+
+        assert!(matches!(
+            got,
+            Err(ParseBodyError::RequestSizeExceeded { .. })
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_accept_identity_content_encoding() {
+        let request = Request::builder()
+            .uri("https://explosions.example/")
+            .header("Content-Encoding", "identity")
+            .body(Body::from("bananas,A=12"))
+            .unwrap();
+
+        let got = parse_body(request, MAX_BYTES).await;
+        assert!(got.is_ok());
+    }
+}
diff --git a/ioxd_common/src/lib.rs b/ioxd_common/src/lib.rs
new file mode 100644
index 0000000..4326c32
--- /dev/null
+++ b/ioxd_common/src/lib.rs
@@ -0,0 +1,294 @@
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    clippy::clone_on_ref_ptr,
+    clippy::dbg_macro,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::use_self,
+    missing_debug_implementations,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod http;
+pub mod rpc;
+pub mod server_type;
+mod service;
+
+// These crates are used by the macros we export; provide a stable
+// path to use them from in downstream crates.
+pub mod reexport {
+    pub use generated_types;
+    pub use service_grpc_testing;
+    pub use tokio_stream;
+    pub use tonic;
+    pub use tonic_health;
+    pub use tonic_reflection;
+    pub use tower_http;
+    pub use tower_trailer;
+    pub use trace_http;
+}
+
+pub use service::Service;
+
+use crate::server_type::{CommonServerState, ServerType};
+use futures::{future::FusedFuture, pin_mut, FutureExt};
+use hyper::server::conn::AddrIncoming;
+use observability_deps::tracing::{error, info};
+use snafu::{ResultExt, Snafu};
+use std::{net::SocketAddr, sync::Arc};
+use tokio_util::sync::CancellationToken;
+use trace_http::ctx::TraceHeaderParser;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Neither grpc nor http listeners are available"))]
+    MissingListener,
+
+    #[snafu(display("Unable to bind to listen for HTTP requests on {}: {}", addr, source))]
+    StartListeningHttp {
+        addr: SocketAddr,
+        source: hyper::Error,
+    },
+
+    #[snafu(display("Unable to bind to listen for gRPC requests on {}: {}", addr, source))]
+    StartListeningGrpc {
+        addr: SocketAddr,
+        source: std::io::Error,
+    },
+
+    #[snafu(display("Error serving HTTP: {}", source))]
+    ServingHttp { source: hyper::Error },
+
+    #[snafu(display("Error serving RPC: {}", source))]
+    ServingRpc { source: server_type::RpcError },
+
+    #[snafu(display("Early Http shutdown"))]
+    LostHttp,
+
+    #[snafu(display("Early RPC shutdown"))]
+    LostRpc,
+
+    #[snafu(display("Early server shutdown"))]
+    LostServer,
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// On unix platforms we want to intercept SIGINT and SIGTERM
+/// This method returns if either are signalled
+#[cfg(unix)]
+pub async fn wait_for_signal() {
+    use tokio::signal::unix::{signal, SignalKind};
+    let mut term = signal(SignalKind::terminate()).expect("failed to register signal handler");
+    let mut int = signal(SignalKind::interrupt()).expect("failed to register signal handler");
+
+    tokio::select! {
+        _ = term.recv() => info!("Received SIGTERM"),
+        _ = int.recv() => info!("Received SIGINT"),
+    }
+}
+
+#[cfg(windows)]
+/// ctrl_c is the cross-platform way to intercept the equivalent of SIGINT
+/// This method returns if this occurs
+pub async fn wait_for_signal() {
+    let _ = tokio::signal::ctrl_c().await;
+}
+
+pub async fn grpc_listener(addr: SocketAddr) -> Result<tokio::net::TcpListener> {
+    let listener = tokio::net::TcpListener::bind(addr)
+        .await
+        .context(StartListeningGrpcSnafu { addr })?;
+
+    match listener.local_addr() {
+        Ok(local_addr) => info!(%local_addr, "bound gRPC listener"),
+        Err(_) => info!(%addr, "bound gRPC listener"),
+    }
+
+    Ok(listener)
+}
+
+pub async fn http_listener(addr: SocketAddr) -> Result<AddrIncoming> {
+    let listener = AddrIncoming::bind(&addr).context(StartListeningHttpSnafu { addr })?;
+    info!(bind_addr=%listener.local_addr(), "bound HTTP listener");
+
+    Ok(listener)
+}
+
+/// Instantiates the gRPC and optional HTTP listeners and returns a `Future` that completes when
+/// the listeners have all exited or the `frontend_shutdown` token is called.
+pub async fn serve(
+    common_state: CommonServerState,
+    frontend_shutdown: CancellationToken,
+    grpc_listener: Option<tokio::net::TcpListener>,
+    http_listener: Option<AddrIncoming>,
+    server_type: Arc<dyn ServerType>,
+) -> Result<()> {
+    if grpc_listener.is_none() && http_listener.is_none() {
+        return Err(Error::MissingListener);
+    }
+
+    let trace_header_parser = TraceHeaderParser::new()
+        .with_jaeger_trace_context_header_name(
+            &common_state
+                .run_config()
+                .tracing_config()
+                .traces_jaeger_trace_context_header_name,
+        )
+        .with_jaeger_debug_name(
+            &common_state
+                .run_config()
+                .tracing_config()
+                .traces_jaeger_debug_name,
+        );
+
+    // Construct and start up gRPC server
+    let captured_server_type = Arc::clone(&server_type);
+    let captured_shutdown = frontend_shutdown.clone();
+    let captured_trace_header_parser = trace_header_parser.clone();
+    let grpc_server = async move {
+        if let Some(grpc_listener) = grpc_listener {
+            info!(?captured_server_type, "gRPC server listening");
+            rpc::serve(
+                grpc_listener,
+                captured_server_type,
+                captured_trace_header_parser,
+                captured_shutdown,
+            )
+            .await?
+        } else {
+            // don't resolve otherwise will cause server to shutdown
+            captured_shutdown.cancelled().await
+        }
+        Ok(())
+    }
+    .fuse();
+
+    let captured_server_type = Arc::clone(&server_type);
+    let captured_shutdown = frontend_shutdown.clone();
+    let http_server = async move {
+        if let Some(http_listener) = http_listener {
+            info!(server_type=?captured_server_type, "HTTP server listening");
+            http::serve(
+                http_listener,
+                captured_server_type,
+                captured_shutdown,
+                trace_header_parser,
+            )
+            .await?
+        } else {
+            // don't resolve otherwise will cause server to shutdown
+            captured_shutdown.cancelled().await
+        }
+        Ok(())
+    }
+    .fuse();
+
+    // Purposefully use log not tokio-tracing to ensure correctly hooked up
+    log::info!("InfluxDB IOx {:?} server ready", server_type);
+
+    // Get IOx background worker join handle
+    let server_handle = Arc::clone(&server_type).join().fuse();
+
+    // Shutdown signal
+    let signal = wait_for_signal().fuse();
+
+    // There are two different select macros - tokio::select and futures::select
+    //
+    // tokio::select takes ownership of the passed future "moving" it into the
+    // select block. This works well when not running select inside a loop, or
+    // when using a future that can be dropped and recreated, often the case
+    // with tokio's futures e.g. `channel.recv()`
+    //
+    // futures::select is more flexible as it doesn't take ownership of the provided
+    // future. However, to safely provide this it imposes some additional
+    // requirements
+    //
+    // All passed futures must implement FusedFuture - it is IB to poll a future
+    // that has returned Poll::Ready(_). A FusedFuture has an is_terminated()
+    // method that indicates if it is safe to poll - e.g. false if it has
+    // returned Poll::Ready(_). futures::select uses this to implement its
+    // functionality. futures::FutureExt adds a fuse() method that
+    // wraps an arbitrary future and makes it a FusedFuture
+    //
+    // The additional requirement of futures::select is that if the future passed
+    // outlives the select block, it must be Unpin or already Pinned
+
+    // pin_mut constructs a Pin<&mut T> from a T by preventing moving the T
+    // from the current stack frame and constructing a Pin<&mut T> to it
+    pin_mut!(signal);
+    pin_mut!(server_handle);
+    pin_mut!(grpc_server);
+    pin_mut!(http_server);
+
+    // Return the first error encountered
+    let mut res = Ok(());
+
+    // Graceful shutdown can be triggered by sending SIGINT or SIGTERM to the
+    // process, or by a background task exiting - most likely with an error
+    //
+    // Graceful shutdown should then proceed in the following order
+    // 1. Stop accepting new HTTP and gRPC requests and drain existing connections
+    // 2. Trigger shutdown of internal background workers loops
+    //
+    // This is important to ensure background tasks, such as polling the tracker
+    // registry, don't exit before HTTP and gRPC requests dependent on them
+    while !grpc_server.is_terminated() || !http_server.is_terminated() {
+        futures::select! {
+            _ = signal => info!(?server_type, "shutdown requested"),
+            _ = server_handle => {
+                // If the frontend & backend stop together, the select! may
+                // choose to follow the "background has shutdown" signal instead
+                // of one of the frontend paths.
+                //
+                // This should not be a problem so long as the frontend has
+                // stopped.
+                if frontend_shutdown.is_cancelled() {
+                    break;
+                }
+                error!(?server_type, "server worker shutdown before frontend");
+                res = res.and(Err(Error::LostServer));
+            },
+            result = grpc_server => match result {
+                Ok(_) if frontend_shutdown.is_cancelled() => info!(?server_type, "gRPC server shutdown"),
+                Ok(_) => {
+                    error!(?server_type, "early gRPC server exit");
+                    res = res.and(Err(Error::LostRpc));
+                }
+                Err(error) => {
+                    error!(%error, ?server_type, "gRPC server error");
+                    res = res.and(Err(Error::ServingRpc{source: error}));
+                }
+            },
+            result = http_server => match result {
+                Ok(_) if frontend_shutdown.is_cancelled() => info!(?server_type, "HTTP server shutdown"),
+                Ok(_) => {
+                    error!(?server_type, "early HTTP server exit");
+                    res = res.and(Err(Error::LostHttp));
+                }
+                Err(error) => {
+                    error!(%error, ?server_type, "HTTP server error");
+                    res = res.and(Err(Error::ServingHttp{source: error}));
+                }
+            },
+        }
+
+        // Delegate shutting down the frontend to the background shutdown
+        // handler, allowing it to sequence the stopping of the RPC/HTTP
+        // servers as needed.
+        server_type.shutdown(frontend_shutdown.clone())
+    }
+    info!(?server_type, "frontend shutdown completed");
+
+    if !server_handle.is_terminated() {
+        server_handle.await;
+    }
+    info!(?server_type, "backend shutdown completed");
+
+    res
+}
diff --git a/ioxd_common/src/rpc.rs b/ioxd_common/src/rpc.rs
new file mode 100644
index 0000000..1185e5b
--- /dev/null
+++ b/ioxd_common/src/rpc.rs
@@ -0,0 +1,193 @@
+use std::any::Any;
+use std::sync::Arc;
+
+use tokio::net::TcpListener;
+use tokio_util::sync::CancellationToken;
+use tonic::{body::BoxBody, transport::NamedService, Code};
+use tonic_health::server::HealthReporter;
+use trace_http::ctx::TraceHeaderParser;
+
+use crate::server_type::{RpcError, ServerType};
+
+/// Returns the name of the gRPC service S.
+pub fn service_name<S: NamedService>(_: &S) -> &'static str {
+    S::NAME
+}
+
+#[derive(Debug)]
+pub struct RpcBuilderInput {
+    pub socket: TcpListener,
+    pub trace_header_parser: TraceHeaderParser,
+    pub shutdown: CancellationToken,
+}
+
+#[derive(Debug)]
+pub struct RpcBuilder<T> {
+    pub inner: T,
+    pub health_reporter: HealthReporter,
+    pub shutdown: CancellationToken,
+    pub socket: TcpListener,
+}
+
+/// Adds a gRPC service to the builder, and registers it with the
+/// health reporter
+#[macro_export]
+macro_rules! add_service {
+    ($builder:ident, $svc:expr) => {
+        $crate::add_service!($builder, $svc, Serving)
+    };
+    ($builder:ident, $svc:expr, $status:ident) => {
+        let $builder = {
+            // `inner` might be required to be `mut` or not depending if we're acting on:
+            // - a `Server`, no service added yet, no `mut` required
+            // - a `Router`, some service was added already, `mut` required
+            #[allow(unused_mut)]
+            {
+                use $crate::rpc::{service_name, RpcBuilder};
+
+                let RpcBuilder {
+                    mut inner,
+                    mut health_reporter,
+                    shutdown,
+                    socket,
+                } = $builder;
+                let service = $svc;
+
+                let status = $crate::reexport::tonic_health::ServingStatus::$status;
+                health_reporter
+                    .set_service_status(service_name(&service), status)
+                    .await;
+
+                let inner = inner.add_service(service);
+
+                RpcBuilder {
+                    inner,
+                    health_reporter,
+                    shutdown,
+                    socket,
+                }
+            }
+        };
+    };
+}
+
+/// Creates a [`RpcBuilder`] from [`RpcBuilderInput`].
+///
+/// The resulting builder can be used w/ [`add_service`]. After adding all services it should
+/// be used w/ [`serve_builder!`](crate::serve_builder).
+#[macro_export]
+macro_rules! setup_builder {
+    ($input:ident, $server_type:ident) => {{
+        #[allow(unused_imports)]
+        use $crate::{add_service, rpc::RpcBuilder, server_type::ServerType};
+
+        let RpcBuilderInput {
+            socket,
+            trace_header_parser,
+            shutdown,
+        } = $input;
+
+        let (health_reporter, health_service) =
+            $crate::reexport::tonic_health::server::health_reporter();
+        let reflection_service = $crate::reexport::tonic_reflection::server::Builder::configure()
+            .register_encoded_file_descriptor_set(
+                $crate::reexport::generated_types::FILE_DESCRIPTOR_SET,
+            )
+            .build()
+            .expect("gRPC reflection data broken");
+
+        let builder = $crate::reexport::tonic::transport::Server::builder();
+        let builder = builder
+            .layer($crate::reexport::trace_http::tower::TraceLayer::new(
+                trace_header_parser,
+                Arc::new($crate::reexport::trace_http::metrics::RequestMetrics::new(
+                    $server_type.metric_registry(),
+                    $crate::reexport::trace_http::metrics::MetricFamily::GrpcServer,
+                )),
+                $server_type.trace_collector(),
+                $server_type.name(),
+            ))
+            .layer(
+                $crate::reexport::tower_http::catch_panic::CatchPanicLayer::custom(
+                    $crate::rpc::handle_panic,
+                ),
+            )
+            .layer($crate::reexport::tower_trailer::TrailerLayer::default());
+
+        let builder = RpcBuilder {
+            inner: builder,
+            health_reporter,
+            shutdown,
+            socket,
+        };
+
+        add_service!(builder, health_service);
+        add_service!(builder, reflection_service);
+        add_service!(
+            builder,
+            $crate::reexport::service_grpc_testing::make_server()
+        );
+
+        builder
+    }};
+}
+
+/// Serve a server constructed using [`RpcBuilder`].
+#[macro_export]
+macro_rules! serve_builder {
+    ($builder:ident) => {{
+        use $crate::rpc::RpcBuilder;
+
+        let RpcBuilder {
+            inner,
+            shutdown,
+            socket,
+            ..
+        } = $builder;
+
+        let stream = $crate::reexport::tonic::transport::server::TcpIncoming::from_listener(
+            socket, true, None,
+        )
+        .expect("failed to initialise tcp socket");
+        inner
+            .serve_with_incoming_shutdown(stream, shutdown.cancelled())
+            .await?;
+    }};
+}
+
+pub fn handle_panic(err: Box<dyn Any + Send + 'static>) -> http::Response<BoxBody> {
+    let message = if let Some(s) = err.downcast_ref::<String>() {
+        s.clone()
+    } else if let Some(s) = err.downcast_ref::<&str>() {
+        s.to_string()
+    } else {
+        "unknown internal error".to_string()
+    };
+
+    http::Response::builder()
+        .status(http::StatusCode::OK)
+        .header(http::header::CONTENT_TYPE, "application/grpc")
+        .header("grpc-status", Code::Internal as u32)
+        .header("grpc-message", message) // we don't want to leak the panic message
+        .body(tonic::body::empty_body())
+        .unwrap()
+}
+
+/// Instantiate a server listening on the specified address
+/// implementing the IOx, Storage, and Flight gRPC interfaces, the
+/// underlying hyper server instance. Resolves when the server has
+/// shutdown.
+pub async fn serve(
+    socket: TcpListener,
+    server_type: Arc<dyn ServerType>,
+    trace_header_parser: TraceHeaderParser,
+    shutdown: CancellationToken,
+) -> Result<(), RpcError> {
+    let builder_input = RpcBuilderInput {
+        socket,
+        trace_header_parser,
+        shutdown,
+    };
+
+    server_type.server_grpc(builder_input).await
+}
diff --git a/ioxd_common/src/server_type.rs b/ioxd_common/src/server_type.rs
new file mode 100644
index 0000000..519b2c4
--- /dev/null
+++ b/ioxd_common/src/server_type.rs
@@ -0,0 +1,86 @@
+mod common_state;
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use hyper::{Body, Request, Response};
+use metric::Registry;
+use snafu::Snafu;
+use tokio_util::sync::CancellationToken;
+use trace::TraceCollector;
+
+pub use common_state::{CommonServerState, CommonServerStateError};
+use trace_http::metrics::{MetricFamily, RequestMetrics};
+
+use crate::{http::error::HttpApiErrorSource, rpc::RpcBuilderInput};
+
+#[derive(Debug, Snafu)]
+pub enum RpcError {
+    #[snafu(display("gRPC transport error: {}{}", source, details))]
+    TransportError {
+        source: tonic::transport::Error,
+        details: String,
+    },
+
+    #[snafu(display("gRPC endpoint is not implemented"))]
+    UnImplemented,
+}
+
+// Custom impl to include underlying source (not included in tonic
+// transport error)
+impl From<tonic::transport::Error> for RpcError {
+    fn from(source: tonic::transport::Error) -> Self {
+        use std::error::Error;
+        let details = source
+            .source()
+            .map(|e| format!(" ({e})"))
+            .unwrap_or_default();
+
+        Self::TransportError { source, details }
+    }
+}
+
+#[async_trait]
+pub trait ServerType: std::fmt::Debug + Send + Sync + 'static {
+    /// Human name for this server type
+    fn name(&self) -> &str;
+
+    /// Metric registry associated with the server.
+    fn metric_registry(&self) -> Arc<Registry>;
+
+    /// Trace collector associated with the server, if any.
+    fn trace_collector(&self) -> Option<Arc<dyn TraceCollector>>;
+
+    /// Returns the `RequestMetrics` for instrumenting HTTP requests
+    fn http_request_metrics(&self) -> RequestMetrics {
+        RequestMetrics::new(self.metric_registry(), MetricFamily::HttpServer)
+    }
+
+    /// Route given HTTP request.
+    ///
+    /// Note that this is only called if none of the shared, common routes (e.g. `/health`) match.
+    async fn route_http_request(
+        &self,
+        req: Request<Body>,
+    ) -> Result<Response<Body>, Box<dyn HttpApiErrorSource>>;
+
+    /// Construct and serve gRPC subsystem.
+    async fn server_grpc(self: Arc<Self>, builder_input: RpcBuilderInput) -> Result<(), RpcError>;
+
+    /// Join shutdown worker.
+    ///
+    /// This MUST NOT exit before `shutdown` is called, otherwise the server is deemed to be dead and the process will exit.
+    async fn join(self: Arc<Self>);
+
+    /// Shutdown background worker.
+    ///
+    /// The provided [`CancellationToken`] MUST be used by the background worker
+    /// to shutdown the "frontend" (HTTP & RPC servers) when appropriate - this
+    /// should happen before [`Self::join()`] returns.
+    fn shutdown(&self, frontend: CancellationToken);
+
+    /// Return `true` if the service is healthy
+    fn is_healthy(&self) -> bool {
+        true
+    }
+}
diff --git a/ioxd_common/src/server_type/common_state.rs b/ioxd_common/src/server_type/common_state.rs
new file mode 100644
index 0000000..f9129d2
--- /dev/null
+++ b/ioxd_common/src/server_type/common_state.rs
@@ -0,0 +1,53 @@
+use std::sync::Arc;
+
+use snafu::{ResultExt, Snafu};
+use trace::TraceCollector;
+
+use clap_blocks::run_config::RunConfig;
+
+#[derive(Debug, Snafu)]
+pub enum CommonServerStateError {
+    #[snafu(display("Cannot create tracing pipeline: {}", source))]
+    Tracing { source: trace_exporters::Error },
+}
+
+/// Common state used by all server types
+#[derive(Debug, Clone)]
+pub struct CommonServerState {
+    run_config: RunConfig,
+    trace_exporter: Option<Arc<trace_exporters::export::AsyncExporter>>,
+}
+
+impl CommonServerState {
+    pub fn from_config(run_config: RunConfig) -> Result<Self, CommonServerStateError> {
+        let trace_exporter = run_config.tracing_config().build().context(TracingSnafu)?;
+
+        Ok(Self {
+            run_config,
+            trace_exporter,
+        })
+    }
+
+    pub fn for_testing() -> Self {
+        use clap::Parser;
+
+        Self::from_config(
+            RunConfig::try_parse_from(["not_used"]).expect("default parsing should work"),
+        )
+        .expect("default configs should work")
+    }
+
+    pub fn run_config(&self) -> &RunConfig {
+        &self.run_config
+    }
+
+    pub fn trace_exporter(&self) -> Option<Arc<trace_exporters::export::AsyncExporter>> {
+        self.trace_exporter.clone()
+    }
+
+    pub fn trace_collector(&self) -> Option<Arc<dyn TraceCollector>> {
+        self.trace_exporter
+            .clone()
+            .map(|x| -> Arc<dyn TraceCollector> { x })
+    }
+}
diff --git a/ioxd_common/src/service.rs b/ioxd_common/src/service.rs
new file mode 100644
index 0000000..b751303
--- /dev/null
+++ b/ioxd_common/src/service.rs
@@ -0,0 +1,39 @@
+use std::sync::Arc;
+
+use clap_blocks::{run_config::RunConfig, socket_addr::SocketAddr};
+
+use crate::server_type::ServerType;
+
+/// A service that will start on the specified addresses
+#[derive(Debug)]
+pub struct Service {
+    pub http_bind_address: Option<SocketAddr>,
+    pub grpc_bind_address: Option<SocketAddr>,
+    pub server_type: Arc<dyn ServerType>,
+}
+
+impl Service {
+    pub fn create(server_type: Arc<dyn ServerType>, run_config: &RunConfig) -> Self {
+        Self {
+            http_bind_address: Some(run_config.http_bind_address),
+            grpc_bind_address: Some(run_config.grpc_bind_address),
+            server_type,
+        }
+    }
+
+    pub fn create_grpc_only(server_type: Arc<dyn ServerType>, run_config: &RunConfig) -> Self {
+        Self {
+            http_bind_address: None,
+            grpc_bind_address: Some(run_config.grpc_bind_address),
+            server_type,
+        }
+    }
+
+    pub fn create_http_only(server_type: Arc<dyn ServerType>, run_config: &RunConfig) -> Self {
+        Self {
+            http_bind_address: Some(run_config.http_bind_address),
+            grpc_bind_address: None,
+            server_type,
+        }
+    }
+}
diff --git a/ioxd_test/Cargo.toml b/ioxd_test/Cargo.toml
new file mode 100644
index 0000000..488efaf
--- /dev/null
+++ b/ioxd_test/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "ioxd_test"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+# Workspace dependencies, in alphabetical order
+ioxd_common = { path = "../ioxd_common" }
+metric = { path = "../metric" }
+trace = { path = "../trace" }
+
+# Crates.io dependencies, in alphabetical order
+async-trait = "0.1"
+clap = { version = "4", features = ["derive", "env"] }
+hyper = "0.14"
+snafu = "0.8"
+tokio-util = "0.7.10"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/ioxd_test/src/lib.rs b/ioxd_test/src/lib.rs
new file mode 100644
index 0000000..b32dcda
--- /dev/null
+++ b/ioxd_test/src/lib.rs
@@ -0,0 +1,131 @@
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    clippy::clone_on_ref_ptr,
+    clippy::dbg_macro,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::use_self,
+    missing_debug_implementations,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use hyper::{Body, Method, Request, Response};
+use ioxd_common::{
+    http::error::{HttpApiError, HttpApiErrorExt, HttpApiErrorSource},
+    rpc::RpcBuilderInput,
+    serve_builder, setup_builder,
+};
+use metric::Registry;
+use snafu::Snafu;
+use tokio_util::sync::CancellationToken;
+use trace::TraceCollector;
+
+use ioxd_common::server_type::{RpcError, ServerType};
+
+#[derive(Debug, Snafu)]
+pub enum ApplicationError {
+    #[snafu(display("No handler for {:?} {}", method, path))]
+    RouteNotFound { method: Method, path: String },
+}
+
+impl HttpApiErrorSource for ApplicationError {
+    fn to_http_api_error(&self) -> HttpApiError {
+        match self {
+            e @ Self::RouteNotFound { .. } => e.not_found(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum, Copy)]
+pub enum TestAction {
+    None,
+    EarlyReturnFromGrpcWorker,
+    EarlyReturnFromServerWorker,
+    PanicInGrpcWorker,
+    PanicInServerWorker,
+}
+
+#[derive(Debug)]
+pub struct TestServerType {
+    metric_registry: Arc<Registry>,
+    trace_collector: Option<Arc<dyn TraceCollector>>,
+    shutdown: CancellationToken,
+    test_action: TestAction,
+}
+
+impl TestServerType {
+    pub fn new(
+        metric_registry: Arc<Registry>,
+        trace_collector: Option<Arc<dyn TraceCollector>>,
+        test_action: TestAction,
+    ) -> Self {
+        Self {
+            metric_registry,
+            trace_collector,
+            shutdown: CancellationToken::new(),
+            test_action,
+        }
+    }
+}
+
+#[async_trait]
+impl ServerType for TestServerType {
+    fn name(&self) -> &str {
+        "test"
+    }
+
+    fn metric_registry(&self) -> Arc<Registry> {
+        Arc::clone(&self.metric_registry)
+    }
+
+    fn trace_collector(&self) -> Option<Arc<dyn TraceCollector>> {
+        self.trace_collector.clone()
+    }
+
+    async fn route_http_request(
+        &self,
+        req: Request<Body>,
+    ) -> Result<Response<Body>, Box<dyn HttpApiErrorSource>> {
+        Err(Box::new(ApplicationError::RouteNotFound {
+            method: req.method().clone(),
+            path: req.uri().path().to_string(),
+        }))
+    }
+
+    async fn server_grpc(self: Arc<Self>, builder_input: RpcBuilderInput) -> Result<(), RpcError> {
+        match self.test_action {
+            TestAction::PanicInGrpcWorker => panic!("Test panic in gRPC worker"),
+            TestAction::EarlyReturnFromGrpcWorker => Ok(()),
+            _ => {
+                let builder = setup_builder!(builder_input, self);
+                serve_builder!(builder);
+
+                Ok(())
+            }
+        }
+    }
+
+    async fn join(self: Arc<Self>) {
+        if self.test_action == TestAction::PanicInServerWorker {
+            panic!("Test panic in server worker");
+        }
+        if self.test_action == TestAction::EarlyReturnFromServerWorker {
+            return;
+        }
+
+        self.shutdown.cancelled().await;
+    }
+
+    fn shutdown(&self, frontend: CancellationToken) {
+        frontend.cancel();
+        self.shutdown.cancel();
+    }
+}
diff --git a/kube_test/Cargo.toml b/kube_test/Cargo.toml
new file mode 100644
index 0000000..f7da1fe
--- /dev/null
+++ b/kube_test/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "kube_test"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+http = "0.2.9"
+hyper = "0.14.27"
+kube-core = "0.87.1"
+k8s-openapi = { version = "0.20.0", features = ["earliest"] }
+rand = "0.8.5"
+serde = "1.0.195"
+serde_json = "1.0.111"
+serde_yaml = "0.9.30"
+tower = "0.4.13"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
diff --git a/kube_test/src/call.rs b/kube_test/src/call.rs
new file mode 100644
index 0000000..6ed31a0
--- /dev/null
+++ b/kube_test/src/call.rs
@@ -0,0 +1,70 @@
+use super::{request::Request, Handler, Result};
+use http::{HeaderMap, Response, StatusCode};
+use hyper::body::HttpBody;
+use hyper::Body;
+use std::future::Future;
+use std::mem;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{ready, Context, Poll};
+
+#[derive(Debug)]
+pub struct Call {
+    handler: Option<Arc<dyn Handler + Send + Sync>>,
+    request: Request,
+    header: HeaderMap,
+    body: Body,
+    buf: Vec<u8>,
+}
+
+impl Call {
+    pub(crate) fn new(
+        handler: Option<Arc<dyn Handler + Send + Sync>>,
+        request: Request,
+        header: HeaderMap,
+        body: Body,
+    ) -> Self {
+        Self {
+            handler,
+            request,
+            header,
+            body,
+            buf: vec![],
+        }
+    }
+}
+
+impl Future for Call {
+    type Output = Result<Response<Body>>;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.get_mut();
+        match &this.handler {
+            None => {
+                let data = serde_json::to_vec(&super::status::resource_not_found(
+                    &this.request.api_plural(),
+                ))
+                .unwrap();
+                Poll::Ready(
+                    Response::builder()
+                        .status(StatusCode::NOT_FOUND)
+                        .body(data.into())
+                        .map_err(super::Error::from),
+                )
+            }
+            Some(handler) => {
+                while !&this.body.is_end_stream() {
+                    match ready!(Pin::new(&mut this.body).poll_data(cx)).transpose()? {
+                        Some(buf) => this.buf.extend_from_slice(buf.as_ref()),
+                        None => break,
+                    }
+                }
+                Poll::Ready(handler.handle(
+                    mem::take(&mut this.request),
+                    mem::take(&mut this.header),
+                    mem::take(&mut this.buf),
+                ))
+            }
+        }
+    }
+}
diff --git a/kube_test/src/error.rs b/kube_test/src/error.rs
new file mode 100644
index 0000000..e3c4f91
--- /dev/null
+++ b/kube_test/src/error.rs
@@ -0,0 +1,57 @@
+use std::fmt::{Display, Formatter};
+
+#[derive(Debug)]
+pub enum Error {
+    Serialization(serde_json::Error),
+    Yaml(serde_yaml::Error),
+    Http(http::Error),
+    Hyper(hyper::Error),
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl Display for Error {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Serialization(e) => e.fmt(f),
+            Self::Yaml(e) => e.fmt(f),
+            Self::Http(e) => e.fmt(f),
+            Self::Hyper(e) => e.fmt(f),
+        }
+    }
+}
+
+impl std::error::Error for Error {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            Self::Serialization(e) => Some(e),
+            Self::Yaml(e) => Some(e),
+            Self::Http(e) => Some(e),
+            Self::Hyper(e) => Some(e),
+        }
+    }
+}
+
+impl From<serde_json::Error> for Error {
+    fn from(value: serde_json::Error) -> Self {
+        Self::Serialization(value)
+    }
+}
+
+impl From<serde_yaml::Error> for Error {
+    fn from(value: serde_yaml::Error) -> Self {
+        Self::Yaml(value)
+    }
+}
+
+impl From<http::Error> for Error {
+    fn from(value: http::Error) -> Self {
+        Self::Http(value)
+    }
+}
+
+impl From<hyper::Error> for Error {
+    fn from(value: hyper::Error) -> Self {
+        Self::Hyper(value)
+    }
+}
diff --git a/kube_test/src/handler.rs b/kube_test/src/handler.rs
new file mode 100644
index 0000000..464797b
--- /dev/null
+++ b/kube_test/src/handler.rs
@@ -0,0 +1,25 @@
+use super::{request::Request, Result};
+use http::{HeaderMap, Response};
+use hyper::Body;
+use kube_core::ApiResource;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+pub trait Handler: Debug {
+    fn api_resource(&self) -> ApiResource;
+
+    fn handle(&self, req: Request, header: HeaderMap, body: Vec<u8>) -> Result<Response<Body>>;
+}
+
+pub trait AsHandler {
+    fn as_handler(self: &Arc<Self>) -> Arc<dyn Handler + Send + Sync>;
+}
+
+impl<T> AsHandler for T
+where
+    T: Handler + Send + Sync + 'static,
+{
+    fn as_handler(self: &Arc<Self>) -> Arc<dyn Handler + Send + Sync> {
+        Arc::clone(self) as Arc<dyn Handler + Send + Sync>
+    }
+}
diff --git a/kube_test/src/lib.rs b/kube_test/src/lib.rs
new file mode 100644
index 0000000..3c72ce2
--- /dev/null
+++ b/kube_test/src/lib.rs
@@ -0,0 +1,31 @@
+//! Kube_test provides a fake kubernetes service that can be used to test a kubernetes controller.
+//! The Service class provides a [tower::Service] that can be used with a kubernetes Client to
+//! behave sufficiently like a kubernetes controller to simplify testing controller reconcile loops.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+missing_debug_implementations,
+clippy::explicit_iter_loop,
+clippy::use_self,
+clippy::clone_on_ref_ptr,
+// See https://github.com/influxdata/influxdb_iox/pull/1671
+clippy::future_not_send
+)]
+#![allow(unreachable_pub)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod call;
+mod error;
+mod handler;
+mod object_map;
+mod request;
+mod resource_handler;
+mod service;
+mod status;
+
+pub use call::Call;
+pub use error::{Error, Result};
+pub use handler::{AsHandler, Handler};
+pub use resource_handler::ResourceHandler;
+pub use service::Service;
diff --git a/kube_test/src/object_map.rs b/kube_test/src/object_map.rs
new file mode 100644
index 0000000..55807b8
--- /dev/null
+++ b/kube_test/src/object_map.rs
@@ -0,0 +1,178 @@
+use super::status;
+use kube_core::{ApiResource, DynamicObject, Status};
+use std::collections::{hash_map, HashMap};
+use std::mem;
+
+#[derive(Debug)]
+pub struct ObjectMap {
+    api_resource: ApiResource,
+    objects: HashMap<Key, DynamicObject>,
+}
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+struct Key {
+    ns: Option<String>,
+    name: String,
+}
+
+impl ObjectMap {
+    pub fn new(api_resource: ApiResource) -> Self {
+        Self {
+            api_resource,
+            objects: HashMap::new(),
+        }
+    }
+
+    pub fn entry(&mut self, ns: Option<String>, name: String) -> Entry<'_> {
+        let key = Key { ns, name };
+        let inner = self.objects.entry(key);
+        Entry {
+            api_resource: &self.api_resource,
+            inner,
+        }
+    }
+
+    pub fn values(&self, ns: Option<String>) -> Values<'_> {
+        Values {
+            ns,
+            inner: self.objects.values(),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct Entry<'a> {
+    api_resource: &'a ApiResource,
+    inner: hash_map::Entry<'a, Key, DynamicObject>,
+}
+
+impl<'a> Entry<'a> {
+    pub fn create(self, obj: DynamicObject) -> Result<&'a DynamicObject, Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(entry) => Err(Box::new(status::already_exists(
+                self.api_resource,
+                Some(entry.key().name.as_str()),
+            ))),
+            hash_map::Entry::Vacant(entry) => {
+                let Key { ns, name } = entry.key().clone();
+                let obj = entry.insert(obj);
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                if obj.metadata.uid.is_none() {
+                    obj.metadata.uid = Some(format!("{}", rand::random::<u64>()));
+                }
+                Ok(obj)
+            }
+        }
+    }
+
+    pub fn get(&mut self) -> Result<&DynamicObject, Box<Status>> {
+        match &self.inner {
+            hash_map::Entry::Occupied(entry) => Ok(entry.get()),
+            hash_map::Entry::Vacant(entry) => {
+                let name = entry.key().name.as_str();
+                Err(Box::new(status::not_found(self.api_resource, Some(name))))
+            }
+        }
+    }
+
+    pub fn delete(self) -> Result<DynamicObject, Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(entry) => {
+                let obj = entry.remove();
+                Ok(obj)
+            }
+            hash_map::Entry::Vacant(entry) => {
+                let name = entry.key().name.as_str();
+                Err(Box::new(status::not_found(self.api_resource, Some(name))))
+            }
+        }
+    }
+
+    pub fn update(self, mut obj: DynamicObject) -> Result<(bool, DynamicObject), Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(mut entry) => {
+                let Key { ns, name } = entry.key().clone();
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                let _ = entry.insert(obj.clone());
+                Ok((false, obj))
+            }
+            hash_map::Entry::Vacant(entry) => {
+                let Key { ns, name } = entry.key().clone();
+                let obj = entry.insert(obj);
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                if obj.metadata.uid.is_none() {
+                    obj.metadata.uid = Some(format!("{}", rand::random::<u64>()));
+                }
+                Ok((true, obj.clone()))
+            }
+        }
+    }
+
+    pub fn apply(self, patch: DynamicObject) -> Result<DynamicObject, Box<Status>> {
+        let Key { ns, name } = self.inner.key().clone();
+
+        let obj = self.inner.or_insert_with(|| {
+            let obj = DynamicObject::new(name.as_str(), self.api_resource);
+            if let Some(ns) = ns {
+                obj.within(ns.as_str())
+            } else {
+                obj
+            }
+        });
+        let _ = mem::replace(&mut obj.data, patch.data);
+        Ok(obj.clone())
+    }
+
+    pub fn update_subresource(
+        self,
+        subresource: String,
+        obj: DynamicObject,
+    ) -> Result<(bool, DynamicObject), Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(mut entry) => {
+                if let Some(value) = obj.data.as_object().and_then(|v| v.get(&subresource)) {
+                    if let Some(data) = entry.get_mut().data.as_object_mut() {
+                        data.insert(subresource, value.clone());
+                    }
+                }
+                Ok((false, entry.get().clone()))
+            }
+            hash_map::Entry::Vacant(entry) => {
+                let Key { ns, name } = entry.key().clone();
+                let obj = entry.insert(obj);
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                if obj.metadata.uid.is_none() {
+                    obj.metadata.uid = Some(format!("{}", rand::random::<u64>()));
+                }
+                Ok((true, obj.clone()))
+            }
+        }
+    }
+}
+
+pub struct Values<'a> {
+    ns: Option<String>,
+    inner: hash_map::Values<'a, Key, DynamicObject>,
+}
+
+impl<'a> Iterator for Values<'a> {
+    type Item = &'a DynamicObject;
+    fn next(&mut self) -> Option<Self::Item> {
+        match &self.ns {
+            None => self.inner.next(),
+            Some(ns) => loop {
+                match self.inner.next() {
+                    None => return None,
+                    Some(v) => match &v.metadata.namespace {
+                        Some(ns2) if ns2 == ns => return Some(v),
+                        _ => continue,
+                    },
+                };
+            },
+        }
+    }
+}
diff --git a/kube_test/src/request.rs b/kube_test/src/request.rs
new file mode 100644
index 0000000..6a741ab
--- /dev/null
+++ b/kube_test/src/request.rs
@@ -0,0 +1,115 @@
+use http::request::Parts;
+use kube_core::ApiResource;
+use std::fmt::{Display, Formatter};
+
+#[derive(Debug, Default, Clone)]
+pub struct Request {
+    pub verb: String,
+    pub group: String,
+    pub version: String,
+    pub plural: String,
+    pub ns: Option<String>,
+    pub name: Option<String>,
+    pub subresource: Option<String>,
+}
+
+impl Request {
+    pub(crate) fn parse(parts: &Parts) -> Self {
+        let verb = parts.method.as_str().to_lowercase();
+        let (group, version, plural, ns, name, subresource) = match parts
+            .uri
+            .path()
+            .split('/')
+            .skip(1)
+            .collect::<Vec<&str>>()
+            .as_slice()
+        {
+            ["api", "v1", plural] => ("", "v1", *plural, "", "", ""),
+            ["api", "v1", plural, name] => ("", "v1", *plural, "", *name, ""),
+            ["api", "v1", "namespaces", ns, plural] => ("", "v1", *plural, *ns, "", ""),
+            ["api", "v1", "namespaces", ns, plural, name] => ("", "v1", *plural, *ns, *name, ""),
+            ["api", "v1", "namespaces", ns, plural, name, subresource] => {
+                ("", "v1", *plural, *ns, *name, *subresource)
+            }
+            ["api", "v1", plural, name, subresource] => {
+                ("", "v1", *plural, "", *name, *subresource)
+            }
+            ["apis", group, version, "namespaces", ns, plural] => {
+                (*group, *version, *plural, *ns, "", "")
+            }
+            ["apis", group, version, "namespaces", ns, plural, name] => {
+                (*group, *version, *plural, *ns, *name, "")
+            }
+            ["apis", group, version, "namespaces", ns, plural, name, subresource] => {
+                (*group, *version, *plural, *ns, *name, *subresource)
+            }
+            ["apis", group, version, plural] => (*group, *version, *plural, "", "", ""),
+            ["apis", group, version, plural, name] => (*group, *version, *plural, "", *name, ""),
+            ["apis", group, version, plural, name, subresource] => {
+                (*group, *version, *plural, "", *name, *subresource)
+            }
+            _ => ("", "", "", "", "", ""),
+        };
+
+        let verb = match (verb.as_str(), name.len()) {
+            ("get", 0) => String::from("list"),
+            ("delete", 0) => String::from("deletecollection"),
+            ("post", _) => String::from("create"),
+            ("put", _) => String::from("update"),
+            _ => verb,
+        };
+
+        Self {
+            verb,
+            group: String::from(group),
+            version: String::from(version),
+            plural: String::from(plural),
+            ns: if ns.is_empty() {
+                None
+            } else {
+                Some(String::from(ns))
+            },
+            name: if name.is_empty() {
+                None
+            } else {
+                Some(String::from(name))
+            },
+            subresource: if subresource.is_empty() {
+                None
+            } else {
+                Some(String::from(subresource))
+            },
+        }
+    }
+
+    pub fn api_plural(&self) -> ApiPlural {
+        ApiPlural::new(self.group.clone(), self.plural.clone())
+    }
+}
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ApiPlural {
+    group: String,
+    plural: String,
+}
+
+impl ApiPlural {
+    pub fn new(group: String, plural: String) -> Self {
+        Self { group, plural }
+    }
+}
+
+impl Display for ApiPlural {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        if self.group.is_empty() {
+            self.plural.fmt(f)
+        } else {
+            write!(f, "{}/{}", self.group, self.plural)
+        }
+    }
+}
+
+impl From<ApiResource> for ApiPlural {
+    fn from(value: ApiResource) -> Self {
+        Self::new(value.group, value.plural)
+    }
+}
diff --git a/kube_test/src/resource_handler.rs b/kube_test/src/resource_handler.rs
new file mode 100644
index 0000000..9b76ad8
--- /dev/null
+++ b/kube_test/src/resource_handler.rs
@@ -0,0 +1,267 @@
+use super::{object_map::ObjectMap, request::Request, status, Handler, Result};
+use http::{HeaderMap, HeaderValue, Response, StatusCode};
+use hyper::Body;
+use kube_core::{ApiResource, DynamicObject, ObjectList, ObjectMeta, Resource};
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::sync::atomic::{AtomicI16, Ordering};
+use std::sync::{Arc, Mutex};
+
+#[derive(Debug)]
+pub struct ResourceHandler<R> {
+    api_resource: ApiResource,
+    objects: Arc<Mutex<ObjectMap>>,
+    gen_id: AtomicI16,
+    phantom: PhantomData<R>,
+}
+
+impl<R> ResourceHandler<R>
+where
+    R: Resource<DynamicType = ()> + DeserializeOwned + Serialize,
+{
+    /// Create a new handler for a kubernetes resource type.
+    pub fn new() -> Self {
+        let api_resource = ApiResource::erase::<R>(&());
+        Self {
+            api_resource: api_resource.clone(),
+            objects: Arc::new(Mutex::new(ObjectMap::new(api_resource))),
+            gen_id: AtomicI16::new(0),
+            phantom: Default::default(),
+        }
+    }
+
+    /// Retrieve a stored kubernetes resource, if available.
+    pub fn get(&self, ns: impl Into<String>, name: impl Into<String>) -> Option<R> {
+        let ns = ns.into();
+        let ns = if ns.is_empty() { None } else { Some(ns) };
+        let name = name.into();
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .get()
+        {
+            Ok(obj) => obj.clone().try_parse::<R>().ok(),
+            _ => None,
+        }
+    }
+
+    /// Store, or overwrite, the resource with the given name.
+    pub fn set(&self, ns: impl Into<String>, name: impl Into<String>, resource: R) -> R {
+        let ns = ns.into();
+        let ns = if ns.is_empty() { None } else { Some(ns) };
+        let name = name.into();
+        let obj = serde_json::from_value::<DynamicObject>(serde_json::to_value(resource).unwrap())
+            .unwrap();
+        let (_, obj) = Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .update(obj)
+            .unwrap();
+        obj.try_parse::<R>().unwrap()
+    }
+
+    /// Retrieve all the stored resources. if the resource is namespaced and ns is not None then
+    /// only resources in that namespace will be returned.
+    pub fn all(&self, ns: impl Into<String>) -> Vec<R> {
+        let ns = ns.into();
+        let ns = if ns.is_empty() { None } else { Some(ns) };
+        Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .values(ns)
+            .cloned()
+            .filter_map(|v| v.try_parse::<R>().ok())
+            .collect::<Vec<_>>()
+    }
+}
+
+impl<R> Default for ResourceHandler<R>
+where
+    R: Resource<DynamicType = ()> + DeserializeOwned + Serialize,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<R> ResourceHandler<R> {
+    fn maybe_generate_name(&self, meta: &mut ObjectMeta) {
+        if meta.name.is_none() {
+            if let Some(prefix) = &meta.generate_name {
+                meta.name = Some(format!(
+                    "{prefix}{:05}",
+                    self.gen_id.fetch_add(1, Ordering::SeqCst)
+                ));
+            }
+        }
+    }
+
+    fn create(&self, body: Vec<u8>) -> Result<Response<Body>> {
+        let mut obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+        self.maybe_generate_name(&mut obj.metadata);
+        let ns = obj.metadata.namespace.clone();
+        let name = obj.metadata.name.clone().unwrap();
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .create(obj)
+        {
+            Ok(obj) => response(StatusCode::CREATED, obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn retrieve(&self, ns: Option<String>, name: String) -> Result<Response<Body>> {
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .get()
+        {
+            Ok(obj) => response(StatusCode::OK, obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn list(&self, ns: Option<String>) -> Result<Response<Body>> {
+        let list = ObjectList {
+            metadata: Default::default(),
+            items: Arc::clone(&self.objects)
+                .lock()
+                .unwrap()
+                .values(ns)
+                .cloned()
+                .collect(),
+        };
+        response(StatusCode::OK, &list)
+    }
+
+    fn update(&self, ns: Option<String>, name: String, body: Vec<u8>) -> Result<Response<Body>> {
+        let obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .update(obj)
+        {
+            Ok((true, obj)) => response(StatusCode::CREATED, &obj),
+            Ok((false, obj)) => response(StatusCode::OK, &obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn update_subresource(
+        &self,
+        ns: Option<String>,
+        name: String,
+        subresource: String,
+        body: Vec<u8>,
+    ) -> Result<Response<Body>> {
+        let obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .update_subresource(subresource, obj)
+        {
+            Ok((true, obj)) => response(StatusCode::CREATED, &obj),
+            Ok((false, obj)) => response(StatusCode::OK, &obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn delete(&self, ns: Option<String>, name: String) -> Result<Response<Body>> {
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .delete()
+        {
+            Ok(obj) => response(StatusCode::OK, &obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn patch(
+        &self,
+        ns: Option<String>,
+        name: String,
+        header: HeaderMap,
+        body: Vec<u8>,
+    ) -> Result<Response<Body>> {
+        let content_type = match header.get("Content-Type") {
+            Some(v) => v.to_str().unwrap(),
+            None => "",
+        };
+        match content_type {
+            "application/apply-patch+yaml" => {
+                let obj = serde_yaml::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+                match Arc::clone(&self.objects)
+                    .lock()
+                    .unwrap()
+                    .entry(ns, name)
+                    .apply(obj)
+                {
+                    Ok(obj) => response(StatusCode::OK, &obj),
+                    Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+                }
+            }
+            ct => {
+                let status = status::invalid(&format!("unsupported patch type \"{ct}\""));
+                response(StatusCode::from_u16(status.code).unwrap(), &status)
+            }
+        }
+    }
+}
+
+fn response<T: Serialize>(status: StatusCode, data: &T) -> Result<Response<Body>> {
+    let buf = serde_json::to_vec(data)?;
+    Ok(Response::builder().status(status).body(buf.into())?)
+}
+
+impl<R> Handler for ResourceHandler<R>
+where
+    R: Debug,
+{
+    fn api_resource(&self) -> ApiResource {
+        self.api_resource.clone()
+    }
+
+    fn handle(
+        &self,
+        req: Request,
+        header: HeaderMap<HeaderValue>,
+        body: Vec<u8>,
+    ) -> Result<Response<Body>> {
+        let Request {
+            verb,
+            ns,
+            name,
+            subresource,
+            ..
+        } = req;
+        match verb.as_str() {
+            "create" => self.create(body),
+            "delete" => self.delete(ns, name.unwrap()),
+            "get" => self.retrieve(ns, name.unwrap()),
+            "list" => self.list(ns),
+            "patch" => self.patch(ns, name.unwrap(), header, body),
+            "update" => {
+                if let Some(subresource) = subresource {
+                    self.update_subresource(ns, name.unwrap(), subresource, body)
+                } else {
+                    self.update(ns, name.unwrap(), body)
+                }
+            }
+            v => {
+                let api_resource = self.api_resource();
+                super::status::method_not_allowed(&api_resource, name, v)
+            }
+        }
+    }
+}
diff --git a/kube_test/src/service.rs b/kube_test/src/service.rs
new file mode 100644
index 0000000..ffc4ef8
--- /dev/null
+++ b/kube_test/src/service.rs
@@ -0,0 +1,54 @@
+use super::{request::ApiPlural, Call, Handler, Result};
+use http::{Request, Response};
+use hyper::Body;
+use std::collections::HashMap;
+use std::ops::DerefMut;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+
+/// Service provides a [tower::Service] that acts like a kubernetes API server.
+#[derive(Debug)]
+pub struct Service {
+    handlers: Arc<Mutex<HashMap<ApiPlural, Arc<dyn Handler + Send + Sync>>>>,
+}
+
+impl Service {
+    pub fn new() -> Self {
+        let handlers = Arc::new(Mutex::new(HashMap::new()));
+        Self { handlers }
+    }
+
+    pub fn add_handler(&self, handler: Arc<dyn Handler + Send + Sync>) {
+        let key = handler.api_resource().into();
+        self.handlers
+            .lock()
+            .unwrap()
+            .deref_mut()
+            .insert(key, handler);
+    }
+}
+
+impl Default for Service {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl tower::Service<Request<Body>> for Service {
+    type Response = Response<Body>;
+    type Error = super::Error;
+    type Future = Call;
+
+    fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let (parts, body) = req.into_parts();
+        let req = super::request::Request::parse(&parts);
+        match self.handlers.lock().unwrap().get(&req.api_plural()) {
+            Some(handler) => Call::new(Some(Arc::clone(handler)), req, parts.headers, body),
+            None => Call::new(None, req, parts.headers, body),
+        }
+    }
+}
diff --git a/kube_test/src/status.rs b/kube_test/src/status.rs
new file mode 100644
index 0000000..cca92b6
--- /dev/null
+++ b/kube_test/src/status.rs
@@ -0,0 +1,61 @@
+use super::{request::ApiPlural, Result};
+use http::{Response, StatusCode};
+use hyper::Body;
+use kube_core::{ApiResource, Status};
+
+/// Generate an "Invalid" kubernetes status response.
+pub(crate) fn invalid(message: &str) -> Status {
+    Status::failure(message, "Invalid").with_code(422)
+}
+
+/// Generate an "AlreadyExists" kubernetes status response.
+pub(crate) fn already_exists(api_resource: &ApiResource, name: Option<&str>) -> Status {
+    let resource_id = resource_id(&api_resource.group, &api_resource.kind, name);
+    Status::failure(
+        format!("{resource_id} already exists",).as_str(),
+        "AlreadyExists",
+    )
+    .with_code(StatusCode::CONFLICT.as_u16())
+}
+
+/// Generate a "NotFound" kubernetes status response for a resource.
+pub(crate) fn resource_not_found(api_plural: &ApiPlural) -> Status {
+    Status::failure(&format!("resource {api_plural} not found"), "NotFound")
+        .with_code(StatusCode::NOT_FOUND.as_u16())
+}
+
+/// Generate a "NotFound" kubernetes status response.
+pub(crate) fn not_found(api_resource: &ApiResource, name: Option<&str>) -> Status {
+    let resource_id = resource_id(&api_resource.group, &api_resource.kind, name);
+    Status::failure(&format!("{resource_id} not found"), "NotFound")
+        .with_code(StatusCode::NOT_FOUND.as_u16())
+}
+
+/// Generate a "MethodNotAllowed" kubernetes status response.
+pub(crate) fn method_not_allowed(
+    api_resource: &ApiResource,
+    name: Option<String>,
+    method: &str,
+) -> Result<Response<Body>> {
+    let resource_id = resource_id(&api_resource.group, &api_resource.kind, name.as_deref());
+    let status = Status::failure(
+        format!("method {method} not allowed for {resource_id}").as_str(),
+        "MethodNotAllowed",
+    )
+    .with_code(StatusCode::METHOD_NOT_ALLOWED.as_u16());
+    response(&status)
+}
+
+fn response(status: &Status) -> Result<Response<Body>> {
+    let buf = serde_json::to_vec(status)?;
+    Ok(Response::builder().status(status.code).body(buf.into())?)
+}
+
+fn resource_id(group: &str, kind: &str, name: Option<&str>) -> String {
+    match (name, group.is_empty()) {
+        (None, true) => format!("resource {kind}"),
+        (None, false) => format!("resource {group}.{kind}"),
+        (Some(name), true) => format!("{kind} {name}"),
+        (Some(name), false) => format!("{group}.{kind} {name}"),
+    }
+}
diff --git a/logfmt/Cargo.toml b/logfmt/Cargo.toml
new file mode 100644
index 0000000..c194cba
--- /dev/null
+++ b/logfmt/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "logfmt"
+description = "tracing_subscriber layer for writing out logfmt formatted events"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+observability_deps = { path = "../observability_deps" }
+tracing-subscriber = "0.3"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+once_cell = { version = "1.19", features = ["parking_lot"] }
+parking_lot = "0.12"
+regex = "1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
diff --git a/logfmt/src/lib.rs b/logfmt/src/lib.rs
new file mode 100644
index 0000000..e63b5b2
--- /dev/null
+++ b/logfmt/src/lib.rs
@@ -0,0 +1,429 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use once_cell as _;
+#[cfg(test)]
+use parking_lot as _;
+#[cfg(test)]
+use regex as _;
+use workspace_hack as _;
+
+use observability_deps::tracing::{
+    self,
+    field::{Field, Visit},
+    subscriber::Interest,
+    Id, Level, Subscriber,
+};
+use std::borrow::Cow;
+use std::{io::Write, time::SystemTime};
+use tracing_subscriber::{fmt::MakeWriter, layer::Context, registry::LookupSpan, Layer};
+
+/// Implements a `tracing_subscriber::Layer` which generates
+/// [logfmt] formatted log entries, suitable for log ingestion
+///
+/// At time of writing, I could find no good existing crate
+///
+/// <https://github.com/mcountryman/logfmt_logger> from @mcountryman
+/// looked very small and did not (obviously) work with the tracing subscriber
+///
+/// [logfmt]: https://brandur.org/logfmt
+#[derive(Debug)]
+pub struct LogFmtLayer<W>
+where
+    W: for<'writer> MakeWriter<'writer>,
+{
+    writer: W,
+    display_target: bool,
+}
+
+impl<W> LogFmtLayer<W>
+where
+    W: for<'writer> MakeWriter<'writer>,
+{
+    /// Create a new logfmt Layer to pass into tracing_subscriber
+    ///
+    /// Note this layer simply formats and writes to the specified writer. It
+    /// does not do any filtering for levels itself. Filtering can be done
+    /// using a EnvFilter
+    ///
+    /// For example:
+    /// ```
+    ///  use logfmt::LogFmtLayer;
+    ///  use tracing_subscriber::{EnvFilter, prelude::*, self};
+    ///
+    ///  // setup debug logging level
+    ///  std::env::set_var("RUST_LOG", "debug");
+    ///
+    ///  // setup formatter to write to stderr
+    ///  let formatter =
+    ///    LogFmtLayer::new(std::io::stderr);
+    ///
+    ///  tracing_subscriber::registry()
+    ///    .with(EnvFilter::from_default_env())
+    ///    .with(formatter)
+    ///    .init();
+    /// ```
+    pub fn new(writer: W) -> Self {
+        Self {
+            writer,
+            display_target: true,
+        }
+    }
+
+    /// Control whether target and location attributes are displayed (on by default).
+    ///
+    /// Note: this API mimics that of other fmt layers in tracing-subscriber crate.
+    pub fn with_target(self, display_target: bool) -> Self {
+        Self {
+            display_target,
+            ..self
+        }
+    }
+}
+
+impl<S, W> Layer<S> for LogFmtLayer<W>
+where
+    W: for<'writer> MakeWriter<'writer> + 'static,
+    S: Subscriber + for<'a> LookupSpan<'a>,
+{
+    fn register_callsite(
+        &self,
+        _metadata: &'static tracing::Metadata<'static>,
+    ) -> tracing::subscriber::Interest {
+        Interest::always()
+    }
+
+    fn on_new_span(&self, attrs: &tracing::span::Attributes<'_>, id: &Id, ctx: Context<'_, S>) {
+        let writer = self.writer.make_writer();
+        let metadata = ctx.metadata(id).expect("span should have metadata");
+        let mut p = FieldPrinter::new(writer, metadata.level(), self.display_target);
+        p.write_span_name(metadata.name());
+        attrs.record(&mut p);
+        p.write_span_id(id);
+        p.write_timestamp();
+    }
+
+    fn max_level_hint(&self) -> Option<tracing::metadata::LevelFilter> {
+        None
+    }
+
+    fn on_event(&self, event: &tracing::Event<'_>, ctx: Context<'_, S>) {
+        let writer = self.writer.make_writer();
+        let mut p = FieldPrinter::new(writer, event.metadata().level(), self.display_target);
+        // record fields
+        event.record(&mut p);
+        if let Some(span) = ctx.lookup_current() {
+            p.write_span_id(&span.id())
+        }
+        // record source information
+        p.write_source_info(event);
+        p.write_timestamp();
+    }
+}
+
+/// This thing is responsible for actually printing log information to
+/// a writer
+struct FieldPrinter<W: Write> {
+    writer: W,
+    display_target: bool,
+}
+
+impl<W: Write> FieldPrinter<W> {
+    fn new(mut writer: W, level: &Level, display_target: bool) -> Self {
+        let level_str = match *level {
+            Level::TRACE => "trace",
+            Level::DEBUG => "debug",
+            Level::INFO => "info",
+            Level::WARN => "warn",
+            Level::ERROR => "error",
+        };
+
+        write!(writer, r#"level={level_str}"#).ok();
+
+        Self {
+            writer,
+            display_target,
+        }
+    }
+
+    fn write_span_name(&mut self, value: &str) {
+        write!(self.writer, " span_name=\"{}\"", quote_and_escape(value)).ok();
+    }
+
+    fn write_source_info(&mut self, event: &tracing::Event<'_>) {
+        if !self.display_target {
+            return;
+        }
+
+        let metadata = event.metadata();
+        write!(
+            self.writer,
+            " target=\"{}\"",
+            quote_and_escape(metadata.target())
+        )
+        .ok();
+
+        if let Some(module_path) = metadata.module_path() {
+            if metadata.target() != module_path {
+                write!(self.writer, " module_path=\"{module_path}\"").ok();
+            }
+        }
+        if let (Some(file), Some(line)) = (metadata.file(), metadata.line()) {
+            write!(self.writer, " location=\"{file}:{line}\"").ok();
+        }
+    }
+
+    fn write_span_id(&mut self, id: &Id) {
+        write!(self.writer, " span={}", id.into_u64()).ok();
+    }
+
+    fn write_timestamp(&mut self) {
+        let ns_since_epoch = SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .expect("System time should have been after the epoch")
+            .as_nanos();
+
+        write!(self.writer, " time={ns_since_epoch:?}").ok();
+    }
+}
+
+impl<W: Write> Drop for FieldPrinter<W> {
+    fn drop(&mut self) {
+        // finish the log line
+        writeln!(self.writer).ok();
+    }
+}
+
+impl<W: Write> Visit for FieldPrinter<W> {
+    fn record_i64(&mut self, field: &Field, value: i64) {
+        write!(
+            self.writer,
+            " {}={}",
+            translate_field_name(field.name()),
+            value
+        )
+        .ok();
+    }
+
+    fn record_u64(&mut self, field: &Field, value: u64) {
+        write!(
+            self.writer,
+            " {}={}",
+            translate_field_name(field.name()),
+            value
+        )
+        .ok();
+    }
+
+    fn record_bool(&mut self, field: &Field, value: bool) {
+        write!(
+            self.writer,
+            " {}={}",
+            translate_field_name(field.name()),
+            value
+        )
+        .ok();
+    }
+
+    fn record_str(&mut self, field: &Field, value: &str) {
+        write!(
+            self.writer,
+            " {}={}",
+            translate_field_name(field.name()),
+            quote_and_escape(value)
+        )
+        .ok();
+    }
+
+    fn record_error(&mut self, field: &Field, value: &(dyn std::error::Error + 'static)) {
+        let field_name = translate_field_name(field.name());
+
+        let debug_formatted = format!("{value:?}");
+        write!(
+            self.writer,
+            " {}={:?}",
+            field_name,
+            quote_and_escape(&debug_formatted)
+        )
+        .ok();
+
+        let display_formatted = format!("{value}");
+        write!(
+            self.writer,
+            " {}.display={}",
+            field_name,
+            quote_and_escape(&display_formatted)
+        )
+        .ok();
+    }
+
+    fn record_debug(&mut self, field: &Field, value: &dyn std::fmt::Debug) {
+        // Note this appears to be invoked via `debug!` and `info! macros
+        let formatted_value = format!("{value:?}");
+        write!(
+            self.writer,
+            " {}={}",
+            translate_field_name(field.name()),
+            quote_and_escape(&formatted_value)
+        )
+        .ok();
+    }
+}
+
+/// return true if the string value already starts/ends with quotes and is
+/// already properly escaped (all spaces escaped)
+fn needs_quotes_and_escaping(value: &str) -> bool {
+    // mismatches beginning  / end quotes
+    if value.starts_with('"') != value.ends_with('"') {
+        return true;
+    }
+
+    // ignore beginning/ending quotes, if any
+    let pre_quoted = value.len() >= 2 && value.starts_with('"') && value.ends_with('"');
+
+    let value = if pre_quoted {
+        &value[1..value.len() - 1]
+    } else {
+        value
+    };
+
+    // unescaped quotes
+    let c0 = value.chars();
+    let c1 = value.chars().skip(1);
+    if c0.zip(c1).any(|(c0, c1)| c0 != '\\' && c1 == '"') {
+        return true;
+    }
+
+    // Quote any strings that contain a literal '=' which the logfmt parser
+    // interprets as a key/value separator.
+    if value.chars().any(|c| c == '=') && !pre_quoted {
+        return true;
+    }
+
+    if value.bytes().any(|b| b <= b' ') && !pre_quoted {
+        return true;
+    }
+
+    false
+}
+
+/// escape any characters in name as needed, otherwise return string as is
+fn quote_and_escape(value: &'_ str) -> Cow<'_, str> {
+    if needs_quotes_and_escaping(value) {
+        Cow::Owned(format!("{value:?}"))
+    } else {
+        Cow::Borrowed(value)
+    }
+}
+
+// Translate the field name from tracing into the logfmt style
+fn translate_field_name(name: &str) -> &str {
+    if name == "message" {
+        "msg"
+    } else {
+        name
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn quote_and_escape_len0() {
+        assert_eq!(quote_and_escape(""), "");
+    }
+
+    #[test]
+    fn quote_and_escape_len1() {
+        assert_eq!(quote_and_escape("f"), "f");
+    }
+
+    #[test]
+    fn quote_and_escape_len2() {
+        assert_eq!(quote_and_escape("fo"), "fo");
+    }
+
+    #[test]
+    fn quote_and_escape_len3() {
+        assert_eq!(quote_and_escape("foo"), "foo");
+    }
+
+    #[test]
+    fn quote_and_escape_len3_1quote_start() {
+        assert_eq!(quote_and_escape("\"foo"), "\"\\\"foo\"");
+    }
+
+    #[test]
+    fn quote_and_escape_len3_1quote_end() {
+        assert_eq!(quote_and_escape("foo\""), "\"foo\\\"\"");
+    }
+
+    #[test]
+    fn quote_and_escape_len3_2quote() {
+        assert_eq!(quote_and_escape("\"foo\""), "\"foo\"");
+    }
+
+    #[test]
+    fn quote_and_escape_space() {
+        assert_eq!(quote_and_escape("foo bar"), "\"foo bar\"");
+    }
+
+    #[test]
+    fn quote_and_escape_space_prequoted() {
+        assert_eq!(quote_and_escape("\"foo bar\""), "\"foo bar\"");
+    }
+
+    #[test]
+    fn quote_and_escape_space_prequoted_but_not_escaped() {
+        assert_eq!(quote_and_escape("\"foo \"bar\""), "\"\\\"foo \\\"bar\\\"\"");
+    }
+
+    #[test]
+    fn quote_and_escape_quoted_quotes() {
+        assert_eq!(quote_and_escape("foo:\"bar\""), "\"foo:\\\"bar\\\"\"");
+    }
+
+    #[test]
+    fn quote_and_escape_nested_1() {
+        assert_eq!(quote_and_escape(r#"a "b" c"#), r#""a \"b\" c""#);
+    }
+
+    #[test]
+    fn quote_and_escape_nested_2() {
+        assert_eq!(
+            quote_and_escape(r#"a "0 \"1\" 2" c"#),
+            r#""a \"0 \\\"1\\\" 2\" c""#
+        );
+    }
+
+    #[test]
+    fn quote_not_printable() {
+        assert_eq!(quote_and_escape("foo\nbar"), r#""foo\nbar""#);
+        assert_eq!(quote_and_escape("foo\r\nbar"), r#""foo\r\nbar""#);
+        assert_eq!(quote_and_escape("foo\0bar"), r#""foo\0bar""#);
+    }
+
+    #[test]
+    fn not_quote_unicode_unnecessarily() {
+        assert_eq!(quote_and_escape("mikuličić"), "mikuličić");
+    }
+
+    #[test]
+    // https://github.com/influxdata/influxdb_iox/issues/4352
+    fn test_uri_quoted() {
+        assert_eq!(quote_and_escape("/api/v2/write?bucket=06fddb4f912a0d7f&org=9df0256628d1f506&orgID=9df0256628d1f506&precision=ns"), r#""/api/v2/write?bucket=06fddb4f912a0d7f&org=9df0256628d1f506&orgID=9df0256628d1f506&precision=ns""#);
+    }
+}
diff --git a/logfmt/tests/logging.rs b/logfmt/tests/logging.rs
new file mode 100644
index 0000000..4cb8cee
--- /dev/null
+++ b/logfmt/tests/logging.rs
@@ -0,0 +1,391 @@
+// Note that this needs to be an integration test because since the tracing
+// structures are global, once you se a logging subscriber you can't undo
+// that.... So punting on that for now
+
+use logfmt::LogFmtLayer;
+use observability_deps::tracing::{debug, error, info, span, trace, warn, Level};
+use once_cell::sync::Lazy;
+use parking_lot::Mutex;
+use regex::Regex;
+use std::{
+    error::Error,
+    fmt,
+    io::{self, Cursor},
+};
+use tracing_subscriber::{self, fmt::MakeWriter, prelude::*};
+
+/// Compares the captured messages with the expected messages,
+/// normalizing for time and location
+#[macro_export]
+macro_rules! assert_logs {
+    ($CAPTURE: expr, $EXPECTED_LINES: expr) => {
+        let expected_lines: Vec<String> = $EXPECTED_LINES.iter().map(|&s| s.into()).collect();
+        let actual_lines = $CAPTURE.to_strings();
+
+        let normalized_expected = normalize(expected_lines.iter());
+        let normalized_actual = normalize(actual_lines.iter());
+
+        assert_eq!(
+            normalized_expected, normalized_actual,
+            "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\
+                \n\nnormalized_expected:\n\n{:#?}\nnormalized_actual:\n\n{:#?}\n\n",
+            expected_lines, actual_lines, normalized_expected, normalized_actual
+        )
+    };
+}
+
+#[test]
+fn level() {
+    let capture = CapturedWriter::new();
+
+    info!("This is an info message");
+    debug!("This is a debug message");
+    trace!("This is a trace message");
+    warn!("This is a warn message");
+    error!("This is a error message");
+
+    let expected = [
+        "level=info msg=\"This is an info message\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:36\" time=1612181556329599000",
+        "level=debug msg=\"This is a debug message\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:37\" time=1612181556329618000",
+        "level=trace msg=\"This is a trace message\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:38\" time=1612181556329634000",
+        "level=warn msg=\"This is a warn message\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:39\" time=1612181556329646000",
+        "level=error msg=\"This is a error message\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:40\" time=1612181556329661000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn event_fields_strings() {
+    let capture = CapturedWriter::new();
+    info!(
+        event_name = "foo bar",
+        other_event = "baz",
+        "This is an info message"
+    );
+
+    let expected = [
+        "level=info msg=\"This is an info message\" event_name=\"foo bar\" other_event=baz \
+            target=\"logging\" location=\"logfmt/tests/logging.rs:59\" time=1612187170712973000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn event_fields_strings_quoting() {
+    let capture = CapturedWriter::new();
+    info!(foo = r#"body: Body(Full(b"{\"error\": \"Internal error\"}"))"#,);
+
+    let escaped_foo_value = r#"body: Body(Full(b\"{\\\"error\\\": \\\"Internal error\\\"}\"))"#;
+
+    let expected = [&format!(
+        "level=info foo=\"{escaped_foo_value}\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:59\" time=1612187170712973000"
+    )];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn test_without_normalization() {
+    let capture = CapturedWriter::new();
+    info!(
+        event_name = "foo bar",
+        other_event = "baz",
+        "This is an info message"
+    );
+
+    // double assure that normalization isn't messing with things by
+    // checking for presence of strings as well
+    let log_string = normalize(capture.to_strings().iter()).join("\n");
+    assert!(log_string.contains("This is an info message"));
+    assert!(log_string.contains("event_name"));
+    assert!(log_string.contains("other_event"));
+    assert!(log_string.contains("baz"));
+    assert!(log_string.contains("foo bar"));
+}
+
+#[test]
+fn event_fields_numeric() {
+    let capture = CapturedWriter::new();
+    info!(bar = 1, frr = false, "This is an info message");
+
+    let expected = [
+        "level=info msg=\"This is an info message\" bar=1 frr=false target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:72\" time=1612187170712947000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn event_fields_repeated() {
+    let capture = CapturedWriter::new();
+    info!(bar = 1, bar = 2, "This is an info message");
+
+    let expected = [
+        "level=info msg=\"This is an info message\" bar=1 bar=2 target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:84\" time=1612187170712948000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn event_fields_errors() {
+    let capture = CapturedWriter::new();
+
+    let err: Box<dyn Error + 'static> =
+        io::Error::new(io::ErrorKind::Other, "shaving yak failed!").into();
+    error!(the_error = err.as_ref(), "This is an error message");
+
+    let expected = [
+        "level=error msg=\"This is an error message\" the_error=\"\\\"Custom { kind: Other, \
+            error: \\\\\\\"shaving yak failed!\\\\\\\" }\\\"\" \
+            the_error.display=\"shaving yak failed!\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:99\" time=1612187170712947000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn event_fields_structs() {
+    let capture = CapturedWriter::new();
+    let my_struct = TestDebugStruct::new();
+
+    info!(s = ?my_struct, "This is an info message");
+
+    let expected = [
+        "level=info msg=\"This is an info message\" s=\"TestDebugStruct { b: true, \
+            s: \\\"The String\\\" }\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:111\" time=1612187170712937000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn event_spans() {
+    // Demonstrate the inclusion of span_id (as `span`)
+    let capture = CapturedWriter::new();
+    let span = span!(Level::INFO, "my_span", foo = "bar");
+    let enter = span.enter();
+    info!(shave = "mo yak!", "info message in span");
+    std::mem::drop(enter);
+
+    let expected = [
+        "level=info span_name=\"my_span\" foo=bar span=1 time=1612209178717290000",
+        "level=info msg=\"info message in span\" shave=\"mo yak!\" span=1 target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:132\" time=1612209178717329000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+#[test]
+fn event_multi_span() {
+    // Demonstrate the inclusion of span_id (as `span`)
+    let capture = CapturedWriter::new();
+
+    let span1 = span!(Level::INFO, "my_span", foo = "bar");
+    let _ = span1.enter();
+    {
+        let span2 = span!(Level::INFO, "my_second_span", foo = "baz");
+        let _ = span2.enter();
+        info!(shave = "yak!", "info message in span 2");
+    }
+
+    {
+        let span3 = span!(Level::INFO, "my_second_span", foo = "brmp");
+        let _ = span3.enter();
+        info!(shave = "mo yak!", "info message in span 3");
+    }
+
+    let expected = [
+        "level=info span_name=\"my_span\" foo=bar span=1 time=1612209327939714000",
+        "level=info span_name=\"my_second_span\" foo=baz span=2 time=1612209327939743000",
+        "level=info msg=\"info message in span 2\" shave=yak! target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:154\" time=1612209327939774000",
+        "level=info span_name=\"my_second_span\" foo=brmp span=3 time=1612209327939795000",
+        "level=info msg=\"info message in span 3\" shave=\"mo yak!\" target=\"logging\" \
+            location=\"logfmt/tests/logging.rs:160\" time=1612209327939828000",
+    ];
+
+    assert_logs!(capture, expected);
+}
+
+// TODO: it might be nice to write some tests for time and location, but for now
+// just punt
+
+/// Test structure that has a debug representation
+#[derive(Debug)]
+struct TestDebugStruct {
+    b: bool,
+    s: String,
+}
+impl TestDebugStruct {
+    fn new() -> Self {
+        Self {
+            b: true,
+            s: "The String".into(),
+        }
+    }
+}
+
+impl fmt::Display for TestDebugStruct {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "Display for TestDebugStruct b:{} s:\"{}\"",
+            self.b, self.s
+        )
+    }
+}
+
+/// Normalize lines for easy comparison
+fn normalize<'a>(lines: impl Iterator<Item = &'a String>) -> Vec<String> {
+    let lines = lines
+        .map(|line| normalize_timestamp(line))
+        .map(|line| normalize_location(&line))
+        .collect();
+    normalize_spans(lines)
+}
+
+/// s/time=1612187170712947000/time=NORMALIZED/g
+fn normalize_timestamp(v: &str) -> String {
+    let re = Regex::new(r"time=\d+").unwrap();
+    re.replace_all(v, "time=NORMALIZED").to_string()
+}
+
+/// s/location=\"logfmt/tests/logging.rs:128\"/location=NORMALIZED/g
+fn normalize_location(v: &str) -> String {
+    let re = Regex::new(r#"location=".*?""#).unwrap();
+    re.replace_all(v, "location=NORMALIZED").to_string()
+}
+
+/// s/span=1/span=SPAN1/g
+fn normalize_spans(lines: Vec<String>) -> Vec<String> {
+    // since there can be multiple unique span values, need to normalize them
+    // differently
+    //
+    // Note: we include leading and trailing spaces so that span=2
+    // doesn't also match span=21423
+    let re = Regex::new(r" span=(\d+) ").unwrap();
+
+    // This collect isn't needless: the `fold` below moves `lines`, so this
+    // iterator can't borrow `lines`, we need to collect into a `Vec` to
+    // stop borrowing `lines`.
+    // See https://github.com/rust-lang/rust-clippy/issues/7336
+    #[allow(clippy::needless_collect)]
+    let span_ids: Vec<String> = lines
+        .iter()
+        .flat_map(|line| re.find_iter(line))
+        .map(|m| m.as_str().to_string())
+        .collect();
+
+    // map span ids to something uniform
+    span_ids
+        .into_iter()
+        .enumerate()
+        .fold(lines, |lines, (idx, orig_id)| {
+            // replace old span
+            let new_id = format!(" span=SPAN{idx} ");
+            let re = Regex::new(&orig_id).unwrap();
+            lines
+                .into_iter()
+                .map(|line| re.replace_all(&line as &str, &new_id as &str).to_string())
+                .collect()
+        })
+}
+
+// Each thread has a local collection of lines that is captured to
+// This is needed because the rust test framework runs the
+// tests potentially using multiple threads but there is a single
+// global logger.
+thread_local! {
+    static LOG_LINES: Mutex<Cursor<Vec<u8>>> = Mutex::new(Cursor::new(Vec::new()));
+}
+
+// Since we can only setup logging once, we need to have gloabl to
+// use it among test cases
+static GLOBAL_WRITER: Lazy<Mutex<CapturedWriter>> = Lazy::new(|| {
+    let capture = CapturedWriter::default();
+    tracing_subscriber::registry()
+        .with(LogFmtLayer::new(capture.clone()))
+        .init();
+    Mutex::new(capture)
+});
+
+// This thing captures log lines
+#[derive(Default, Clone)]
+struct CapturedWriter {
+    // all state is held in the LOG_LINES thread local variable
+}
+
+impl CapturedWriter {
+    fn new() -> Self {
+        let global_writer = GLOBAL_WRITER.lock();
+        global_writer.clone().clear()
+    }
+
+    /// Clear all thread local state
+    fn clear(self) -> Self {
+        LOG_LINES.with(|lines| {
+            let mut cursor = lines.lock();
+            cursor.get_mut().clear()
+        });
+        self
+    }
+
+    fn to_strings(&self) -> Vec<String> {
+        LOG_LINES.with(|lines| {
+            let cursor = lines.lock();
+            let bytes: Vec<u8> = cursor.get_ref().clone();
+            String::from_utf8(bytes)
+                .expect("valid utf8")
+                .lines()
+                .map(|s| s.to_string())
+                .collect()
+        })
+    }
+}
+
+impl fmt::Display for CapturedWriter {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        for s in self.to_strings() {
+            writeln!(f, "{s}")?
+        }
+        Ok(())
+    }
+}
+
+impl std::io::Write for CapturedWriter {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        LOG_LINES.with(|lines| {
+            let mut cursor = lines.lock();
+            cursor.write(buf)
+        })
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        LOG_LINES.with(|lines| {
+            let mut cursor = lines.lock();
+            cursor.flush()
+        })
+    }
+}
+
+impl MakeWriter<'_> for CapturedWriter {
+    type Writer = Self;
+
+    fn make_writer(&self) -> Self::Writer {
+        self.clone()
+    }
+}
diff --git a/metric/Cargo.toml b/metric/Cargo.toml
new file mode 100644
index 0000000..b177d09
--- /dev/null
+++ b/metric/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "metric"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+parking_lot = "0.12"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
diff --git a/metric/src/counter.rs b/metric/src/counter.rs
new file mode 100644
index 0000000..b8c1cd4
--- /dev/null
+++ b/metric/src/counter.rs
@@ -0,0 +1,86 @@
+use crate::{MetricKind, MetricObserver, Observation};
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+
+/// A monotonic counter.
+///
+/// A [`U64Counter`]` is an internally reference counted type, and all mutations
+/// to cloned instances mutate the same underlying counter.
+#[derive(Debug, Clone, Default)]
+pub struct U64Counter {
+    state: Arc<AtomicU64>,
+}
+
+impl U64Counter {
+    pub fn inc(&self, count: u64) {
+        self.state.fetch_add(count, Ordering::Relaxed);
+    }
+
+    pub fn fetch(&self) -> u64 {
+        self.state.load(Ordering::Relaxed)
+    }
+}
+
+impl MetricObserver for U64Counter {
+    type Recorder = Self;
+
+    fn kind() -> MetricKind {
+        MetricKind::U64Counter
+    }
+
+    fn recorder(&self) -> Self::Recorder {
+        self.clone()
+    }
+
+    fn observe(&self) -> Observation {
+        Observation::U64Counter(self.fetch())
+    }
+}
+
+/// A concise helper to assert the value of a metric counter, regardless of underlying type.
+#[macro_export]
+macro_rules! assert_counter {
+    (
+        $metrics:ident,
+        $counter:ty,
+        $name:expr,
+        $(labels = $attr:expr,)*
+        $(value = $value:expr,)*
+    ) => {
+        // Default to an empty set of attributes if not specified.
+        #[allow(unused)]
+        let mut attr = None;
+        $(attr = Some($attr);)*
+        let attr = attr.unwrap_or_else(|| metric::Attributes::from(&[]));
+
+        let counter = $metrics
+            .get_instrument::<metric::Metric<$counter>>($name)
+            .expect("failed to find metric with provided name")
+            .get_observer(&attr)
+            .expect("failed to find metric with provided attributes")
+            .fetch();
+
+        $(assert_eq!(counter, $value, "counter value mismatch");)*
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_counter() {
+        let counter = U64Counter::default();
+        assert_eq!(counter.fetch(), 0);
+        counter.inc(12);
+        assert_eq!(counter.fetch(), 12);
+        counter.inc(34);
+        assert_eq!(counter.fetch(), 46);
+
+        assert_eq!(counter.observe(), Observation::U64Counter(46));
+
+        // Expect counter to wrap around
+        counter.inc(u64::MAX);
+        assert_eq!(counter.observe(), Observation::U64Counter(45));
+    }
+}
diff --git a/metric/src/cumulative.rs b/metric/src/cumulative.rs
new file mode 100644
index 0000000..c769e8e
--- /dev/null
+++ b/metric/src/cumulative.rs
@@ -0,0 +1,156 @@
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+
+use crate::{MetricKind, MetricObserver, Observation};
+
+/// A `CumulativeGauge` reports the total of the values reported by `CumulativeRecorder`
+///
+/// Unlike `U64Gauge` this means it is safe to use in multiple locations with the same
+/// set of `Attributes`. The value reported will be the sum of all `CumulativeRecorder`.
+///
+/// Any contributions made by a `CumulativeRecorder` to the total reported by the
+/// `CumulativeGauge` are lost when the `CumulativeRecorder` is dropped
+///
+/// The primary use-case for CumulativeGauge is reporting observations at a lower
+/// granularity than the instrumented objects. For example, we might want
+/// to instrument individual Chunks but report the total across all Chunks
+/// in a partition or table.
+#[derive(Debug, Default, Clone)]
+pub struct CumulativeGauge {
+    state: Arc<AtomicU64>,
+}
+
+impl CumulativeGauge {
+    pub fn fetch(&self) -> u64 {
+        self.state.load(Ordering::Relaxed)
+    }
+}
+
+impl MetricObserver for CumulativeGauge {
+    type Recorder = CumulativeRecorder;
+
+    fn kind() -> MetricKind {
+        MetricKind::U64Gauge
+    }
+
+    fn recorder(&self) -> Self::Recorder {
+        CumulativeRecorder {
+            local: 0,
+            state: Arc::clone(&self.state),
+        }
+    }
+
+    fn observe(&self) -> Observation {
+        Observation::U64Gauge(self.fetch())
+    }
+}
+
+#[derive(Debug)]
+pub struct CumulativeRecorder {
+    local: u64,
+    state: Arc<AtomicU64>,
+}
+
+impl CumulativeRecorder {
+    /// Gets a new unregistered recorder
+    pub fn new_unregistered() -> Self {
+        Self {
+            local: 0,
+            state: Default::default(),
+        }
+    }
+
+    /// Gets the CumulativeGauge this CumulativeRecorder is associated with
+    pub fn reporter(&self) -> CumulativeGauge {
+        CumulativeGauge {
+            state: Arc::clone(&self.state),
+        }
+    }
+
+    /// Gets the local contribution from this instance
+    pub fn get_local(&self) -> u64 {
+        self.local
+    }
+
+    /// Increment the local value for this CumulativeRecorder
+    pub fn inc(&mut self, delta: u64) {
+        self.local += delta;
+        self.state.fetch_add(delta, Ordering::Relaxed);
+    }
+
+    /// Decrement the local value for this CumulativeRecorder
+    pub fn decr(&mut self, delta: u64) {
+        self.local -= delta;
+        self.state.fetch_sub(delta, Ordering::Relaxed);
+    }
+
+    /// Sets the local value for this CumulativeRecorder
+    pub fn set(&mut self, new: u64) {
+        match new.cmp(&self.local) {
+            std::cmp::Ordering::Less => self.decr(self.local - new),
+            std::cmp::Ordering::Equal => {}
+            std::cmp::Ordering::Greater => self.inc(new - self.local),
+        }
+    }
+}
+
+impl Drop for CumulativeRecorder {
+    fn drop(&mut self) {
+        self.state.fetch_sub(self.local, Ordering::Relaxed);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gauge() {
+        let gauge = CumulativeGauge::default();
+
+        let mut r1 = gauge.recorder();
+        assert_eq!(gauge.observe(), Observation::U64Gauge(0));
+
+        r1.set(23);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(23));
+
+        let mut r2 = gauge.recorder();
+        assert_eq!(gauge.observe(), Observation::U64Gauge(23));
+
+        r2.set(34);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(57));
+
+        std::mem::drop(r2);
+
+        assert_eq!(gauge.observe(), Observation::U64Gauge(23));
+
+        let mut r3 = gauge.recorder();
+        r3.set(7);
+
+        assert_eq!(gauge.observe(), Observation::U64Gauge(30));
+
+        r1.set(53);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(60));
+
+        std::mem::drop(r1);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(7));
+
+        std::mem::drop(r3);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(0));
+
+        // Test overflow behaviour
+        let mut r1 = gauge.recorder();
+        let mut r2 = gauge.recorder();
+
+        r1.set(u64::MAX);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(u64::MAX));
+        r2.set(34);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(33));
+
+        std::mem::drop(r1);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(34));
+
+        std::mem::drop(r2);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(0));
+    }
+}
diff --git a/metric/src/duration.rs b/metric/src/duration.rs
new file mode 100644
index 0000000..6fd9750
--- /dev/null
+++ b/metric/src/duration.rs
@@ -0,0 +1,331 @@
+use std::time::Duration;
+
+use crate::{
+    HistogramObservation, MakeMetricObserver, MetricKind, MetricObserver, Observation,
+    ObservationBucket, U64Counter, U64Gauge, U64Histogram,
+};
+
+use std::convert::TryInto;
+
+/// The maximum duration that can be stored in the duration measurements
+pub const DURATION_MAX: Duration = Duration::from_nanos(u64::MAX);
+
+/// A monotonic counter of `std::time::Duration`
+#[derive(Debug, Clone, Default)]
+pub struct DurationCounter {
+    inner: U64Counter,
+}
+
+impl DurationCounter {
+    pub fn inc(&self, duration: Duration) {
+        self.inner.inc(
+            duration
+                .as_nanos()
+                .try_into()
+                .expect("cannot fit duration into u64"),
+        )
+    }
+
+    pub fn fetch(&self) -> Duration {
+        Duration::from_nanos(self.inner.fetch())
+    }
+}
+
+impl MetricObserver for DurationCounter {
+    type Recorder = Self;
+
+    fn kind() -> MetricKind {
+        MetricKind::DurationCounter
+    }
+
+    fn recorder(&self) -> Self::Recorder {
+        self.clone()
+    }
+
+    fn observe(&self) -> Observation {
+        Observation::DurationCounter(self.fetch())
+    }
+}
+
+/// An observation of a single `std::time::Duration`
+///
+/// NOTE: If the same `DurationGauge` is used in multiple locations, e.g. a non-unique set
+/// of attributes is provided to `Metric<DurationGauge>::recorder`, the reported value
+/// will oscillate between those reported by the separate locations
+#[derive(Debug, Clone, Default)]
+pub struct DurationGauge {
+    inner: U64Gauge,
+}
+
+impl DurationGauge {
+    pub fn set(&self, value: Duration) {
+        self.inner.set(
+            value
+                .as_nanos()
+                .try_into()
+                .expect("cannot fit duration into u64"),
+        )
+    }
+
+    pub fn fetch(&self) -> Duration {
+        Duration::from_nanos(self.inner.fetch())
+    }
+}
+
+impl MetricObserver for DurationGauge {
+    type Recorder = Self;
+
+    fn kind() -> MetricKind {
+        MetricKind::DurationGauge
+    }
+
+    fn recorder(&self) -> Self::Recorder {
+        self.clone()
+    }
+
+    fn observe(&self) -> Observation {
+        Observation::DurationGauge(self.fetch())
+    }
+}
+
+/// A `DurationHistogram` provides bucketed observations of `Durations`
+///
+/// This provides insight into the distribution beyond a simple count or total
+#[derive(Debug, Clone)]
+pub struct DurationHistogram {
+    inner: U64Histogram,
+}
+
+impl DurationHistogram {
+    pub fn fetch(&self) -> HistogramObservation<Duration> {
+        let inner = self.inner.fetch();
+
+        HistogramObservation {
+            total: Duration::from_nanos(inner.total),
+            buckets: inner
+                .buckets
+                .into_iter()
+                .map(|bucket| ObservationBucket {
+                    le: Duration::from_nanos(bucket.le),
+                    count: bucket.count,
+                })
+                .collect(),
+        }
+    }
+
+    pub fn record(&self, value: Duration) {
+        self.record_multiple(value, 1)
+    }
+
+    pub fn record_multiple(&self, value: Duration, count: u64) {
+        self.inner.record_multiple(
+            value
+                .as_nanos()
+                .try_into()
+                .expect("cannot fit duration into u64"),
+            count,
+        )
+    }
+
+    pub fn reset(&self) {
+        self.inner.reset();
+    }
+
+    pub fn percentile(&self, percentile: u64) -> Duration {
+        Duration::from_nanos(self.inner.percentile(percentile))
+    }
+}
+
+/// `DurationHistogramOptions` allows configuring the buckets used by `DurationHistogram`
+#[derive(Debug, Clone)]
+pub struct DurationHistogramOptions {
+    buckets: Vec<Duration>,
+}
+
+impl DurationHistogramOptions {
+    /// Create a new `DurationHistogramOptions` with a list of thresholds to delimit the buckets
+    pub fn new(thresholds: impl IntoIterator<Item = Duration>) -> Self {
+        let mut buckets: Vec<_> = thresholds.into_iter().collect();
+        buckets.sort_unstable();
+        Self { buckets }
+    }
+}
+
+impl Default for DurationHistogramOptions {
+    fn default() -> Self {
+        Self {
+            buckets: vec![
+                Duration::from_millis(1),
+                Duration::from_micros(2_500),
+                Duration::from_millis(5),
+                Duration::from_millis(10),
+                Duration::from_millis(25),
+                Duration::from_millis(50),
+                Duration::from_millis(100),
+                Duration::from_millis(250),
+                Duration::from_millis(500),
+                Duration::from_millis(1000),
+                Duration::from_millis(2500),
+                Duration::from_millis(5000),
+                Duration::from_millis(10000),
+                DURATION_MAX,
+            ],
+        }
+    }
+}
+
+impl MakeMetricObserver for DurationHistogram {
+    type Options = DurationHistogramOptions;
+
+    fn create(options: &DurationHistogramOptions) -> Self {
+        Self {
+            inner: U64Histogram::new(options.buckets.iter().map(|duration| {
+                duration
+                    .as_nanos()
+                    .try_into()
+                    .expect("cannot fit duration into u64")
+            })),
+        }
+    }
+}
+
+impl MetricObserver for DurationHistogram {
+    type Recorder = Self;
+
+    fn kind() -> MetricKind {
+        MetricKind::DurationHistogram
+    }
+
+    fn recorder(&self) -> Self::Recorder {
+        self.clone()
+    }
+
+    fn observe(&self) -> Observation {
+        Observation::DurationHistogram(self.fetch())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::time::Duration;
+
+    #[test]
+    fn test_gauge() {
+        let gauge = DurationGauge::default();
+
+        assert_eq!(
+            gauge.observe(),
+            Observation::DurationGauge(Duration::from_nanos(0))
+        );
+
+        gauge.set(Duration::from_nanos(10002));
+        assert_eq!(
+            gauge.observe(),
+            Observation::DurationGauge(Duration::from_nanos(10002))
+        );
+
+        gauge.set(Duration::from_nanos(12));
+        assert_eq!(
+            gauge.observe(),
+            Observation::DurationGauge(Duration::from_nanos(12))
+        );
+
+        let r2 = gauge.recorder();
+
+        gauge.set(Duration::from_secs(12));
+        assert_eq!(
+            gauge.observe(),
+            Observation::DurationGauge(Duration::from_secs(12))
+        );
+
+        std::mem::drop(r2);
+
+        assert_eq!(
+            gauge.observe(),
+            Observation::DurationGauge(Duration::from_secs(12))
+        );
+    }
+
+    #[test]
+    fn test_counter() {
+        let counter = DurationCounter::default();
+        assert_eq!(counter.fetch(), Duration::from_nanos(0));
+        counter.inc(Duration::from_nanos(120));
+        assert_eq!(counter.fetch(), Duration::from_nanos(120));
+        counter.inc(Duration::from_secs(1));
+        assert_eq!(counter.fetch(), Duration::from_nanos(1_000_000_120));
+
+        assert_eq!(
+            counter.observe(),
+            Observation::DurationCounter(Duration::from_nanos(1_000_000_120))
+        )
+    }
+
+    #[test]
+    #[should_panic(expected = "cannot fit duration into u64: TryFromIntError(())")]
+    fn test_bucket_overflow() {
+        let options = DurationHistogramOptions::new([Duration::MAX]);
+        DurationHistogram::create(&options);
+    }
+
+    #[test]
+    #[should_panic(expected = "cannot fit duration into u64: TryFromIntError(())")]
+    fn test_record_overflow() {
+        let histogram = DurationHistogram::create(&Default::default());
+        histogram.record(Duration::MAX);
+    }
+
+    #[test]
+    fn test_histogram() {
+        let buckets = [
+            Duration::from_millis(10),
+            Duration::from_millis(15),
+            Duration::from_millis(100),
+            DURATION_MAX,
+        ];
+
+        let options = DurationHistogramOptions::new(buckets);
+        let histogram = DurationHistogram::create(&options);
+
+        let buckets = |expected: &[u64; 4], total| -> Observation {
+            Observation::DurationHistogram(HistogramObservation {
+                total,
+                buckets: expected
+                    .iter()
+                    .cloned()
+                    .zip(buckets)
+                    .map(|(count, le)| ObservationBucket { le, count })
+                    .collect(),
+            })
+        };
+
+        assert_eq!(
+            histogram.observe(),
+            buckets(&[0, 0, 0, 0], Duration::from_millis(0))
+        );
+
+        histogram.record(Duration::from_millis(20));
+        assert_eq!(
+            histogram.observe(),
+            buckets(&[0, 0, 1, 0], Duration::from_millis(20))
+        );
+
+        histogram.record(Duration::from_millis(0));
+        assert_eq!(
+            histogram.observe(),
+            buckets(&[1, 0, 1, 0], Duration::from_millis(20))
+        );
+
+        histogram.record(DURATION_MAX);
+
+        // Expect total to overflow and wrap around
+        assert_eq!(
+            histogram.observe(),
+            buckets(
+                &[1, 0, 1, 1],
+                Duration::from_millis(20) - Duration::from_nanos(1)
+            )
+        );
+    }
+}
diff --git a/metric/src/gauge.rs b/metric/src/gauge.rs
new file mode 100644
index 0000000..fdf436b
--- /dev/null
+++ b/metric/src/gauge.rs
@@ -0,0 +1,107 @@
+use crate::{MetricKind, MetricObserver, Observation};
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+
+/// An observation of a single u64 value
+///
+/// NOTE: If the same `U64Gauge` is used in multiple locations, e.g. a non-unique set
+/// of attributes is provided to `Metric<U64Gauge>::recorder`, the reported value
+/// will oscillate between those reported by the separate locations
+#[derive(Debug, Clone, Default)]
+pub struct U64Gauge {
+    state: Arc<AtomicU64>,
+}
+
+impl U64Gauge {
+    /// Sets the value of this U64Gauge
+    pub fn set(&self, value: u64) {
+        self.state.store(value, Ordering::Relaxed);
+    }
+
+    /// Increments the value of this U64Gauge by the specified amount.
+    pub fn inc(&self, delta: u64) {
+        self.state.fetch_add(delta, Ordering::Relaxed);
+    }
+
+    /// Decrements the value of this U64Gauge by the specified amount.
+    ///
+    /// # Underflow / Overflow
+    ///
+    /// This operation wraps around on over/underflow.
+    pub fn dec(&self, delta: u64) {
+        self.state.fetch_sub(delta, Ordering::Relaxed);
+    }
+
+    /// Adjusts the value of this U64Gauge by the specified delta.
+    ///
+    /// # Underflow / Overflow
+    ///
+    /// This operation wraps around on over/underflow.
+    pub fn delta(&self, delta: i64) {
+        if delta > 0 {
+            self.inc(delta as _);
+        } else {
+            self.dec(delta.unsigned_abs());
+        }
+    }
+
+    /// Fetches the value of this U64Gauge
+    pub fn fetch(&self) -> u64 {
+        self.state.load(Ordering::Relaxed)
+    }
+}
+
+impl MetricObserver for U64Gauge {
+    type Recorder = Self;
+
+    fn kind() -> MetricKind {
+        MetricKind::U64Gauge
+    }
+
+    fn recorder(&self) -> Self::Recorder {
+        self.clone()
+    }
+
+    fn observe(&self) -> Observation {
+        Observation::U64Gauge(self.fetch())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gauge() {
+        let gauge = U64Gauge::default();
+
+        assert_eq!(gauge.observe(), Observation::U64Gauge(0));
+
+        gauge.set(345);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(345));
+
+        gauge.set(23);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(23));
+
+        gauge.inc(10);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(33));
+
+        gauge.dec(10);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(23));
+
+        gauge.delta(19);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(42));
+
+        gauge.delta(-19);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(23));
+
+        let r2 = gauge.recorder();
+
+        r2.set(34);
+        assert_eq!(gauge.observe(), Observation::U64Gauge(34));
+
+        std::mem::drop(r2);
+
+        assert_eq!(gauge.observe(), Observation::U64Gauge(34));
+    }
+}
diff --git a/metric/src/histogram.rs b/metric/src/histogram.rs
new file mode 100644
index 0000000..099ef66
--- /dev/null
+++ b/metric/src/histogram.rs
@@ -0,0 +1,257 @@
+use crate::{
+    HistogramObservation, MakeMetricObserver, MetricKind, MetricObserver, Observation,
+    ObservationBucket,
+};
+use parking_lot::Mutex;
+use std::sync::Arc;
+
+/// Determines the bucketing used by the `U64Histogram`
+#[derive(Debug, Clone)]
+pub struct U64HistogramOptions {
+    buckets: Vec<u64>,
+}
+
+impl U64HistogramOptions {
+    /// Create a new `U64HistogramOptions` with a list of thresholds to delimit the buckets
+    pub fn new(thresholds: impl IntoIterator<Item = u64>) -> Self {
+        let mut buckets: Vec<_> = thresholds.into_iter().collect();
+        buckets.sort_unstable();
+        Self { buckets }
+    }
+}
+
+/// A `U64Histogram` provides bucketed observations of u64 values
+///
+/// This provides insight into the distribution of values beyond a simple count or total
+#[derive(Debug, Clone)]
+pub struct U64Histogram {
+    shared: Arc<Mutex<HistogramObservation<u64>>>,
+}
+
+impl U64Histogram {
+    pub(crate) fn new(sorted_buckets: impl Iterator<Item = u64>) -> Self {
+        let buckets = sorted_buckets
+            .map(|le| ObservationBucket {
+                le,
+                count: Default::default(),
+            })
+            .collect();
+
+        Self {
+            shared: Arc::new(Mutex::new(HistogramObservation {
+                total: Default::default(),
+                buckets,
+            })),
+        }
+    }
+
+    pub fn fetch(&self) -> HistogramObservation<u64> {
+        self.shared.lock().clone()
+    }
+
+    pub fn record(&self, value: u64) {
+        self.record_multiple(value, 1)
+    }
+
+    pub fn record_multiple(&self, value: u64, count: u64) {
+        let mut state = self.shared.lock();
+        if let Some(bucket) = state
+            .buckets
+            .iter_mut()
+            .find(|bucket| value <= bucket.le)
+            .as_mut()
+        {
+            bucket.count = bucket.count.wrapping_add(count);
+            state.total = state.total.wrapping_add(value * count);
+        }
+    }
+
+    pub fn reset(&self) {
+        let mut state = self.shared.lock();
+        for bucket in &mut state.buckets {
+            bucket.count = 0;
+        }
+        state.total = 0;
+    }
+
+    /// percentile returns the bucket threshold for the given percentile.
+    /// For example, if you want the median value, percentile(50) will return the 'le' threshold
+    /// for the histogram bucket that contains the median sample.
+    ///
+    /// A use case for for this function is:
+    ///     Use a histogram tracks the load placed on a system.
+    ///     Set the buckets so they represent load levels of idle/low/medium/high/overloaded.
+    ///     Then use percentile to determine how much of the time is spent at various load levels.
+    ///  e.g. if percentile(50) comes come back with the low load threshold, the median load on the system is low
+    pub fn percentile(&self, percentile: u64) -> u64 {
+        let state = self.shared.lock();
+
+        // we need the total quantity of samples, not the sum of samples.
+        let total: u64 = state.buckets.iter().map(|bucket| bucket.count).sum();
+
+        let target = total * percentile / 100;
+
+        let mut sum = 0;
+        for bucket in &state.buckets {
+            sum += bucket.count;
+            if sum >= target {
+                return bucket.le;
+            }
+        }
+        0
+    }
+}
+
+impl MakeMetricObserver for U64Histogram {
+    type Options = U64HistogramOptions;
+
+    fn create(options: &U64HistogramOptions) -> Self {
+        Self::new(options.buckets.iter().cloned())
+    }
+}
+
+impl MetricObserver for U64Histogram {
+    type Recorder = Self;
+
+    fn kind() -> MetricKind {
+        MetricKind::U64Histogram
+    }
+
+    fn recorder(&self) -> Self::Recorder {
+        self.clone()
+    }
+
+    fn observe(&self) -> Observation {
+        Observation::U64Histogram(self.fetch())
+    }
+}
+
+/// A concise helper to assert the value of a metric histogram, regardless of underlying type.
+#[macro_export]
+macro_rules! assert_histogram {
+    (
+        $metrics:ident,
+        $hist:ty,
+        $name:expr,
+        $(labels = $attr:expr,)*
+        $(samples = $samples:expr,)*
+        $(sum = $sum:expr,)*
+    ) => {
+        // Default to an empty set of attributes if not specified.
+        #[allow(unused)]
+        let mut attr = None;
+        $(attr = Some($attr);)*
+        let attr = attr.unwrap_or_else(|| metric::Attributes::from(&[]));
+
+        let hist = $metrics
+            .get_instrument::<metric::Metric<$hist>>($name)
+            .expect("failed to find metric with provided name")
+            .get_observer(&attr)
+            .expect("failed to find metric with provided attributes")
+            .fetch();
+
+        $(assert_eq!(hist.sample_count(), $samples, "sample count mismatch");)*
+        $(assert_eq!(hist.total, $sum, "sum value mismatch");)*
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::HistogramObservation;
+
+    #[test]
+    fn test_histogram() {
+        let buckets = [20, 40, 50];
+        let options = U64HistogramOptions::new(buckets);
+        let histogram = U64Histogram::create(&options);
+
+        let buckets = |expected: &[u64; 3], total: u64| -> Observation {
+            Observation::U64Histogram(HistogramObservation {
+                total,
+                buckets: expected
+                    .iter()
+                    .cloned()
+                    .zip(buckets)
+                    .map(|(count, le)| ObservationBucket { le, count })
+                    .collect(),
+            })
+        };
+
+        assert_eq!(histogram.observe(), buckets(&[0, 0, 0], 0));
+
+        histogram.record(30);
+
+        assert_eq!(histogram.observe(), buckets(&[0, 1, 0], 30));
+
+        histogram.record(50);
+
+        assert_eq!(histogram.observe(), buckets(&[0, 1, 1], 80));
+
+        histogram.record(51);
+
+        // Exceeds max bucket - ignored
+        assert_eq!(histogram.observe(), buckets(&[0, 1, 1], 80));
+
+        histogram.record(0);
+        histogram.record(0);
+
+        assert_eq!(histogram.observe(), buckets(&[2, 1, 1], 80));
+
+        // Now test the percentile reporting function
+        let options = U64HistogramOptions::new(vec![0, 1, 2, 4, 8, 16, 32, u64::MAX]);
+        let histogram = U64Histogram::create(&options);
+
+        histogram.record(0); // bucket 0, le 0
+        histogram.record(2); // bucket 2, le 2
+        histogram.record(3); // bucket 3, le 4
+        histogram.record(3); // bucket 3, le 4
+        histogram.record(20); // bucket 6, le 32
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+
+        // Of the 10 samples above:
+        // 1 (10%) is in bucket 0, le 0
+        // 1 (10%) is in bucket 2, le 2
+        // 2 (20%) are in bucket 3, le 4
+        // 1 (10%) is in bucket 6, le 32
+        // 5 (50%) are in bucket 7, le u64::MAX
+
+        // request percentiles falling in bucket 0, le 0
+        assert_eq!(histogram.percentile(3), 0);
+        assert_eq!(histogram.percentile(10), 0);
+        assert_eq!(histogram.percentile(19), 0);
+
+        // request percentiles falling in bucket 2, le 2
+        assert_eq!(histogram.percentile(20), 2);
+        assert_eq!(histogram.percentile(29), 2);
+
+        // requests percentiles falling in bucket 3, le 4
+        assert_eq!(histogram.percentile(30), 4);
+        assert_eq!(histogram.percentile(49), 4);
+
+        // requests percentiles falling in bucket 6, le 32
+        assert_eq!(histogram.percentile(50), 32);
+        assert_eq!(histogram.percentile(59), 32);
+
+        // requests percentiles falling in bucket 6, le 32
+        assert_eq!(histogram.percentile(60), u64::MAX);
+        assert_eq!(histogram.percentile(80), u64::MAX);
+        assert_eq!(histogram.percentile(100), u64::MAX);
+
+        // test reset
+        histogram.reset();
+        assert_eq!(histogram.percentile(100), 0);
+        histogram.record(1); // bucket 1, le 1
+        histogram.record(2); // bucket 2, le 2
+        histogram.record(3); // bucket 3, le 4
+        histogram.record(3); // bucket 3, le 4
+        assert_eq!(histogram.percentile(0), 0);
+        assert_eq!(histogram.percentile(25), 1);
+        assert_eq!(histogram.percentile(49), 1);
+        assert_eq!(histogram.percentile(50), 2);
+    }
+}
diff --git a/metric/src/lib.rs b/metric/src/lib.rs
new file mode 100644
index 0000000..23b085e
--- /dev/null
+++ b/metric/src/lib.rs
@@ -0,0 +1,575 @@
+//! This crate contains the metric abstraction for IOx
+//!
+//! # Background
+//!
+//! Prior to this crate, IOx used a custom shim on top of OpenTelemetry. Over time, however, this
+//! shim grew as bugs and limitations were worked around, resulting in an incredibly complex crate
+//! with questionable performance, an inconsistent API, and a substantial dependency footprint
+//!
+//! As such this crate was created to address directly the requirements IOx has
+//! for a metrics abstractions. Specifically these are:
+//!
+//! 1. Require minimal additional dependencies to instrument a given crate
+//! 2. Decouple metric recording from metric export
+//! 3. Be easy to reason about what attributes are associated with a given metric
+//! 4. Be easy to grep for a given metric name
+//! 5. Allow amortizing any attribute manipulation over multiple metric records
+//! 6. Individual metric recording should be as cheap as possible
+//! 7. Be possible to define histogram buckets on a per-metric basis
+//! 8. Allow exposing the recorded metrics to internal systems (e.g. lifecycle, system tables, etc...)
+//! 9. It should be possible to hook up alternative metric sinks
+//! 10. Test instrumentation directly without relying on a, potentially extremely large, prometheus dump
+//!
+//! # Reporting
+//!
+//! `Registry` stores a list of `Instrument` associated with names.
+//!
+//! An `Instrument` is an object that knows how to write its `Observation` to a `Reporter`
+//! when requested. `Registry::report` will call `Instrument::report` for every `Instrument`
+//! registered with it, in alphabetical order of name.
+//!
+//! It follows that `Reporter` is an object that sinks `Observation`. This crate provides
+//! a `RawReporter` that buffers `Observation` and is useful for testing.
+//!
+//! A separate `metric_exporters` crate provides other exporters, e.g. `PrometheusTextEncoder`,
+//! that allow exporting metrics to other metrics destinations
+//!
+//! This is a separate crate to avoid dragging in unnecessary dependencies into code that
+//! only needs to be instrumented
+//!
+//! # Metric
+//!
+//! The `Reporter` data model has a concept of `Attributes` which is a set of key, value pairs
+//! associated with a single `Observation`.
+//!
+//! It is common for each set of `Attributes` to be recorded independently despite sharing
+//! the same metric name. `Metric` is an `Instrument` that encodes this scenario.
+//!
+//! A `MetricObserver` is an object that reports a single `Observation`.
+//!
+//! `Metric<T>` then maintains a separate instance of this `MetricObserver` for each
+//! set of `Attributes` registered with it, and reports them all via the `Instrument` trait.
+//!
+//! The result is that a type can implement `MetricObserver` and leave `Metric` to handle
+//! all the common logic around `Attribute` manipulation
+//!
+//! # Recording
+//!
+//! The astute will have observed nothing about the above mentions recording. This is because
+//! the trait topology is *only* concerned with reporting.
+//!
+//! Instead the methods on `Registry` downcast to the underlying concrete type, and so recording
+//! can take place using standard member functions on whatever the concrete type of `Instrument` is
+//!
+//! For example, `U64Counter` has a member function `U64Counter::inc` that can be called as follows
+//!
+//! ```
+//! use ::metric::{Registry, Metric, U64Counter, Observation, RawReporter, Attributes};
+//!
+//! let registry = Registry::new();
+//! let counter: Metric<U64Counter> = registry.register_metric("metric_name", "description");
+//!
+//! // Get access to the U64Counter for a given set of Attributes
+//! // The returned value could be cached to avoid subsequent attribute manipulation
+//! let recorder = counter.recorder(&[("tag1", "val1"), ("tag2", "val2")]);
+//!
+//! // Call member function on recorder
+//! recorder.inc(20);
+//!
+//! // Can also do as a one-liner at the cost of repeated attribute manipulation
+//! counter.recorder(&[("tag1", "val1"), ("tag2", "val2")]).inc(12);
+//!
+//! // We can then dump the observations to a reporter
+//! // NOTE: in production code this would likely be a Prometheus reporter or similar
+//! let mut reporter = RawReporter::default();
+//!
+//! registry.report(&mut reporter);
+//!
+//! let observation_sets = reporter.observations();
+//! assert_eq!(observation_sets.len(), 1);
+//!
+//! let counter = &observation_sets[0];
+//! assert_eq!(counter.metric_name, "metric_name");
+//! assert_eq!(counter.description, "description");
+//!
+//! // A U64Counter reports a single monotonic count that is the sum of all calls to `inc`
+//! assert_eq!(counter.observations.len(), 1);
+//! assert_eq!(counter.observations[0].0, Attributes::from(&[("tag1", "val1"), ("tag2", "val2")]));
+//! assert_eq!(counter.observations[0].1, Observation::U64Counter(32))
+//! ```
+//!
+//! This provides great flexibility in how recording takes place, as provided they can talk the
+//! common reporting data model, they can plug into `Registry`
+//!
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use parking_lot::Mutex;
+use std::any::Any;
+use std::borrow::Cow;
+use std::collections::btree_map::Entry;
+use std::collections::BTreeMap;
+
+mod counter;
+mod cumulative;
+mod duration;
+mod gauge;
+mod histogram;
+mod metric;
+
+pub use crate::metric::*;
+pub use counter::*;
+pub use cumulative::*;
+pub use duration::*;
+pub use gauge::*;
+pub use histogram::*;
+
+/// A `Registry` stores a map of metric names to `Instrument`
+///
+/// It allows retrieving them by name, registering new instruments and generating
+/// reports of all registered instruments
+#[derive(Debug, Default)]
+pub struct Registry {
+    /// A list of instruments indexed by metric name
+    ///
+    /// A BTreeMap is used to provide a consistent ordering
+    instruments: Mutex<BTreeMap<&'static str, Box<dyn Instrument>>>,
+}
+
+impl Registry {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Register a new `Metric` with the provided name and description
+    ///
+    /// ```
+    /// use ::metric::{Registry, Metric, U64Counter};
+    ///
+    /// let registry = Registry::new();
+    /// let counter: Metric<U64Counter> = registry.register_metric("metric_name", "description");
+    /// ```
+    ///
+    /// Note: `&'static str` is intentionally used to ensure the metric name appears "in-the-plain"
+    /// and can easily be searched for within the codebase
+    ///
+    pub fn register_metric<T>(&self, name: &'static str, description: &'static str) -> Metric<T>
+    where
+        T: MetricObserver,
+        T::Options: Default,
+    {
+        self.register_metric_with_options(name, description, Default::default)
+    }
+
+    /// If a metric with the provided `name` already exists, returns it
+    ///
+    /// Otherwise, invokes `options` and creates a new Metric from the
+    /// returned options, stores it in this `Registry` and returns it
+    ///
+    /// Panics if an `Instrument` has already been registered with this
+    /// name but a different type
+    ///
+    /// ```
+    /// use ::metric::{Registry, Metric, U64Histogram, U64HistogramOptions};
+    ///
+    /// let registry = Registry::new();
+    /// let histogram: Metric<U64Histogram> = registry.register_metric_with_options(
+    ///     "metric_name",
+    ///     "description",
+    ///     || U64HistogramOptions::new([10, 20, u64::MAX]),
+    /// );
+    /// ```
+    ///
+    pub fn register_metric_with_options<T: MetricObserver, F: FnOnce() -> T::Options>(
+        &self,
+        name: &'static str,
+        description: &'static str,
+        options: F,
+    ) -> Metric<T> {
+        self.register_instrument(name, move || Metric::new(name, description, options()))
+    }
+
+    /// If an instrument already exists with the provided `name`, returns it
+    ///
+    /// Otherwise, invokes `create` to create a new `Instrument`, stores it in this `Registry`,
+    /// and returns it
+    ///
+    /// An application might choose to register a custom Instrument, instead of using a Metric,
+    /// when it wishes to defer some computation to report time. For example, reporting
+    /// metrics from systems that cannot be instrumented with `Metric` directly (e.g. jemalloc)
+    ///
+    /// Note: An instrument name is not required to match the metric name(s) reported by the
+    /// instrument, however:
+    ///
+    /// - instruments will report in order of their instrument name
+    /// - not all reporters may handle the same metric name being reported multiple times
+    ///
+    /// Panics if an `Instrument` has already been registered with this name but a different type
+    ///
+    /// Panics if the instrument name is illegal
+    pub fn register_instrument<F: FnOnce() -> I, I: Instrument + Clone + 'static>(
+        &self,
+        name: &'static str,
+        create: F,
+    ) -> I {
+        assert_legal_key(name);
+
+        let mut instruments = self.instruments.lock();
+
+        let instrument = match instruments.entry(name) {
+            Entry::Occupied(o) => match o.get().as_any().downcast_ref::<I>() {
+                Some(instrument) => instrument.clone(),
+                None => panic!("instrument {name} registered with two different types"),
+            },
+            Entry::Vacant(v) => {
+                let instrument = create();
+                v.insert(Box::new(instrument.clone()));
+                instrument
+            }
+        };
+
+        instrument
+    }
+
+    /// Returns the already registered `Instrument` if any
+    ///
+    /// This is primarily useful for testing
+    pub fn get_instrument<I: Instrument + Clone + 'static>(&self, name: &'static str) -> Option<I> {
+        let instruments = self.instruments.lock();
+        instruments
+            .get(name)
+            .map(|instrument| match instrument.as_any().downcast_ref::<I>() {
+                Some(metric) => metric.clone(),
+                None => panic!("instrument {name} registered with two different types"),
+            })
+    }
+
+    /// Record the current state of every metric in this registry to the provided `Reporter`
+    ///
+    /// Will iterate through all registered metrics in alphabetical order and for each:
+    /// - call start_metric once
+    /// - call report_observation once for each set of attributes in alphabetical order
+    /// - call finish_metric once complete
+    pub fn report(&self, reporter: &mut dyn Reporter) {
+        let instruments = self.instruments.lock();
+        for instrument in instruments.values() {
+            instrument.report(reporter)
+        }
+    }
+}
+
+/// `Instrument` is a type that knows how to write its observations to a `Reporter`
+pub trait Instrument: std::fmt::Debug + Send + Sync {
+    /// Record the current state of this metric to the provided `Reporter`
+    ///
+    /// Guaranteed to:
+    /// - call start_metric once
+    /// - call report_observation once for each set of attributes in alphabetical order
+    /// - call finish_metric once complete
+    fn report(&self, reporter: &mut dyn Reporter);
+
+    /// Returns the type as [`Any`] so that it can be downcast to its underlying type
+    fn as_any(&self) -> &dyn Any;
+}
+
+/// `Reporter` is the trait that should be implemented by anything that wants to
+/// extract the state of all metrics within a `Registry` and export them
+pub trait Reporter {
+    /// Start recording the observations of a single metric
+    ///
+    /// Successive calls are guaranteed to not occur without an intervening
+    /// call to finish_metric
+    fn start_metric(
+        &mut self,
+        metric_name: &'static str,
+        description: &'static str,
+        kind: MetricKind,
+    );
+
+    /// Record an observation for the metric started by start_metric
+    ///
+    /// Must not be called without a prior call to start_metric with
+    /// no intervening call to finish_metric
+    fn report_observation(&mut self, attributes: &Attributes, observation: Observation);
+
+    /// Finish recording a given metric
+    ///
+    /// Must not be called without a prior call to start_metric with
+    /// no intervening call to finish_metric
+    fn finish_metric(&mut self);
+}
+
+/// A set of observations for a particular metric
+///
+/// This is solely used by `RawReporter` to buffer up observations, the `Reporter`
+/// trait streams `Observation` and does not perform intermediate aggregation
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct ObservationSet {
+    pub metric_name: &'static str,
+    pub description: &'static str,
+    pub kind: MetricKind,
+    pub observations: Vec<(Attributes, Observation)>,
+}
+
+impl ObservationSet {
+    /// Returns the observation for a given set of attributes if any
+    pub fn observation(&self, attributes: impl Into<Attributes>) -> Option<&Observation> {
+        let attributes = attributes.into();
+        self.observations
+            .iter()
+            .find_map(|(a, o)| if a == &attributes { Some(o) } else { None })
+    }
+}
+
+/// A `Reporter` that records the raw data submitted
+#[derive(Debug, Clone, Default)]
+pub struct RawReporter {
+    completed: Vec<ObservationSet>,
+    in_progress: Option<ObservationSet>,
+}
+
+impl Reporter for RawReporter {
+    fn start_metric(
+        &mut self,
+        metric_name: &'static str,
+        description: &'static str,
+        kind: MetricKind,
+    ) {
+        assert!(self.in_progress.is_none(), "metric already in progress");
+        self.in_progress = Some(ObservationSet {
+            metric_name,
+            description,
+            kind,
+            observations: Default::default(),
+        })
+    }
+
+    fn report_observation(&mut self, attributes: &Attributes, observation: Observation) {
+        let metric = self
+            .in_progress
+            .as_mut()
+            .expect("metric should be in progress");
+        metric.observations.push((attributes.clone(), observation))
+    }
+
+    fn finish_metric(&mut self) {
+        let metric = self
+            .in_progress
+            .take()
+            .expect("metric should be in progress");
+        self.completed.push(metric)
+    }
+}
+
+impl RawReporter {
+    /// Returns the observation set for a given metric name if any
+    pub fn metric(&self, metric_name: &str) -> Option<&ObservationSet> {
+        self.observations()
+            .iter()
+            .find(|observation| observation.metric_name == metric_name)
+    }
+
+    /// Returns a list of `ObservationSet` for each reported metric
+    pub fn observations(&self) -> &Vec<ObservationSet> {
+        assert!(self.in_progress.is_none(), "metric observation in progress");
+        &self.completed
+    }
+}
+
+/// Identifies the type of `Observation` reported by this `Metric`
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum MetricKind {
+    U64Counter,
+    U64Gauge,
+    U64Histogram,
+    DurationCounter,
+    DurationGauge,
+    DurationHistogram,
+}
+
+/// A `Metric` records an `Observation` for each unique set of `Attributes`
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub enum Observation {
+    U64Counter(u64),
+    U64Gauge(u64),
+    DurationCounter(std::time::Duration),
+    DurationGauge(std::time::Duration),
+    U64Histogram(HistogramObservation<u64>),
+    DurationHistogram(HistogramObservation<std::time::Duration>),
+}
+
+/// A histogram measurement
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct HistogramObservation<T> {
+    /// The sum of all observations
+    pub total: T,
+    /// The buckets
+    pub buckets: Vec<ObservationBucket<T>>,
+}
+
+impl<T> HistogramObservation<T> {
+    pub fn sample_count(&self) -> u64 {
+        self.buckets.iter().map(|bucket| bucket.count).sum()
+    }
+}
+
+/// A bucketed observation
+///
+/// Stores the number of values that were less than or equal to `le` and
+/// strictly greater than the `le` of the previous bucket
+///
+/// NB: Unlike prometheus histogram bins the buckets are not cumulative
+/// i.e. `count` is just the count of values that fell into this bucket
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct ObservationBucket<T> {
+    pub le: T,
+    pub count: u64,
+}
+
+/// A set of key-value pairs with unique keys
+///
+/// A `Metric` records observations for each unique set of `Attributes`
+#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+pub struct Attributes(BTreeMap<&'static str, Cow<'static, str>>);
+
+impl Attributes {
+    pub fn iter(&self) -> std::collections::btree_map::Iter<'_, &'static str, Cow<'static, str>> {
+        self.0.iter()
+    }
+
+    /// Sets the given key, overriding it if already set
+    pub fn insert(&mut self, key: &'static str, value: impl Into<Cow<'static, str>>) {
+        self.0.insert(key, value.into());
+    }
+}
+
+impl<'a, const N: usize> From<&'a [(&'static str, &'static str); N]> for Attributes {
+    fn from(iterator: &'a [(&'static str, &'static str); N]) -> Self {
+        Self(
+            iterator
+                .iter()
+                .map(|(key, value)| {
+                    assert_legal_key(key);
+                    (*key, Cow::Borrowed(*value))
+                })
+                .collect(),
+        )
+    }
+}
+
+impl<const N: usize> From<[(&'static str, Cow<'static, str>); N]> for Attributes {
+    fn from(iterator: [(&'static str, Cow<'static, str>); N]) -> Self {
+        Self(
+            IntoIterator::into_iter(iterator)
+                .map(|(key, value)| {
+                    assert_legal_key(key);
+                    (key, value)
+                })
+                .collect(),
+        )
+    }
+}
+
+/// Panics if the provided string matches [0-9a-z_]+
+pub fn assert_legal_key(s: &str) {
+    assert!(!s.is_empty(), "string must not be empty");
+    assert!(
+        s.chars().all(|c| matches!(c, '0'..='9' | 'a'..='z' | '_')),
+        "string must be [0-9a-z_]+ got: \"{s}\""
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_registry() {
+        let registry = Registry::new();
+        let counter: Metric<U64Counter> = registry.register_metric("foo", "my magic description");
+        let gauge: Metric<U64Gauge> = registry.register_metric("bar", "my magic description");
+
+        counter.recorder(&[("tag1", "foo")]).inc(23);
+        counter.recorder(&[("tag1", "bar")]).inc(53);
+        gauge.recorder(&[("tag1", "value")]).set(49);
+
+        let mut reporter = RawReporter::default();
+        registry.report(&mut reporter);
+
+        let observations = reporter.observations();
+
+        assert_eq!(observations.len(), 2);
+
+        // Results should be alphabetical in metric name
+        let gauge = &observations[0];
+        assert_eq!(gauge.metric_name, "bar");
+        assert_eq!(gauge.kind, MetricKind::U64Gauge);
+        assert_eq!(gauge.observations.len(), 1);
+
+        let (attributes, observation) = &gauge.observations[0];
+        assert_eq!(attributes.0.get("tag1").unwrap(), "value");
+        assert_eq!(observation, &Observation::U64Gauge(49));
+
+        let counter = &observations[1];
+        assert_eq!(counter.metric_name, "foo");
+        assert_eq!(counter.kind, MetricKind::U64Counter);
+
+        assert_eq!(counter.observations.len(), 2);
+
+        // Attributes should be alphabetical
+        let (attributes, observation) = &counter.observations[0];
+        assert_eq!(attributes.0.get("tag1").unwrap(), "bar");
+        assert_eq!(observation, &Observation::U64Counter(53));
+
+        let (attributes, observation) = &counter.observations[1];
+        assert_eq!(attributes.0.get("tag1").unwrap(), "foo");
+        assert_eq!(observation, &Observation::U64Counter(23));
+
+        assert!(registry
+            .get_instrument::<Metric<U64Counter>>("unregistered")
+            .is_none());
+
+        let counter = registry
+            .get_instrument::<Metric<U64Counter>>("foo")
+            .unwrap();
+
+        let new_attributes = Attributes::from(&[("foo", "bar")]);
+        assert!(counter.get_observer(&new_attributes).is_none());
+        let observation = counter.get_observer(attributes).unwrap().observe();
+
+        assert_eq!(observation, Observation::U64Counter(23));
+    }
+
+    #[test]
+    #[should_panic(expected = "instrument foo registered with two different types")]
+    fn test_type_mismatch() {
+        let registry = Registry::new();
+        registry.register_metric::<U64Gauge>("foo", "my magic description");
+        registry.register_metric::<U64Counter>("foo", "my magic description");
+    }
+
+    #[test]
+    #[should_panic(expected = "string must be [0-9a-z_]+ got: \"foo sdf\"")]
+    fn illegal_metric_name() {
+        let registry = Registry::new();
+        registry.register_metric::<U64Gauge>("foo sdf", "my magic description");
+    }
+
+    #[test]
+    #[should_panic(expected = "string must be [0-9a-z_]+ got: \"foo bar\"")]
+    fn illegal_attribute_name() {
+        let _ = Attributes::from(&[("foo bar", "value")]);
+    }
+}
diff --git a/metric/src/metric.rs b/metric/src/metric.rs
new file mode 100644
index 0000000..04b0a20
--- /dev/null
+++ b/metric/src/metric.rs
@@ -0,0 +1,337 @@
+use std::any::Any;
+use std::collections::BTreeMap;
+use std::sync::Arc;
+
+use parking_lot::{MappedMutexGuard, Mutex, MutexGuard};
+
+use super::{Attributes, Instrument, MetricKind, Observation, Reporter};
+
+/// A `Metric` collects `Observation` for each unique set of `Attributes`
+///
+/// It is templated by `T: MetricObserver` which determines the type of
+/// `Observation` made by this `Metric` along with its semantics
+#[derive(Debug)]
+pub struct Metric<T: MetricObserver> {
+    name: &'static str,
+    description: &'static str,
+    shared: Arc<MetricShared<T>>,
+}
+
+#[derive(Debug)]
+struct MetricShared<T: MetricObserver> {
+    options: T::Options,
+    values: Mutex<BTreeMap<Attributes, T>>,
+}
+
+/// Manually implement Clone to avoid constraint T: Clone
+impl<T: MetricObserver> Clone for Metric<T> {
+    fn clone(&self) -> Self {
+        Self {
+            name: self.name,
+            description: self.description,
+            shared: Arc::clone(&self.shared),
+        }
+    }
+}
+
+impl<T: MetricObserver> Metric<T> {
+    pub(crate) fn new(name: &'static str, description: &'static str, options: T::Options) -> Self {
+        Self {
+            name,
+            description,
+            shared: Arc::new(MetricShared {
+                options,
+                values: Default::default(),
+            }),
+        }
+    }
+
+    /// Retrieves a type that can be used to report observations for a given set of attributes
+    ///
+    /// If this is the first time this method has been called with this set of attributes,
+    /// it will initialize the corresponding `MetricObserver` with the default observation
+    ///
+    /// ```
+    /// use ::metric::{U64Gauge, Registry, Metric};
+    ///
+    /// let registry = Registry::new();
+    /// let metric: Metric<U64Gauge> = registry.register_metric("metric_name", "description");
+    ///
+    /// metric.recorder(&[("foo", "bar")]).set(34);
+    ///
+    /// let recorder = metric.recorder(&[("foo", "biz")]);
+    /// recorder.set(34);
+    ///
+    /// ```
+    pub fn recorder(&self, attributes: impl Into<Attributes>) -> T::Recorder {
+        self.observer(attributes).recorder()
+    }
+
+    /// Retrieves the observer for a given set of attributes
+    ///
+    /// If this is the first time this method has been called with this set of attributes,
+    /// it will initialize the corresponding `MetricObserver` with the default observation
+    pub fn observer(&self, attributes: impl Into<Attributes>) -> MappedMutexGuard<'_, T> {
+        MutexGuard::map(self.shared.values.lock(), |values| {
+            values
+                .entry(attributes.into())
+                .or_insert_with(|| T::create(&self.shared.options))
+        })
+    }
+
+    /// Gets the observer for a given set of attributes if one has
+    /// been registered by a call to `Metric::recorder`
+    ///
+    /// This is primarily useful for testing
+    pub fn get_observer(&self, attributes: &Attributes) -> Option<MappedMutexGuard<'_, T>> {
+        MutexGuard::try_map(self.shared.values.lock(), |values| {
+            values.get_mut(attributes)
+        })
+        .ok()
+    }
+}
+
+impl<T: MetricObserver> Instrument for Metric<T> {
+    fn report(&self, reporter: &mut dyn Reporter) {
+        reporter.start_metric(self.name, self.description, T::kind());
+
+        let values = self.shared.values.lock();
+        for (attributes, metric_value) in &*values {
+            reporter.report_observation(attributes, metric_value.observe())
+        }
+
+        reporter.finish_metric();
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+/// Types that wish to be used with `Metric` must implement this trait
+/// that exposes the necessary reporting API
+///
+/// `Metric` maintains a distinct `MetricObserver` for each unique set of `Attributes`
+pub trait MetricObserver: MakeMetricObserver + std::fmt::Debug + Send + 'static {
+    /// The type that is used to modify the value reported by this MetricObserver
+    ///
+    /// Most commonly this will be `Self` but see `CumulativeGauge` for an example
+    /// of where it is not
+    type Recorder;
+
+    /// The `MetricKind` reported by this `MetricObserver`
+    fn kind() -> MetricKind;
+
+    /// Return a `Self::Recorder` that can be used to mutate the value reported
+    /// by this `MetricObserver`
+    fn recorder(&self) -> Self::Recorder;
+
+    /// Return the current value for this
+    fn observe(&self) -> Observation;
+}
+
+/// All `MetricObserver` must also implement `MakeMetricObserver` which defines
+/// how to construct new instances of `Self`
+///
+/// A blanket impl is provided for types that implement Default
+///
+/// See `U64Histogram` for an example of how this is used
+pub trait MakeMetricObserver {
+    type Options: Sized + Send + Sync + std::fmt::Debug;
+
+    fn create(options: &Self::Options) -> Self;
+}
+
+impl<T: Default> MakeMetricObserver for T {
+    type Options = ();
+
+    fn create(_: &Self::Options) -> Self {
+        Default::default()
+    }
+}
+
+/// In most cases the recorder for a `MetricObserver` is stateless, in fact in many cases
+/// `MetricObserver::Recorder = Self`. This means applications wishing to record observations
+/// for many different sets of attributes can just use `Metric<T>` and construct reporters
+/// dynamically
+///
+/// ```
+/// use metric::{Registry, Metric, U64Gauge, Attributes};
+///
+/// let registry = Registry::new();
+/// let metric: Metric<U64Gauge> = registry.register_metric("foo", "description");
+///
+/// metric.recorder(&[("foo", "bar")]).set(21);
+/// metric.recorder(&[("fiz", "bar")]).set(34);
+///
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("foo", "bar")])).unwrap().fetch(), 21);
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("fiz", "bar")])).unwrap().fetch(), 34);
+/// ```
+///
+/// However, some `MetricObserver` are stateful, for example, `CumulativeGauge`. In this case
+/// dropping the recorder clears any contribution it made to the metric's total
+///
+/// ```
+/// use metric::{Registry, Metric, CumulativeGauge, Attributes};
+///
+/// let registry = Registry::new();
+/// let metric: Metric<CumulativeGauge> = registry.register_metric("foo", "description");
+///
+/// metric.recorder(&[("foo", "bar")]).set(21);
+/// metric.recorder(&[("fiz", "bar")]).set(34);
+///
+/// // Recorders dropped immediately and so they don't record anything!
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("foo", "bar")])).unwrap().fetch(), 0);
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("fiz", "bar")])).unwrap().fetch(), 0);
+/// ```
+///
+/// `RecorderCollection` exists to address this situation, as unlike `Metric` it retains the
+/// `MetricObserver::Recorder` and ensures they live as long as the `RecorderCollection`
+///
+/// ```
+/// use metric::{Registry, Metric, CumulativeGauge, RecorderCollection, Attributes};
+///
+/// let registry = Registry::new();
+/// let metric: Metric<CumulativeGauge> = registry.register_metric("foo", "description");
+///
+/// let mut r1 = RecorderCollection::new(metric.clone());
+/// let mut r2 = RecorderCollection::new(metric.clone());
+///
+/// r1.recorder(&[("foo", "bar")]).set(21);
+/// r1.recorder(&[("fiz", "bar")]).set(34);
+/// r2.recorder(&[("foo", "bar")]).set(12);
+///
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("foo", "bar")])).unwrap().fetch(), 21 + 12);
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("fiz", "bar")])).unwrap().fetch(), 34);
+///
+/// std::mem::drop(r1);
+///
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("foo", "bar")])).unwrap().fetch(), 12);
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("fiz", "bar")])).unwrap().fetch(), 0);
+///
+/// std::mem::drop(r2);
+///
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("foo", "bar")])).unwrap().fetch(), 0);
+/// assert_eq!(metric.get_observer(&Attributes::from(&[("fiz", "bar")])).unwrap().fetch(), 0);
+/// ```
+///
+#[derive(Debug)]
+pub struct RecorderCollection<T: MetricObserver> {
+    metric: Metric<T>,
+    recorders: BTreeMap<Attributes, T::Recorder>,
+}
+
+impl<T: MetricObserver> RecorderCollection<T> {
+    /// Create a new `RecorderCollection` from the provided `Metric`
+    pub fn new(metric: Metric<T>) -> Self {
+        Self {
+            metric,
+            recorders: Default::default(),
+        }
+    }
+
+    /// Create a new unregistered `RecorderCollection` from the provided options
+    pub fn new_unregistered_options(options: T::Options) -> Self {
+        Self {
+            metric: Metric::new("unregistered", "unregistered", options),
+            recorders: Default::default(),
+        }
+    }
+
+    /// Retrieves a type that can be used to report observations for a given set of attributes
+    ///
+    /// The value returned is cached on this `RecorderCollection` and lives as long as it does
+    pub fn recorder(&mut self, attributes: impl Into<Attributes>) -> &mut T::Recorder {
+        let metric = &self.metric;
+        self.recorders
+            .entry(attributes.into())
+            .or_insert_with_key(|key| metric.recorder(key.clone()))
+    }
+}
+
+impl<T: MetricObserver> RecorderCollection<T>
+where
+    T::Options: Default,
+{
+    /// Create a new unregistered `RecorderCollection` with the default options
+    pub fn new_unregistered() -> Self {
+        Self::new_unregistered_options(Default::default())
+    }
+}
+
+/// A common grouping of `MetricObserver` for reporting on fallible code paths
+#[derive(Debug, Clone)]
+pub struct ResultMetric<T> {
+    pub ok: T,
+    pub client_error: T,
+    pub server_error: T,
+    pub unexpected_response: T,
+}
+
+impl<T> ResultMetric<T>
+where
+    T: MetricObserver<Recorder = T>,
+{
+    pub fn new(metric: &Metric<T>, mut attributes: Attributes) -> Self {
+        attributes.insert("status", "ok");
+        let ok = metric.recorder(attributes.clone());
+
+        attributes.insert("status", "client_error");
+        let client_error = metric.recorder(attributes.clone());
+
+        attributes.insert("status", "server_error");
+        let server_error = metric.recorder(attributes.clone());
+
+        attributes.insert("status", "unexpected_response");
+        let unexpected_response = metric.recorder(attributes);
+
+        Self {
+            ok,
+            client_error,
+            server_error,
+            unexpected_response,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::U64Counter;
+
+    #[test]
+    fn test_metric() {
+        let metric: Metric<U64Counter> = Metric::new("foo", "description", ());
+
+        let r1 = metric.recorder(&[("tag1", "val1"), ("tag2", "val2")]);
+        let r2 = metric.recorder(&[("tag1", "val1")]);
+        let r3 = metric.recorder(&[("tag1", "val2")]);
+        let r4 = metric.recorder(&[("tag1", "val1"), ("tag2", "val2")]);
+
+        assert_eq!(r1.fetch(), 0);
+        assert_eq!(r2.fetch(), 0);
+        assert_eq!(r3.fetch(), 0);
+        assert_eq!(r4.fetch(), 0);
+
+        r2.inc(32);
+
+        assert_eq!(r1.fetch(), 0);
+        assert_eq!(r2.fetch(), 32);
+        assert_eq!(r3.fetch(), 0);
+        assert_eq!(r4.fetch(), 0);
+
+        r1.inc(30);
+
+        assert_eq!(r1.fetch(), 30);
+        assert_eq!(r2.fetch(), 32);
+        assert_eq!(r3.fetch(), 0);
+        assert_eq!(r4.fetch(), 30);
+
+        r4.inc(21);
+
+        assert_eq!(r1.fetch(), 51);
+        assert_eq!(r2.fetch(), 32);
+        assert_eq!(r3.fetch(), 0);
+        assert_eq!(r4.fetch(), 51);
+    }
+}
diff --git a/metric_exporters/Cargo.toml b/metric_exporters/Cargo.toml
new file mode 100644
index 0000000..dc70a67
--- /dev/null
+++ b/metric_exporters/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "metric_exporters"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+observability_deps = { path = "../observability_deps" }
+metric = { path = "../metric" }
+prometheus = { version = "0.13", default-features = false }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+test_helpers = { path = "../test_helpers" }
diff --git a/metric_exporters/src/lib.rs b/metric_exporters/src/lib.rs
new file mode 100644
index 0000000..ff62a86
--- /dev/null
+++ b/metric_exporters/src/lib.rs
@@ -0,0 +1,296 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use metric::{Attributes, MetricKind, Observation};
+use std::io::Write;
+
+use observability_deps::tracing::error;
+use prometheus::proto::{Bucket, Histogram};
+use prometheus::{
+    proto::{Counter, Gauge, LabelPair, Metric, MetricFamily, MetricType},
+    Encoder, TextEncoder,
+};
+
+/// A `metric::Reporter` that writes data in the prometheus text exposition format
+///
+/// In order to comply with the prometheus naming best-practices, certain metrics may have
+/// a unit and/or "_total" suffix applied - <https://prometheus.io/docs/practices/naming/>
+///
+/// Note: this is done after the metric sort order is established - this means the output
+/// order is guaranteed to be stable, but not necessarily sorted.
+///
+/// For example a counter named "metric" and a gauge named "metric_a" will be exported as
+/// "metric_total" and "metric_a" in that order
+///
+#[derive(Debug)]
+pub struct PrometheusTextEncoder<'a, W: Write> {
+    /// metric family together with a flag indicating that it was used
+    metric: Option<(MetricFamily, bool)>,
+
+    encoder: TextEncoder,
+    writer: &'a mut W,
+}
+
+impl<'a, W: Write> PrometheusTextEncoder<'a, W> {
+    pub fn new(writer: &'a mut W) -> Self {
+        Self {
+            metric: None,
+            encoder: TextEncoder::new(),
+            writer,
+        }
+    }
+}
+
+impl<'a, W: Write> metric::Reporter for PrometheusTextEncoder<'a, W> {
+    fn start_metric(
+        &mut self,
+        metric_name: &'static str,
+        description: &'static str,
+        kind: MetricKind,
+    ) {
+        assert!(self.metric.is_none(), "metric already in progress");
+
+        let (name, metric_type) = match kind {
+            MetricKind::U64Counter => (format!("{metric_name}_total"), MetricType::COUNTER),
+            MetricKind::U64Gauge => (metric_name.to_string(), MetricType::GAUGE),
+            MetricKind::U64Histogram => (metric_name.to_string(), MetricType::HISTOGRAM),
+            MetricKind::DurationCounter => {
+                (format!("{metric_name}_seconds_total"), MetricType::COUNTER)
+            }
+            MetricKind::DurationGauge => (format!("{metric_name}_seconds"), MetricType::GAUGE),
+            MetricKind::DurationHistogram => {
+                (format!("{metric_name}_seconds"), MetricType::HISTOGRAM)
+            }
+        };
+
+        let mut metric = MetricFamily::default();
+        metric.set_name(name);
+        metric.set_help(description.to_string());
+        metric.set_field_type(metric_type);
+
+        self.metric = Some((metric, false))
+    }
+
+    fn report_observation(&mut self, attributes: &Attributes, observation: Observation) {
+        let (metrics, used) = self.metric.as_mut().expect("no metric in progress");
+
+        let metrics = metrics.mut_metric();
+
+        let mut metric = Metric::default();
+
+        metric.set_label(
+            attributes
+                .iter()
+                .map(|(name, value)| {
+                    let mut pair = LabelPair::default();
+                    pair.set_name(name.to_string());
+                    pair.set_value(value.to_string());
+                    pair
+                })
+                .collect(),
+        );
+
+        match observation {
+            Observation::U64Counter(v) => {
+                let mut counter = Counter::default();
+                counter.set_value(v as f64);
+                metric.set_counter(counter)
+            }
+            Observation::U64Gauge(v) => {
+                let mut gauge = Gauge::default();
+                gauge.set_value(v as f64);
+                metric.set_gauge(gauge)
+            }
+            Observation::DurationCounter(v) => {
+                let mut counter = Counter::default();
+                counter.set_value(v.as_secs_f64());
+                metric.set_counter(counter)
+            }
+            Observation::DurationGauge(v) => {
+                let mut gauge = Gauge::default();
+                gauge.set_value(v.as_secs_f64());
+                metric.set_gauge(gauge)
+            }
+            Observation::U64Histogram(v) => {
+                let mut histogram = Histogram::default();
+                let mut cumulative_count = 0;
+
+                histogram.set_bucket(
+                    v.buckets
+                        .into_iter()
+                        .map(|observation| {
+                            cumulative_count += observation.count;
+
+                            let mut bucket = Bucket::default();
+                            let le = match observation.le {
+                                u64::MAX => f64::INFINITY,
+                                v => v as f64,
+                            };
+
+                            bucket.set_upper_bound(le);
+                            bucket.set_cumulative_count(cumulative_count);
+                            bucket
+                        })
+                        .collect(),
+                );
+
+                histogram.set_sample_count(cumulative_count);
+                histogram.set_sample_sum(v.total as f64);
+                metric.set_histogram(histogram)
+            }
+            Observation::DurationHistogram(v) => {
+                let mut histogram = Histogram::default();
+                let mut cumulative_count = 0;
+
+                histogram.set_bucket(
+                    v.buckets
+                        .into_iter()
+                        .map(|observation| {
+                            cumulative_count += observation.count;
+
+                            let mut bucket = Bucket::default();
+                            let le = match observation.le {
+                                metric::DURATION_MAX => f64::INFINITY,
+                                v => v.as_secs_f64(),
+                            };
+
+                            bucket.set_upper_bound(le);
+                            bucket.set_cumulative_count(cumulative_count);
+                            bucket
+                        })
+                        .collect(),
+                );
+
+                histogram.set_sample_count(cumulative_count);
+                histogram.set_sample_sum(v.total.as_secs_f64());
+                metric.set_histogram(histogram)
+            }
+        };
+        metrics.push(metric);
+
+        *used = true;
+    }
+
+    fn finish_metric(&mut self) {
+        if let Some((family, used)) = self.metric.take() {
+            if !used {
+                // just don't report the metric
+                return;
+            }
+
+            match self.encoder.encode(&[family], self.writer) {
+                Ok(_) => {}
+                Err(e) => error!(%e, "error encoding metric family"),
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use metric::{
+        DurationCounter, DurationGauge, DurationHistogram, Metric, Registry, U64Counter,
+        U64Histogram, U64HistogramOptions,
+    };
+    use std::time::Duration;
+    use test_helpers::assert_not_contains;
+
+    #[test]
+    fn test_encode() {
+        // tap tracing to check for errors
+        let tracing_capture = test_helpers::tracing::TracingCapture::new();
+
+        let registry = Registry::new();
+
+        let counter: Metric<U64Counter> = registry.register_metric("foo", "a counter metric");
+
+        let counter_value = counter.recorder(&[("tag1", "value"), ("tag2", "value")]);
+        counter_value.inc(5);
+
+        let counter_value2 = counter.recorder(&[("tag1", "value"), ("tag2", "value2")]);
+        counter_value2.inc(7);
+
+        let histogram: Metric<U64Histogram> =
+            registry.register_metric_with_options("bar", "a histogram metric", || {
+                U64HistogramOptions::new([5, 10, 50])
+            });
+
+        let histogram_r1 = histogram.recorder(&[("tag1", "value1")]);
+        let histogram_r2 = histogram.recorder(&[("tag1", "value1")]);
+        let histogram_r3 = histogram.recorder(&[("tag1", "value2")]);
+
+        histogram_r1.record(10);
+        histogram_r2.record(3);
+        histogram_r2.record(40);
+        histogram_r3.record(8);
+        histogram_r3.record(40);
+
+        let duration: Metric<DurationGauge> =
+            registry.register_metric("duration_gauge", "a duration gauge");
+
+        duration
+            .recorder(&[("tag1", "value1")])
+            .set(Duration::from_millis(100));
+
+        let duration_counter: Metric<DurationCounter> =
+            registry.register_metric("duration_counter", "a duration counter");
+
+        duration_counter
+            .recorder(&[("tag1", "value1")])
+            .inc(Duration::from_millis(1200));
+
+        // unused metrics must not result in an error
+        let _unused: Metric<DurationHistogram> = registry.register_metric("unused", "unused");
+
+        let mut buffer = Vec::new();
+        let mut encoder = PrometheusTextEncoder::new(&mut buffer);
+        registry.report(&mut encoder);
+
+        let buffer = String::from_utf8(buffer).unwrap();
+
+        let expected = r#"
+# HELP bar a histogram metric
+# TYPE bar histogram
+bar_bucket{tag1="value1",le="5"} 1
+bar_bucket{tag1="value1",le="10"} 2
+bar_bucket{tag1="value1",le="50"} 3
+bar_bucket{tag1="value1",le="+Inf"} 3
+bar_sum{tag1="value1"} 53
+bar_count{tag1="value1"} 3
+bar_bucket{tag1="value2",le="5"} 0
+bar_bucket{tag1="value2",le="10"} 1
+bar_bucket{tag1="value2",le="50"} 2
+bar_bucket{tag1="value2",le="+Inf"} 2
+bar_sum{tag1="value2"} 48
+bar_count{tag1="value2"} 2
+# HELP duration_counter_seconds_total a duration counter
+# TYPE duration_counter_seconds_total counter
+duration_counter_seconds_total{tag1="value1"} 1.2
+# HELP duration_gauge_seconds a duration gauge
+# TYPE duration_gauge_seconds gauge
+duration_gauge_seconds{tag1="value1"} 0.1
+# HELP foo_total a counter metric
+# TYPE foo_total counter
+foo_total{tag1="value",tag2="value"} 5
+foo_total{tag1="value",tag2="value2"} 7
+"#
+        .trim_start();
+
+        assert_eq!(&buffer, expected, "{buffer}");
+
+        // no errors
+        assert_not_contains!(tracing_capture.to_string(), "error");
+    }
+}
diff --git a/mutable_batch/Cargo.toml b/mutable_batch/Cargo.toml
new file mode 100644
index 0000000..21bbf52
--- /dev/null
+++ b/mutable_batch/Cargo.toml
@@ -0,0 +1,29 @@
+[package]
+name = "mutable_batch"
+description = "A mutable arrow RecordBatch"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+arrow_util = { path = "../arrow_util" }
+data_types = { path = "../data_types" }
+hashbrown = { workspace = true }
+iox_time = { path = "../iox_time" }
+itertools = "0.12"
+schema = { path = "../schema" }
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+assert_matches = "1.5.0"
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+partition = { path = "../partition" }
+pretty_assertions = "1.4.0"
+proptest = { version = "1.4.0", default-features = false }
+rand = "0.8"
diff --git a/mutable_batch/src/column.rs b/mutable_batch/src/column.rs
new file mode 100644
index 0000000..17d4305
--- /dev/null
+++ b/mutable_batch/src/column.rs
@@ -0,0 +1,1188 @@
+//! A [`Column`] stores the rows for a given column name
+
+use arrow::{
+    array::{
+        ArrayDataBuilder, ArrayRef, BooleanArray, Float64Array, Int64Array,
+        TimestampNanosecondArray, UInt64Array,
+    },
+    buffer::NullBuffer,
+    datatypes::DataType,
+    error::ArrowError,
+};
+use arrow_util::{bitset::BitSet, string::PackedStringArray};
+use data_types::{StatValues, Statistics};
+use schema::{InfluxColumnType, InfluxFieldType, TIME_DATA_TYPE};
+use snafu::{ResultExt, Snafu};
+use std::{fmt::Formatter, iter, mem, num::NonZeroU64, sync::Arc};
+
+/// A "dictionary ID" (DID) is a compact numeric representation of an interned
+/// string in the dictionary. The same string always maps the same DID.
+///
+/// DIDs can be compared, hashed and cheaply copied around, just like small integers.
+///
+/// An i32 is used to match the default for Arrow dictionaries
+#[allow(clippy::upper_case_acronyms)]
+pub(crate) type DID = i32;
+
+/// An invalid DID used for NULL rows
+pub(crate) const NULL_DID: DID = -1;
+
+/// The type of the dictionary used
+type Dictionary = arrow_util::dictionary::StringDictionary<DID>;
+
+/// A type-agnostic way of splitting the various [`ColumnData`] arrays.
+///
+/// This macro is required because it's not possible to write a generic function
+/// that operates on all "data" types across [`ColumnData`] variants.`
+macro_rules! split_off_column {
+    ($self:expr, $data:expr, $n:expr, $stats:expr, $right_nulls:expr, $($ty:tt)+) => {{
+        // Compute the new number of nulls in the left side of the split.
+        let left_nulls = $stats.null_count.map(|v| v - $right_nulls);
+
+        // Update the stats for the left side of the split.
+        *$stats = StatValues::new(None, None, $self.valid.len() as u64, left_nulls);
+
+        // Generate the right side of the split (with minimal stats).
+        let right_data = $data.split_off($n);
+        let right_len = right_data.len();
+        $($ty)+(
+            right_data,
+            StatValues::new(
+                None,
+                None,
+                right_len as _,
+                Some($right_nulls),
+            ),
+        )
+    }};
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_copy_implementations, missing_docs)]
+pub enum Error {
+    #[snafu(display(
+        "Invalid null mask, expected to be {} bytes but was {}",
+        expected_bytes,
+        actual_bytes
+    ))]
+    InvalidNullMask {
+        expected_bytes: usize,
+        actual_bytes: usize,
+    },
+
+    #[snafu(display("Internal MUB error constructing Arrow Array: {}", source))]
+    CreatingArrowArray { source: ArrowError },
+}
+
+/// A specialized `Error` for [`Column`] errors
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Stores the actual data for columns in a chunk along with summary
+/// statistics
+#[derive(Debug, Clone)]
+pub struct Column {
+    pub(crate) influx_type: InfluxColumnType,
+    pub(crate) valid: BitSet,
+    pub(crate) data: ColumnData,
+}
+
+/// The data for a column
+#[derive(Debug, Clone)]
+#[allow(missing_docs)]
+pub enum ColumnData {
+    /// These types contain arrays that contain an element for every logical row
+    /// (including nulls).
+    ///
+    /// Null values are padded with an arbitrary dummy value.
+    F64(Vec<f64>, StatValues<f64>), // NaN is ignored when computing statistics.
+    I64(Vec<i64>, StatValues<i64>),
+    U64(Vec<u64>, StatValues<u64>),
+    Bool(BitSet, StatValues<bool>),
+
+    /// The String encoding contains an entry for every logical row, and
+    /// explicitly stores an empty string in the PackedStringArray for NULL
+    /// values.
+    String(PackedStringArray<i32>, StatValues<String>),
+
+    /// Whereas the dictionary encoding does not store an explicit empty string
+    /// in the internal PackedStringArray, nor does it create an entry in the
+    /// dedupe map. A NULL entry is padded into the data vec using the
+    /// [`NULL_DID`] value.
+    ///
+    /// Every distinct, non-null value is stored in the dictionary exactly once,
+    /// and the data arrays contains the dictionary ID for every logical row
+    /// (including nulls as described above).
+    Tag(Vec<DID>, Dictionary, StatValues<String>),
+}
+
+impl std::fmt::Display for ColumnData {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::F64(col_data, _) => write!(f, "F64({})", col_data.len()),
+            Self::I64(col_data, _) => write!(f, "I64({})", col_data.len()),
+            Self::U64(col_data, _) => write!(f, "U64({})", col_data.len()),
+            Self::String(col_data, _) => write!(f, "String({})", col_data.len()),
+            Self::Bool(col_data, _) => write!(f, "Bool({})", col_data.len()),
+            Self::Tag(col_data, dictionary, _) => write!(
+                f,
+                "Tag(keys:{},values:{})",
+                col_data.len(),
+                dictionary.values().len()
+            ),
+        }
+    }
+}
+
+impl Column {
+    pub(crate) fn new(row_count: usize, column_type: InfluxColumnType) -> Self {
+        let mut valid = BitSet::new();
+        valid.append_unset(row_count);
+
+        // Keep track of how many total rows there are
+        let total_count = row_count as u64;
+
+        // If there are no values, there are no distinct values.
+        let distinct_count = if row_count > 0 { Some(1) } else { None };
+
+        let data = match column_type {
+            InfluxColumnType::Field(InfluxFieldType::Boolean) => {
+                let mut data = BitSet::new();
+                data.append_unset(row_count);
+                ColumnData::Bool(data, StatValues::new_all_null(total_count, None))
+            }
+            InfluxColumnType::Field(InfluxFieldType::UInteger) => ColumnData::U64(
+                vec![0; row_count],
+                StatValues::new_all_null(total_count, None),
+            ),
+            InfluxColumnType::Field(InfluxFieldType::Float) => ColumnData::F64(
+                vec![0.0; row_count],
+                StatValues::new_all_null(total_count, None),
+            ),
+            InfluxColumnType::Field(InfluxFieldType::Integer) | InfluxColumnType::Timestamp => {
+                ColumnData::I64(
+                    vec![0; row_count],
+                    StatValues::new_all_null(total_count, None),
+                )
+            }
+            InfluxColumnType::Field(InfluxFieldType::String) => ColumnData::String(
+                PackedStringArray::new_empty(row_count),
+                StatValues::new_all_null(total_count, distinct_count),
+            ),
+            InfluxColumnType::Tag => ColumnData::Tag(
+                vec![NULL_DID; row_count],
+                Default::default(),
+                StatValues::new_all_null(total_count, distinct_count),
+            ),
+        };
+
+        Self {
+            influx_type: column_type,
+            valid,
+            data,
+        }
+    }
+
+    /// Returns the [`InfluxColumnType`] of this column
+    pub fn influx_type(&self) -> InfluxColumnType {
+        self.influx_type
+    }
+
+    /// Returns the validity bitmask of this column
+    pub fn valid_mask(&self) -> &BitSet {
+        &self.valid
+    }
+
+    /// Returns a reference to this column's data
+    pub fn data(&self) -> &ColumnData {
+        &self.data
+    }
+
+    /// Ensures that the total length of this column is `len` rows,
+    /// padding it with trailing NULLs if necessary
+    pub(crate) fn push_nulls_to_len(&mut self, len: usize) {
+        if self.valid.len() == len {
+            return;
+        }
+        assert!(len > self.valid.len(), "cannot shrink column");
+        let delta = len - self.valid.len();
+        self.valid.append_unset(delta);
+
+        match &mut self.data {
+            ColumnData::F64(data, stats) => {
+                data.resize(len, 0.);
+                stats.update_for_nulls(delta as u64);
+            }
+            ColumnData::I64(data, stats) => {
+                data.resize(len, 0);
+                stats.update_for_nulls(delta as u64);
+            }
+            ColumnData::U64(data, stats) => {
+                data.resize(len, 0);
+                stats.update_for_nulls(delta as u64);
+            }
+            ColumnData::String(data, stats) => {
+                data.extend(delta);
+                stats.update_for_nulls(delta as u64);
+            }
+            ColumnData::Bool(data, stats) => {
+                data.append_unset(delta);
+                stats.update_for_nulls(delta as u64);
+            }
+            ColumnData::Tag(data, _dict, stats) => {
+                data.resize(len, NULL_DID);
+                stats.update_for_nulls(delta as u64);
+            }
+        }
+    }
+
+    /// Returns the number of rows in this column
+    pub fn len(&self) -> usize {
+        self.valid.len()
+    }
+
+    /// Returns true if this column contains no rows
+    pub fn is_empty(&self) -> bool {
+        self.valid.is_empty()
+    }
+
+    /// Returns this column's [`Statistics`]
+    pub fn stats(&self) -> Statistics {
+        match &self.data {
+            ColumnData::F64(_, stats) => Statistics::F64(stats.clone()),
+            ColumnData::I64(_, stats) => Statistics::I64(stats.clone()),
+            ColumnData::U64(_, stats) => Statistics::U64(stats.clone()),
+            ColumnData::Bool(_, stats) => Statistics::Bool(stats.clone()),
+            ColumnData::String(_, stats) => Statistics::String(stats.clone()),
+            ColumnData::Tag(_, dictionary, stats) => {
+                let mut distinct_count = dictionary.values().len() as u64;
+                if stats.null_count.expect("mutable batch keeps null counts") > 0 {
+                    distinct_count += 1;
+                }
+
+                let mut stats = stats.clone();
+                stats.distinct_count = distinct_count.try_into().ok();
+                Statistics::String(stats)
+            }
+        }
+    }
+
+    /// The approximate memory size of the data in the column.
+    ///
+    /// This includes the size of `self`.
+    pub fn size(&self) -> usize {
+        let data_size = match &self.data {
+            ColumnData::F64(v, stats) => {
+                mem::size_of::<f64>() * v.capacity() + mem::size_of_val(stats)
+            }
+            ColumnData::I64(v, stats) => {
+                mem::size_of::<i64>() * v.capacity() + mem::size_of_val(stats)
+            }
+            ColumnData::U64(v, stats) => {
+                mem::size_of::<u64>() * v.capacity() + mem::size_of_val(stats)
+            }
+            ColumnData::Bool(v, stats) => v.byte_len() + mem::size_of_val(stats),
+            ColumnData::Tag(v, dictionary, stats) => {
+                mem::size_of::<DID>() * v.capacity() + dictionary.size() + mem::size_of_val(stats)
+            }
+            ColumnData::String(v, stats) => {
+                v.size() + mem::size_of_val(stats) + stats.string_size()
+            }
+        };
+        mem::size_of::<Self>() + data_size + self.valid.byte_len()
+    }
+
+    /// The approximate memory size of the data in the column, not counting for stats or self or
+    /// whatever extra space has been allocated for the vecs
+    pub fn size_data(&self) -> usize {
+        match &self.data {
+            ColumnData::F64(_, _) => mem::size_of::<f64>() * self.len(),
+            ColumnData::I64(_, _) => mem::size_of::<i64>() * self.len(),
+            ColumnData::U64(_, _) => mem::size_of::<u64>() * self.len(),
+            ColumnData::Bool(_, _) => mem::size_of::<bool>() * self.len(),
+            ColumnData::Tag(_, dictionary, _) => {
+                mem::size_of::<DID>() * self.len() + dictionary.size()
+            }
+            ColumnData::String(v, _) => v.size(),
+        }
+    }
+
+    /// Converts this column to an arrow [`ArrayRef`]
+    pub fn to_arrow(&self) -> Result<ArrayRef> {
+        let nulls = Some(NullBuffer::new(self.valid.to_arrow()));
+
+        let data: ArrayRef = match &self.data {
+            ColumnData::F64(data, _) => {
+                let data = ArrayDataBuilder::new(DataType::Float64)
+                    .len(data.len())
+                    .add_buffer(data.iter().cloned().collect())
+                    .nulls(nulls)
+                    .build()
+                    .context(CreatingArrowArraySnafu)?;
+                Arc::new(Float64Array::from(data))
+            }
+            ColumnData::I64(data, _) => match self.influx_type {
+                InfluxColumnType::Timestamp => {
+                    let data = ArrayDataBuilder::new(TIME_DATA_TYPE())
+                        .len(data.len())
+                        .add_buffer(data.iter().cloned().collect())
+                        .nulls(nulls)
+                        .build()
+                        .context(CreatingArrowArraySnafu)?;
+                    Arc::new(TimestampNanosecondArray::from(data))
+                }
+
+                InfluxColumnType::Field(InfluxFieldType::Integer) => {
+                    let data = ArrayDataBuilder::new(DataType::Int64)
+                        .len(data.len())
+                        .add_buffer(data.iter().cloned().collect())
+                        .nulls(nulls)
+                        .build()
+                        .context(CreatingArrowArraySnafu)?;
+                    Arc::new(Int64Array::from(data))
+                }
+                _ => unreachable!(),
+            },
+            ColumnData::U64(data, _) => {
+                let data = ArrayDataBuilder::new(DataType::UInt64)
+                    .len(data.len())
+                    .add_buffer(data.iter().cloned().collect())
+                    .nulls(nulls)
+                    .build()
+                    .context(CreatingArrowArraySnafu)?;
+                Arc::new(UInt64Array::from(data))
+            }
+            ColumnData::String(data, _) => Arc::new(data.to_arrow(nulls)),
+            ColumnData::Bool(data, _) => {
+                let data = ArrayDataBuilder::new(DataType::Boolean)
+                    .len(data.len())
+                    .add_buffer(data.to_arrow().into_inner())
+                    .nulls(nulls)
+                    .build()
+                    .context(CreatingArrowArraySnafu)?;
+                Arc::new(BooleanArray::from(data))
+            }
+            ColumnData::Tag(data, dictionary, _) => {
+                Arc::new(dictionary.to_arrow(data.iter().cloned(), nulls))
+            }
+        };
+
+        assert_eq!(data.len(), self.len());
+
+        Ok(data)
+    }
+
+    /// Split this [`Column`] at the specified row boundary, such that after
+    /// this call, `self` contains the range of rows indexed from `[0, n)` and
+    /// the returned value contains `[n, len)`.
+    ///
+    /// # Statistics
+    ///
+    /// For performance reasons, this operation leaves `self` and the returned
+    /// [`Column`] with reduced summary statistics available.
+    ///
+    /// This allows the caller to selectively reconstruct the statistics that
+    /// will be useful to the caller, instead of always paying the price of
+    /// recomputing statistics, even if unused.
+    ///
+    /// For the following column types:
+    ///
+    ///  - [`ColumnData::F64`]
+    ///  - [`ColumnData::I64`]
+    ///  - [`ColumnData::U64`]
+    ///  - [`ColumnData::Bool`]
+    ///  - [`ColumnData::String`]
+    ///
+    /// The statistics for both [`Column`] contain only:
+    ///
+    ///  - Total count
+    ///  - NULL count (see below)
+    ///
+    /// The NULL count is always present in the returned [`Column`], and only
+    /// present in `self` if it had a NULL count statistic prior to the split.
+    ///
+    /// For [`ColumnData::Tag`] all the statistics above are included, with the
+    /// addition of the distinct count.
+    ///
+    /// # Performance
+    ///
+    /// This call is `O(n)` where `n` is the number of elements in the right
+    /// side of the split (the `[n, len)` interval) due to the need to copy
+    /// and process these elements only.
+    ///
+    /// The size of the left-side interval (the [0, n) interval) does not affect
+    /// performance of this call.
+    pub fn split_off(&mut self, n: usize) -> Self {
+        if n > self.len() {
+            return Self::new(0, self.influx_type);
+        }
+
+        // Split the null mask into [0, n) and [n, len).
+        let right_bitmap = self.valid.split_off(n);
+
+        // Compute the null count for the right side.
+        let right_nulls = right_bitmap.count_zeros() as u64;
+
+        // Split the actual data and update/compute the statistics.
+        let right_data = match &mut self.data {
+            ColumnData::F64(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::F64)
+            }
+            ColumnData::I64(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::I64)
+            }
+            ColumnData::U64(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::U64)
+            }
+            ColumnData::String(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::String)
+            }
+            ColumnData::Bool(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::Bool)
+            }
+            ColumnData::Tag(data, dict, left_stats) => {
+                // Split the tag data at the value index.
+                let mut new_data = data.split_off(n);
+
+                // "new_data" now contains values [n, len), and likely no longer
+                // references all the values in the current dictionary.
+                //
+                // Generate a dictionary for "new_data" that contains only the
+                // values that appear in "new_data", and rewrite the dictionary
+                // IDs in "new_data" to reflect this new mapping.
+                let new_dict = rebuild_dictionary(dict, &mut new_data);
+
+                // The original "dict" may now contain references to keys that
+                // appear only in "new_data", and never in the "data" that
+                // remains.
+                //
+                // Rewrite this dictionary, to shrink it to contain only entries
+                // that appear in "data".
+                //
+                // Note: this may not be required if Arrow can tolerate a
+                // dictionary with more keys than necessary, but it optimises
+                // for memory utilisation.
+                *dict = rebuild_dictionary(dict, data);
+
+                // Compute how many NULLs are left in the left side.
+                let left_nulls = left_stats.null_count.map(|v| v - right_nulls);
+
+                // It's effectively free to compute the distinct count of a
+                // column using dictionary encoding - it's simply the length of
+                // the dictionary, and plus one if a NULL exists - maintain
+                // distinct counts in the returned statistics.
+                let make_distinct_count = |dict: &Dictionary, has_null| {
+                    let mut count = dict.values().len();
+                    if has_null {
+                        count += 1;
+                    }
+                    NonZeroU64::try_from(count as u64).ok()
+                };
+
+                let left_distinct = make_distinct_count(dict, left_nulls.unwrap_or_default() > 0);
+                let right_distinct = make_distinct_count(&new_dict, right_nulls > 0);
+
+                // Update the stats for the left side of the split.
+                *left_stats = StatValues::new_with_distinct(
+                    None,
+                    None,
+                    self.valid.len() as _,
+                    left_nulls,
+                    left_distinct,
+                );
+
+                // Generate the right side of the split.
+                let new_len = new_data.len();
+                ColumnData::Tag(
+                    new_data,
+                    new_dict,
+                    StatValues::new_with_distinct(
+                        None,
+                        None,
+                        new_len as _,
+                        Some(right_nulls),
+                        right_distinct,
+                    ),
+                )
+            }
+        };
+
+        Self {
+            influx_type: self.influx_type,
+            valid: right_bitmap,
+            data: right_data,
+        }
+    }
+}
+
+/// Constructs a new, minimal dictionary for `data`, rewriting the dictionary
+/// IDs in `data` to use the new returned dictionary.
+fn rebuild_dictionary(original: &Dictionary, data: &mut [DID]) -> Dictionary {
+    let mut dict = Dictionary::new();
+
+    for id in data.iter_mut() {
+        if *id == NULL_DID {
+            continue;
+        }
+        let value = original
+            .lookup_id(*id)
+            .expect("original dictionary does not contain value");
+        *id = dict.lookup_value_or_insert(value);
+    }
+
+    dict
+}
+
+/// Recompute the min/max values for the given [`Column`].
+///
+/// This is an `O(n)` operation for:
+///
+///  - [`ColumnData::F64`]
+///  - [`ColumnData::I64`]
+///  - [`ColumnData::U64`]
+///  - [`ColumnData::Bool`]
+///  - [`ColumnData::String`]
+///
+/// This is an `O(distinct(n))` operation for [`ColumnData::Tag`].
+pub fn recompute_min_max(c: &mut Column) {
+    match &mut c.data {
+        // A specialised implementation for floats is required to filter out NaN
+        // values in order to match the behaviour of `StatValues::update()`.
+        ColumnData::F64(data, stats) => {
+            data.iter()
+                .zip(c.valid.iter())
+                .filter_map(|(v, valid)| {
+                    if !valid || v.is_nan() {
+                        // NaN are completely ignored in stats.
+                        return None;
+                    }
+                    Some(*v)
+                })
+                .for_each(|v| {
+                    stats.min = Some(stats.min.unwrap_or(v).min(v));
+                    stats.max = Some(stats.max.unwrap_or(v).max(v));
+                });
+        }
+
+        // A specialised implementation for boolean values for significantly
+        // improved performance.
+        ColumnData::Bool(data, stats) => {
+            // Process 8 values at a time by evaluating against the underlying
+            // bytes directly in both the validity and value bitsets.
+            //
+            // Invariant: the excess bits beyond "bitset.len()" are always 0.
+            let iter = c.valid.bytes().iter().zip(data.bytes().iter());
+
+            let mut contains_false = false;
+            let mut contains_true = false;
+
+            for (valid, data) in iter {
+                // Set bits only if they're non-null and 1.
+                contains_true |= valid & data > 0;
+
+                // Set bits only if they're non-null and 0.
+                contains_false |= valid & !data > 0;
+
+                // Short circuit if both have been observed.
+                if contains_false && contains_true {
+                    break;
+                }
+            }
+
+            // If all values are NULL, no real values were observed, and the
+            // stats should be cleared (as the stats ignore NULLs).
+            if !contains_false && !contains_true {
+                stats.min = None;
+                stats.max = None;
+                return;
+            }
+
+            stats.min = Some(!contains_false);
+            stats.max = Some(contains_true);
+        }
+
+        // The rest of the data types use `recompute_min_max_for()`.
+        ColumnData::I64(data, stats) => {
+            if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) {
+                stats.min = Some(*min);
+                stats.max = Some(*max);
+            }
+        }
+        ColumnData::U64(data, stats) => {
+            if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) {
+                stats.min = Some(*min);
+                stats.max = Some(*max);
+            }
+        }
+
+        // Optimised to avoid cloning the string for every change in min/max
+        // value, instead this clones the strings at most once for each of
+        // min/max.
+        //
+        // This applies to both the String and Tag data types.
+        ColumnData::String(data, stats) => {
+            if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) {
+                stats.min = Some(min.to_string());
+                stats.max = Some(max.to_string());
+            }
+        }
+        ColumnData::Tag(_, dict, stats) => {
+            // The dictionary does not store a representation of NULL, so all
+            // the values in the dictionary are candidates for min/max.
+            if let Some((min, max)) =
+                recompute_min_max_for(dict.values().iter(), iter::repeat(true))
+            {
+                stats.min = Some(min.to_string());
+                stats.max = Some(max.to_string());
+            }
+        }
+    }
+}
+
+/// Compute the min/max values of `data`, filtering out any values with
+/// corresponding positions in `valid` that are `false`.
+fn recompute_min_max_for<'a, T>(
+    data: impl IntoIterator<Item = &'a T>,
+    valid: impl IntoIterator<Item = bool>,
+) -> Option<(&'a T, &'a T)>
+where
+    T: Ord + ?Sized,
+{
+    let (min, max) = data
+        .into_iter()
+        .zip(valid.into_iter())
+        .filter_map(|(v, valid)| if valid { Some(v) } else { None })
+        .fold((None, None), |acc, v| {
+            (
+                Some(acc.0.unwrap_or(v).min(v)),
+                Some(acc.1.unwrap_or(v).max(v)),
+            )
+        });
+
+    min.zip(max)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{borrow::Borrow, collections::HashSet, fmt::Debug, mem::discriminant};
+
+    use arrow::record_batch::RecordBatch;
+    use arrow_util::assert_batches_eq;
+    use assert_matches::assert_matches;
+    use data_types::IsNan;
+    use proptest::prelude::*;
+
+    use super::*;
+
+    fn hydrate(dict: &Dictionary, data: &[DID]) -> Vec<String> {
+        data.iter()
+            .map(|&id| dict.lookup_id(id).unwrap().to_string())
+            .collect::<Vec<_>>()
+    }
+
+    /// Take an iterator of nullable `T`, and convert it into a vector of
+    /// non-optional values and a null mask compatible with [`ColumnData`].
+    ///
+    /// Returns the number of nulls in `data`.
+    fn densify<T, U>(data: impl IntoIterator<Item = Option<U>>) -> (Vec<T>, BitSet, usize)
+    where
+        U: ToOwned<Owned = T>,
+        T: Default,
+    {
+        let mut out = Vec::new();
+        let mut bitmap = BitSet::new();
+        let mut nulls = 0;
+        for v in data.into_iter() {
+            match v {
+                Some(v) => {
+                    bitmap.append_set(1);
+                    out.push(v.to_owned());
+                }
+                None => {
+                    out.push(Default::default());
+                    bitmap.append_unset(1);
+                    nulls += 1;
+                }
+            }
+        }
+
+        (out, bitmap, nulls)
+    }
+
+    #[test]
+    #[allow(clippy::bool_assert_comparison)]
+    fn test_densify() {
+        let input = [None, Some(42), None, None, Some(24)];
+
+        let (got, nulls, count) = densify(input);
+        assert_eq!(got, [0, 42, 0, 0, 24]); // NULLS are populated with 0 (not sparse representation)
+        assert_eq!(nulls.get(0), false);
+        assert_eq!(nulls.get(1), true);
+        assert_eq!(nulls.get(2), false);
+        assert_eq!(nulls.get(3), false);
+        assert_eq!(nulls.get(4), true);
+        assert_eq!(nulls.len(), 5);
+        assert_eq!(count, 3);
+    }
+
+    #[test]
+    fn test_rewrite_dictionary() {
+        let mut original = Dictionary::new();
+        let mut data = vec![];
+
+        // Input strings to be dictionary encoded.
+        let input = [
+            "bananas", "platanos", "bananas", "platanos", "ananas", "ananas", "ananas",
+        ];
+
+        for v in input {
+            data.push(original.lookup_value_or_insert(v));
+        }
+
+        assert_eq!(data.len(), input.len());
+        assert_eq!(original.values().len(), 3); // 3 distinct values
+
+        let mut new_data = data.split_off(3);
+        let new_dict = rebuild_dictionary(&original, &mut new_data);
+        let old_dict = rebuild_dictionary(&original, &mut data);
+
+        let new_data_hydrated = hydrate(&new_dict, &new_data);
+        let old_data_hydrated = hydrate(&old_dict, &data);
+
+        assert_eq!(
+            new_data_hydrated,
+            ["platanos", "ananas", "ananas", "ananas"]
+        );
+        assert_eq!(old_data_hydrated, ["bananas", "platanos", "bananas"]);
+
+        assert_eq!(new_dict.values().len(), 2); // 2 distinct values
+        assert_eq!(old_dict.values().len(), 2); // 2 distinct values
+    }
+
+    #[test]
+    fn test_split_off() {
+        let (data, valid, _) = densify([Some(42), None, None, Some(24)]);
+        valid.to_arrow();
+
+        let mut col = Column {
+            influx_type: InfluxColumnType::Field(InfluxFieldType::UInteger),
+            valid,
+            data: ColumnData::U64(data, StatValues::new(None, None, 4, Some(2))),
+        };
+
+        let mut schema = schema::SchemaBuilder::new();
+        schema.influx_column("bananas", col.influx_type());
+        let schema = schema.build().unwrap();
+
+        // Before the split
+        let batch = RecordBatch::try_new(
+            schema.clone().into(),
+            vec![col.to_arrow().expect("failed to covert column to arrow")],
+        )
+        .expect("failed to build record batch");
+        assert_batches_eq!(
+            [
+                "+---------+",
+                "| bananas |",
+                "+---------+",
+                "| 42      |",
+                "|         |",
+                "|         |",
+                "| 24      |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+
+        let col2 = col.split_off(2);
+
+        // After the split, the input column
+        let batch = RecordBatch::try_new(
+            schema.clone().into(),
+            vec![col.to_arrow().expect("failed to covert column to arrow")],
+        )
+        .expect("failed to build record batch");
+        assert_batches_eq!(
+            [
+                "+---------+",
+                "| bananas |",
+                "+---------+",
+                "| 42      |",
+                "|         |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+
+        // After the split, the split off column
+        let batch = RecordBatch::try_new(
+            schema.into(),
+            vec![col2.to_arrow().expect("failed to covert column to arrow")],
+        )
+        .expect("failed to build record batch");
+        assert_batches_eq!(
+            [
+                "+---------+",
+                "| bananas |",
+                "+---------+",
+                "|         |",
+                "| 24      |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+    }
+
+    const MAX_ROWS: usize = 20;
+
+    /// Returns a vector of `Option<T>`.
+    fn sparse_array<T>(s: impl Strategy<Value = T>) -> impl Strategy<Value = Vec<Option<T>>>
+    where
+        T: Debug,
+    {
+        prop::collection::vec(prop::option::of(s), 0..MAX_ROWS)
+    }
+
+    /// Produces a valid [`Column`]` of an arbitrary data type and data.
+    ///
+    /// The embedded statistics do not contain min/max values but otherwise
+    /// model a column within a [`MutableBatch`] produced by a [`Writer`].
+    ///
+    /// [`MutableBatch`]: crate::MutableBatch
+    /// [`Writer`]: crate::writer::Writer
+    fn arbitrary_column() -> impl Strategy<Value = Column> {
+        prop_oneof![
+            sparse_array(any::<f64>()).prop_map(|v| {
+                let (data, valid, null_count) = densify(v.clone());
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::Float),
+                    valid,
+                    data: ColumnData::F64(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<i64>()).prop_map(|v| {
+                let (data, valid, null_count) = densify(v.clone());
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::Integer),
+                    valid,
+                    data: ColumnData::I64(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<u64>()).prop_map(|v| {
+                let (data, valid, null_count) = densify(v.clone());
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::UInteger),
+                    valid,
+                    data: ColumnData::U64(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<String>()).prop_map(|v| {
+                let (strings, valid, null_count) = densify(v.clone());
+                let mut data = PackedStringArray::new();
+                for s in strings {
+                    data.append(&s);
+                }
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::String),
+                    valid,
+                    data: ColumnData::String(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<bool>()).prop_map(|v| {
+                let (values, valid, null_count) = densify(v.clone());
+                let mut data = BitSet::new();
+                for v in values {
+                    match v {
+                        true => data.append_set(1),
+                        false => data.append_unset(1),
+                    }
+                }
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::Boolean),
+                    valid,
+                    data: ColumnData::Bool(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            // This artificially weights string generation to produce arrays
+            // with a higher chance of covering both dense and sparse arrays
+            // where distinct values != array length.
+            prop_oneof![
+                sparse_array(
+                    prop::string::string_regex("[a-b]").expect("invalid repetition regex")
+                ),
+                sparse_array(any::<String>()),
+            ]
+            .prop_map(|v| {
+                // The NULL encoding of the dictionary is a bit of a snowflake.
+                //
+                // Walk the NULL-able input, and for any NULLs insert NULL_DID
+                // into the data array without inserting into the dictionary.
+                let mut data = Vec::new();
+                let mut dict = Dictionary::new();
+                let mut valid = BitSet::new();
+
+                let mut nulls = 0;
+                for v in &v {
+                    match v {
+                        Some(v) => {
+                            valid.append_set(1);
+                            data.push(dict.lookup_value_or_insert(v));
+                        }
+                        None => {
+                            data.push(NULL_DID);
+                            valid.append_unset(1);
+                            nulls += 1;
+                        }
+                    }
+                }
+
+                // A NULL is a distinct value, that does not appear in the
+                // dictionary.
+                let distinct_count = if nulls > 0 {
+                    dict.values().len() + 1
+                } else {
+                    dict.values().len()
+                };
+
+                Column {
+                    influx_type: InfluxColumnType::Tag,
+                    valid,
+                    data: ColumnData::Tag(
+                        data,
+                        dict,
+                        StatValues::new_with_distinct(
+                            None,
+                            None,
+                            v.len() as _,
+                            Some(nulls),
+                            NonZeroU64::try_from(distinct_count as u64).ok(),
+                        ),
+                    ),
+                }
+            }),
+        ]
+    }
+    // Set the number of test cases higher than the default (256) to ensure better
+    // coverage of the generated arbitrary columns without compromising too
+    // much on the input space.
+    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(2048))]
+
+        /// Asserts the correctness of the [`Column::split_off()`] method, using
+        /// the Arrow "Array" slice method as a test oracle.
+        ///
+        /// Asserts the following invariants after splitting:
+        ///
+        ///  - Never panics due to out-of-bounds split position
+        ///  - Data types remain unchanged
+        ///  - Metadata for influx data model unchanged
+        ///  - NULL mask is of the correct length
+        ///  - Data length matches count statistics
+        ///  - NULL value count matches NULL count statistics
+        ///  - Tag distinct values matches distinct count statistics
+        ///  - Tag dictionary contains correct number of entries, with NULLs
+        ///  - Total count statistics are equal to input statistics
+        ///  - NULL count statistics are equal to input statistics
+        ///  - Both sides of the split match equivalent Arrow oracle splits
+        ///
+        #[test]
+        fn prop_split_off(
+            input in arbitrary_column(),
+            split_at in 0..=MAX_ROWS,
+        ) {
+            // Split the column.
+            let mut col = input.clone();
+            let col2 = col.split_off(split_at);
+
+            // Assert no rows were lost.
+            assert_eq!(col.len() + col2.len(), input.len());
+
+            // Because "split_at" may be greater than the number of rows in the
+            // input column, compute how many rows should remain after the
+            // split.
+            let want_remaining_rows = input.len().min(split_at);
+            assert_eq!(col.len(), want_remaining_rows);
+
+            // And validate the rest of the rows wound up in the col2 half.
+            assert_eq!(col2.len(), input.len() - want_remaining_rows);
+
+            for c in [&col, &col2] {
+                // The data type should remain the same.
+                assert_eq!(c.influx_type(), input.influx_type());
+                assert_eq!(discriminant(c.data()), discriminant(input.data()));
+
+                // Inspect the statistics for each.
+                let data_len = match c.data() {
+                    ColumnData::F64(data, _) => data.len(),
+                    ColumnData::I64(data, _) => data.len(),
+                    ColumnData::U64(data, _) => data.len(),
+                    ColumnData::String(data, _) => data.len(),
+                    ColumnData::Bool(data, _) => data.len(),
+                    ColumnData::Tag(data, dict, stats) => {
+                        // Tags have an additional distinct count statistics
+                        // maintained throughout the split.
+                        let want = stats.distinct_count.map(|v| v.get()).unwrap_or_default();
+                        let have = data.iter().collect::<HashSet<_>>().len() as u64;
+                        assert_eq!(have, want);
+
+                        // If there are no nulls, the dictionary length must
+                        // match the number of distinct values. If there are
+                        // NULLs, +1 to the dictionary length (it does not
+                        // contain NULLs).
+                        if stats.null_count.unwrap_or_default() == 0 {
+                            assert_eq!(have, dict.values().len() as u64);
+                        } else {
+                            // Otherwise there must be one more distinct value.
+                            assert_eq!(have, dict.values().len() as u64 + 1);
+                        }
+
+                        data.len()
+                    },
+                };
+
+                // First check the consistency of the total count:
+                assert_eq!(c.valid_mask().len(), data_len);
+                assert_eq!(data_len as u64, c.stats().total_count());
+
+                // Null counts:
+                let nulls = c.valid_mask().count_zeros() as u64;
+                assert_eq!(c.stats().null_count(), Some(nulls));
+            }
+
+            // The sum of various statistics must match the input counts.
+            let count = col.stats().total_count() + col2.stats().total_count();
+            assert_eq!(input.stats().total_count(), count);
+
+            // Null counts must sum to the input count
+            let nulls = col.stats().null_count().unwrap_or_default() +
+            col2.stats().null_count().unwrap_or_default();
+            assert_eq!(input.stats().null_count().unwrap_or_default(), nulls);
+
+            // Generate arrow arrays from both inputs
+            let col = col.to_arrow().unwrap();
+            let col2 = col2.to_arrow().unwrap();
+
+            // And the test oracle
+            let input = input.to_arrow().unwrap();
+
+            // Slice the input data using arrow's slice methods.
+            let want = input.slice(0, split_at.min(input.len()));
+
+            // And assert the split_off() data is equal.
+            assert!(col.eq(&want));
+
+            // Only attempt to slice off and validate the right side if it would
+            // be non-empty (or arrow panics)
+            if split_at >= input.len() {
+                assert_eq!(col2.len(), 0);
+            } else {
+                let want2 = input.slice(split_at, input.len() - split_at);
+                assert!(col2.eq(&want2));
+            }
+        }
+
+        /// Exercise [`recompute_min_max()`] against a [`Column`], asserting the
+        /// resulting [`StatValues`] match that produced by using the [`Writer`]
+        /// to populate the [`Column`].
+        #[test]
+        fn prop_recompute_min_max(
+            mut input in arbitrary_column(),
+        ) {
+            // Compute a `StatValues` using the test oracle implementation.
+            fn stats_oracle<S, T, U>(data: S, valid: impl IntoIterator<Item = bool>) -> StatValues<T>
+            where
+                S: IntoIterator<Item = U>,
+                T: Borrow<U>,
+                U: ToOwned<Owned = T> + PartialOrd + IsNan,
+            {
+                data.into_iter()
+                    .zip(valid.into_iter())
+                    .filter_map(|(v, valid)| if valid { Some(v) } else { None })
+                    .fold(StatValues::default(), |mut acc, v| {
+                        acc.update(&v);
+                        acc
+                    })
+            }
+
+            match input.clone().data() {
+                ColumnData::F64(data,_) => {
+                    let want = stats_oracle(data, input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::F64(v) => v);
+
+                    assert_eq!(want.min.cloned(), got.min);
+                    assert_eq!(want.max.cloned(), got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::I64(data, _) => {
+                    let want = stats_oracle(data, input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::I64(v) => v);
+
+                    assert_eq!(want.min.cloned(), got.min);
+                    assert_eq!(want.max.cloned(), got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::U64(data, _) => {
+                    let want = stats_oracle(data, input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::U64(v) => v);
+
+                    assert_eq!(want.min.cloned(), got.min);
+                    assert_eq!(want.max.cloned(), got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::Bool(data, _) => {
+                    let want = stats_oracle(data.iter(), input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::Bool(v) => v);
+
+                    assert_eq!(want.min, got.min);
+                    assert_eq!(want.max, got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::String(data, _) => {
+                    let want = stats_oracle(data.iter().map(ToString::to_string), input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::String(v) => v);
+
+                    assert_eq!(want.min, got.min);
+                    assert_eq!(want.max, got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::Tag(_data, dict, _) => {
+                    let want = stats_oracle(
+                        dict.values().iter().map(ToString::to_string),
+                        iter::repeat(true)
+                    );
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::String(v) => v);
+
+                    assert_eq!(want.min, got.min);
+                    assert_eq!(want.max, got.max);
+                    assert!(got.min <= got.max);
+                },
+            }
+        }
+    }
+}
diff --git a/mutable_batch/src/lib.rs b/mutable_batch/src/lib.rs
new file mode 100644
index 0000000..62244d6
--- /dev/null
+++ b/mutable_batch/src/lib.rs
@@ -0,0 +1,516 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+//! A mutable data structure for a collection of writes.
+//!
+//! Can be viewed as a mutable version of [`RecordBatch`] that remains the exclusive
+//! owner of its buffers, permitting mutability. The in-memory layout is similar, however,
+//! permitting fast conversion to [`RecordBatch`].
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use partition as _;
+#[cfg(test)]
+use pretty_assertions as _;
+#[cfg(test)]
+use rand as _;
+use workspace_hack as _;
+
+use crate::column::{Column, ColumnData};
+use arrow::record_batch::RecordBatch;
+use data_types::StatValues;
+use hashbrown::HashMap;
+use iox_time::Time;
+use schema::Projection;
+use schema::{builder::SchemaBuilder, Schema, TIME_COLUMN_NAME};
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::{collections::BTreeSet, ops::Range};
+
+pub mod column;
+pub mod payload;
+pub mod writer;
+
+pub use payload::*;
+
+#[allow(missing_docs)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Column error on column {}: {}", column, source))]
+    ColumnError {
+        column: String,
+        source: column::Error,
+    },
+
+    #[snafu(display("arrow conversion error: {}", source))]
+    ArrowError { source: arrow::error::ArrowError },
+
+    #[snafu(display("Internal error converting schema: {}", source))]
+    InternalSchema { source: schema::builder::Error },
+
+    #[snafu(display("Column not found: {}", column))]
+    ColumnNotFound { column: String },
+
+    #[snafu(context(false))]
+    WriterError { source: writer::Error },
+}
+
+/// A specialized `Error` for [`MutableBatch`] errors
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Represents a mutable batch of rows (a horizontal subset of a table) which
+/// can be appended to and converted into an Arrow `RecordBatch`
+#[derive(Debug, Default, Clone)]
+pub struct MutableBatch {
+    /// Map of column name to index in `MutableBatch::columns`
+    column_names: HashMap<String, usize>,
+
+    /// Columns contained within this MutableBatch
+    columns: Vec<Column>,
+
+    /// The number of rows in this MutableBatch
+    row_count: usize,
+}
+
+impl MutableBatch {
+    /// Create a new empty batch
+    pub fn new() -> Self {
+        Self {
+            column_names: Default::default(),
+            columns: Default::default(),
+            row_count: 0,
+        }
+    }
+
+    /// Returns the schema for a given selection
+    ///
+    /// If Selection::All the returned columns are sorted by name
+    pub fn schema(&self, selection: Projection<'_>) -> Result<Schema> {
+        let mut schema_builder = SchemaBuilder::new();
+        let schema = match selection {
+            Projection::All => {
+                for (column_name, column_idx) in &self.column_names {
+                    let column = &self.columns[*column_idx];
+                    schema_builder.influx_column(column_name, column.influx_type());
+                }
+
+                schema_builder
+                    .build()
+                    .context(InternalSchemaSnafu)?
+                    .sort_fields_by_name()
+            }
+            Projection::Some(cols) => {
+                for col in cols {
+                    let column = self.column(col)?;
+                    schema_builder.influx_column(*col, column.influx_type());
+                }
+                schema_builder.build().context(InternalSchemaSnafu)?
+            }
+        };
+
+        Ok(schema)
+    }
+
+    /// Convert all the data in this `MutableBatch` into a `RecordBatch`
+    pub fn to_arrow(&self, selection: Projection<'_>) -> Result<RecordBatch> {
+        let schema = self.schema(selection)?;
+        let columns = schema
+            .iter()
+            .map(|(_, field)| {
+                let column = self
+                    .column(field.name())
+                    .expect("schema contains non-existent column");
+
+                column.to_arrow().context(ColumnSnafu {
+                    column: field.name(),
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        RecordBatch::try_new(schema.into(), columns).context(ArrowSnafu {})
+    }
+
+    /// Returns an iterator over the columns in this batch in no particular order
+    pub fn columns(&self) -> impl Iterator<Item = (&String, &Column)> + ExactSizeIterator + '_ {
+        self.column_names
+            .iter()
+            .map(move |(name, idx)| (name, &self.columns[*idx]))
+    }
+
+    /// Return the set of column names for this table. Used in combination with a write operation's
+    /// column names to determine whether a write would exceed the max allowed columns.
+    pub fn column_names(&self) -> BTreeSet<&str> {
+        self.column_names.keys().map(|name| name.as_str()).collect()
+    }
+
+    /// Return the number of rows in this chunk
+    pub fn rows(&self) -> usize {
+        self.row_count
+    }
+
+    /// Returns a summary of the write timestamps in this chunk if a
+    /// time column exists
+    pub fn timestamp_summary(&self) -> Option<TimestampSummary> {
+        let col_data = self.time_column().ok()?;
+        let mut summary = TimestampSummary::default();
+
+        for t in col_data {
+            summary.record_nanos(*t)
+        }
+
+        Some(summary)
+    }
+
+    /// Extend this [`MutableBatch`] with the contents of `other`
+    pub fn extend_from(&mut self, other: &Self) -> Result<()> {
+        let mut writer = writer::Writer::new(self, other.row_count);
+        writer.write_batch(other)?;
+        writer.commit();
+        Ok(())
+    }
+
+    /// Extend this [`MutableBatch`] with `range` rows from `other`
+    pub fn extend_from_range(&mut self, other: &Self, range: Range<usize>) -> Result<()> {
+        let mut writer = writer::Writer::new(self, range.end - range.start);
+        writer.write_batch_range(other, range)?;
+        writer.commit();
+        Ok(())
+    }
+
+    /// Extend this [`MutableBatch`] with `ranges` rows from `other`
+    pub fn extend_from_ranges(&mut self, other: &Self, ranges: &[Range<usize>]) -> Result<()> {
+        let to_insert = ranges.iter().map(|x| x.end - x.start).sum();
+
+        let mut writer = writer::Writer::new(self, to_insert);
+        writer.write_batch_ranges(other, ranges)?;
+        writer.commit();
+        Ok(())
+    }
+
+    /// Returns a reference to the specified column
+    pub fn column(&self, column: &str) -> Result<&Column> {
+        let idx = self
+            .column_names
+            .get(column)
+            .context(ColumnNotFoundSnafu { column })?;
+
+        Ok(&self.columns[*idx])
+    }
+
+    /// Returns a reference to the column at the specified index
+    pub fn column_by_index(&self, idx: usize) -> Result<&Column> {
+        self.columns.get(idx).with_context(|| ColumnNotFoundSnafu {
+            column: format!("index {}", idx),
+        })
+    }
+
+    /// Return the values in the time column in this batch. Returns an error if the batch has no
+    /// time column.
+    ///
+    /// # Panics
+    ///
+    /// If a time column exists but its data isn't of type `i64`, this function will panic.
+    fn time_column(&self) -> Result<&[i64]> {
+        let time_column = self.column(TIME_COLUMN_NAME)?;
+        match &time_column.data {
+            ColumnData::I64(col_data, _) => Ok(col_data),
+            x => unreachable!("expected i64 got {} for time column", x),
+        }
+    }
+
+    /// Return the approximate memory size of the batch, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + self
+                .column_names
+                .iter()
+                .map(|(k, v)| std::mem::size_of_val(k) + k.capacity() + std::mem::size_of_val(v))
+                .sum::<usize>()
+            + self.columns.iter().map(|c| c.size()).sum::<usize>()
+    }
+
+    /// Return the approximate memory size of the data in the batch, in bytes.
+    pub fn size_data(&self) -> usize {
+        self.columns.iter().map(|c| c.size_data()).sum::<usize>()
+    }
+
+    /// Split this [`MutableBatch`] at the specified row boundary, such that
+    /// after this call, `self` contains the range of rows indexed from `[0, n)`
+    /// and the returned value contains `[n, len)`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `n > self.rows()`.
+    ///
+    /// # Performance
+    ///
+    /// This implementation is heavily optimised towards splitting `self` at a
+    /// `n` value skewed towards the high end of the row count - see [`Column`].
+    pub fn split_off(&mut self, n: usize) -> Self {
+        assert!(n <= self.row_count);
+
+        let right_row_count = self.row_count - n;
+        self.row_count = n;
+
+        Self {
+            column_names: self.column_names.clone(),
+            columns: self.columns.iter_mut().map(|v| v.split_off(n)).collect(),
+            row_count: right_row_count,
+        }
+    }
+}
+
+/// A description of the distribution of timestamps in a
+/// set of writes, bucketed based on minute within the hour
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct TimestampSummary {
+    /// Stores the count of how many rows in the set of writes have a timestamp
+    /// with a minute matching a given index
+    ///
+    /// E.g. a row with timestamp 12:31:12 would store a count at index 31
+    pub counts: [u32; 60],
+
+    /// Standard timestamp statistics
+    pub stats: StatValues<i64>,
+}
+
+impl Default for TimestampSummary {
+    fn default() -> Self {
+        Self {
+            counts: [0; 60],
+            stats: Default::default(),
+        }
+    }
+}
+
+impl TimestampSummary {
+    /// Records a timestamp value
+    pub fn record(&mut self, timestamp: Time) {
+        self.counts[timestamp.minute() as usize] += 1;
+        self.stats.update(&timestamp.timestamp_nanos())
+    }
+
+    /// Records a timestamp value from nanos
+    pub fn record_nanos(&mut self, timestamp_nanos: i64) {
+        self.record(Time::from_timestamp_nanos(timestamp_nanos))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_util::assert_batches_eq;
+    use mutable_batch_lp::lines_to_batches;
+    use schema::Projection;
+
+    #[test]
+    fn size_data_without_nulls() {
+        let batches = lines_to_batches(
+            "cpu,t1=hello,t2=world f1=1.1,f2=1i 1234\ncpu,t1=h,t2=w f1=2.2,f2=2i 1234",
+            0,
+        )
+        .unwrap();
+        let batch = batches.get("cpu").unwrap();
+
+        assert_eq!(batch.size_data(), 128);
+        assert_eq!(batch.columns().len(), 5);
+
+        let batches = lines_to_batches(
+            "cpu,t1=hellomore,t2=world f1=1.1,f2=1i 1234\ncpu,t1=h,t2=w f1=2.2,f2=2i 1234",
+            0,
+        )
+        .unwrap();
+        let batch = batches.get("cpu").unwrap();
+        assert_eq!(batch.size_data(), 138);
+        assert_eq!(batch.columns().len(), 5);
+    }
+
+    #[test]
+    fn size_data_with_nulls() {
+        let batches = lines_to_batches(
+            "cpu,t1=hello,t2=world f1=1.1 1234\ncpu,t2=w f1=2.2,f2=2i 1234",
+            0,
+        )
+        .unwrap();
+        let batch = batches.get("cpu").unwrap();
+
+        assert_eq!(batch.size_data(), 124);
+        assert_eq!(batch.columns().len(), 5);
+    }
+
+    /// Assert the correct row index is split off using
+    /// [`MutableBatch::split_off()`].
+    ///
+    /// Correctness of the [`Column`] splitting is handled by tests against the
+    /// [`Column`] itself.
+    #[test]
+    fn test_split_off() {
+        let mut batches = lines_to_batches(
+            "\
+            cpu,t1=hello,t2=world f1=1.1 1234\n\
+            cpu,t2=w f1=2.2,f2=2i 1234\n\
+            ",
+            0,
+        )
+        .unwrap();
+        let mut batch = batches.remove("cpu").unwrap();
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(batch.column_names().len(), 5);
+
+        let got = batch.split_off(1);
+
+        assert_batches_eq!(
+            &[
+                "+-----+----+-------+-------+--------------------------------+",
+                "| f1  | f2 | t1    | t2    | time                           |",
+                "+-----+----+-------+-------+--------------------------------+",
+                "| 1.1 |    | hello | world | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+-------+-------+--------------------------------+",
+            ],
+            &[batch.to_arrow(Projection::All).unwrap()]
+        );
+        assert_batches_eq!(
+            &[
+                "+-----+----+----+----+--------------------------------+",
+                "| f1  | f2 | t1 | t2 | time                           |",
+                "+-----+----+----+----+--------------------------------+",
+                "| 2.2 | 2  |    | w  | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+----+----+--------------------------------+",
+            ],
+            &[got.to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_eq!(batch.rows(), 1);
+        assert_eq!(got.rows(), 1);
+
+        // Actual Column instances
+        assert_eq!(got.columns().len(), batch.columns().len());
+
+        // Column name map
+        assert_eq!(got.column_names().len(), 5);
+        assert_eq!(got.column_names(), batch.column_names());
+        assert_eq!(got.column_names().len(), got.columns().len());
+
+        // Schema
+        assert_eq!(
+            got.schema(Projection::All).unwrap(),
+            batch.schema(Projection::All).unwrap()
+        );
+        assert_eq!(
+            got.schema(Projection::All).unwrap().len(),
+            got.columns().len()
+        );
+    }
+
+    #[test]
+    fn test_split_off_n_0() {
+        let mut batches = lines_to_batches(
+            "\
+            cpu,t1=hello,t2=world f1=1.1 1234\n\
+            cpu,t2=w f1=2.2,f2=2i 1234\n\
+            ",
+            0,
+        )
+        .unwrap();
+        let mut batch = batches.remove("cpu").unwrap();
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(batch.column_names().len(), 5);
+
+        let got = batch.split_off(0);
+
+        assert_batches_eq!(
+            &[
+                "+-----+----+-------+-------+--------------------------------+",
+                "| f1  | f2 | t1    | t2    | time                           |",
+                "+-----+----+-------+-------+--------------------------------+",
+                "| 1.1 |    | hello | world | 1970-01-01T00:00:00.000001234Z |",
+                "| 2.2 | 2  |       | w     | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+-------+-------+--------------------------------+",
+            ],
+            &[got.to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_eq!(batch.rows(), 0);
+        assert_eq!(got.rows(), 2);
+
+        // Actual Column instances
+        assert_eq!(got.columns().len(), batch.columns().len());
+
+        // Column name map
+        assert_eq!(got.column_names().len(), 5);
+        assert_eq!(got.column_names(), batch.column_names());
+        assert_eq!(got.column_names().len(), got.columns().len());
+
+        // Schema
+        assert_eq!(
+            got.schema(Projection::All).unwrap(),
+            batch.schema(Projection::All).unwrap()
+        );
+        assert_eq!(
+            got.schema(Projection::All).unwrap().len(),
+            got.columns().len()
+        );
+    }
+
+    #[test]
+    fn test_split_off_none() {
+        let mut batches = lines_to_batches(
+            "\
+            cpu,t1=hello,t2=world f1=1.1 1234\n\
+            cpu,t2=w f1=2.2,f2=2i 1234\n\
+            ",
+            0,
+        )
+        .unwrap();
+        let mut batch = batches.remove("cpu").unwrap();
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(batch.column_names().len(), 5);
+
+        let got = batch.split_off(2);
+
+        assert_batches_eq!(
+            &[
+                "+-----+----+-------+-------+--------------------------------+",
+                "| f1  | f2 | t1    | t2    | time                           |",
+                "+-----+----+-------+-------+--------------------------------+",
+                "| 1.1 |    | hello | world | 1970-01-01T00:00:00.000001234Z |",
+                "| 2.2 | 2  |       | w     | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+-------+-------+--------------------------------+",
+            ],
+            &[batch.to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(got.rows(), 0);
+
+        // Actual Column instances
+        assert_eq!(got.columns().len(), batch.columns().len());
+
+        // Column name map
+        assert_eq!(got.column_names().len(), 5);
+        assert_eq!(got.column_names(), batch.column_names());
+        assert_eq!(got.column_names().len(), got.columns().len());
+
+        // Schema
+        assert_eq!(
+            got.schema(Projection::All).unwrap(),
+            batch.schema(Projection::All).unwrap()
+        );
+        assert_eq!(
+            got.schema(Projection::All).unwrap().len(),
+            got.columns().len()
+        );
+    }
+}
diff --git a/mutable_batch/src/payload.rs b/mutable_batch/src/payload.rs
new file mode 100644
index 0000000..0fb6403
--- /dev/null
+++ b/mutable_batch/src/payload.rs
@@ -0,0 +1,15 @@
+//! Write payload abstractions derived from [`MutableBatch`]
+
+use crate::{MutableBatch, Result};
+
+/// A payload that can be written to a mutable batch
+pub trait WritePayload {
+    /// Write this payload to `batch`
+    fn write_to_batch(&self, batch: &mut MutableBatch) -> Result<()>;
+}
+
+impl WritePayload for MutableBatch {
+    fn write_to_batch(&self, batch: &mut MutableBatch) -> Result<()> {
+        batch.extend_from(self)
+    }
+}
diff --git a/mutable_batch/src/payload/filter.rs b/mutable_batch/src/payload/filter.rs
new file mode 100644
index 0000000..825f783
--- /dev/null
+++ b/mutable_batch/src/payload/filter.rs
@@ -0,0 +1,155 @@
+//! Functions for filtering rows from a [`MutableBatch`]
+//!
+//! The returned ranges can then be used with `MutableBatch::extend_from_range`
+
+use crate::column::ColumnData;
+use crate::MutableBatch;
+use schema::TIME_COLUMN_NAME;
+use std::ops::Range;
+
+/// Given a [`MutableBatch`] a time predicate and a set of row ranges, returns the row
+/// indexes that pass the predicate
+///
+/// # Panic
+///
+/// Panics if `batch` does not contain a time column of the correct type
+pub fn filter_time<'a, F>(
+    batch: &'a MutableBatch,
+    ranges: &'a [Range<usize>],
+    mut predicate: F,
+) -> Vec<Range<usize>>
+where
+    F: FnMut(i64) -> bool,
+{
+    let col_idx = *batch
+        .column_names
+        .get(TIME_COLUMN_NAME)
+        .expect("time column");
+
+    let col = &batch.columns[col_idx];
+    let col_data = match &col.data {
+        ColumnData::I64(col_data, _) => col_data,
+        x => unreachable!("expected i64 got {} for time column", x),
+    };
+
+    // Time column is not nullable so can skip checking mask
+    let mut ret = vec![];
+    for range in ranges {
+        let offset = range.start;
+        ret.extend(
+            filter_slice(&col_data[range.clone()], &mut predicate)
+                .map(|r| (r.start + offset)..(r.end + offset)),
+        )
+    }
+    ret
+}
+
+fn filter_slice<'a, T, F>(
+    col_data: &'a [T],
+    predicate: &'a mut F,
+) -> impl Iterator<Item = Range<usize>> + 'a
+where
+    T: Copy,
+    F: 'a + FnMut(T) -> bool,
+{
+    let mut range: Range<usize> = 0..0;
+    let mut values = col_data.iter();
+
+    std::iter::from_fn(move || loop {
+        match values.next() {
+            Some(value) if predicate(*value) => {
+                range.end += 1;
+                continue;
+            }
+            // Either finished or predicate failed
+            _ if range.start != range.end => {
+                let t = range.clone();
+                range.end += 1;
+                range.start = range.end;
+                return Some(t);
+            }
+            // Predicate failed and start == end
+            Some(_) => {
+                range.start += 1;
+                range.end += 1;
+            }
+            None => return None,
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::writer::Writer;
+    use rand::prelude::*;
+
+    fn make_rng() -> StdRng {
+        let seed = rand::rngs::OsRng.next_u64();
+        println!("Seed: {seed}");
+        StdRng::seed_from_u64(seed)
+    }
+
+    #[test]
+    fn test_filter_slice() {
+        let collected: Vec<_> =
+            filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x != 1 && x != 4).collect();
+        assert_eq!(collected, vec![0..1, 2..4, 5..7]);
+
+        let collected: Vec<_> =
+            filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x == 1 || x == 2 || x == 6).collect();
+        assert_eq!(collected, vec![1..3, 6..7])
+    }
+
+    #[test]
+    fn test_filter_fuzz() {
+        let mut rng = make_rng();
+        let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32()))
+            .take(1000)
+            .collect();
+
+        let mut predicate = |x: u32| x & 1 == 0;
+
+        let indexes: Vec<_> = filter_slice(&data, &mut predicate).flatten().collect();
+
+        let expected: Vec<_> = data
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, x)| match predicate(*x) {
+                true => Some(idx),
+                false => None,
+            })
+            .collect();
+
+        assert_eq!(indexes, expected);
+    }
+
+    #[test]
+    fn test_filter_batch() {
+        let mut batch = MutableBatch::new();
+        let mut rng = make_rng();
+        let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() as i64))
+            .take(1000)
+            .collect();
+
+        let ranges = &[0..87, 90..442, 634..800];
+        let mut predicate = |x: i64| x & 1 == 0;
+
+        let mut writer = Writer::new(&mut batch, 1000);
+        writer.write_time("time", data.iter().cloned()).unwrap();
+        writer.commit();
+
+        let actual: Vec<_> = filter_time(&batch, ranges, &mut predicate)
+            .into_iter()
+            .flatten()
+            .collect();
+
+        let expected: Vec<_> = ranges
+            .iter()
+            .flat_map(|r| r.clone())
+            .filter(|idx| predicate(data[*idx]))
+            .collect();
+
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/mutable_batch/src/payload/partition.rs b/mutable_batch/src/payload/partition.rs
new file mode 100644
index 0000000..db36724
--- /dev/null
+++ b/mutable_batch/src/payload/partition.rs
@@ -0,0 +1,1343 @@
+//! Functions for partitioning rows from a [`MutableBatch`]
+//!
+//! The returned ranges can then be used with
+//! [`MutableBatch::extend_from_range`].
+//!
+//! The partitioning template, derived partition key format, and encodings are
+//! described in detail in the [`data_types::partition_template`] module.
+mod strftime;
+
+use std::{borrow::Cow, ops::Range};
+
+use data_types::partition_template::{
+    TablePartitionTemplateOverride, TemplatePart, ENCODED_PARTITION_KEY_CHARS,
+    MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, PARTITION_KEY_DELIMITER, PARTITION_KEY_MAX_PART_LEN,
+    PARTITION_KEY_PART_TRUNCATED, PARTITION_KEY_VALUE_EMPTY_STR, PARTITION_KEY_VALUE_NULL_STR,
+};
+use percent_encoding::utf8_percent_encode;
+use schema::{InfluxColumnType, TIME_COLUMN_NAME};
+use thiserror::Error;
+use unicode_segmentation::UnicodeSegmentation;
+
+use crate::{
+    column::{Column, ColumnData},
+    MutableBatch,
+};
+
+use self::strftime::StrftimeFormatter;
+
+/// An error generating a partition key for a row.
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Error, PartialEq, Eq, Clone)]
+pub enum PartitionKeyError {
+    /// The partition template defines a [`Template::TimeFormat`] part, but the
+    /// provided strftime formatter is invalid.
+    #[error("invalid strftime format in partition template")]
+    InvalidStrftime,
+
+    /// The partition template defines a [`Template::TagValue`] part, but the
+    /// column type is not "tag".
+    #[error("tag value partitioner does not accept input columns of type {0:?}")]
+    TagValueNotTag(InfluxColumnType),
+
+    /// A "catch all" error for when a formatter returns [`std::fmt::Error`],
+    /// which contains no context.
+    #[error("partition key generation error")]
+    FmtError(#[from] std::fmt::Error),
+}
+
+/// Returns an iterator identifying consecutive ranges for a given partition key
+pub fn partition_batch<'a>(
+    batch: &'a MutableBatch,
+    template: &'a TablePartitionTemplateOverride,
+) -> impl Iterator<Item = (Result<String, PartitionKeyError>, Range<usize>)> + 'a {
+    let parts = template.len();
+    if parts > MAXIMUM_NUMBER_OF_TEMPLATE_PARTS {
+        panic!(
+            "partition template contains {} parts, which exceeds the maximum of {} parts",
+            parts, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS
+        );
+    }
+
+    range_encode(partition_keys(batch, template.parts()))
+}
+
+/// A [`TablePartitionTemplateOverride`] is made up of one of more
+/// [`TemplatePart`]s that are rendered and joined together by
+/// [`PARTITION_KEY_DELIMITER`] to form a single partition key.
+///
+/// To avoid allocating intermediate strings, and performing column lookups for
+/// every row, each [`TemplatePart`] is converted to a [`Template`].
+///
+/// [`Template::fmt_row`] can then be used to render the template for that
+/// particular row to the provided string, without performing any additional
+/// column lookups
+#[derive(Debug)]
+#[allow(clippy::large_enum_variant)]
+enum Template<'a> {
+    TagValue(&'a Column, Option<i32>),
+    TimeFormat(&'a [i64], StrftimeFormatter<'a>),
+
+    /// This batch is missing a partitioning tag column.
+    MissingTag,
+}
+
+impl<'a> Template<'a> {
+    /// Renders this template to `out` for the row `idx`.
+    fn fmt_row<W: std::fmt::Write>(
+        &mut self,
+        out: &mut W,
+        idx: usize,
+    ) -> Result<(), PartitionKeyError> {
+        match self {
+            Template::TagValue(col, last_key) if col.valid.get(idx) => match &col.data {
+                ColumnData::Tag(col_data, dictionary, _) => {
+                    let this_key = col_data[idx];
+
+                    // Update the "is identical" tracking key for this new,
+                    // potentially different key.
+                    *last_key = Some(this_key);
+
+                    out.write_str(
+                        encode_key_part(dictionary.lookup_id(this_key).unwrap()).as_ref(),
+                    )?
+                }
+                _ => return Err(PartitionKeyError::TagValueNotTag(col.influx_type())),
+            },
+            Template::TimeFormat(t, fmt) => fmt.render(t[idx], out)?,
+            // Either a tag that has no value for this given row index, or the
+            // batch does not contain this tag at all.
+            Template::TagValue(_, last_key) => {
+                // This row doesn't have a tag value, which should be carried
+                // forwards to be checked against the next row.
+                *last_key = None;
+                out.write_str(PARTITION_KEY_VALUE_NULL_STR)?
+            }
+            Template::MissingTag => out.write_str(PARTITION_KEY_VALUE_NULL_STR)?,
+        }
+
+        Ok(())
+    }
+
+    /// Returns true if the partition key generated by `self` for `idx` will be
+    /// identical to the last generated key.
+    fn is_identical(&self, idx: usize) -> bool {
+        match self {
+            Template::TagValue(col, last_key) if col.valid.get(idx) => match &col.data {
+                ColumnData::Tag(col_data, _, _) => {
+                    let this_key = col_data[idx];
+                    // Check if the dictionary key matches the last dictionary
+                    // key, indicating the same value is going to be rendered.
+                    last_key.map(|v| v == this_key).unwrap_or_default()
+                }
+                // This is an error, but for the purposes of identical checks,
+                // it is treated as not identical, causing the error to be
+                // raised when formatting is attempted.
+                _ => false,
+            },
+            Template::TimeFormat(t, fmt) => {
+                // Check if the last value matches the current value, after
+                // optionally applying the precision reduction optimisation.
+                fmt.equals_last(t[idx])
+            }
+            // The last row did not contain this key, and neither does this.
+            Template::TagValue(_, None) => true,
+            // The last row did contain a key, but this one does not (therefore
+            // it differs).
+            Template::TagValue(_, Some(_)) => false,
+
+            // The batch does not contain this tag at all - it always matches
+            // with the previous row.
+            Template::MissingTag => true,
+        }
+    }
+}
+
+fn encode_key_part(s: &str) -> Cow<'_, str> {
+    // Encode reserved characters and non-ascii characters.
+    let as_str: Cow<'_, str> = utf8_percent_encode(s, &ENCODED_PARTITION_KEY_CHARS).into();
+
+    match as_str.len() {
+        0 => Cow::Borrowed(PARTITION_KEY_VALUE_EMPTY_STR),
+        1..=PARTITION_KEY_MAX_PART_LEN => as_str,
+        _ => {
+            // This string exceeds the maximum byte length limit and must be
+            // truncated.
+            //
+            // Truncation of unicode strings can be tricky - this implementation
+            // avoids splitting unicode code-points nor graphemes. See the
+            // partition_template module docs in data_types before altering
+            // this.
+
+            // Preallocate the string to hold the long partition key part.
+            let mut buf = String::with_capacity(PARTITION_KEY_MAX_PART_LEN);
+
+            // This is a slow path, re-encoding the original input string -
+            // fortunately this is an uncommon path.
+            //
+            // Walk the string, encoding each grapheme (which includes spaces)
+            // individually, tracking the total length of the encoded string.
+            // Once it hits 199 bytes, stop and append a #.
+
+            let mut bytes = 0;
+            s.graphemes(true)
+                .map(|v| Cow::from(utf8_percent_encode(v, &ENCODED_PARTITION_KEY_CHARS)))
+                .take_while(|v| {
+                    bytes += v.len(); // Byte length of encoded grapheme
+                    bytes < PARTITION_KEY_MAX_PART_LEN
+                })
+                .for_each(|v| buf.push_str(v.as_ref()));
+
+            // Append the truncation marker.
+            buf.push(PARTITION_KEY_PART_TRUNCATED);
+
+            assert!(buf.len() <= PARTITION_KEY_MAX_PART_LEN);
+
+            Cow::Owned(buf)
+        }
+    }
+}
+
+/// Returns an iterator of partition keys for the given table batch.
+///
+/// This function performs deduplication on returned keys; the returned iterator
+/// yields [`Some`] containing the partition key string when a new key is
+/// generated, and [`None`] when the generated key would equal the last key.
+fn partition_keys<'a>(
+    batch: &'a MutableBatch,
+    template_parts: impl Iterator<Item = TemplatePart<'a>>,
+) -> impl Iterator<Item = Option<Result<String, PartitionKeyError>>> + 'a {
+    // Extract the timestamp data.
+    let time = match batch.column(TIME_COLUMN_NAME).map(|v| &v.data) {
+        Ok(ColumnData::I64(data, _)) => data.as_slice(),
+        Ok(v) => unreachable!("incorrect type for time column: {v:?}"),
+        Err(e) => panic!("error reading time column: {e:?}"),
+    };
+
+    // Convert TemplatePart into an ordered array of Template
+    let mut template = template_parts
+        .map(|v| match v {
+            TemplatePart::TagValue(col_name) => batch
+                .column(col_name)
+                .map_or_else(|_| Template::MissingTag, |v| Template::TagValue(v, None)),
+            TemplatePart::TimeFormat(fmt) => {
+                Template::TimeFormat(time, StrftimeFormatter::new(fmt))
+            }
+        })
+        .collect::<Vec<_>>();
+
+    // Track the length of the last yielded partition key, and pre-allocate the
+    // next partition key string to match it.
+    //
+    // In the happy path, keys of consistent sizes are generated and the
+    // allocations reach a minimum. If the keys are inconsistent, at best a
+    // subset of allocations are eliminated, and at worst, a few bytes of memory
+    // is temporarily allocated until the resulting string is shrunk down.
+    let mut last_len = 5;
+
+    // The first row in a batch must always be evaluated to produce a key.
+    //
+    // Row 0 is guaranteed to exist, otherwise attempting to read the time
+    // column above would have caused a panic (no rows -> no time column).
+    let first = std::iter::once(Some(evaluate_template(&mut template, &mut last_len, 0)));
+
+    // The subsequent rows in a batch may generate the same key, and therefore a
+    // dedupe check is used before allocating & populating the partition key.
+    let rest = (1..batch.row_count).map(move |idx| {
+        // Check if this partition key is going to be different from the
+        // last, short-circuiting the check if it is.
+        if template.iter().all(|t| t.is_identical(idx)) {
+            return None;
+        }
+
+        Some(evaluate_template(&mut template, &mut last_len, idx))
+    });
+
+    first.chain(rest)
+}
+
+/// Evaluate the partition template against the row indexed by `idx`.
+///
+/// # Panics
+///
+/// This method panics if `idx` exceeds the number of rows in the batch.
+fn evaluate_template(
+    template: &mut [Template<'_>],
+    last_len: &mut usize,
+    idx: usize,
+) -> Result<String, PartitionKeyError> {
+    let mut buf = String::with_capacity(*last_len);
+    let template_len = template.len();
+
+    // Evaluate each template part for this row
+    for (col_idx, col) in template.iter_mut().enumerate() {
+        // Evaluate the formatter for this template part against the row.
+        col.fmt_row(&mut buf, idx)?;
+
+        // If this isn't the last element in the template, insert a field
+        // delimiter.
+        if col_idx + 1 != template_len {
+            buf.push(PARTITION_KEY_DELIMITER);
+        }
+    }
+
+    *last_len = buf.len();
+    Ok(buf)
+}
+
+/// Takes an iterator of [`Option`] and merges identical consecutive elements
+/// together.
+///
+/// Any [`None`] yielded by `iterator` is added to the range for the previous
+/// [`Some`].
+fn range_encode<I, T>(mut iterator: I) -> impl Iterator<Item = (T, Range<usize>)>
+where
+    I: Iterator<Item = Option<T>>,
+    T: Eq,
+{
+    let mut last: Option<I::Item> = None;
+    let mut range: Range<usize> = 0..0;
+    std::iter::from_fn(move || loop {
+        match (iterator.next(), last.take()) {
+            // The iterator yeilds a NULL/identical value and there is a prior value
+            (Some(None), Some(v)) => {
+                range.end += 1;
+                last = Some(v);
+            }
+            // The iterator yeilds a value, and the last value matches
+            (Some(cur), Some(next)) => match cur == next {
+                true => {
+                    range.end += 1;
+                    last = Some(next);
+                }
+                false => {
+                    let t = range.clone();
+                    range.start = range.end;
+                    range.end += 1;
+                    last = Some(cur);
+                    return Some((next.unwrap(), t));
+                }
+            },
+            // There is no last value
+            (Some(cur), None) => {
+                range.end += 1;
+                last = Some(cur);
+            }
+            (None, Some(next)) => return Some((next.unwrap(), range.clone())),
+            (None, None) => return None,
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use super::*;
+
+    use crate::writer::Writer;
+
+    use assert_matches::assert_matches;
+    use chrono::{format::StrftimeItems, DateTime, Datelike, Days, TimeZone, Utc};
+    use data_types::partition_template::{
+        build_column_values, test_table_partition_override, ColumnValue,
+    };
+    use proptest::{prelude::*, prop_compose, proptest, strategy::Strategy};
+    use rand::prelude::*;
+
+    fn make_rng() -> StdRng {
+        let seed = rand::rngs::OsRng.next_u64();
+        println!("Seed: {seed}");
+        StdRng::seed_from_u64(seed)
+    }
+
+    /// Reproducer for https://github.com/influxdata/idpe/issues/17765
+    #[test]
+    fn test_equals_last() {
+        let ts = [
+            1686756903736785920, // last_eq=false, render, set last_ptr
+            42,                  // last_eq=false, render, set last_ptr
+            1686756903736785920, // last_eq=false, re-use, don't change last_ptr
+            1686756903736785920, // last_eq=false, re-use, don't change last_ptr
+            42,                  // last_eq=true (wrong), re-use
+        ];
+
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, ts.len());
+
+        writer.write_time("time", ts.into_iter()).unwrap();
+        writer.commit();
+
+        let keys =
+            generate_denormalised_keys(&batch, TablePartitionTemplateOverride::default().parts())
+                .unwrap();
+
+        assert_eq!(
+            keys,
+            &[
+                "2023-06-14",
+                "1970-01-01",
+                "2023-06-14",
+                "2023-06-14",
+                "1970-01-01",
+            ]
+        );
+    }
+
+    /// Generates a vector of partition key strings, or an error.
+    ///
+    /// This function normalises the de-duplicated output of
+    /// [`partition_keys()`], returning the last observed key when the dedupe
+    /// [`partition_keys()`] process returns [`None`].
+    fn generate_denormalised_keys<'a, 'b: 'a>(
+        batch: &'b MutableBatch,
+        template_parts: impl Iterator<Item = TemplatePart<'a>>,
+    ) -> Result<Vec<String>, PartitionKeyError> {
+        let mut last_ret = None;
+        partition_keys(batch, template_parts)
+            .map(|v| match v {
+                Some(this) => {
+                    last_ret = Some(this.clone());
+                    this
+                }
+                None => last_ret
+                    .as_ref()
+                    .expect("must have observed prior key")
+                    .clone(),
+            })
+            .collect::<Result<Vec<_>, _>>()
+    }
+
+    /// A fixture test asserting the default partition key format, derived from
+    /// the default partition key template.
+    #[test]
+    fn test_default_fixture() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 1);
+
+        writer.write_time("time", vec![1].into_iter()).unwrap();
+        writer
+            .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter())
+            .unwrap();
+        writer.commit();
+
+        let template_parts =
+            TablePartitionTemplateOverride::try_new(None, &Default::default()).unwrap();
+        let keys: Vec<_> = partition_keys(&batch, template_parts.parts())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(keys, vec!["1970-01-01".to_string()])
+    }
+
+    #[test]
+    #[should_panic(expected = r#"error reading time column: ColumnNotFound { column: "time" }"#)]
+    fn test_zero_sized_batch() {
+        let batch = MutableBatch::new();
+
+        let template_parts = test_table_partition_override(vec![
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::TagValue("bananas"),
+        ]);
+
+        let keys: Vec<_> = partition_batch(&batch, &template_parts).collect::<Vec<_>>();
+        assert_eq!(keys, vec![])
+    }
+
+    #[test]
+    fn test_range_encode() {
+        let collected: Vec<_> =
+            range_encode(vec![5, 5, 5, 7, 2, 2, 3].into_iter().map(Some)).collect();
+        assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)])
+    }
+
+    #[test]
+    fn test_range_encode_sparse() {
+        let collected: Vec<_> =
+            range_encode(vec![Some(5), None, None, Some(7), Some(2), None, Some(3)].into_iter())
+                .collect();
+        assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)])
+    }
+
+    #[test]
+    fn test_range_encode_fuzz() {
+        let mut rng = make_rng();
+        let original: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() % 20))
+            .take(1000)
+            .collect();
+
+        let rle: Vec<_> = range_encode(original.iter().cloned().map(Some)).collect();
+
+        let mut last_range = rle[0].1.clone();
+        for (_, range) in &rle[1..] {
+            assert_eq!(range.start, last_range.end);
+            assert_ne!(range.start, range.end);
+            last_range = range.clone();
+        }
+
+        let hydrated: Vec<_> = rle
+            .iter()
+            .flat_map(|(v, r)| std::iter::repeat(*v).take(r.end - r.start))
+            .collect();
+
+        assert_eq!(original, hydrated)
+    }
+
+    #[test]
+    fn test_partition() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_tag(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::TagValue("bananas"), // column not present
+        ];
+
+        writer.commit();
+
+        let keys: Vec<_> = partition_keys(&batch, template_parts.into_iter())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(
+            keys,
+            vec![
+                "1970-01-01 00:00:00|!|!".to_string(),
+                "1970-01-01 00:00:00|west|!".to_string(),
+                "1970-01-01 00:00:00|!|!".to_string(),
+                "1970-01-01 00:00:00|east|!".to_string(),
+                "1970-01-01 00:00:00|!|!".to_string()
+            ]
+        )
+    }
+
+    #[test]
+    fn test_sparse_representation() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 6);
+
+        writer
+            .write_time(
+                "time",
+                vec![
+                    1,
+                    1,
+                    1,
+                    1685971961464736000,
+                    1685971961464736000,
+                    1685971961464736000,
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        writer
+            .write_tag(
+                "region",
+                Some(&[0b00111111]),
+                vec![
+                    "platanos", "platanos", "platanos", "platanos", "platanos", "bananas",
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::TagValue("bananas"), // column not present
+        ];
+
+        writer.commit();
+
+        let mut iter = partition_keys(&batch, template_parts.into_iter());
+
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("1970-01-01 00:00:00|platanos|!".to_string()))
+        );
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("2023-06-05 13:32:41|platanos|!".to_string()))
+        );
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("2023-06-05 13:32:41|bananas|!".to_string()))
+        );
+    }
+
+    #[test]
+    fn partitioning_on_fields_panics() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_string(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [TemplatePart::TagValue("region")];
+
+        writer.commit();
+
+        let got: Result<Vec<_>, _> = generate_denormalised_keys(&batch, template_parts.into_iter());
+        assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_)));
+    }
+
+    fn identity<'a, T>(s: T) -> ColumnValue<'a>
+    where
+        T: Into<Cow<'a, str>>,
+    {
+        ColumnValue::Identity(s.into())
+    }
+
+    fn prefix<'a, T>(s: T) -> ColumnValue<'a>
+    where
+        T: Into<Cow<'a, str>>,
+    {
+        ColumnValue::Prefix(s.into())
+    }
+
+    fn year(y: i32) -> ColumnValue<'static> {
+        ColumnValue::Datetime {
+            begin: Utc.with_ymd_and_hms(y, 1, 1, 0, 0, 0).unwrap(),
+            end: Utc.with_ymd_and_hms(y + 1, 1, 1, 0, 0, 0).unwrap(),
+        }
+    }
+
+    // Generate a test that asserts the derived partition key matches
+    // "want_key", when using the provided "template" parts and set of "tags".
+    //
+    // Additionally validates that the derived key is reversible into the
+    // expected set of "want_reversed_tags" from the original inputs.
+    macro_rules! test_partition_key {
+        (
+            $name:ident,
+            template = $template:expr,              // Array/vec of TemplatePart
+            tags = $tags:expr,                      // Array/vec of (tag_name, value) tuples
+            want_key = $want_key:expr,              // Expected partition key string
+            want_reversed_tags = $want_reversed_tags:expr // Array/vec of (tag_name, value) reversed from $tags
+        ) => {
+            paste::paste! {
+                #[test]
+                fn [<test_partition_key_ $name>]() {
+                    let mut batch = MutableBatch::new();
+                    let mut writer = Writer::new(&mut batch, 1);
+
+                    let template = $template.into_iter().collect::<Vec<_>>();
+                    let template = test_table_partition_override(template);
+
+                    // Timestamp: 2023-05-29T13:03:16Z
+                    writer
+                        .write_time("time", vec![1685365396931384064].into_iter())
+                        .unwrap();
+
+                    for (col, value) in $tags {
+                        let v = String::from(value);
+                        writer
+                            .write_tag(col, Some(&[0b00000001]), vec![v.as_str()].into_iter())
+                            .unwrap();
+                    }
+
+                    writer.commit();
+
+                    // Generate the full set of partition keys, inserting the
+                    // last observed value when the next key is identical to
+                    // normalise the values.
+                    let keys = generate_denormalised_keys(&batch, template.parts())
+                        .unwrap();
+                    assert_eq!(keys, vec![$want_key.to_string()], "generated key differs");
+
+                    // Reverse the encoding.
+                    let reversed = build_column_values(&template, &keys[0]);
+
+                    // Expect the tags to be (str, ColumnValue) for the
+                    // comparison
+                    let want: Vec<(&str, ColumnValue<'_>)> = $want_reversed_tags
+                        .into_iter()
+                        .collect();
+
+                    let got = reversed.collect::<Vec<_>>();
+                    assert_eq!(got, want, "reversed key differs");
+                }
+            }
+        };
+    }
+
+    test_partition_key!(
+        simple,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+        ],
+        tags = [("a", "bananas"), ("b", "are_good")],
+        want_key = "2023|bananas|are_good",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("bananas")),
+            ("b", identity("are_good")),
+        ]
+    );
+
+    test_partition_key!(
+        non_ascii,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+        ],
+        tags = [("a", "bananas"), ("b", "plátanos")],
+        want_key = "2023|bananas|pl%C3%A1tanos",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("bananas")),
+            ("b", identity("plátanos")),
+        ]
+    );
+
+    test_partition_key!(
+        single_tag_template_tag_not_present,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("b", "bananas")],
+        want_key = "!",
+        want_reversed_tags = []
+    );
+
+    test_partition_key!(
+        single_tag_template_tag_empty,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "")],
+        want_key = "^",
+        want_reversed_tags = [("a", identity(""))]
+    );
+
+    test_partition_key!(
+        missing_tag,
+        template = [TemplatePart::TagValue("a"), TemplatePart::TagValue("b")],
+        tags = [("a", "bananas")],
+        want_key = "bananas|!",
+        want_reversed_tags = [("a", identity("bananas"))]
+    );
+
+    test_partition_key!(
+        unambiguous,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::TagValue("c"),
+            TemplatePart::TagValue("d"),
+            TemplatePart::TagValue("e"),
+        ],
+        tags = [("a", "|"), ("b", "!"), ("d", "%7C%21%257C"), ("e", "^")],
+        want_key = "2023|%7C|%21|!|%257C%2521%25257C|%5E",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("|")),
+            ("b", identity("!")),
+            ("d", identity("%7C%21%257C")),
+            ("e", identity("^"))
+        ]
+    );
+
+    test_partition_key!(
+        truncated_char_reserved,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "#")],
+        want_key = "%23",
+        want_reversed_tags = [("a", identity("#"))]
+    );
+
+    // Keys < 200 bytes long should not be truncated.
+    test_partition_key!(
+        truncate_length_199,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(199))],
+        want_key = "A".repeat(199),
+        want_reversed_tags = [("a", identity("A".repeat(199)))]
+    );
+
+    // Keys of exactly 200 bytes long should not be truncated.
+    test_partition_key!(
+        truncate_length_200,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(200))],
+        want_key = "A".repeat(200),
+        want_reversed_tags = [("a", identity("A".repeat(200)))]
+    );
+
+    // Keys > 200 bytes long should be truncated to exactly 200 bytes,
+    // terminated by a # character.
+    test_partition_key!(
+        truncate_length_201,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(201))],
+        want_key = format!("{}#", "A".repeat(199)),
+        want_reversed_tags = [("a", prefix("A".repeat(199)))]
+    );
+
+    // A key ending in an encoded sequence that does not cross the cut-off point
+    // is preserved.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                      ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%25`
+    //                      ^ cutoff
+    //
+    // So the entire encoded sequence should be preserved.
+    test_partition_key!(
+        truncate_encoding_sequence_ok,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(197)))],
+        want_key = format!("{}%25", "A".repeat(197)), // Not truncated
+        want_reversed_tags = [("a", identity(format!("{}%", "A".repeat(197))))]
+    );
+
+    // A key ending in an encoded sequence should not be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                    ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>% 25`            (space added for clarity)
+    //                    ^ cutoff
+    //
+    // Where naive slicing would result in truncating an encoding sequence and
+    // therefore the whole encoded sequence should be truncated.
+    test_partition_key!(
+        truncate_encoding_sequence_truncated_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(198)))],
+        want_key = format!("{}#", "A".repeat(198)), // Truncated
+        want_reversed_tags = [("a", prefix("A".repeat(198)))]
+    );
+
+    // A key ending in an encoded sequence should not be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                     ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%2 5`            (space added for clarity)
+    //                     ^ cutoff
+    //
+    // Where naive slicing would result in truncating an encoding sequence and
+    // therefore the whole encoded sequence should be truncated.
+    test_partition_key!(
+        truncate_encoding_sequence_truncated_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(199)))],
+        want_key = format!("{}#", "A".repeat(199)), // Truncated
+        want_reversed_tags = [("a", prefix("A".repeat(199)))]
+    );
+
+    // A key ending in a unicode code-point should never be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>🍌`
+    //                         ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%F0%9F%8D%8C`
+    //                         ^ cutoff
+    //
+    // Therefore the entire code-point should be removed from the truncated
+    // output.
+    //
+    // This test MUST NOT fail, or an invalid UTF-8 string is being generated
+    // which is unusable in languages (like Rust).
+    //
+    // Advances the cut-off to ensure the position within the code-point doesn't
+    // affect the output.
+    test_partition_key!(
+        truncate_within_code_point_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(194)))],
+        want_key = format!("{}#", "A".repeat(194)),
+        want_reversed_tags = [("a", prefix("A".repeat(194)))]
+    );
+    test_partition_key!(
+        truncate_within_code_point_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(195)))],
+        want_key = format!("{}#", "A".repeat(195)),
+        want_reversed_tags = [("a", prefix("A".repeat(195)))]
+    );
+    test_partition_key!(
+        truncate_within_code_point_3,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(196)))],
+        want_key = format!("{}#", "A".repeat(196)),
+        want_reversed_tags = [("a", prefix("A".repeat(196)))]
+    );
+
+    // A key ending in a unicode grapheme should never be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>நிbananas`
+    //                   ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>நிbananas`    (within a grapheme)
+    //                   ^ cutoff
+    //
+    // Therefore the entire grapheme (நி) should be removed from the truncated
+    // output.
+    //
+    // This is a conservative implementation, and may be relaxed in the future.
+    //
+    // This first test asserts that a grapheme can be included, and then
+    // subsequent tests increment the cut-off point by 1 byte each time.
+    test_partition_key!(
+        truncate_within_grapheme_0,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(181)))],
+        want_key = format!("{}%E0%AE%A8%E0%AE%BF#", "A".repeat(181)),
+        want_reversed_tags = [("a", prefix(format!("{}நி", "A".repeat(181))))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(182)))],
+        want_key = format!("{}#", "A".repeat(182)),
+        want_reversed_tags = [("a", prefix("A".repeat(182)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(183)))],
+        want_key = format!("{}#", "A".repeat(183)),
+        want_reversed_tags = [("a", prefix("A".repeat(183)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_3,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(184)))],
+        want_key = format!("{}#", "A".repeat(184)),
+        want_reversed_tags = [("a", prefix("A".repeat(184)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_4,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(185)))],
+        want_key = format!("{}#", "A".repeat(185)),
+        want_reversed_tags = [("a", prefix("A".repeat(185)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_5,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(186)))],
+        want_key = format!("{}#", "A".repeat(186)),
+        want_reversed_tags = [("a", prefix("A".repeat(186)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_6,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(187)))],
+        want_key = format!("{}#", "A".repeat(187)),
+        want_reversed_tags = [("a", prefix("A".repeat(187)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_7,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(188)))],
+        want_key = format!("{}#", "A".repeat(188)),
+        want_reversed_tags = [("a", prefix("A".repeat(188)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_8,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(189)))],
+        want_key = format!("{}#", "A".repeat(189)),
+        want_reversed_tags = [("a", prefix("A".repeat(189)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_9,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(190)))],
+        want_key = format!("{}#", "A".repeat(190)),
+        want_reversed_tags = [("a", prefix("A".repeat(190)))]
+    );
+
+    // As above, but the grapheme is the last portion of the generated string
+    // (no trailing bananas).
+    test_partition_key!(
+        truncate_grapheme_identity,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நி", "A".repeat(182)))],
+        want_key = format!("{}%E0%AE%A8%E0%AE%BF", "A".repeat(182)),
+        want_reversed_tags = [("a", identity(format!("{}நி", "A".repeat(182))))]
+    );
+
+    /// A test using an invalid strftime format string.
+    #[test]
+    fn test_invalid_strftime() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 1);
+
+        writer.write_time("time", vec![1].into_iter()).unwrap();
+        writer
+            .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter())
+            .unwrap();
+        writer.commit();
+
+        let template = [TemplatePart::TimeFormat("%3F")]
+            .into_iter()
+            .collect::<Vec<_>>();
+        let template = test_table_partition_override(template);
+
+        let ret = partition_keys(&batch, template.parts())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>();
+
+        assert_matches!(ret, Err(PartitionKeyError::InvalidStrftime));
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "partition template contains 9 parts, which exceeds the maximum of 8 parts"
+    )]
+    fn test_too_many_parts() {
+        let template = test_table_partition_override(
+            std::iter::repeat(TemplatePart::TagValue("bananas"))
+                .take(9)
+                .collect(),
+        );
+
+        let _ = partition_batch(&MutableBatch::new(), &template);
+    }
+
+    // These values are arbitrarily chosen when building an input to the
+    // partitioner.
+
+    // Arbitrary tag names are selected from this set of candidates (to ensure
+    // there's always some overlap, rather than truly random strings).
+    const TEST_TAG_NAME_SET: &[&str] = &["A", "B", "C", "D", "E", "F"];
+
+    // Arbitrary template parts are selected from this set.
+    const TEST_TEMPLATE_PARTS: &[TemplatePart<'static>] = &[
+        TemplatePart::TimeFormat("%Y|%m|%d!-string"),
+        TemplatePart::TimeFormat("%Y|%m|%d!-%%bananas"),
+        TemplatePart::TimeFormat("%Y/%m/%d"),
+        TemplatePart::TimeFormat("%Y-%m-%d"),
+        TemplatePart::TagValue(""),
+        TemplatePart::TagValue("A"),
+        TemplatePart::TagValue("B"),
+        TemplatePart::TagValue("C"),
+        TemplatePart::TagValue("tags!"),
+        TemplatePart::TagValue("%tags!"),
+        TemplatePart::TagValue("my_tag"),
+        TemplatePart::TagValue("my|tag"),
+        TemplatePart::TagValue("%%%%|!!!!|"),
+    ];
+
+    prop_compose! {
+        /// Yields a vector of up to [`MAXIMUM_NUMBER_OF_TEMPLATE_PARTS`] unique
+        /// template parts, chosen from [`TEST_TEMPLATE_PARTS`].
+        fn arbitrary_template_parts()(set in proptest::collection::vec(
+                proptest::sample::select(TEST_TEMPLATE_PARTS),
+                (1, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS) // Set size range
+            )) -> Vec<TemplatePart<'static>> {
+            let mut set = set;
+            set.dedup_by(|a, b| format!("{a:?}") == format!("{b:?}"));
+            set
+        }
+    }
+
+    prop_compose! {
+        /// Yield a HashMap of between 1 and 10 (column_name, random string
+        /// value) with tag names chosen from [`TEST_TAG_NAME_SET`].
+        fn arbitrary_tag_value_map()(v in proptest::collection::hash_map(
+                proptest::sample::select(TEST_TAG_NAME_SET).prop_map(ToString::to_string),
+                any::<String>(),
+                (1, 10) // Set size range
+            )) -> HashMap<String, String> {
+            v
+        }
+    }
+
+    prop_compose! {
+        /// Yield a Vec containing an identical timestamp run of random length,
+        /// up to `max_run_len`,
+        fn arbitrary_timestamp_run(max_run_len: usize)(v in 0_i64..i64::MAX, run_len in 1..max_run_len) -> Vec<i64> {
+            let mut x = Vec::with_capacity(run_len);
+            x.resize(run_len, v);
+            x
+        }
+    }
+
+    /// Yield a Vec of timestamp values that more accurately model real
+    /// timestamps than pure random selection.
+    ///
+    /// Runs of identical timestamps are generated with
+    /// [`arbitrary_timestamp_run()`], which are then shuffled to produce a list
+    /// of timestamps with limited repeats, sometimes consecutively.
+    fn arbitrary_timestamps() -> impl Strategy<Value = Vec<i64>> {
+        proptest::collection::vec(arbitrary_timestamp_run(6), 10..100)
+            .prop_map(|v| v.into_iter().flatten().collect::<Vec<_>>())
+            .prop_shuffle()
+    }
+
+    enum StringOrTSRange {
+        String(String),
+        TSRange(DateTime<Utc>, DateTime<Utc>),
+    }
+
+    impl StringOrTSRange {
+        fn expect_string(&self) -> &String {
+            match self {
+                Self::String(s) => s,
+                Self::TSRange(_, _) => panic!("expected string, got TS range"),
+            }
+        }
+
+        fn expect_ts_range(&self) -> (DateTime<Utc>, DateTime<Utc>) {
+            match self {
+                Self::String(_) => panic!("expected TS range, got string"),
+                Self::TSRange(b, e) => (*b, *e),
+            }
+        }
+    }
+
+    proptest! {
+        /// A property test that asserts a write comprised of an arbitrary
+        /// subset of [`TEST_TAG_NAME_SET`] with randomised values, that is
+        /// partitioned using a partitioning template arbitrarily selected from
+        /// [`TEST_TEMPLATE_PARTS`], can be reversed to the full set of tags via
+        /// [`build_column_values()`].
+        #[test]
+        fn prop_reversible_mapping(
+            template in arbitrary_template_parts(),
+            tag_values in arbitrary_tag_value_map(),
+            ts in 0_i64..i64::MAX,
+        ) {
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, 1);
+
+            let template = template.clone().into_iter().collect::<Vec<_>>();
+            let template = test_table_partition_override(template);
+
+            writer
+                .write_time("time", vec![ts].into_iter())
+                .unwrap();
+
+            for (col, value) in &tag_values {
+                writer
+                    .write_tag(col.as_str(), Some(&[0b00000001]), vec![value.as_str()].into_iter())
+                    .unwrap();
+            }
+
+            writer.commit();
+            let keys: Vec<_> = generate_denormalised_keys(&batch, template.parts())
+                .unwrap();
+            assert_eq!(keys.len(), 1);
+
+            // Reverse the encoding.
+            let reversed: Vec<(&str, ColumnValue<'_>)> = build_column_values(&template, &keys[0]).collect();
+
+            // Build the expected set of reversed tags by filtering out any
+            // NULL tags (preserving empty string values).
+            let ts = Utc.timestamp_nanos(ts);
+            let want_reversed: Vec<(&str, StringOrTSRange)> = template.parts().filter_map(|v| match v {
+                TemplatePart::TagValue(col_name) if tag_values.contains_key(col_name) => {
+                    // This tag had a (potentially empty) value wrote and should
+                    // appear in the reversed output.
+                    Some((col_name, StringOrTSRange::String(tag_values.get(col_name).unwrap().to_string())))
+                }
+                TemplatePart::TimeFormat("%Y/%m/%d" | "%Y-%m-%d") => {
+                    let begin = Utc.with_ymd_and_hms(ts.year(), ts.month(), ts.day(), 0, 0, 0).unwrap();
+                    let end = begin + Days::new(1);
+                    Some((TIME_COLUMN_NAME, StringOrTSRange::TSRange(begin, end)))
+                }
+                _ => None,
+            }).collect();
+
+            assert_eq!(want_reversed.len(), reversed.len());
+
+            for ((want_col, want_val), (got_col, got_val)) in want_reversed.iter().zip(reversed.iter()) {
+                assert_eq!(got_col, want_col, "column names differ");
+
+                match got_val {
+                    ColumnValue::Identity(_) => {
+                        // An identity is both equal to, and a prefix of, the
+                        // original value.
+                        let want_val = want_val.expect_string();
+                        assert_eq!(got_val, &want_val, "identity values differ");
+                        assert!(
+                            got_val.is_prefix_match_of(want_val),
+                            "prefix mismatch; {:?} is not a prefix of {:?}",
+                            got_val,
+                            want_val,
+                        );
+                    },
+                    ColumnValue::Prefix(_) => {
+                        let want_val = want_val.expect_string();
+                        assert!(
+                            got_val.is_prefix_match_of(want_val),
+                            "prefix mismatch; {:?} is not a prefix of {:?}",
+                            got_val,
+                            want_val,
+                        );
+                    },
+                    ColumnValue::Datetime{..} => {
+                        let (got_begin, got_end) = want_val.expect_ts_range();
+                        match got_val {
+                            ColumnValue::Datetime{begin, end} => {
+                                assert_eq!(got_begin, *begin);
+                                assert_eq!(got_end, *end);
+                            }
+                            _ => panic!("expected datatime column value but got: {:?}", got_val)
+                        }
+                    },
+                };
+            }
+        }
+
+        /// A property test that asserts the partitioner tolerates (does not
+        /// panic) randomised, potentially invalid strftime formatter strings.
+        #[test]
+        fn prop_arbitrary_strftime_format(fmt in any::<String>()) {
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, 1);
+
+            // This sequence causes chrono's formatter to panic with a "do not
+            // use this" message...
+            //
+            // This is validated to not be part of the formatter (among other
+            // invalid sequences) when constructing a template from the user
+            // input/proto.
+            //
+            // Uniquely this causes a panic, whereas others do not - so it must
+            // be filtered out when fuzz-testing that invalid sequences do not
+            // cause a panic in the key generator.
+            prop_assume!(!fmt.contains("%#z"));
+
+            // Generate a single time-based partitioning template with a
+            // randomised format string.
+            let template = vec![
+                TemplatePart::TimeFormat(&fmt),
+            ];
+            let template = test_table_partition_override(template);
+
+            // Timestamp: 2023-05-29T13:03:16Z
+            writer
+                .write_time("time", vec![1685365396931384064].into_iter())
+                .unwrap();
+
+            writer
+                .write_tag("bananas", Some(&[0b00000001]), vec!["great"].into_iter())
+                .unwrap();
+
+            writer.commit();
+            let ret = partition_keys(&batch, template.parts())
+                .map(|v| v.expect("non-identical consecutive keys"))
+                .collect::<Result<Vec<_>, _>>();
+
+            // The is allowed to succeed or fail under this test (but not
+            // panic), and the returned error/value must match certain
+            // properties:
+            match ret {
+                Ok(v) => { assert_eq!(v.len(), 1); },
+                Err(e) => { assert_matches!(e, PartitionKeyError::InvalidStrftime); },
+            }
+        }
+
+        // Drives the strftime formatter through the "front door", using the
+        // same interface as a user would call to partition data. This validates
+        // the integration between the various formatters, range encoders,
+        // dedupe, etc.
+        #[test]
+        fn prop_strftime_integration(
+            times in arbitrary_timestamps(),
+            format in prop_oneof![
+                Just("%Y-%m-%d"), // Default scheme
+                Just("%s")        // Unix seconds, to drive increased cache miss rate in strftime formatter
+            ]
+        ) {
+            use std::fmt::Write;
+
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, times.len());
+            let row_count = times.len();
+
+            let template = test_table_partition_override(vec![TemplatePart::TimeFormat(format)]);
+
+            writer
+                .write_time("time", times.clone().into_iter())
+                .unwrap();
+
+            writer.commit();
+
+            let fmt = StrftimeItems::new(format);
+            let iter = partition_batch(&batch, &template);
+
+            let mut observed_rows = 0;
+
+            // For each partition key and the calculated row range
+            for (key, range) in iter {
+                let key = key.unwrap();
+
+                observed_rows += range.len();
+
+                // Validate all rows in that range render to the same timestamp
+                // value as the partition key when using the same format, using
+                // a known-good formatter.
+                for ts in &times[range] {
+                    // Generate the control string.
+                    let mut control = String::new();
+                    let _ = write!(
+                        control,
+                        "{}",
+                        Utc.timestamp_nanos(*ts)
+                            .format_with_items(fmt.clone())
+                    );
+                    assert_eq!(control, key);
+                }
+            }
+
+            assert_eq!(observed_rows, row_count);
+        }
+    }
+}
diff --git a/mutable_batch/src/payload/partition/strftime.rs b/mutable_batch/src/payload/partition/strftime.rs
new file mode 100644
index 0000000..bd52300
--- /dev/null
+++ b/mutable_batch/src/payload/partition/strftime.rs
@@ -0,0 +1,415 @@
+use std::fmt::Write;
+
+use chrono::{format::StrftimeItems, TimeZone, Utc};
+
+use crate::PartitionKeyError;
+
+use super::encode_key_part;
+
+/// The number of nanoseconds in 1 day, definitely recited from memory.
+const DAY_NANOSECONDS: i64 = 86_400_000_000_000;
+
+/// The default YMD formatter spec.
+const YMD_SPEC: &str = "%Y-%m-%d";
+
+/// A FIFO ring buffer, holding `N` lazily initialised slots.
+///
+/// This is optimised for low values of `N` (where N*T covers a few cache lines)
+/// as it performs an O(n) linear search.
+#[derive(Debug)]
+struct RingBuffer<const N: usize, T> {
+    buf: [Option<T>; N],
+
+    /// Index into to the last wrote value.
+    last_idx: usize,
+}
+
+impl<const N: usize, T> Default for RingBuffer<N, T>
+where
+    T: Default,
+{
+    fn default() -> Self {
+        Self {
+            buf: [(); N].map(|_| Default::default()), // default init for non-const type
+            last_idx: N - 1,
+        }
+    }
+}
+
+impl<const N: usize, T> RingBuffer<N, T>
+where
+    T: Default,
+{
+    /// Return a mutable reference to the next slot to be overwrote. This method
+    /// initialises the slot if it has not been previously used.
+    ///
+    /// This is like an "insert" operation, but allows the caller to re-use the
+    /// contents of the slot to minimise allocations.
+    ///
+    /// This is an O(1) operation.
+    fn next_slot(&mut self) -> &mut T {
+        // Advance the next slot pointer
+        self.last_idx += 1;
+        self.last_idx %= N;
+
+        let v = self.buf[self.last_idx].get_or_insert_with(Default::default);
+
+        v
+    }
+
+    /// Drop the last buffer entry.
+    ///
+    /// This may cause spurious cache misses due to the short-circuiting search
+    /// observing an empty element, potentially before non-empty elements.
+    fn drop_last(&mut self) {
+        self.buf[self.last_idx] = None;
+    }
+
+    /// Find the first initialised slot that causes `F` to evaluate to true,
+    /// returning the slot contents.
+    ///
+    /// This is a O(n) linear search operation, which for small N can be as
+    /// fast, or faster, than a hashmap lookup by key.
+    fn find<F>(&self, f: F) -> Option<&'_ T>
+    where
+        F: Fn(&T) -> bool,
+    {
+        for v in &self.buf {
+            let v = v.as_ref()?;
+            if f(v) {
+                return Some(v);
+            }
+        }
+        None
+    }
+}
+
+/// A strftime-like formatter of epoch timestamps with nanosecond granularity.
+///
+/// # Deferred Errors
+///
+/// If the provided stftime formatter is invalid, an
+/// [`PartitionKeyError::InvalidStrftime`] error is raised during the formatting
+/// call to [`StrftimeFormatter::render()`] and not during initialisation. This
+/// is a limitation of the underlying library.
+///
+/// # Caching
+///
+/// It is very common for batches of writes to contain multiple measurements
+/// taken at the same timestamp; for example, a periodic scraper of metric
+/// values will assign a single timestamp for the entire batch of observations.
+///
+/// To leverage this reuse of timestamps, this type retains a cache of the 5
+/// most recently observed distinct timestamps to avoid recomputing the same
+/// formatted string for each repeat occurrence.
+///
+/// In the best case, this reduces N row formats down to a single format
+/// operation, and in the worst case, it changes the memory overhead from "rows"
+/// to "rows + 5" which amortises nicely as batch sizes increase. If more than 5
+/// timestamps are observed, the existing buffer allocations are reused when
+/// computing the replacement values.
+///
+/// # `YYYY-MM-DD` Reduction Specialisation
+///
+/// The default (and therefore most common) formatting spec is "%Y-%m-%d", as
+/// this is the IOx default partitioning template. The vast majority of writes
+/// will utilise this format spec.
+///
+/// Because this spec is so common, a special case optimisation is utilised for
+/// it: for any given timestamp, first normalise the value by reducing the
+/// precision such that the timestamp is rounded down to the nearest whole day
+/// before further processing.
+///
+/// This removes all the sub-day variance (hours, minutes, seconds, etc) from
+/// the value, without changing the formatter output (it still produces the same
+/// string). This in turn causes any timestamp from the same day to be a cache
+/// hit with any prior value for the same day, regardless of "time" portion of
+/// the timestamp.
+///
+/// Combined with the above cache, this raises the cache hit rate to ~100% for
+/// write batches that span less than 6 days, effectively amortising the cost of
+/// timestamp formatting to O(1) for these very common batches.
+#[derive(Debug)]
+pub(super) struct StrftimeFormatter<'a> {
+    /// The strftime formatter definition.
+    ///
+    /// NOTE: the value below is UNVALIDATED - if the input strftime format
+    /// contains invalid formatter directives, then the error is deferred until
+    /// formatting a timestamp.
+    format: StrftimeItems<'a>,
+
+    /// As an optimisation, when this formatter is using the default YYYY-MM-DD
+    /// partitioning template, timestamps are normalised to per-day granularity,
+    /// preventing variances in the timestamp of less-than 1 day from causing a
+    /// miss in the cached "values".
+    ///
+    /// This optimisation massively increases the reuse of cached, pre-formatted
+    /// strings.
+    is_ymd_format: bool,
+
+    /// A set of 5 most recently added timestamps, and the formatted string they
+    /// map to.
+    values: RingBuffer<5, (i64, String)>,
+
+    /// The last observed timestamp.
+    ///
+    /// This value changes each time a timestamp is returned to the user, either
+    /// from the cache of pre-generated strings, or by generating a new one and
+    /// MUST always track the last timestamp given to
+    /// [`StrftimeFormatter::render()`].
+    last_ts: Option<i64>,
+}
+
+impl<'a> StrftimeFormatter<'a> {
+    /// Initialise a new [`StrftimeFormatter`] with the given stftime-like
+    /// format string.
+    ///
+    /// The exact formatter specification is [documented here].
+    ///
+    /// If the formatter contains an invalid spec, an error is raised when
+    /// formatting.
+    ///
+    /// [documented here]:
+    ///     https://docs.rs/chrono/latest/chrono/format/strftime/index.html
+    pub(super) fn new(format: &'a str) -> Self {
+        let mut is_default_format = false;
+        if format == YMD_SPEC {
+            is_default_format = true;
+        }
+
+        Self {
+            format: StrftimeItems::new(format),
+            is_ymd_format: is_default_format,
+            values: RingBuffer::default(),
+            last_ts: None,
+        }
+    }
+
+    /// Format `timestamp` to the format spec provided during initialisation,
+    /// writing the result to `out`.
+    pub(super) fn render<W>(&mut self, timestamp: i64, mut out: W) -> Result<(), PartitionKeyError>
+    where
+        W: std::fmt::Write,
+    {
+        // Optionally apply the default format reduction optimisation.
+        let timestamp = self.maybe_reduce(timestamp);
+
+        // Retain this timestamp as the last observed timestamp.
+        self.last_ts = Some(timestamp);
+
+        // Check if this timestamp has already been rendered.
+        if let Some(v) = self.values.find(|(t, _v)| *t == timestamp) {
+            // It has! Re-use the existing formatted string.
+            out.write_str(&v.1)?;
+            return Ok(());
+        }
+
+        // Obtain a mutable reference to the next item to be replaced, re-using
+        // the string buffer within it to avoid allocating (or initialising it
+        // if it was not yet initialised).
+        let buf = self.values.next_slot();
+
+        // Reset the slot value
+        buf.0 = timestamp;
+        buf.1.clear();
+
+        // Format the timestamp value into the slot buffer.
+        if write!(
+            buf.1,
+            "{}",
+            Utc.timestamp_nanos(timestamp)
+                .format_with_items(self.format.clone()) // Cheap clone of refs
+        )
+        .is_err()
+        {
+            // The string buffer may be empty, or contain partially rendered
+            // output before the error was raised.
+            //
+            // Remove this entry from the cache to prevent there being a mapping
+            // of `timestamp -> <empty or incomplete output>`.
+            self.values.drop_last();
+            return Err(PartitionKeyError::InvalidStrftime);
+        };
+
+        // Encode any reserved characters in this new string.
+        buf.1 = encode_key_part(&buf.1).to_string();
+
+        // Render this new value to the caller's buffer
+        out.write_str(&buf.1)?;
+
+        Ok(())
+    }
+
+    /// Reduce the precision of the timestamp iff using the default "%Y-%m-%d"
+    /// formatter string, returning a value rounded to the nearest whole day.
+    ///
+    /// If the formatter is not this special-case value, `timestamp` is returned
+    /// unchanged.
+    fn maybe_reduce(&self, timestamp: i64) -> i64 {
+        if !self.is_ymd_format {
+            return timestamp;
+        }
+        // Don't map timestamps less than the value we would subtract.
+        if timestamp < DAY_NANOSECONDS {
+            return timestamp;
+        }
+        timestamp - (timestamp % DAY_NANOSECONDS)
+    }
+
+    /// Returns true if the output of rendering `timestamp` will match the last
+    /// rendered timestamp, after optionally applying the precision reduction
+    /// optimisation.
+    pub(crate) fn equals_last(&self, timestamp: i64) -> bool {
+        // Optionally apply the default format reduction optimisation.
+        let timestamp = self.maybe_reduce(timestamp);
+
+        self.last_ts.map(|v| v == timestamp).unwrap_or_default()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+    use data_types::partition_template::{TablePartitionTemplateOverride, TemplatePart};
+    use proptest::prelude::*;
+
+    use super::*;
+
+    #[test]
+    fn test_default_formatter() {
+        let template = TablePartitionTemplateOverride::default();
+        let expect = template.parts().collect::<Vec<_>>();
+
+        // If this assert fails (and it probably shouldn't!) then you may want
+        // to consider changing the special case optimisation above.
+        assert_matches!(expect.as_slice(), &[TemplatePart::TimeFormat(YMD_SPEC)]);
+    }
+
+    #[test]
+    fn test_never_empty() {
+        let mut fmt = StrftimeFormatter::new("");
+
+        let mut buf = String::new();
+        fmt.render(42, &mut buf).expect("should render string");
+        assert!(!buf.is_empty());
+        assert_eq!(buf, "^");
+    }
+
+    #[test]
+    fn test_incomplete_formatter() {
+        let mut fmt = StrftimeFormatter::new("%");
+
+        let mut buf = String::new();
+        let got = fmt.render(42, &mut buf);
+        assert_matches!(got, Err(PartitionKeyError::InvalidStrftime));
+    }
+
+    #[test]
+    fn test_incomplete_formatter_removes_bad_mapping() {
+        let mut fmt = StrftimeFormatter::new("%s");
+
+        let mut buf = String::new();
+        fmt.render(42, &mut buf).unwrap();
+
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), None, None, None, None]
+        );
+
+        // This obviously isn't possible through normal usage, but to trigger
+        // the "failed to render" code path, reach in and tweak the formatter to
+        // cause it to fail.
+        fmt.format = StrftimeItems::new("%");
+
+        // Trigger the "cannot format" code path
+        fmt.render(4242, &mut buf).expect_err("invalid formatter");
+
+        // And ensure the ring buffer was left in a clean state
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), None, None, None, None]
+        );
+    }
+
+    #[test]
+    fn test_uses_ring_buffer() {
+        let mut fmt = StrftimeFormatter::new("%H");
+        let mut buf = String::new();
+
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(12345, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+
+        // Assert the above repetitive values were deduped in the cache.
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), Some((12345, _)), None, None, None]
+        );
+        assert_eq!(fmt.values.last_idx, 1);
+    }
+
+    const FORMATTER_SPEC_PARTS: &[&str] = &[
+        "%Y", "%m", "%d", "%H", "%m", "%.9f", "%r", "%+", "%t", "%n", "%A", "%c",
+    ];
+
+    prop_compose! {
+        /// Yield an arbitrary formatter spec selected from
+        /// [`FORMATTER_SPEC_PARTS`] delimited by a random character.
+        fn arbitrary_formatter_spec()(
+            delimiter in any::<char>(),
+            v in proptest::collection::vec(
+                proptest::sample::select(FORMATTER_SPEC_PARTS).prop_map(ToString::to_string),
+                (0, 10) // Set size range
+            )) -> String {
+            v.join(&delimiter.to_string())
+        }
+    }
+
+    fn default_formatter_spec() -> impl Strategy<Value = String> {
+        Just(YMD_SPEC.to_string())
+    }
+
+    proptest! {
+        /// The [`StrftimeFormatter`] is a glorified wrapper around chrono's
+        /// formatter, therefore this test asserts the following property:
+        ///
+        ///     For any timestamp and formatter, the output of this type must
+        ///     match the output of chrono's formatter, after key encoding.
+        ///
+        /// Validating this asserts correctness of the wrapper itself, assuming
+        /// chrono's formatter produces correct output. Note the encoding is
+        /// tested in the actual partitioner module.
+        #[test]
+        fn prop_differential_validation(
+            timestamps in prop::collection::vec(any::<i64>(), 1..100),
+            format in prop_oneof![arbitrary_formatter_spec(), default_formatter_spec(), any::<String>()],
+        ) {
+            let mut fmt = StrftimeFormatter::new(&format);
+            let items = StrftimeItems::new(&format);
+
+            for ts in timestamps {
+                // Generate the control string.
+                let mut control = String::new();
+                let _ = write!(
+                    control,
+                    "{}",
+                    Utc.timestamp_nanos(ts)
+                        .format_with_items(items.clone())
+                );
+                let control = encode_key_part(&control);
+
+                // Generate the test string.
+                let mut test = String::new();
+                if fmt.render(ts, &mut test).is_err() {
+                    // Any error results in the key not being used, so any
+                    // differences are inconsequential.
+                    continue;
+                }
+
+                assert_eq!(control, test);
+            }
+        }
+    }
+}
diff --git a/mutable_batch/src/writer.rs b/mutable_batch/src/writer.rs
new file mode 100644
index 0000000..8158077
--- /dev/null
+++ b/mutable_batch/src/writer.rs
@@ -0,0 +1,838 @@
+//! A panic-safe write abstraction for [`MutableBatch`]
+
+use crate::{
+    column::{Column, ColumnData, NULL_DID},
+    MutableBatch,
+};
+use arrow_util::bitset::{iter_set_positions, iter_set_positions_with_offset, BitSet};
+use data_types::{IsNan, StatValues, Statistics};
+use schema::{InfluxColumnType, InfluxFieldType};
+use snafu::Snafu;
+use std::{num::NonZeroU64, ops::Range};
+
+#[allow(missing_docs, missing_copy_implementations)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Unable to insert {inserted} type into column {column} with type {existing}"
+    ))]
+    TypeMismatch {
+        column: String,
+        existing: InfluxColumnType,
+        inserted: InfluxColumnType,
+    },
+
+    #[snafu(display("Incorrect number of values provided"))]
+    InsufficientValues,
+
+    #[snafu(display("Key not found in dictionary: {}", key))]
+    KeyNotFound { key: usize },
+}
+
+/// A specialized `Error` for [`Writer`] errors
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// [`Writer`] provides a panic-safe abstraction to append a number of rows to a [`MutableBatch`]
+///
+/// If a [`Writer`] is dropped without calling [`Writer::commit`], the [`MutableBatch`] will be
+/// truncated to the original number of rows, and the statistics not updated
+#[derive(Debug)]
+pub struct Writer<'a> {
+    /// The mutable batch that is being mutated
+    batch: &'a mut MutableBatch,
+    /// A list of column index paired with Statistics
+    ///
+    /// Statistics updates are deferred to commit time
+    statistics: Vec<(usize, Statistics)>,
+    /// The initial number of rows in the MutableBatch
+    initial_rows: usize,
+    /// The initial number of columns in the MutableBatch
+    initial_cols: usize,
+    /// The number of rows to insert
+    to_insert: usize,
+    /// If this Writer committed successfully
+    success: bool,
+}
+
+impl<'a> Writer<'a> {
+    /// Create a [`Writer`] for inserting `to_insert` rows to the provided `batch`
+    ///
+    /// If the writer is dropped without calling commit all changes will be rolled back
+    pub fn new(batch: &'a mut MutableBatch, to_insert: usize) -> Self {
+        let initial_rows = batch.rows();
+        let initial_cols = batch.columns.len();
+        Self {
+            batch,
+            statistics: vec![],
+            initial_rows,
+            initial_cols,
+            to_insert,
+            success: false,
+        }
+    }
+
+    /// Write the f64 typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_f64<I>(
+        &mut self,
+        name: &str,
+        valid_mask: Option<&[u8]>,
+        mut values: I,
+    ) -> Result<()>
+    where
+        I: Iterator<Item = f64>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) =
+            self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::Float))?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::F64(col_data, _) => {
+                col_data.resize(initial_rows + to_insert, 0_f64);
+                for idx in set_position_iterator(valid_mask, to_insert) {
+                    let value = values.next().ok_or(Error::InsufficientValues)?;
+                    col_data[initial_rows + idx] = value;
+                    stats.update(&value);
+                }
+            }
+            x => unreachable!("expected f64 got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, valid_mask, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::F64(stats)));
+
+        Ok(())
+    }
+
+    /// Write the i64 typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_i64<I>(
+        &mut self,
+        name: &str,
+        valid_mask: Option<&[u8]>,
+        mut values: I,
+    ) -> Result<()>
+    where
+        I: Iterator<Item = i64>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) =
+            self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::Integer))?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::I64(col_data, _) => {
+                col_data.resize(initial_rows + to_insert, 0_i64);
+                for idx in set_position_iterator(valid_mask, to_insert) {
+                    let value = values.next().ok_or(Error::InsufficientValues)?;
+                    col_data[initial_rows + idx] = value;
+                    stats.update(&value);
+                }
+            }
+            x => unreachable!("expected i64 got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, valid_mask, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::I64(stats)));
+
+        Ok(())
+    }
+
+    /// Write the u64 typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_u64<I>(
+        &mut self,
+        name: &str,
+        valid_mask: Option<&[u8]>,
+        mut values: I,
+    ) -> Result<()>
+    where
+        I: Iterator<Item = u64>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) =
+            self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::UInteger))?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::U64(col_data, _) => {
+                col_data.resize(initial_rows + to_insert, 0_u64);
+                for idx in set_position_iterator(valid_mask, to_insert) {
+                    let value = values.next().ok_or(Error::InsufficientValues)?;
+                    col_data[initial_rows + idx] = value;
+                    stats.update(&value);
+                }
+            }
+            x => unreachable!("expected u64 got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, valid_mask, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::U64(stats)));
+
+        Ok(())
+    }
+
+    /// Write the boolean typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_bool<I>(
+        &mut self,
+        name: &str,
+        valid_mask: Option<&[u8]>,
+        mut values: I,
+    ) -> Result<()>
+    where
+        I: Iterator<Item = bool>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) =
+            self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::Boolean))?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::Bool(col_data, _) => {
+                col_data.append_unset(to_insert);
+                for idx in set_position_iterator(valid_mask, to_insert) {
+                    let value = values.next().ok_or(Error::InsufficientValues)?;
+                    if value {
+                        col_data.set(initial_rows + idx);
+                    }
+                    stats.update(&value);
+                }
+            }
+            x => unreachable!("expected bool got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, valid_mask, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::Bool(stats)));
+
+        Ok(())
+    }
+
+    /// Write the string field typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_string<'s, I>(
+        &mut self,
+        name: &str,
+        valid_mask: Option<&[u8]>,
+        mut values: I,
+    ) -> Result<()>
+    where
+        I: Iterator<Item = &'s str>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) =
+            self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::String))?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::String(col_data, _) => {
+                for idx in set_position_iterator(valid_mask, to_insert) {
+                    let value = values.next().ok_or(Error::InsufficientValues)?;
+                    col_data.extend(initial_rows + idx - col_data.len());
+                    col_data.append(value);
+                    stats.update(value);
+                }
+                col_data.extend(initial_rows + to_insert - col_data.len());
+            }
+            x => unreachable!("expected tag got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, valid_mask, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::String(stats)));
+
+        Ok(())
+    }
+
+    /// Write the tag typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_tag<'s, I>(
+        &mut self,
+        name: &str,
+        valid_mask: Option<&[u8]>,
+        mut values: I,
+    ) -> Result<()>
+    where
+        I: Iterator<Item = &'s str>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) = self.column_mut(name, InfluxColumnType::Tag)?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::Tag(col_data, dict, _) => {
+                col_data.resize(initial_rows + to_insert, NULL_DID);
+
+                for idx in set_position_iterator(valid_mask, to_insert) {
+                    let value = values.next().ok_or(Error::InsufficientValues)?;
+                    col_data[initial_rows + idx] = dict.lookup_value_or_insert(value);
+                    stats.update(value);
+                }
+            }
+            x => unreachable!("expected tag got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, valid_mask, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::String(stats)));
+
+        Ok(())
+    }
+
+    /// Write the tag typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_tag_dict<'s, K, V>(
+        &mut self,
+        name: &str,
+        valid_mask: Option<&[u8]>,
+        mut keys: K,
+        values: V,
+    ) -> Result<()>
+    where
+        K: Iterator<Item = usize>,
+        V: Iterator<Item = &'s str>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) = self.column_mut(name, InfluxColumnType::Tag)?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::Tag(col_data, dict, _) => {
+                // Lazily compute mappings to handle dictionaries with unused mappings
+                let mut mapping: Vec<_> = values.map(|value| (value, None)).collect();
+
+                col_data.resize(initial_rows + to_insert, NULL_DID);
+
+                for idx in set_position_iterator(valid_mask, to_insert) {
+                    let key = keys.next().ok_or(Error::InsufficientValues)?;
+                    let (value, maybe_did) =
+                        mapping.get_mut(key).ok_or(Error::KeyNotFound { key })?;
+
+                    match maybe_did {
+                        Some(did) => col_data[initial_rows + idx] = *did,
+                        None => {
+                            let did = dict.lookup_value_or_insert(value);
+                            *maybe_did = Some(did);
+                            col_data[initial_rows + idx] = did
+                        }
+                    }
+                    stats.update(*value);
+                }
+            }
+            x => unreachable!("expected tag got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, valid_mask, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::String(stats)));
+
+        Ok(())
+    }
+
+    /// Write the time typed column identified by `name`
+    ///
+    /// For each set bit in `valid_mask` an a value from `values` is inserted at the
+    /// corresponding index in the column. Nulls are inserted for the other rows
+    ///
+    /// # Panic
+    ///
+    /// - panics if this column has already been written to by this `Writer`
+    ///
+    pub fn write_time<I>(&mut self, name: &str, mut values: I) -> Result<()>
+    where
+        I: Iterator<Item = i64>,
+    {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+
+        let (col_idx, col) = self.column_mut(name, InfluxColumnType::Timestamp)?;
+
+        let mut stats = StatValues::new_empty();
+        match &mut col.data {
+            ColumnData::I64(col_data, _) => {
+                col_data.resize(initial_rows + to_insert, 0_i64);
+                for idx in 0..to_insert {
+                    let value = values.next().ok_or(Error::InsufficientValues)?;
+                    col_data[initial_rows + idx] = value;
+                    stats.update(&value)
+                }
+            }
+            x => unreachable!("expected i64 got {} for column \"{}\"", x, name),
+        }
+
+        append_valid_mask(col, None, to_insert);
+
+        stats.update_for_nulls(to_insert as u64 - stats.total_count);
+        self.statistics.push((col_idx, Statistics::I64(stats)));
+
+        Ok(())
+    }
+
+    /// Write the provided MutableBatch
+    pub(crate) fn write_batch(&mut self, src: &MutableBatch) -> Result<()> {
+        assert_eq!(src.row_count, self.to_insert);
+
+        for (src_col_name, src_col_idx) in &src.column_names {
+            let src_col = &src.columns[*src_col_idx];
+            let (dst_col_idx, dst_col) = self.column_mut(src_col_name, src_col.influx_type)?;
+
+            let stats = match (&mut dst_col.data, &src_col.data) {
+                (ColumnData::F64(dst_data, _), ColumnData::F64(src_data, stats)) => {
+                    dst_data.extend_from_slice(src_data);
+                    Statistics::F64(stats.clone())
+                }
+                (ColumnData::I64(dst_data, _), ColumnData::I64(src_data, stats)) => {
+                    dst_data.extend_from_slice(src_data);
+                    Statistics::I64(stats.clone())
+                }
+                (ColumnData::U64(dst_data, _), ColumnData::U64(src_data, stats)) => {
+                    dst_data.extend_from_slice(src_data);
+                    Statistics::U64(stats.clone())
+                }
+                (ColumnData::Bool(dst_data, _), ColumnData::Bool(src_data, stats)) => {
+                    dst_data.extend_from(src_data);
+                    Statistics::Bool(stats.clone())
+                }
+                (ColumnData::String(dst_data, _), ColumnData::String(src_data, stats)) => {
+                    dst_data.extend_from(src_data);
+                    Statistics::String(stats.clone())
+                }
+                (
+                    ColumnData::Tag(dst_data, dst_dict, _),
+                    ColumnData::Tag(src_data, src_dict, stats),
+                ) => {
+                    let mapping: Vec<_> = src_dict
+                        .values()
+                        .iter()
+                        .map(|value| dst_dict.lookup_value_or_insert(value))
+                        .collect();
+
+                    dst_data.extend(src_data.iter().map(|src_id| match *src_id {
+                        NULL_DID => NULL_DID,
+                        _ => mapping[*src_id as usize],
+                    }));
+
+                    Statistics::String(stats.clone())
+                }
+                _ => unreachable!("src: {}, dst: {}", src_col.data, dst_col.data),
+            };
+
+            dst_col.valid.extend_from(&src_col.valid);
+            self.statistics.push((dst_col_idx, stats));
+        }
+        Ok(())
+    }
+
+    /// Write `range` rows from the provided MutableBatch
+    pub(crate) fn write_batch_range(
+        &mut self,
+        src: &MutableBatch,
+        range: Range<usize>,
+    ) -> Result<()> {
+        self.write_batch_ranges(src, &[range])
+    }
+
+    /// Write the rows identified by `ranges` to the provided MutableBatch
+    pub(crate) fn write_batch_ranges(
+        &mut self,
+        src: &MutableBatch,
+        ranges: &[Range<usize>],
+    ) -> Result<()> {
+        let to_insert = self.to_insert;
+
+        if to_insert == src.row_count {
+            return self.write_batch(src);
+        }
+
+        for (src_col_name, src_col_idx) in &src.column_names {
+            let src_col = &src.columns[*src_col_idx];
+            let (dst_col_idx, dst_col) = self.column_mut(src_col_name, src_col.influx_type)?;
+            let stats = match (&mut dst_col.data, &src_col.data) {
+                (ColumnData::F64(dst_data, _), ColumnData::F64(src_data, _)) => Statistics::F64(
+                    write_slice(to_insert, ranges, src_col.valid.bytes(), src_data, dst_data),
+                ),
+                (ColumnData::I64(dst_data, _), ColumnData::I64(src_data, _)) => Statistics::I64(
+                    write_slice(to_insert, ranges, src_col.valid.bytes(), src_data, dst_data),
+                ),
+                (ColumnData::U64(dst_data, _), ColumnData::U64(src_data, _)) => Statistics::U64(
+                    write_slice(to_insert, ranges, src_col.valid.bytes(), src_data, dst_data),
+                ),
+                (ColumnData::Bool(dst_data, _), ColumnData::Bool(src_data, _)) => {
+                    dst_data.reserve(to_insert);
+                    let mut stats = StatValues::new_empty();
+                    for range in ranges {
+                        dst_data.extend_from_range(src_data, range.clone());
+                        compute_bool_stats(
+                            src_col.valid.bytes(),
+                            range.clone(),
+                            src_data,
+                            &mut stats,
+                        )
+                    }
+                    Statistics::Bool(stats)
+                }
+                (ColumnData::String(dst_data, _), ColumnData::String(src_data, _)) => {
+                    let mut stats = StatValues::new_empty();
+                    for range in ranges {
+                        dst_data.extend_from_range(src_data, range.clone());
+                        compute_stats(src_col.valid.bytes(), range.clone(), &mut stats, |x| {
+                            src_data.get(x).unwrap()
+                        })
+                    }
+                    Statistics::String(stats)
+                }
+                (
+                    ColumnData::Tag(dst_data, dst_dict, _),
+                    ColumnData::Tag(src_data, src_dict, _),
+                ) => {
+                    dst_data.reserve(to_insert);
+
+                    let mut mapping: Vec<_> = vec![None; src_dict.values().len()];
+                    let mut stats = StatValues::new_empty();
+                    for range in ranges {
+                        dst_data.extend(src_data[range.clone()].iter().map(
+                            |src_id| match *src_id {
+                                NULL_DID => {
+                                    stats.update_for_nulls(1);
+                                    NULL_DID
+                                }
+                                _ => {
+                                    let maybe_did = &mut mapping[*src_id as usize];
+                                    match maybe_did {
+                                        Some(did) => {
+                                            stats.total_count += 1;
+                                            *did
+                                        }
+                                        None => {
+                                            let value = src_dict.lookup_id(*src_id).unwrap();
+                                            stats.update(value);
+
+                                            let did = dst_dict.lookup_value_or_insert(value);
+                                            *maybe_did = Some(did);
+                                            did
+                                        }
+                                    }
+                                }
+                            },
+                        ));
+                    }
+
+                    Statistics::String(stats)
+                }
+                _ => unreachable!(),
+            };
+
+            dst_col.valid.reserve(to_insert);
+            for range in ranges {
+                dst_col
+                    .valid
+                    .extend_from_range(&src_col.valid, range.clone());
+            }
+
+            self.statistics.push((dst_col_idx, stats));
+        }
+        Ok(())
+    }
+
+    fn column_mut(
+        &mut self,
+        name: &str,
+        influx_type: InfluxColumnType,
+    ) -> Result<(usize, &mut Column)> {
+        let columns_len = self.batch.columns.len();
+
+        let column_idx = *self
+            .batch
+            .column_names
+            .raw_entry_mut()
+            .from_key(name)
+            .or_insert_with(|| (name.to_string(), columns_len))
+            .1;
+
+        if columns_len == column_idx {
+            self.batch
+                .columns
+                .push(Column::new(self.initial_rows, influx_type))
+        }
+
+        let col = &mut self.batch.columns[column_idx];
+
+        if col.influx_type != influx_type {
+            return Err(Error::TypeMismatch {
+                column: name.to_string(),
+                existing: col.influx_type,
+                inserted: influx_type,
+            });
+        }
+
+        assert_eq!(
+            col.valid.len(),
+            self.initial_rows,
+            "expected {} rows in column \"{}\" got {} when performing write of {} rows",
+            self.initial_rows,
+            name,
+            col.valid.len(),
+            self.to_insert
+        );
+
+        Ok((column_idx, col))
+    }
+
+    /// Commits the writes performed on this [`Writer`]. This will update the statistics
+    /// and pad any unwritten columns with nulls
+    pub fn commit(mut self) {
+        let initial_rows = self.initial_rows;
+        let to_insert = self.to_insert;
+        let final_rows = initial_rows + to_insert;
+
+        self.statistics
+            .sort_unstable_by_key(|(col_idx, _)| *col_idx);
+        let mut statistics = self.statistics.iter();
+
+        for (col_idx, col) in self.batch.columns.iter_mut().enumerate() {
+            // All columns should either have received a write and have statistics or not
+            if col.valid.len() == initial_rows {
+                col.push_nulls_to_len(final_rows);
+            } else {
+                assert_eq!(
+                    col.valid.len(),
+                    final_rows,
+                    "expected {} rows in column index {} got {} when performing write of {} rows",
+                    final_rows,
+                    col_idx,
+                    col.valid.len(),
+                    to_insert
+                );
+
+                let (stats_col_idx, stats) = statistics.next().unwrap();
+                assert_eq!(*stats_col_idx, col_idx);
+                assert_eq!(stats.total_count(), to_insert as u64);
+
+                match (&mut col.data, stats) {
+                    (ColumnData::F64(col_data, stats), Statistics::F64(new)) => {
+                        assert_eq!(col_data.len(), final_rows);
+                        stats.update_from(new);
+                    }
+                    (ColumnData::I64(col_data, stats), Statistics::I64(new)) => {
+                        assert_eq!(col_data.len(), final_rows);
+                        stats.update_from(new);
+                    }
+                    (ColumnData::U64(col_data, stats), Statistics::U64(new)) => {
+                        assert_eq!(col_data.len(), final_rows);
+                        stats.update_from(new);
+                    }
+                    (ColumnData::String(col_data, stats), Statistics::String(new)) => {
+                        assert_eq!(col_data.len(), final_rows);
+                        stats.update_from(new);
+                    }
+                    (ColumnData::Bool(col_data, stats), Statistics::Bool(new)) => {
+                        assert_eq!(col_data.len(), final_rows);
+                        stats.update_from(new);
+                    }
+                    (ColumnData::Tag(col_data, dict, stats), Statistics::String(new)) => {
+                        assert_eq!(col_data.len(), final_rows);
+                        stats.update_from(new);
+                        stats.distinct_count = match stats.null_count {
+                            Some(0) => NonZeroU64::new(dict.values().len() as u64),
+                            Some(_) => NonZeroU64::new(dict.values().len() as u64 + 1),
+                            None => unreachable!("mutable batch keeps null counts"),
+                        }
+                    }
+                    _ => unreachable!("column: {}, statistics: {}", col.data, stats.type_name()),
+                }
+            }
+        }
+        self.batch.row_count = final_rows;
+        self.success = true;
+    }
+}
+
+fn set_position_iterator(
+    valid_mask: Option<&[u8]>,
+    to_insert: usize,
+) -> impl Iterator<Item = usize> + '_ {
+    match valid_mask {
+        Some(mask) => itertools::Either::Left(
+            iter_set_positions(mask).take_while(move |idx| *idx < to_insert),
+        ),
+        None => itertools::Either::Right(0..to_insert),
+    }
+}
+
+fn append_valid_mask(column: &mut Column, valid_mask: Option<&[u8]>, to_insert: usize) {
+    match valid_mask {
+        Some(mask) => column.valid.append_bits(to_insert, mask),
+        None => column.valid.append_set(to_insert),
+    }
+}
+
+fn compute_bool_stats(
+    valid: &[u8],
+    range: Range<usize>,
+    col_data: &BitSet,
+    stats: &mut StatValues<bool>,
+) {
+    // There are likely faster ways to do this
+    let indexes =
+        iter_set_positions_with_offset(valid, range.start).take_while(|idx| *idx < range.end);
+
+    let mut non_null_count = 0_u64;
+    for index in indexes {
+        let value = col_data.get(index);
+        stats.update(&value);
+        non_null_count += 1;
+    }
+
+    let to_insert = range.end - range.start;
+    stats.update_for_nulls(to_insert as u64 - non_null_count);
+}
+
+fn write_slice<T>(
+    to_insert: usize,
+    ranges: &[Range<usize>],
+    valid: &[u8],
+    src_data: &[T],
+    dst_data: &mut Vec<T>,
+) -> StatValues<T>
+where
+    T: Clone + PartialOrd + IsNan,
+{
+    dst_data.reserve(to_insert);
+    let mut stats = StatValues::new_empty();
+    for range in ranges {
+        dst_data.extend_from_slice(&src_data[range.clone()]);
+        compute_stats(valid, range.clone(), &mut stats, |x| &src_data[x]);
+    }
+    stats
+}
+
+fn compute_stats<'a, T, U, F>(
+    valid: &[u8],
+    range: Range<usize>,
+    stats: &mut StatValues<T>,
+    accessor: F,
+) where
+    U: 'a + ToOwned<Owned = T> + PartialOrd + ?Sized + IsNan,
+    F: Fn(usize) -> &'a U,
+    T: std::borrow::Borrow<U>,
+{
+    let values = iter_set_positions_with_offset(valid, range.start)
+        .take_while(|idx| *idx < range.end)
+        .map(accessor);
+
+    let mut non_null_count = 0_u64;
+    for value in values {
+        stats.update(value);
+        non_null_count += 1;
+    }
+
+    let to_insert = range.end - range.start;
+    stats.update_for_nulls(to_insert as u64 - non_null_count);
+}
+
+impl<'a> Drop for Writer<'a> {
+    fn drop(&mut self) {
+        if !self.success {
+            let initial_rows = self.initial_rows;
+            let initial_cols = self.initial_cols;
+
+            if self.batch.columns.len() != initial_cols {
+                self.batch.columns.truncate(initial_cols);
+                self.batch.column_names.retain(|_, v| *v < initial_cols)
+            }
+
+            for col in &mut self.batch.columns {
+                col.valid.truncate(initial_rows);
+                match &mut col.data {
+                    ColumnData::F64(col_data, _) => col_data.truncate(initial_rows),
+                    ColumnData::I64(col_data, _) => col_data.truncate(initial_rows),
+                    ColumnData::U64(col_data, _) => col_data.truncate(initial_rows),
+                    ColumnData::String(col_data, _) => col_data.truncate(initial_rows),
+                    ColumnData::Bool(col_data, _) => col_data.truncate(initial_rows),
+                    ColumnData::Tag(col_data, dict, _) => {
+                        col_data.truncate(initial_rows);
+                        match col_data.iter().max() {
+                            Some(max) => dict.truncate(*max),
+                            None => dict.clear(),
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/mutable_batch/tests/extend.rs b/mutable_batch/tests/extend.rs
new file mode 100644
index 0000000..cbd05a2
--- /dev/null
+++ b/mutable_batch/tests/extend.rs
@@ -0,0 +1,164 @@
+use arrow_util::assert_batches_eq;
+use data_types::{StatValues, Statistics};
+use mutable_batch::{writer::Writer, MutableBatch};
+use schema::Projection;
+use std::{collections::BTreeMap, num::NonZeroU64};
+
+#[test]
+fn test_extend() {
+    let mut a = MutableBatch::new();
+    let mut writer = Writer::new(&mut a, 5);
+
+    writer
+        .write_tag(
+            "tag1",
+            Some(&[0b00010101]),
+            vec!["v1", "v1", "v2"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_tag(
+            "tag2",
+            Some(&[0b00001101]),
+            vec!["v2", "v1", "v1"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_time("time", vec![0, 1, 2, 3, 4].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let mut b = MutableBatch::new();
+    let mut writer = Writer::new(&mut b, 8);
+
+    writer
+        .write_tag(
+            "tag1",
+            Some(&[0b10010011]),
+            vec!["v1", "v1", "v3", "v1"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_tag(
+            "tag3",
+            None,
+            vec!["v2", "v1", "v3", "v1", "v3", "v5", "v5", "v5"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_time("time", vec![5, 6, 7, 8, 9, 10, 11, 12].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let a_before = a.to_arrow(Projection::All).unwrap();
+
+    a.extend_from(&b).unwrap();
+
+    assert_batches_eq!(
+        &[
+            "+------+------+--------------------------------+",
+            "| tag1 | tag2 | time                           |",
+            "+------+------+--------------------------------+",
+            "| v1   | v2   | 1970-01-01T00:00:00Z           |",
+            "|      |      | 1970-01-01T00:00:00.000000001Z |",
+            "| v1   | v1   | 1970-01-01T00:00:00.000000002Z |",
+            "|      | v1   | 1970-01-01T00:00:00.000000003Z |",
+            "| v2   |      | 1970-01-01T00:00:00.000000004Z |",
+            "+------+------+--------------------------------+",
+        ],
+        &[a_before]
+    );
+
+    assert_batches_eq!(
+        &[
+            "+------+------+--------------------------------+",
+            "| tag1 | tag3 | time                           |",
+            "+------+------+--------------------------------+",
+            "| v1   | v2   | 1970-01-01T00:00:00.000000005Z |",
+            "| v1   | v1   | 1970-01-01T00:00:00.000000006Z |",
+            "|      | v3   | 1970-01-01T00:00:00.000000007Z |",
+            "|      | v1   | 1970-01-01T00:00:00.000000008Z |",
+            "| v3   | v3   | 1970-01-01T00:00:00.000000009Z |",
+            "|      | v5   | 1970-01-01T00:00:00.000000010Z |",
+            "|      | v5   | 1970-01-01T00:00:00.000000011Z |",
+            "| v1   | v5   | 1970-01-01T00:00:00.000000012Z |",
+            "+------+------+--------------------------------+",
+        ],
+        &[b.to_arrow(Projection::All).unwrap()]
+    );
+
+    assert_batches_eq!(
+        &[
+            "+------+------+------+--------------------------------+",
+            "| tag1 | tag2 | tag3 | time                           |",
+            "+------+------+------+--------------------------------+",
+            "| v1   | v2   |      | 1970-01-01T00:00:00Z           |",
+            "|      |      |      | 1970-01-01T00:00:00.000000001Z |",
+            "| v1   | v1   |      | 1970-01-01T00:00:00.000000002Z |",
+            "|      | v1   |      | 1970-01-01T00:00:00.000000003Z |",
+            "| v2   |      |      | 1970-01-01T00:00:00.000000004Z |",
+            "| v1   |      | v2   | 1970-01-01T00:00:00.000000005Z |",
+            "| v1   |      | v1   | 1970-01-01T00:00:00.000000006Z |",
+            "|      |      | v3   | 1970-01-01T00:00:00.000000007Z |",
+            "|      |      | v1   | 1970-01-01T00:00:00.000000008Z |",
+            "| v3   |      | v3   | 1970-01-01T00:00:00.000000009Z |",
+            "|      |      | v5   | 1970-01-01T00:00:00.000000010Z |",
+            "|      |      | v5   | 1970-01-01T00:00:00.000000011Z |",
+            "| v1   |      | v5   | 1970-01-01T00:00:00.000000012Z |",
+            "+------+------+------+--------------------------------+",
+        ],
+        &[a.to_arrow(Projection::All).unwrap()]
+    );
+
+    let stats: BTreeMap<_, _> = a.columns().map(|(k, v)| (k.as_str(), v.stats())).collect();
+
+    assert_eq!(
+        stats["tag1"],
+        Statistics::String(StatValues {
+            min: Some("v1".to_string()),
+            max: Some("v3".to_string()),
+            total_count: 13,
+            null_count: Some(6),
+            distinct_count: Some(NonZeroU64::new(4).unwrap())
+        })
+    );
+
+    assert_eq!(
+        stats["tag2"],
+        Statistics::String(StatValues {
+            min: Some("v1".to_string()),
+            max: Some("v2".to_string()),
+            total_count: 13,
+            null_count: Some(10),
+            distinct_count: Some(NonZeroU64::new(3).unwrap())
+        })
+    );
+
+    assert_eq!(
+        stats["tag3"],
+        Statistics::String(StatValues {
+            min: Some("v1".to_string()),
+            max: Some("v5".to_string()),
+            total_count: 13,
+            null_count: Some(5),
+            distinct_count: Some(NonZeroU64::new(5).unwrap())
+        })
+    );
+
+    assert_eq!(
+        stats["time"],
+        Statistics::I64(StatValues {
+            min: Some(0),
+            max: Some(12),
+            total_count: 13,
+            null_count: Some(0),
+            distinct_count: None
+        })
+    )
+}
diff --git a/mutable_batch/tests/extend_range.rs b/mutable_batch/tests/extend_range.rs
new file mode 100644
index 0000000..4deffa9
--- /dev/null
+++ b/mutable_batch/tests/extend_range.rs
@@ -0,0 +1,168 @@
+use arrow_util::assert_batches_eq;
+use data_types::{StatValues, Statistics};
+use mutable_batch::{writer::Writer, MutableBatch};
+use schema::Projection;
+use std::{collections::BTreeMap, num::NonZeroU64};
+
+#[test]
+fn test_extend_range() {
+    let mut a = MutableBatch::new();
+    let mut writer = Writer::new(&mut a, 5);
+
+    writer
+        .write_tag(
+            "tag1",
+            Some(&[0b00010101]),
+            vec!["v2", "v2", "v2"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_f64("f64", Some(&[0b00001011]), vec![23., 23., 5.].into_iter())
+        .unwrap();
+
+    writer
+        .write_time("time", vec![0, 1, 2, 3, 4].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let mut b = MutableBatch::new();
+    let mut writer = Writer::new(&mut b, 8);
+
+    writer
+        .write_tag(
+            "tag1",
+            Some(&[0b10010011]),
+            vec!["v1", "v1", "v3", "v1"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_tag(
+            "tag3",
+            None,
+            vec!["v2", "v1", "v3", "v1", "v3", "v5", "v3", "v2"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_bool("bool", Some(&[0b00010010]), vec![false, true].into_iter())
+        .unwrap();
+
+    writer
+        .write_time("time", vec![5, 6, 7, 8, 9, 10, 11, 12].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    assert_batches_eq!(
+        &[
+            "+------+------+--------------------------------+",
+            "| f64  | tag1 | time                           |",
+            "+------+------+--------------------------------+",
+            "| 23.0 | v2   | 1970-01-01T00:00:00Z           |",
+            "| 23.0 |      | 1970-01-01T00:00:00.000000001Z |",
+            "|      | v2   | 1970-01-01T00:00:00.000000002Z |",
+            "| 5.0  |      | 1970-01-01T00:00:00.000000003Z |",
+            "|      | v2   | 1970-01-01T00:00:00.000000004Z |",
+            "+------+------+--------------------------------+",
+        ],
+        &[a.to_arrow(Projection::All).unwrap()]
+    );
+
+    assert_batches_eq!(
+        &[
+            "+-------+------+------+--------------------------------+",
+            "| bool  | tag1 | tag3 | time                           |",
+            "+-------+------+------+--------------------------------+",
+            "|       | v1   | v2   | 1970-01-01T00:00:00.000000005Z |",
+            "| false | v1   | v1   | 1970-01-01T00:00:00.000000006Z |",
+            "|       |      | v3   | 1970-01-01T00:00:00.000000007Z |",
+            "|       |      | v1   | 1970-01-01T00:00:00.000000008Z |",
+            "| true  | v3   | v3   | 1970-01-01T00:00:00.000000009Z |",
+            "|       |      | v5   | 1970-01-01T00:00:00.000000010Z |",
+            "|       |      | v3   | 1970-01-01T00:00:00.000000011Z |",
+            "|       | v1   | v2   | 1970-01-01T00:00:00.000000012Z |",
+            "+-------+------+------+--------------------------------+",
+        ],
+        &[b.to_arrow(Projection::All).unwrap()]
+    );
+
+    a.extend_from_range(&b, 1..4).unwrap();
+
+    assert_batches_eq!(
+        &[
+            "+-------+------+------+------+--------------------------------+",
+            "| bool  | f64  | tag1 | tag3 | time                           |",
+            "+-------+------+------+------+--------------------------------+",
+            "|       | 23.0 | v2   |      | 1970-01-01T00:00:00Z           |",
+            "|       | 23.0 |      |      | 1970-01-01T00:00:00.000000001Z |",
+            "|       |      | v2   |      | 1970-01-01T00:00:00.000000002Z |",
+            "|       | 5.0  |      |      | 1970-01-01T00:00:00.000000003Z |",
+            "|       |      | v2   |      | 1970-01-01T00:00:00.000000004Z |",
+            "| false |      | v1   | v1   | 1970-01-01T00:00:00.000000006Z |",
+            "|       |      |      | v3   | 1970-01-01T00:00:00.000000007Z |",
+            "|       |      |      | v1   | 1970-01-01T00:00:00.000000008Z |",
+            "+-------+------+------+------+--------------------------------+",
+        ],
+        &[a.to_arrow(Projection::All).unwrap()]
+    );
+
+    let stats: BTreeMap<_, _> = a.columns().map(|(k, v)| (k.as_str(), v.stats())).collect();
+
+    assert_eq!(
+        stats["bool"],
+        Statistics::Bool(StatValues {
+            min: Some(false),
+            max: Some(false),
+            total_count: 8,
+            null_count: Some(7),
+            distinct_count: None
+        })
+    );
+
+    assert_eq!(
+        stats["f64"],
+        Statistics::F64(StatValues {
+            min: Some(5.),
+            max: Some(23.),
+            total_count: 8,
+            null_count: Some(5),
+            distinct_count: None
+        })
+    );
+
+    assert_eq!(
+        stats["tag1"],
+        Statistics::String(StatValues {
+            min: Some("v1".to_string()),
+            max: Some("v2".to_string()),
+            total_count: 8,
+            null_count: Some(4),
+            distinct_count: Some(NonZeroU64::new(3).unwrap())
+        })
+    );
+
+    assert_eq!(
+        stats["tag3"],
+        Statistics::String(StatValues {
+            min: Some("v1".to_string()),
+            max: Some("v3".to_string()),
+            total_count: 8,
+            null_count: Some(5),
+            distinct_count: Some(NonZeroU64::new(3).unwrap())
+        })
+    );
+
+    assert_eq!(
+        stats["time"],
+        Statistics::I64(StatValues {
+            min: Some(0),
+            max: Some(8),
+            total_count: 8,
+            null_count: Some(0),
+            distinct_count: None
+        })
+    )
+}
diff --git a/mutable_batch/tests/writer.rs b/mutable_batch/tests/writer.rs
new file mode 100644
index 0000000..96e1aa0
--- /dev/null
+++ b/mutable_batch/tests/writer.rs
@@ -0,0 +1,452 @@
+use arrow_util::assert_batches_eq;
+use data_types::{StatValues, Statistics};
+use mutable_batch::{writer::Writer, MutableBatch, TimestampSummary};
+use schema::Projection;
+use std::{f64::NAN, num::NonZeroU64};
+
+fn get_stats(batch: &MutableBatch) -> Vec<(&str, Statistics)> {
+    let mut stats: Vec<_> = batch
+        .columns()
+        .map(|(name, col)| (name.as_str(), col.stats()))
+        .collect();
+
+    stats.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
+    stats
+}
+
+#[test]
+fn test_basic() {
+    let mut batch = MutableBatch::new();
+
+    let mut writer = Writer::new(&mut batch, 5);
+
+    writer
+        .write_bool(
+            "b1",
+            None,
+            vec![true, true, false, false, false].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_bool(
+            "b2",
+            Some(&[0b00011101]),
+            vec![true, false, false, true].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_f64(
+            "f64",
+            Some(&[0b00011011]),
+            vec![343.3, 443., 477., -24.].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_i64("i64", None, vec![234, 6, 2, 6, -3].into_iter())
+        .unwrap();
+
+    writer
+        .write_i64("i64_2", Some(&[0b00000001]), vec![-8].into_iter())
+        .unwrap();
+
+    writer
+        .write_u64("u64", Some(&[0b00001001]), vec![23, 5].into_iter())
+        .unwrap();
+
+    writer
+        .write_time("time", vec![7, 5, 7, 3, 5].into_iter())
+        .unwrap();
+
+    writer
+        .write_tag("tag1", None, vec!["v1", "v1", "v2", "v2", "v1"].into_iter())
+        .unwrap();
+
+    writer
+        .write_tag(
+            "tag2",
+            Some(&[0b00001011]),
+            vec!["v1", "v2", "v2"].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_tag_dict(
+            "tag3",
+            Some(&[0b00011011]),
+            vec![1, 0, 0, 1].into_iter(),
+            vec!["v1", "v2"].into_iter(),
+        )
+        .unwrap();
+
+    writer.commit();
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    let expected_data = &[
+        "+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
+        "| b1    | b2    | f64   | i64 | i64_2 | tag1 | tag2 | tag3 | time                           | u64 |",
+        "+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
+        "| true  | true  | 343.3 | 234 | -8    | v1   | v1   | v2   | 1970-01-01T00:00:00.000000007Z | 23  |",
+        "| true  |       | 443.0 | 6   |       | v1   | v2   | v1   | 1970-01-01T00:00:00.000000005Z |     |",
+        "| false | false |       | 2   |       | v2   |      |      | 1970-01-01T00:00:00.000000007Z |     |",
+        "| false | false | 477.0 | 6   |       | v2   | v2   | v1   | 1970-01-01T00:00:00.000000003Z | 5   |",
+        "| false | true  | -24.0 | -3  |       | v1   |      | v2   | 1970-01-01T00:00:00.000000005Z |     |",
+        "+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
+    ];
+
+    let expected_stats = vec![
+        (
+            "b1",
+            Statistics::Bool(StatValues::new(Some(false), Some(true), 5, Some(0))),
+        ),
+        (
+            "b2",
+            Statistics::Bool(StatValues::new(Some(false), Some(true), 5, Some(1))),
+        ),
+        (
+            "f64",
+            Statistics::F64(StatValues::new(Some(-24.), Some(477.), 5, Some(1))),
+        ),
+        (
+            "i64",
+            Statistics::I64(StatValues::new(Some(-3), Some(234), 5, Some(0))),
+        ),
+        (
+            "i64_2",
+            Statistics::I64(StatValues::new(Some(-8), Some(-8), 5, Some(4))),
+        ),
+        (
+            "tag1",
+            Statistics::String(StatValues::new_with_distinct(
+                Some("v1".to_string()),
+                Some("v2".to_string()),
+                5,
+                Some(0),
+                Some(NonZeroU64::new(2).unwrap()),
+            )),
+        ),
+        (
+            "tag2",
+            Statistics::String(StatValues::new_with_distinct(
+                Some("v1".to_string()),
+                Some("v2".to_string()),
+                5,
+                Some(2),
+                Some(NonZeroU64::new(3).unwrap()),
+            )),
+        ),
+        (
+            "tag3",
+            Statistics::String(StatValues::new_with_distinct(
+                Some("v1".to_string()),
+                Some("v2".to_string()),
+                5,
+                Some(1),
+                Some(NonZeroU64::new(3).unwrap()),
+            )),
+        ),
+        (
+            "time",
+            Statistics::I64(StatValues::new(Some(3), Some(7), 5, Some(0))),
+        ),
+        (
+            "u64",
+            Statistics::U64(StatValues::new(Some(5), Some(23), 5, Some(3))),
+        ),
+    ];
+
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
+    assert_eq!(stats, expected_stats);
+
+    let mut writer = Writer::new(&mut batch, 4);
+    writer
+        .write_time("time", vec![4, 6, 21, 7].into_iter())
+        .unwrap();
+
+    writer
+        .write_tag("tag1", None, vec!["v6", "v7", "v8", "v4"].into_iter())
+        .unwrap();
+
+    std::mem::drop(writer);
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    // Writer dropped, should not impact stats or data
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
+    assert_eq!(stats, expected_stats);
+
+    let err = Writer::new(&mut batch, 1)
+        .write_tag("b1", None, vec!["err"].into_iter())
+        .unwrap_err()
+        .to_string();
+    assert_eq!(err.as_str(), "Unable to insert iox::column_type::tag type into column b1 with type iox::column_type::field::boolean");
+
+    let err = Writer::new(&mut batch, 1)
+        .write_i64("f64", None, vec![3].into_iter())
+        .unwrap_err()
+        .to_string();
+
+    assert_eq!(err.as_str(), "Unable to insert iox::column_type::field::integer type into column f64 with type iox::column_type::field::float");
+
+    let err = Writer::new(&mut batch, 1)
+        .write_string("tag3", None, vec!["sd"].into_iter())
+        .unwrap_err()
+        .to_string();
+
+    assert_eq!(err.as_str(), "Unable to insert iox::column_type::field::string type into column tag3 with type iox::column_type::tag");
+
+    let err = Writer::new(&mut batch, 1)
+        .write_tag_dict("tag3", None, vec![1].into_iter(), vec!["v1"].into_iter())
+        .unwrap_err()
+        .to_string();
+
+    assert_eq!(err.as_str(), "Key not found in dictionary: 1");
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    // Writer not committed, should not impact stats or data
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
+    assert_eq!(stats, expected_stats);
+
+    let mut writer = Writer::new(&mut batch, 17);
+
+    writer.write_time("time", 0..17).unwrap();
+
+    writer
+        .write_f64(
+            "f64",
+            Some(&[0b01000010, 0b00100100, 0b00000001]),
+            vec![4., 945., -222., 4., 7.].into_iter(),
+        )
+        .unwrap();
+
+    writer
+        .write_tag("tag3", None, std::iter::repeat("v2"))
+        .unwrap();
+
+    writer
+        .write_tag_dict(
+            "tag2",
+            Some(&[0b11011111, 0b11011101, 0b00000000]),
+            vec![0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1].into_iter(),
+            vec!["v4", "v1", "v7"].into_iter(), // Intentional extra key
+        )
+        .unwrap();
+
+    writer.commit();
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    let expected_data = &[
+        "+-------+-------+--------+-----+-------+------+------+------+--------------------------------+-----+",
+        "| b1    | b2    | f64    | i64 | i64_2 | tag1 | tag2 | tag3 | time                           | u64 |",
+        "+-------+-------+--------+-----+-------+------+------+------+--------------------------------+-----+",
+        "| true  | true  | 343.3  | 234 | -8    | v1   | v1   | v2   | 1970-01-01T00:00:00.000000007Z | 23  |",
+        "| true  |       | 443.0  | 6   |       | v1   | v2   | v1   | 1970-01-01T00:00:00.000000005Z |     |",
+        "| false | false |        | 2   |       | v2   |      |      | 1970-01-01T00:00:00.000000007Z |     |",
+        "| false | false | 477.0  | 6   |       | v2   | v2   | v1   | 1970-01-01T00:00:00.000000003Z | 5   |",
+        "| false | true  | -24.0  | -3  |       | v1   |      | v2   | 1970-01-01T00:00:00.000000005Z |     |",
+        "|       |       |        |     |       |      | v4   | v2   | 1970-01-01T00:00:00Z           |     |",
+        "|       |       | 4.0    |     |       |      | v1   | v2   | 1970-01-01T00:00:00.000000001Z |     |",
+        "|       |       |        |     |       |      | v1   | v2   | 1970-01-01T00:00:00.000000002Z |     |",
+        "|       |       |        |     |       |      | v4   | v2   | 1970-01-01T00:00:00.000000003Z |     |",
+        "|       |       |        |     |       |      | v1   | v2   | 1970-01-01T00:00:00.000000004Z |     |",
+        "|       |       |        |     |       |      |      | v2   | 1970-01-01T00:00:00.000000005Z |     |",
+        "|       |       | 945.0  |     |       |      | v1   | v2   | 1970-01-01T00:00:00.000000006Z |     |",
+        "|       |       |        |     |       |      | v1   | v2   | 1970-01-01T00:00:00.000000007Z |     |",
+        "|       |       |        |     |       |      | v4   | v2   | 1970-01-01T00:00:00.000000008Z |     |",
+        "|       |       |        |     |       |      |      | v2   | 1970-01-01T00:00:00.000000009Z |     |",
+        "|       |       | -222.0 |     |       |      | v4   | v2   | 1970-01-01T00:00:00.000000010Z |     |",
+        "|       |       |        |     |       |      | v4   | v2   | 1970-01-01T00:00:00.000000011Z |     |",
+        "|       |       |        |     |       |      | v4   | v2   | 1970-01-01T00:00:00.000000012Z |     |",
+        "|       |       | 4.0    |     |       |      |      | v2   | 1970-01-01T00:00:00.000000013Z |     |",
+        "|       |       |        |     |       |      | v1   | v2   | 1970-01-01T00:00:00.000000014Z |     |",
+        "|       |       |        |     |       |      | v1   | v2   | 1970-01-01T00:00:00.000000015Z |     |",
+        "|       |       | 7.0    |     |       |      |      | v2   | 1970-01-01T00:00:00.000000016Z |     |",
+        "+-------+-------+--------+-----+-------+------+------+------+--------------------------------+-----+",
+    ];
+
+    let expected_stats = vec![
+        (
+            "b1",
+            Statistics::Bool(StatValues::new(Some(false), Some(true), 22, Some(17))),
+        ),
+        (
+            "b2",
+            Statistics::Bool(StatValues::new(Some(false), Some(true), 22, Some(18))),
+        ),
+        (
+            "f64",
+            Statistics::F64(StatValues::new(Some(-222.), Some(945.), 22, Some(13))),
+        ),
+        (
+            "i64",
+            Statistics::I64(StatValues::new(Some(-3), Some(234), 22, Some(17))),
+        ),
+        (
+            "i64_2",
+            Statistics::I64(StatValues::new(Some(-8), Some(-8), 22, Some(21))),
+        ),
+        (
+            "tag1",
+            Statistics::String(StatValues::new_with_distinct(
+                Some("v1".to_string()),
+                Some("v2".to_string()),
+                22,
+                Some(17),
+                Some(NonZeroU64::new(3).unwrap()),
+            )),
+        ),
+        (
+            "tag2",
+            Statistics::String(StatValues::new_with_distinct(
+                Some("v1".to_string()),
+                Some("v4".to_string()),
+                22,
+                Some(6),
+                Some(NonZeroU64::new(4).unwrap()),
+            )),
+        ),
+        (
+            "tag3",
+            Statistics::String(StatValues::new_with_distinct(
+                Some("v1".to_string()),
+                Some("v2".to_string()),
+                22,
+                Some(1),
+                Some(NonZeroU64::new(3).unwrap()),
+            )),
+        ),
+        (
+            "time",
+            Statistics::I64(StatValues::new(Some(0), Some(16), 22, Some(0))),
+        ),
+        (
+            "u64",
+            Statistics::U64(StatValues::new(Some(5), Some(23), 22, Some(20))),
+        ),
+    ];
+
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
+    assert_eq!(stats, expected_stats);
+
+    let mut expected_timestamps = TimestampSummary::default();
+    for t in [
+        7, 5, 7, 3, 5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+    ] {
+        expected_timestamps.record_nanos(t)
+    }
+
+    let timestamps = batch.timestamp_summary().unwrap();
+    assert_eq!(timestamps, expected_timestamps);
+}
+
+#[test]
+fn test_null_only() {
+    let mut batch = MutableBatch::new();
+
+    let mut writer = Writer::new(&mut batch, 1);
+
+    writer
+        .write_bool("b1", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_f64("f64", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_i64("i64", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_u64("u64", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_string("string", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer.write_time("time", vec![42].into_iter()).unwrap();
+
+    writer
+        .write_tag("tag1", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    let expected_data = &[
+        "+----+-----+-----+--------+------+--------------------------------+-----+",
+        "| b1 | f64 | i64 | string | tag1 | time                           | u64 |",
+        "+----+-----+-----+--------+------+--------------------------------+-----+",
+        "|    |     |     |        |      | 1970-01-01T00:00:00.000000042Z |     |",
+        "+----+-----+-----+--------+------+--------------------------------+-----+",
+    ];
+
+    let expected_stats = vec![
+        (
+            "b1",
+            Statistics::Bool(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "f64",
+            Statistics::F64(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "i64",
+            Statistics::I64(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "string",
+            Statistics::String(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "tag1",
+            Statistics::String(StatValues::new_with_distinct(
+                None,
+                None,
+                1,
+                Some(1),
+                Some(1.try_into().unwrap()),
+            )),
+        ),
+        (
+            "time",
+            Statistics::I64(StatValues::new(Some(42), Some(42), 1, Some(0))),
+        ),
+        (
+            "u64",
+            Statistics::U64(StatValues::new(None, None, 1, Some(1))),
+        ),
+    ];
+
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
+    pretty_assertions::assert_eq!(expected_stats, stats);
+}
+
+#[test]
+fn test_nan_stats() {
+    let mut batch = MutableBatch::new();
+
+    let mut writer = Writer::new(&mut batch, 3);
+
+    writer
+        .write_f64("f64", None, vec![4.2, NAN, 2.4].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    let expected_stats = vec![(
+        "f64",
+        Statistics::F64(StatValues::new(Some(2.4), Some(4.2), 3, Some(0))),
+    )];
+
+    pretty_assertions::assert_eq!(expected_stats, stats);
+}
diff --git a/mutable_batch/tests/writer_drop.rs b/mutable_batch/tests/writer_drop.rs
new file mode 100644
index 0000000..67a25ed
--- /dev/null
+++ b/mutable_batch/tests/writer_drop.rs
@@ -0,0 +1,37 @@
+use arrow_util::assert_batches_eq;
+use mutable_batch::writer::Writer;
+use mutable_batch::MutableBatch;
+use schema::Projection;
+
+#[test]
+fn test_new_column() {
+    let mut batch = MutableBatch::new();
+    let mut writer = Writer::new(&mut batch, 2);
+
+    writer
+        .write_bool("b1", None, vec![true, false].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let expected = &[
+        "+-------+",
+        "| b1    |",
+        "+-------+",
+        "| true  |",
+        "| false |",
+        "+-------+",
+    ];
+
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+    let mut writer = Writer::new(&mut batch, 1);
+    writer
+        .write_string("tag1", None, vec!["v1"].into_iter())
+        .unwrap();
+
+    std::mem::drop(writer);
+
+    // Should not include tag1 column
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+}
diff --git a/mutable_batch/tests/writer_fuzz.rs b/mutable_batch/tests/writer_fuzz.rs
new file mode 100644
index 0000000..bf8183d
--- /dev/null
+++ b/mutable_batch/tests/writer_fuzz.rs
@@ -0,0 +1,452 @@
+//! A fuzz test of the [`mutable_batch::Writer`] interface:
+//!
+//! - column writes - `write_i64`, `write_tag`, etc...
+//! - batch writes - `write_batch`
+//! - batch writes with ranges - `write_batch_ranges`
+//!
+//! Verifies that the rows and statistics are as expected after a number of interleaved writes
+
+use arrow::{
+    array::{
+        ArrayRef, BooleanArray, Float64Array, Int64Array, StringArray, TimestampNanosecondArray,
+        UInt64Array,
+    },
+    record_batch::RecordBatch,
+};
+use arrow_util::bitset::BitSet;
+use data_types::{
+    partition_template::{test_table_partition_override, TemplatePart},
+    IsNan, StatValues, Statistics,
+};
+use hashbrown::HashSet;
+use mutable_batch::{writer::Writer, MutableBatch, WritePayload};
+use partition::PartitionWrite;
+use rand::prelude::*;
+use schema::Projection;
+use std::{collections::BTreeMap, num::NonZeroU64, ops::Range, sync::Arc};
+
+fn make_rng() -> StdRng {
+    let seed = rand::rngs::OsRng.next_u64();
+    println!("Seed: {seed}");
+    StdRng::seed_from_u64(seed)
+}
+
+/// A random unicode string of up to 20 codepoints
+fn random_string(rng: &mut StdRng) -> String {
+    let len = (rng.next_u32() % 64) as usize;
+    rng.sample_iter::<char, _>(rand::distributions::Standard)
+        .take(len)
+        .collect()
+}
+
+fn random_bool(rng: &mut StdRng) -> bool {
+    rng.sample(rand::distributions::Standard)
+}
+
+/// Randomly may return an array containing randomly generated, nullable data
+fn maybe_array<T, F>(rng: &mut StdRng, len: u32, generator: F) -> Option<Vec<Option<T>>>
+where
+    F: Fn(&mut StdRng) -> T,
+{
+    match random_bool(rng) {
+        true => None,
+        false => Some(
+            (0..len)
+                .map(|_| match random_bool(rng) {
+                    true => Some(generator(rng)),
+                    false => None,
+                })
+                .collect::<Vec<_>>(),
+        ),
+    }
+}
+
+fn compute_mask<T>(array: &[Option<T>]) -> BitSet {
+    let mut bitset = BitSet::new();
+    bitset.append_unset(array.len());
+    for (idx, v) in array.iter().enumerate() {
+        if v.is_some() {
+            bitset.set(idx)
+        }
+    }
+    bitset
+}
+
+/// The expected data that was written
+#[derive(Debug, Default)]
+struct Expected {
+    time_expected: Vec<i64>,
+    tag_expected: Vec<Option<String>>,
+    string_expected: Vec<Option<String>>,
+    bool_expected: Vec<Option<bool>>,
+    i64_expected: Vec<Option<i64>>,
+    u64_expected: Vec<Option<u64>>,
+    f64_expected: Vec<Option<f64>>,
+}
+
+fn filter_vec<T: Clone>(ranges: &[Range<usize>], src: &[T]) -> Vec<T> {
+    ranges
+        .iter()
+        .flat_map(|r| r.clone())
+        .map(|x| src[x].clone())
+        .collect()
+}
+
+fn compute_stats<T: PartialOrd + IsNan + ToOwned<Owned = T>>(data: &[Option<T>]) -> StatValues<T> {
+    let mut stats = StatValues::new_empty();
+    for d in data {
+        match d {
+            Some(v) => stats.update(v),
+            None => stats.update_for_nulls(1),
+        }
+    }
+    stats
+}
+
+impl Expected {
+    /// Returns a filtered version of `self` based on the provided `ranges`
+    fn filter(self, ranges: &[Range<usize>]) -> Expected {
+        Self {
+            time_expected: filter_vec(ranges, &self.time_expected),
+            tag_expected: filter_vec(ranges, &self.tag_expected),
+            string_expected: filter_vec(ranges, &self.string_expected),
+            bool_expected: filter_vec(ranges, &self.bool_expected),
+            i64_expected: filter_vec(ranges, &self.i64_expected),
+            u64_expected: filter_vec(ranges, &self.u64_expected),
+            f64_expected: filter_vec(ranges, &self.f64_expected),
+        }
+    }
+
+    /// Extends `self` with the writes from `other`
+    fn concat(&mut self, other: &Expected) {
+        self.time_expected.extend_from_slice(&other.time_expected);
+        self.tag_expected.extend_from_slice(&other.tag_expected);
+        self.string_expected
+            .extend_from_slice(&other.string_expected);
+        self.bool_expected.extend_from_slice(&other.bool_expected);
+        self.i64_expected.extend_from_slice(&other.i64_expected);
+        self.u64_expected.extend_from_slice(&other.u64_expected);
+        self.f64_expected.extend_from_slice(&other.f64_expected);
+    }
+
+    /// Reports the statistics indexed by column
+    fn stats(&self) -> BTreeMap<String, Statistics> {
+        let mut stats = BTreeMap::new();
+        stats.insert(
+            "b1".to_string(),
+            Statistics::Bool(compute_stats(&self.bool_expected)),
+        );
+        stats.insert(
+            "f1".to_string(),
+            Statistics::F64(compute_stats(&self.f64_expected)),
+        );
+        stats.insert(
+            "i1".to_string(),
+            Statistics::I64(compute_stats(&self.i64_expected)),
+        );
+        stats.insert(
+            "s1".to_string(),
+            Statistics::String(compute_stats(&self.string_expected)),
+        );
+        stats.insert(
+            "u1".to_string(),
+            Statistics::U64(compute_stats(&self.u64_expected)),
+        );
+
+        let mut tag_stats = StatValues::new_empty();
+        let mut tags = HashSet::new();
+        for tag in &self.tag_expected {
+            match tag {
+                Some(v) => {
+                    tags.insert(v.as_str());
+                    tag_stats.update(v);
+                }
+                None => tag_stats.update_for_nulls(1),
+            }
+        }
+
+        // Null counts as a distinct value
+        match tag_stats.null_count {
+            None => unreachable!("mutable batch keeps null counts"),
+            Some(0) => tag_stats.distinct_count = NonZeroU64::new(tags.len() as u64),
+            Some(_) => tag_stats.distinct_count = NonZeroU64::new(tags.len() as u64 + 1),
+        }
+
+        stats.insert("t1".to_string(), Statistics::String(tag_stats));
+
+        let mut time_stats = StatValues::new_empty();
+        self.time_expected.iter().for_each(|x| time_stats.update(x));
+        stats.insert("time".to_string(), Statistics::I64(time_stats));
+
+        stats
+    }
+
+    /// Converts this to a [`RecordBatch`]
+    fn batch(&self) -> RecordBatch {
+        RecordBatch::try_from_iter(vec![
+            (
+                "b1",
+                Arc::new(BooleanArray::from_iter(self.bool_expected.iter())) as ArrayRef,
+            ),
+            (
+                "f1",
+                Arc::new(Float64Array::from_iter(self.f64_expected.iter())) as ArrayRef,
+            ),
+            (
+                "i1",
+                Arc::new(Int64Array::from_iter(self.i64_expected.iter())) as ArrayRef,
+            ),
+            (
+                "s1",
+                Arc::new(StringArray::from_iter(self.string_expected.iter())) as ArrayRef,
+            ),
+            (
+                "t1",
+                Arc::new(StringArray::from_iter(self.tag_expected.iter())) as ArrayRef,
+            ),
+            (
+                "time",
+                Arc::new(TimestampNanosecondArray::from_iter_values(
+                    self.time_expected.iter().cloned(),
+                )) as ArrayRef,
+            ),
+            (
+                "u1",
+                Arc::new(UInt64Array::from_iter(self.u64_expected.iter())) as ArrayRef,
+            ),
+        ])
+        .unwrap()
+    }
+}
+
+/// Extends the provided batch with random content, returning a summary of what was written
+fn extend_batch(rng: &mut StdRng, batch: &mut MutableBatch) -> Expected {
+    let len = rng.next_u32() % 128 + 1;
+    let mut expected = Expected::default();
+
+    let mut writer = Writer::new(batch, len as usize);
+
+    let time: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() as i64))
+        .take(len as usize)
+        .collect();
+
+    writer.write_time("time", time.iter().cloned()).unwrap();
+    expected.time_expected.extend_from_slice(&time);
+
+    match random_bool(rng) {
+        true => match maybe_array(rng, len, random_string) {
+            Some(array) => {
+                expected.tag_expected.extend(array.iter().cloned());
+                let mask = compute_mask(&array);
+                writer
+                    .write_tag(
+                        "t1",
+                        Some(mask.bytes()),
+                        array.iter().filter_map(|x| x.as_deref()),
+                    )
+                    .unwrap();
+            }
+            None => expected
+                .tag_expected
+                .extend(std::iter::repeat(None).take(len as usize)),
+        },
+        false => {
+            let values_len = rng.next_u32() % 18 + 1;
+            let values: Vec<_> = std::iter::from_fn(|| Some(random_string(rng)))
+                .take(values_len as usize)
+                .collect();
+
+            match maybe_array(rng, len, |rng| (rng.next_u32() % values_len) as usize) {
+                Some(array) => {
+                    expected
+                        .tag_expected
+                        .extend(array.iter().map(|x| Some(values[(*x)?].clone())));
+                    let mask = compute_mask(&array);
+                    writer
+                        .write_tag_dict(
+                            "t1",
+                            Some(mask.bytes()),
+                            array.iter().filter_map(|x| *x),
+                            values.iter().map(|x| x.as_str()),
+                        )
+                        .unwrap();
+                }
+                None => expected
+                    .tag_expected
+                    .extend(std::iter::repeat(None).take(len as usize)),
+            }
+        }
+    }
+
+    match maybe_array(rng, len, random_string) {
+        Some(array) => {
+            expected.string_expected.extend(array.iter().cloned());
+            let mask = compute_mask(&array);
+            writer
+                .write_string(
+                    "s1",
+                    Some(mask.bytes()),
+                    array.iter().filter_map(|x| x.as_deref()),
+                )
+                .unwrap();
+        }
+        None => expected
+            .string_expected
+            .extend(std::iter::repeat(None).take(len as usize)),
+    }
+
+    match maybe_array(rng, len, random_bool) {
+        Some(array) => {
+            expected.bool_expected.extend(array.iter().cloned());
+            let mask = compute_mask(&array);
+            writer
+                .write_bool("b1", Some(mask.bytes()), array.iter().filter_map(|x| *x))
+                .unwrap();
+        }
+        None => expected
+            .bool_expected
+            .extend(std::iter::repeat(None).take(len as usize)),
+    }
+
+    match maybe_array(rng, len, |rng| rng.next_u64()) {
+        Some(array) => {
+            expected.u64_expected.extend(array.iter().cloned());
+            let mask = compute_mask(&array);
+            writer
+                .write_u64("u1", Some(mask.bytes()), array.iter().filter_map(|x| *x))
+                .unwrap();
+        }
+        None => expected
+            .u64_expected
+            .extend(std::iter::repeat(None).take(len as usize)),
+    }
+
+    match maybe_array(rng, len, |rng| rng.next_u64() as i64) {
+        Some(array) => {
+            expected.i64_expected.extend(array.iter().cloned());
+            let mask = compute_mask(&array);
+            writer
+                .write_i64("i1", Some(mask.bytes()), array.iter().filter_map(|x| *x))
+                .unwrap();
+        }
+        None => expected
+            .i64_expected
+            .extend(std::iter::repeat(None).take(len as usize)),
+    }
+
+    match maybe_array(rng, len, |rng| f64::from_bits(rng.next_u64())) {
+        Some(array) => {
+            expected.f64_expected.extend(array.iter().cloned());
+            let mask = compute_mask(&array);
+            writer
+                .write_f64("f1", Some(mask.bytes()), array.iter().filter_map(|x| *x))
+                .unwrap();
+        }
+        None => expected
+            .f64_expected
+            .extend(std::iter::repeat(None).take(len as usize)),
+    }
+
+    writer.commit();
+    expected
+}
+
+/// Returns random non-overlapping ranges in increasing order with a max of len
+fn random_ranges(rng: &mut StdRng, len: usize) -> Vec<Range<usize>> {
+    let mut start = rng.next_u64() as usize % len;
+
+    let mut ret = vec![];
+    while start < len {
+        let end = (start + rng.next_u32() as usize % 32).min(len);
+        ret.push(start..end);
+        start = end + rng.next_u32() as usize % 32;
+    }
+    ret
+}
+
+#[test]
+fn test_writer_fuzz() {
+    let mut rng = make_rng();
+    let mut batch = MutableBatch::new();
+    let mut expected = Expected::default();
+
+    // Perform some regular writes
+    for _ in 0..20 {
+        let ret = extend_batch(&mut rng, &mut batch);
+        expected.concat(&ret);
+    }
+
+    // Test extend from
+    for _ in 0..20 {
+        let mut temp = MutableBatch::new();
+        let ret = extend_batch(&mut rng, &mut temp);
+        batch.extend_from(&temp).unwrap();
+        expected.concat(&ret);
+    }
+
+    // Test extend from ranges
+    for _ in 0..20 {
+        let mut temp = MutableBatch::new();
+        let ret = extend_batch(&mut rng, &mut temp);
+
+        let ranges = random_ranges(&mut rng, temp.rows());
+        batch.extend_from_ranges(&temp, &ranges).unwrap();
+
+        expected.concat(&ret.filter(&ranges));
+    }
+
+    let actual = batch.to_arrow(Projection::All).unwrap();
+
+    assert_eq!(
+        arrow_util::display::pretty_format_batches(&[actual]).unwrap(),
+        arrow_util::display::pretty_format_batches(&[expected.batch()]).unwrap()
+    );
+
+    let actual_statistics: BTreeMap<String, Statistics> = batch
+        .columns()
+        .map(|(name, col)| (name.clone(), col.stats()))
+        .collect();
+    let expected_statistics = expected.stats();
+
+    assert_eq!(actual_statistics, expected_statistics);
+}
+
+#[test]
+fn test_partition_write() {
+    let mut rng = make_rng();
+    let mut batch = MutableBatch::new();
+    let expected = extend_batch(&mut rng, &mut batch);
+
+    let w = PartitionWrite::new(&batch).unwrap();
+    assert_eq!(w.rows().get(), expected.tag_expected.len());
+
+    let verify_write = |write: &PartitionWrite<'_>| {
+        // Verify that the time and row statistics computed by the PartitionWrite
+        // match what actually gets written to a MutableBatch
+        let mut temp = MutableBatch::new();
+        write.write_to_batch(&mut temp).unwrap();
+
+        let stats = match temp.column("time").unwrap().stats() {
+            Statistics::I64(stats) => stats,
+            _ => unreachable!(),
+        };
+
+        assert_eq!(write.min_timestamp(), stats.min.unwrap());
+        assert_eq!(write.max_timestamp(), stats.max.unwrap());
+        assert_eq!(write.rows().get() as u64, stats.total_count);
+    };
+
+    let table_partition_template =
+        test_table_partition_override(vec![TemplatePart::TagValue("t1")]);
+
+    let partitioned = PartitionWrite::partition(&batch, &table_partition_template).unwrap();
+
+    for (_, write) in &partitioned {
+        verify_write(write);
+
+        match write.filter(|x| x & 1 == 0) {
+            Some(filtered) => verify_write(&filtered),
+            None => continue,
+        }
+    }
+}
diff --git a/mutable_batch_lp/Cargo.toml b/mutable_batch_lp/Cargo.toml
new file mode 100644
index 0000000..89e367a
--- /dev/null
+++ b/mutable_batch_lp/Cargo.toml
@@ -0,0 +1,34 @@
+[package]
+name = "mutable_batch_lp"
+description = "Conversion logic for line protocol -> MutableBatch"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+hashbrown = { workspace = true }
+influxdb-line-protocol = { path = "../influxdb_line_protocol" }
+itertools = "0.12.0"
+mutable_batch = { path = "../mutable_batch" }
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+arrow_util = { path = "../arrow_util" }
+assert_matches = "1.5.0"
+criterion = { version = "0.5", default-features = false, features = ["rayon"]}
+schema = { path = "../schema" }
+test_helpers = { path = "../test_helpers" }
+
+[[bench]]
+name = "parse_lp"
+harness = false
+
+[lib]
+# Allow --save-baseline to work
+# https://github.com/bheisler/criterion.rs/issues/275
+bench = false
diff --git a/mutable_batch_lp/benches/parse_lp.rs b/mutable_batch_lp/benches/parse_lp.rs
new file mode 100644
index 0000000..103eddb
--- /dev/null
+++ b/mutable_batch_lp/benches/parse_lp.rs
@@ -0,0 +1,29 @@
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput};
+use mutable_batch_lp::LinesConverter;
+
+fn bench_write_line(c: &mut Criterion) {
+    // Read the text_fixtures/metrics.lp data set, containing 1,000 lines of LP.
+    let lp = std::fs::read_to_string(format!(
+        "{}/../test_fixtures/lineproto/metrics.lp",
+        env!("CARGO_MANIFEST_DIR")
+    ))
+    .expect("reading test fixture failed");
+
+    let lines = lp.chars().filter(|&c| c == '\n').count();
+    assert_eq!(lines, 1000); // Perf would vary if the fixture changed
+
+    let mut group = c.benchmark_group("parse_lp");
+    group.throughput(Throughput::Elements(lines as _));
+    group.bench_function("metrics.lp", |b| {
+        b.iter_batched(
+            || LinesConverter::new(42),
+            |mut converter| {
+                converter.write_lp(&lp).unwrap();
+            },
+            BatchSize::PerIteration,
+        );
+    });
+}
+
+criterion_group!(benches, bench_write_line);
+criterion_main!(benches);
diff --git a/mutable_batch_lp/fuzz/.gitignore b/mutable_batch_lp/fuzz/.gitignore
new file mode 100644
index 0000000..1a45eee
--- /dev/null
+++ b/mutable_batch_lp/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/mutable_batch_lp/fuzz/Cargo.lock b/mutable_batch_lp/fuzz/Cargo.lock
new file mode 100644
index 0000000..db2c6c7
--- /dev/null
+++ b/mutable_batch_lp/fuzz/Cargo.lock
@@ -0,0 +1,4129 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "addr2line"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "ahash"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
+dependencies = [
+ "cfg-if",
+ "const-random",
+ "getrandom",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anstream"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
+
+[[package]]
+name = "arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+
+[[package]]
+name = "arrow"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614"
+dependencies = [
+ "ahash",
+ "arrow-arith",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-csv",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-json",
+ "arrow-ord",
+ "arrow-row",
+ "arrow-schema",
+ "arrow-select",
+ "arrow-string",
+]
+
+[[package]]
+name = "arrow-arith"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-array"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d"
+dependencies = [
+ "ahash",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "chrono-tz",
+ "half",
+ "hashbrown 0.14.3",
+ "num",
+]
+
+[[package]]
+name = "arrow-buffer"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c"
+dependencies = [
+ "bytes",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-cast"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "base64",
+ "chrono",
+ "comfy-table",
+ "half",
+ "lexical-core",
+ "num",
+]
+
+[[package]]
+name = "arrow-csv"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e09aa6246a1d6459b3f14baeaa49606cfdbca34435c46320e14054d244987ca"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "csv",
+ "csv-core",
+ "lazy_static",
+ "lexical-core",
+ "regex",
+]
+
+[[package]]
+name = "arrow-data"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634"
+dependencies = [
+ "arrow-buffer",
+ "arrow-schema",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-ipc"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "flatbuffers",
+]
+
+[[package]]
+name = "arrow-json"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d82565c91fd627922ebfe2810ee4e8346841b6f9361b87505a9acea38b614fee"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "indexmap 2.1.0",
+ "lexical-core",
+ "num",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "arrow-ord"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-row"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "half",
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "arrow-schema"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167"
+
+[[package]]
+name = "arrow-select"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "num",
+]
+
+[[package]]
+name = "arrow-string"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "num",
+ "regex",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "arrow_util"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "arrow",
+ "chrono",
+ "comfy-table",
+ "hashbrown 0.14.3",
+ "num-traits",
+ "once_cell",
+ "regex",
+ "snafu",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "async-stream"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "atoi"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "atomic-write-file"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edcdbedc2236483ab103a53415653d6b4442ea6141baf1ffa85df29635e88436"
+dependencies = [
+ "nix",
+ "rand",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "axum"
+version = "0.6.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bitflags 1.3.2",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "mime",
+ "rustversion",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
+
+[[package]]
+name = "base64"
+version = "0.21.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
+
+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+
+[[package]]
+name = "cc"
+version = "1.0.83"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+dependencies = [
+ "jobserver",
+ "libc",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "serde",
+ "wasm-bindgen",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "chrono-tz"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91d7b79e99bfaa0d47da0687c43aa3b7381938a62ad3a6498599039321f660b7"
+dependencies = [
+ "chrono",
+ "chrono-tz-build",
+ "phf",
+]
+
+[[package]]
+name = "chrono-tz-build"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f"
+dependencies = [
+ "parse-zoneinfo",
+ "phf",
+ "phf_codegen",
+]
+
+[[package]]
+name = "clap"
+version = "4.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcfab8ba68f3668e89f6ff60f5b205cea56aa7b769451a59f34b8682f51c056d"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb7fb5e4e979aec3be7791562fcba452f94ad85e954da024396433e0e25a79e9"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+
+[[package]]
+name = "comfy-table"
+version = "7.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c64043d6c7b7a4c58e39e7efccfdea7b93d885a795d0c054a69dbbf4dd52686"
+dependencies = [
+ "strum",
+ "strum_macros",
+ "unicode-width",
+]
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "const-random"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
+dependencies = [
+ "const-random-macro",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom",
+ "once_cell",
+ "tiny-keccak",
+]
+
+[[package]]
+name = "core-foundation"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce420fe07aecd3e67c5f910618fe65e94158f6dcc0adf44e00d69ce2bdfe0fd0"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc"
+version = "3.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe"
+dependencies = [
+ "crc-catalog",
+]
+
+[[package]]
+name = "crc-catalog"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
+
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "croaring"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7266f0a7275b00ce4c4f4753e8c31afdefe93828101ece83a06e2ddab1dd1010"
+dependencies = [
+ "byteorder",
+ "croaring-sys",
+]
+
+[[package]]
+name = "croaring-sys"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e47112498c394a7067949ebc07ef429b7384a413cf0efcf675846a47bcd307fb"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc6598521bb5a83d491e8c1fe51db7296019d2ca3cb93cc6c2a20369a4d78a2"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3a430a770ebd84726f584a90ee7f020d28db52c6d02138900f22341f866d39c"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "csv"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "data_types"
+version = "0.1.0"
+dependencies = [
+ "arrow-buffer",
+ "bytes",
+ "chrono",
+ "croaring",
+ "generated_types",
+ "influxdb-line-protocol",
+ "iox_time",
+ "murmur3",
+ "observability_deps",
+ "once_cell",
+ "ordered-float 4.2.0",
+ "percent-encoding",
+ "prost",
+ "schema",
+ "serde_json",
+ "sha2",
+ "siphasher 1.0.0",
+ "snafu",
+ "sqlx",
+ "thiserror",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "der"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
+dependencies = [
+ "const-oid",
+ "pem-rfc7468",
+ "zeroize",
+]
+
+[[package]]
+name = "difflib"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "const-oid",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "doc-comment"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
+
+[[package]]
+name = "dotenvy"
+version = "0.15.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
+
+[[package]]
+name = "dyn-clone"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d"
+
+[[package]]
+name = "either"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "errno"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "etcetera"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943"
+dependencies = [
+ "cfg-if",
+ "home",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "event-listener"
+version = "2.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
+
+[[package]]
+name = "fastrand"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
+
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
+[[package]]
+name = "fixedbitset"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+
+[[package]]
+name = "flatbuffers"
+version = "23.5.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640"
+dependencies = [
+ "bitflags 1.3.2",
+ "rustc_version",
+]
+
+[[package]]
+name = "flate2"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "float-cmp"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "flume"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "spin 0.9.8",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "futures"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-intrusive"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f"
+dependencies = [
+ "futures-core",
+ "lock_api",
+ "parking_lot",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+
+[[package]]
+name = "futures-task"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+
+[[package]]
+name = "futures-util"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "generated_types"
+version = "0.1.0"
+dependencies = [
+ "observability_deps",
+ "pbjson",
+ "pbjson-build",
+ "pbjson-types",
+ "prost",
+ "prost-build",
+ "serde",
+ "tonic",
+ "tonic-build",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "gimli"
+version = "0.28.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+
+[[package]]
+name = "h2"
+version = "0.3.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d6250322ef6e60f93f9a2162799302cd6f68f79f6e5d85c8c16f14d1d958178"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http",
+ "indexmap 2.1.0",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "half"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]
+
+[[package]]
+name = "hashlink"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+dependencies = [
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+dependencies = [
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "hkdf"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7"
+dependencies = [
+ "hmac",
+]
+
+[[package]]
+name = "hmac"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "http"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8947b1a6fad4393052c7ba1f4cd97bed3e953a95c79c92ad9b051a04611d9fbb"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
+dependencies = [
+ "bytes",
+ "http",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "http-range-header"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f"
+
+[[package]]
+name = "httparse"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
+[[package]]
+name = "hyper"
+version = "0.14.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590"
+dependencies = [
+ "futures-util",
+ "http",
+ "hyper",
+ "log",
+ "rustls",
+ "rustls-native-certs",
+ "tokio",
+ "tokio-rustls",
+]
+
+[[package]]
+name = "hyper-timeout"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
+dependencies = [
+ "hyper",
+ "pin-project-lite",
+ "tokio",
+ "tokio-io-timeout",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.59"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "idna"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "influxdb-line-protocol"
+version = "1.0.0"
+dependencies = [
+ "bytes",
+ "log",
+ "nom",
+ "smallvec",
+ "snafu",
+]
+
+[[package]]
+name = "integer-encoding"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
+
+[[package]]
+name = "iox_time"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "parking_lot",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
+
+[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c"
+
+[[package]]
+name = "jobserver"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "json-patch"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55ff1e1486799e3f64129f8ccad108b38290df9cd7015cd31bed17239f0789d6"
+dependencies = [
+ "serde",
+ "serde_json",
+ "thiserror",
+ "treediff",
+]
+
+[[package]]
+name = "k8s-openapi"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6"
+dependencies = [
+ "base64",
+ "bytes",
+ "chrono",
+ "schemars",
+ "serde",
+ "serde-value",
+ "serde_json",
+]
+
+[[package]]
+name = "kube-core"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae"
+dependencies = [
+ "chrono",
+ "form_urlencoded",
+ "http",
+ "json-patch",
+ "k8s-openapi",
+ "once_cell",
+ "schemars",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin 0.5.2",
+]
+
+[[package]]
+name = "lexical-core"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46"
+dependencies = [
+ "lexical-parse-float",
+ "lexical-parse-integer",
+ "lexical-util",
+ "lexical-write-float",
+ "lexical-write-integer",
+]
+
+[[package]]
+name = "lexical-parse-float"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f"
+dependencies = [
+ "lexical-parse-integer",
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-parse-integer"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-util"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc"
+dependencies = [
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-float"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862"
+dependencies = [
+ "lexical-util",
+ "lexical-write-integer",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-integer"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.151"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "libsqlite3-sys"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
+dependencies = [
+ "cc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456"
+
+[[package]]
+name = "lock_api"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
+
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "mio"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+dependencies = [
+ "libc",
+ "log",
+ "wasi",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "multimap"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
+
+[[package]]
+name = "murmur3"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b"
+
+[[package]]
+name = "mutable_batch"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow_util",
+ "chrono",
+ "data_types",
+ "hashbrown 0.14.3",
+ "iox_time",
+ "itertools 0.12.0",
+ "percent-encoding",
+ "schema",
+ "snafu",
+ "thiserror",
+ "unicode-segmentation",
+ "workspace-hack",
+]
+
+[[package]]
+name = "mutable_batch_lp"
+version = "0.0.0"
+dependencies = [
+ "hashbrown 0.14.3",
+ "libfuzzer-sys",
+ "mutable_batch",
+ "mutable_batch_lp 0.1.0",
+]
+
+[[package]]
+name = "mutable_batch_lp"
+version = "0.1.0"
+dependencies = [
+ "hashbrown 0.14.3",
+ "influxdb-line-protocol",
+ "itertools 0.12.0",
+ "mutable_batch",
+ "snafu",
+ "workspace-hack",
+]
+
+[[package]]
+name = "nix"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+dependencies = [
+ "bitflags 2.4.1",
+ "cfg-if",
+ "libc",
+]
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "normalize-line-endings"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be"
+
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
+[[package]]
+name = "num"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint-dig"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
+dependencies = [
+ "byteorder",
+ "lazy_static",
+ "libm",
+ "num-integer",
+ "num-iter",
+ "num-traits",
+ "rand",
+ "smallvec",
+ "zeroize",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "num_cpus"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
+name = "object"
+version = "0.32.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "object_store"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2524735495ea1268be33d200e1ee97455096a0846295a21548cd2f3541de7050"
+dependencies = [
+ "async-trait",
+ "base64",
+ "bytes",
+ "chrono",
+ "futures",
+ "humantime",
+ "hyper",
+ "itertools 0.11.0",
+ "parking_lot",
+ "percent-encoding",
+ "quick-xml",
+ "rand",
+ "reqwest",
+ "ring",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "snafu",
+ "tokio",
+ "tracing",
+ "url",
+ "walkdir",
+]
+
+[[package]]
+name = "observability_deps"
+version = "0.1.0"
+dependencies = [
+ "tracing",
+ "workspace-hack",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+dependencies = [
+ "parking_lot_core",
+]
+
+[[package]]
+name = "openssl-probe"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
+
+[[package]]
+name = "ordered-float"
+version = "2.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ordered-float"
+version = "4.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
+[[package]]
+name = "parking_lot"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "parse-zoneinfo"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41"
+dependencies = [
+ "regex",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
+
+[[package]]
+name = "pbjson"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90"
+dependencies = [
+ "base64",
+ "serde",
+]
+
+[[package]]
+name = "pbjson-build"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735"
+dependencies = [
+ "heck",
+ "itertools 0.11.0",
+ "prost",
+ "prost-types",
+]
+
+[[package]]
+name = "pbjson-types"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12"
+dependencies = [
+ "bytes",
+ "chrono",
+ "pbjson",
+ "pbjson-build",
+ "prost",
+ "prost-build",
+ "serde",
+]
+
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "petgraph"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.1.0",
+]
+
+[[package]]
+name = "phf"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
+dependencies = [
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
+dependencies = [
+ "phf_shared",
+ "rand",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
+dependencies = [
+ "siphasher 0.3.11",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "pkcs1"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
+dependencies = [
+ "der",
+ "pkcs8",
+ "spki",
+]
+
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der",
+ "spki",
+]
+
+[[package]]
+name = "pkg-config"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "predicates"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dfc28575c2e3f19cb3c73b93af36460ae898d426eba6fc15b9bd2a5220758a0"
+dependencies = [
+ "anstyle",
+ "difflib",
+ "float-cmp",
+ "itertools 0.11.0",
+ "normalize-line-endings",
+ "predicates-core",
+ "regex",
+]
+
+[[package]]
+name = "predicates-core"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b794032607612e7abeb4db69adb4e33590fa6cf1149e95fd7cb00e634b92f174"
+
+[[package]]
+name = "prettyplease"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2de98502f212cfcea8d0bb305bd0f49d7ebdd75b64ba0a68f937d888f4e0d6db"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "proptest"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31b476131c3c86cb68032fdc5cb6d5a1045e3e42d96b69fa599fd77701e1f5bf"
+dependencies = [
+ "bit-set",
+ "bit-vec",
+ "bitflags 2.4.1",
+ "lazy_static",
+ "num-traits",
+ "rand",
+ "rand_chacha",
+ "rand_xorshift",
+ "regex-syntax 0.8.2",
+ "rusty-fork",
+ "tempfile",
+ "unarray",
+]
+
+[[package]]
+name = "prost"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a"
+dependencies = [
+ "bytes",
+ "prost-derive",
+]
+
+[[package]]
+name = "prost-build"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2"
+dependencies = [
+ "bytes",
+ "heck",
+ "itertools 0.11.0",
+ "log",
+ "multimap",
+ "once_cell",
+ "petgraph",
+ "prettyplease",
+ "prost",
+ "prost-types",
+ "regex",
+ "syn 2.0.46",
+ "tempfile",
+ "which",
+]
+
+[[package]]
+name = "prost-derive"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e"
+dependencies = [
+ "anyhow",
+ "itertools 0.11.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e"
+dependencies = [
+ "prost",
+]
+
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
+[[package]]
+name = "quick-xml"
+version = "0.31.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_xorshift"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
+[[package]]
+name = "regex"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
+
+[[package]]
+name = "reqwest"
+version = "0.11.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41"
+dependencies = [
+ "base64",
+ "bytes",
+ "encoding_rs",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-rustls",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "system-configuration",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams",
+ "web-sys",
+ "webpki-roots",
+ "winreg",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74"
+dependencies = [
+ "cc",
+ "getrandom",
+ "libc",
+ "spin 0.9.8",
+ "untrusted",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "rsa"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+dependencies = [
+ "const-oid",
+ "digest",
+ "num-bigint-dig",
+ "num-integer",
+ "num-traits",
+ "pkcs1",
+ "pkcs8",
+ "rand_core",
+ "signature",
+ "spki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+dependencies = [
+ "bitflags 2.4.1",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustls"
+version = "0.21.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba"
+dependencies = [
+ "log",
+ "ring",
+ "rustls-webpki",
+ "sct",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00"
+dependencies = [
+ "openssl-probe",
+ "rustls-pemfile",
+ "schannel",
+ "security-framework",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
+dependencies = [
+ "base64",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.101.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
+
+[[package]]
+name = "rusty-fork"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f"
+dependencies = [
+ "fnv",
+ "quick-error",
+ "tempfile",
+ "wait-timeout",
+]
+
+[[package]]
+name = "ryu"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "schema"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "hashbrown 0.14.3",
+ "indexmap 2.1.0",
+ "observability_deps",
+ "snafu",
+ "workspace-hack",
+]
+
+[[package]]
+name = "schemars"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45a28f4c49489add4ce10783f7911893516f15afe45d015608d41faca6bc4d29"
+dependencies = [
+ "dyn-clone",
+ "schemars_derive",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars_derive"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c767fd6fa65d9ccf9cf026122c1b555f2ef9a4f0cea69da4d7dbc3e258d30967"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "sct"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "security-framework"
+version = "2.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0"
+
+[[package]]
+name = "serde"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b114498256798c94a0689e1a15fec6005dee8ac1f41de56404b67afc2a4b773"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde-value"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
+dependencies = [
+ "ordered-float 2.10.1",
+ "serde",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3385e45322e8f9931410f01b3031ec534c3947d0e94c18049af4d9f9907d4e0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "serde_derive_internals"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85bf8229e7920a9f636479437026331ce11aa132b4dde37d121944a44d6e5f3c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.110"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fbd975230bada99c8bb618e0c365c2eefa219158d5c6c29610fd09ff1833257"
+dependencies = [
+ "indexmap 2.1.0",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "digest",
+ "rand_core",
+]
+
+[[package]]
+name = "similar"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32fea41aca09ee824cc9724996433064c89f7777e60762749a4170a14abbfa21"
+
+[[package]]
+name = "siphasher"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
+
+[[package]]
+name = "siphasher"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
+
+[[package]]
+name = "slab"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970"
+
+[[package]]
+name = "snafu"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6"
+dependencies = [
+ "doc-comment",
+ "snafu-derive",
+]
+
+[[package]]
+name = "snafu-derive"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "socket2"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
+dependencies = [
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "spin"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
+
+[[package]]
+name = "spin"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der",
+]
+
+[[package]]
+name = "sqlformat"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce81b7bd7c4493975347ef60d8c7e8b742d4694f4c49f93e0a12ea263938176c"
+dependencies = [
+ "itertools 0.12.0",
+ "nom",
+ "unicode_categories",
+]
+
+[[package]]
+name = "sqlx"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dba03c279da73694ef99763320dea58b51095dfe87d001b1d4b5fe78ba8763cf"
+dependencies = [
+ "sqlx-core",
+ "sqlx-macros",
+ "sqlx-mysql",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+]
+
+[[package]]
+name = "sqlx-core"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0a3c3739e220d94b3239fd69fb1f74bc36e16643423bd99de3b43c21bfbd"
+dependencies = [
+ "ahash",
+ "atoi",
+ "byteorder",
+ "bytes",
+ "crc",
+ "crossbeam-queue",
+ "dotenvy",
+ "either",
+ "event-listener",
+ "futures-channel",
+ "futures-core",
+ "futures-intrusive",
+ "futures-io",
+ "futures-util",
+ "hashlink",
+ "hex",
+ "indexmap 2.1.0",
+ "log",
+ "memchr",
+ "once_cell",
+ "paste",
+ "percent-encoding",
+ "rustls",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "sha2",
+ "smallvec",
+ "sqlformat",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+ "url",
+ "uuid",
+ "webpki-roots",
+]
+
+[[package]]
+name = "sqlx-macros"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89961c00dc4d7dffb7aee214964b065072bff69e36ddb9e2c107541f75e4f2a5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "sqlx-core",
+ "sqlx-macros-core",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "sqlx-macros-core"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0bd4519486723648186a08785143599760f7cc81c52334a55d6a83ea1e20841"
+dependencies = [
+ "atomic-write-file",
+ "dotenvy",
+ "either",
+ "heck",
+ "hex",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "sha2",
+ "sqlx-core",
+ "sqlx-mysql",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+ "syn 1.0.109",
+ "tempfile",
+ "tokio",
+ "url",
+]
+
+[[package]]
+name = "sqlx-mysql"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e37195395df71fd068f6e2082247891bc11e3289624bbc776a0cdfa1ca7f1ea4"
+dependencies = [
+ "atoi",
+ "base64",
+ "bitflags 2.4.1",
+ "byteorder",
+ "bytes",
+ "crc",
+ "digest",
+ "dotenvy",
+ "either",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-util",
+ "generic-array",
+ "hex",
+ "hkdf",
+ "hmac",
+ "itoa",
+ "log",
+ "md-5",
+ "memchr",
+ "once_cell",
+ "percent-encoding",
+ "rand",
+ "rsa",
+ "serde",
+ "sha1",
+ "sha2",
+ "smallvec",
+ "sqlx-core",
+ "stringprep",
+ "thiserror",
+ "tracing",
+ "uuid",
+ "whoami",
+]
+
+[[package]]
+name = "sqlx-postgres"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6ac0ac3b7ccd10cc96c7ab29791a7dd236bd94021f31eec7ba3d46a74aa1c24"
+dependencies = [
+ "atoi",
+ "base64",
+ "bitflags 2.4.1",
+ "byteorder",
+ "crc",
+ "dotenvy",
+ "etcetera",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-util",
+ "hex",
+ "hkdf",
+ "hmac",
+ "home",
+ "itoa",
+ "log",
+ "md-5",
+ "memchr",
+ "once_cell",
+ "rand",
+ "serde",
+ "serde_json",
+ "sha1",
+ "sha2",
+ "smallvec",
+ "sqlx-core",
+ "stringprep",
+ "thiserror",
+ "tracing",
+ "uuid",
+ "whoami",
+]
+
+[[package]]
+name = "sqlx-sqlite"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "210976b7d948c7ba9fced8ca835b11cbb2d677c59c79de41ac0d397e14547490"
+dependencies = [
+ "atoi",
+ "flume",
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-intrusive",
+ "futures-util",
+ "libsqlite3-sys",
+ "log",
+ "percent-encoding",
+ "serde",
+ "sqlx-core",
+ "tracing",
+ "url",
+ "urlencoding",
+ "uuid",
+]
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "stringprep"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6"
+dependencies = [
+ "finl_unicode",
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "strsim"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+
+[[package]]
+name = "strum"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.25.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "subtle"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89456b690ff72fddcecf231caedbe615c59480c93358a93dfae7fc29e3ebbf0e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+
+[[package]]
+name = "system-configuration"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "redox_syscall",
+ "rustix",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "thread_local"
+version = "1.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
+[[package]]
+name = "threadpool"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
+dependencies = [
+ "num_cpus",
+]
+
+[[package]]
+name = "thrift"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
+dependencies = [
+ "byteorder",
+ "integer-encoding",
+ "log",
+ "ordered-float 2.10.1",
+ "threadpool",
+]
+
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.35.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104"
+dependencies = [
+ "backtrace",
+ "bytes",
+ "libc",
+ "mio",
+ "num_cpus",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "tracing",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "tokio-io-timeout"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.24.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "pin-project-lite",
+ "slab",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "tonic"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-timeout",
+ "percent-encoding",
+ "pin-project",
+ "prost",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "tokio",
+ "tokio-rustls",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 1.9.3",
+ "pin-project",
+ "pin-project-lite",
+ "rand",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
+dependencies = [
+ "base64",
+ "bitflags 2.4.1",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-range-header",
+ "mime",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
+
+[[package]]
+name = "tower-service"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "log",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "parking_lot",
+ "regex",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
+[[package]]
+name = "treediff"
+version = "4.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52984d277bdf2a751072b5df30ec0377febdb02f7696d64c2d7d54630bac4303"
+dependencies = [
+ "serde_json",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "typenum"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
+
+[[package]]
+name = "unarray"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
+
+[[package]]
+name = "unicode-bidi"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
+
+[[package]]
+name = "unicode_categories"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "url"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+]
+
+[[package]]
+name = "urlencoding"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
+
+[[package]]
+name = "uuid"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wait-timeout"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "walkdir"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac36a15a220124ac510204aec1c3e5db8a22ab06fd6706d881dc6149f8ed9a12"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f"
+
+[[package]]
+name = "wasm-streams"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "0.25.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
+
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
+[[package]]
+name = "whoami"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.0",
+ "windows_aarch64_msvc 0.52.0",
+ "windows_i686_gnu 0.52.0",
+ "windows_i686_msvc 0.52.0",
+ "windows_x86_64_gnu 0.52.0",
+ "windows_x86_64_gnullvm 0.52.0",
+ "windows_x86_64_msvc 0.52.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
+
+[[package]]
+name = "winreg"
+version = "0.50.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
+dependencies = [
+ "cfg-if",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "workspace-hack"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "base64",
+ "bitflags 2.4.1",
+ "byteorder",
+ "bytes",
+ "cc",
+ "chrono",
+ "clap",
+ "clap_builder",
+ "crossbeam-utils",
+ "crypto-common",
+ "digest",
+ "either",
+ "fixedbitset",
+ "flatbuffers",
+ "flate2",
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+ "getrandom",
+ "hashbrown 0.14.3",
+ "heck",
+ "hyper",
+ "hyper-rustls",
+ "indexmap 2.1.0",
+ "itertools 0.11.0",
+ "k8s-openapi",
+ "kube-core",
+ "libc",
+ "lock_api",
+ "log",
+ "md-5",
+ "memchr",
+ "mio",
+ "nix",
+ "nom",
+ "num-traits",
+ "object_store",
+ "once_cell",
+ "parking_lot",
+ "percent-encoding",
+ "petgraph",
+ "phf_shared",
+ "predicates",
+ "proptest",
+ "prost",
+ "prost-types",
+ "rand",
+ "rand_core",
+ "regex",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
+ "reqwest",
+ "ring",
+ "rustls",
+ "serde",
+ "serde_json",
+ "sha2",
+ "similar",
+ "spin 0.9.8",
+ "sqlx",
+ "sqlx-core",
+ "sqlx-macros",
+ "sqlx-macros-core",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+ "strum",
+ "syn 1.0.109",
+ "syn 2.0.46",
+ "thrift",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+ "unicode-bidi",
+ "unicode-normalization",
+ "url",
+ "uuid",
+ "winapi",
+ "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
diff --git a/mutable_batch_lp/fuzz/Cargo.toml b/mutable_batch_lp/fuzz/Cargo.toml
new file mode 100644
index 0000000..7a564ad
--- /dev/null
+++ b/mutable_batch_lp/fuzz/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "mutable_batch_lp"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+hashbrown = "0.14"
+libfuzzer-sys = "0.4"
+mutable_batch_lp = { path = ".." }
+mutable_batch = { path = "../../mutable_batch" }
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[profile.release]
+debug = 1
+
+[[bin]]
+name = "lines_converter"
+path = "fuzz_targets/lines_converter.rs"
+test = false
+doc = false
diff --git a/mutable_batch_lp/fuzz/README.md b/mutable_batch_lp/fuzz/README.md
new file mode 100644
index 0000000..b978638
--- /dev/null
+++ b/mutable_batch_lp/fuzz/README.md
@@ -0,0 +1,46 @@
+# Fuzz tests
+
+The fuzz tests in this `fuzz` crate were created using [cargo-fuzz] version 0.11.3.
+
+[cargo-fuzz]: https://rust-fuzz.github.io/book/introduction.html
+
+## One-time setup
+
+To install `cargo-fuzz`:
+
+```
+$ cargo install cargo-fuzz
+```
+
+You'll also need a nightly Rust:
+
+```
+$ rustup install nightly
+```
+
+## Running
+
+To run an existing fuzz test, change to the `mutable_batch_lp` directory and run:
+
+```
+$ cargo +nightly fuzz run <TARGET>
+```
+
+where `<TARGET>` is the name of one of the files in `fuzz/fuzz_targets`. To list all targets, run:
+
+```
+$ cargo fuzz list
+```
+
+## Adding more
+
+To add more fuzzing targets, run:
+
+```
+$ cargo fuzz add <TARGET>
+```
+
+which will add a new file in `fuzz/fuzz_targets`. Edit the new file to call the code you want to
+fuzz; see the [`cargo-fuzz` tutorial] for examples.
+
+[`cargo-fuzz` tutorial]: https://rust-fuzz.github.io/book/cargo-fuzz/tutorial.html
diff --git a/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs b/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs
new file mode 100644
index 0000000..3442189
--- /dev/null
+++ b/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs
@@ -0,0 +1,66 @@
+#![no_main]
+
+use hashbrown::HashMap;
+use libfuzzer_sys::fuzz_target;
+use mutable_batch::{column::ColumnData, MutableBatch, PartitionWrite, WritePayload};
+use mutable_batch_lp::LinesConverter;
+
+fuzz_target!(|data: &[u8]| {
+    if let Ok(body) = std::str::from_utf8(data) {
+        let table_partition_template = Default::default();
+        let mut converter = LinesConverter::new(10);
+        let errors = match converter.write_lp(body) {
+            Ok(_) => vec![],
+            Err(mutable_batch_lp::Error::PerLine { lines }) => lines,
+            Err(other) => panic!("unexpected error: `{other}` input: `{body}`"),
+        };
+
+        if let Ok((batches, stats)) = converter.finish() {
+            let mut total_rows = 0;
+
+            let mut partitions: HashMap<_, HashMap<String, MutableBatch>> =
+                HashMap::default();
+
+            for (table_name, mutable_batch) in &batches {
+                assert!(
+                    mutable_batch.column("time").is_ok(),
+                    "batch for table `{table_name}` does not have a time column: \
+                    {mutable_batch:#?}\ninput: `{body}`\nerrors: `{errors:#?}`"
+                );
+
+                let data = mutable_batch.column("time").unwrap().data();
+                assert!(
+                    matches!(data, ColumnData::I64(_, _)),
+                    "expected the time column to be I64, instead got `{data:?}`.\ninput: `{body}`"
+                );
+
+                for (partition_key, partition_payload) in
+                    PartitionWrite::partition(&mutable_batch, &table_partition_template).unwrap()
+                {
+                    let partition = partitions.entry(partition_key).or_default();
+
+                    let mut table_batch = partition
+                        .raw_entry_mut()
+                        .from_key(table_name.as_str())
+                        .or_insert_with(|| (table_name.to_owned(), MutableBatch::default()));
+                    partition_payload
+                        .write_to_batch(&mut table_batch.1)
+                        .unwrap();
+                }
+
+                total_rows += mutable_batch.rows();
+            }
+
+            for (_partition_key, table_batches) in partitions {
+                for (_table_name, batch) in table_batches {
+                    assert_ne!(batch.rows(), 0);
+                }
+            }
+
+            assert_eq!(
+                stats.num_lines, total_rows,
+                "batches: {batches:#?}\ninput: `{body}`\nerrors: `{errors:#?}`"
+            );
+        }
+    }
+});
diff --git a/mutable_batch_lp/src/lib.rs b/mutable_batch_lp/src/lib.rs
new file mode 100644
index 0000000..5579a43
--- /dev/null
+++ b/mutable_batch_lp/src/lib.rs
@@ -0,0 +1,735 @@
+//! Code to convert line protocol to [`MutableBatch`]
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use criterion as _;
+use workspace_hack as _;
+
+use hashbrown::{hash_map::Entry, HashMap, HashSet};
+use influxdb_line_protocol::{parse_lines, FieldValue, ParsedLine};
+use mutable_batch::writer::Writer;
+use mutable_batch::MutableBatch;
+use snafu::{ResultExt, Snafu};
+
+const MAXIMUM_RETURNED_ERRORS: usize = 100;
+
+/// Error type for a conversion attempt on a set of line protocol lines
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display(
+        "errors encountered on line(s):\n{}",
+        itertools::join(lines.iter(), "\n")
+    ))]
+    PerLine { lines: Vec<LineError> },
+
+    #[snafu(display("empty write payload"))]
+    EmptyPayload,
+}
+
+/// Errors which occur independently per line
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum LineError {
+    #[snafu(display("error parsing line {} (1-based): {}", line, source))]
+    LineProtocol {
+        source: influxdb_line_protocol::Error,
+        line: usize,
+    },
+
+    #[snafu(display("error writing line {} (1-based): {}", line, source))]
+    Write { source: LineWriteError, line: usize },
+
+    #[snafu(display("timestamp overflows i64 on line {} (1-based)", line))]
+    TimestampOverflow { line: usize },
+}
+
+/// Result type for line protocol conversion
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Statistics about a line-protocol payload
+#[derive(Debug, Copy, Clone, Default)]
+pub struct PayloadStatistics {
+    /// The number of fields
+    pub num_fields: usize,
+    /// The number of lines
+    pub num_lines: usize,
+}
+
+/// Converts line protocol to a set of [`MutableBatch`]
+#[derive(Debug)]
+pub struct LinesConverter {
+    /// The timestamp for non-timestamped rows
+    default_time: i64,
+    /// The multiplier to convert input timestamps to nanoseconds
+    timestamp_base: i64,
+    /// The statistics
+    stats: PayloadStatistics,
+    /// The current batches
+    batches: HashMap<String, MutableBatch>,
+}
+
+impl LinesConverter {
+    /// Create a new [`LinesConverter`]
+    pub fn new(default_time: i64) -> Self {
+        Self {
+            default_time,
+            timestamp_base: 1,
+            stats: Default::default(),
+            batches: Default::default(),
+        }
+    }
+
+    /// Sets a multiplier to convert line protocol timestamps to nanoseconds
+    pub fn set_timestamp_base(&mut self, timestamp_base: i64) {
+        self.timestamp_base = timestamp_base
+    }
+
+    /// Write some line protocol data.
+    ///
+    /// If a field / tag name appears more than once in a single line, the
+    /// following semantics apply:
+    ///
+    ///   * duplicate fields, same value: do nothing/coalesce
+    ///   * duplicate fields, different value: last occurrence wins
+    ///   * duplicate fields, different types: return
+    ///     [`LineWriteError::ConflictedFieldTypes`]
+    ///   * duplicate tags, same value: return [`LineWriteError::DuplicateTag`]
+    ///   * duplicate tags, different value: return
+    ///     [`LineWriteError::DuplicateTag`]
+    ///   * duplicate tags, different types: return
+    ///     [`LineWriteError::DuplicateTag`]
+    ///   * same name for tag and field: return
+    ///     [`mutable_batch::writer::Error::TypeMismatch`]
+    ///   * same name for tag and field, different type :
+    ///     [`mutable_batch::writer::Error::TypeMismatch`]
+    ///
+    pub fn write_lp(&mut self, lines: &str) -> Result<()> {
+        let errors = parse_lines(lines)
+            .enumerate()
+            .filter_map(|(line_idx, maybe_line)| {
+                maybe_line
+                    .context(LineProtocolSnafu { line: line_idx + 1 })
+                    .and_then(|line| self.rebase_timestamp(line, line_idx))
+                    .and_then(|line| self.add_line_to_batch(line, line_idx))
+                    .err()
+            })
+            .take(MAXIMUM_RETURNED_ERRORS)
+            .collect::<Vec<_>>();
+
+        if !errors.is_empty() {
+            return Err(Error::PerLine { lines: errors });
+        }
+        Ok(())
+    }
+
+    fn rebase_timestamp<'a>(
+        &self,
+        mut line: ParsedLine<'a>,
+        line_idx: usize,
+    ) -> Result<ParsedLine<'a>, LineError> {
+        if let Some(t) = line.timestamp.as_mut() {
+            let updated_timestamp = match t.checked_mul(self.timestamp_base) {
+                Some(t) => t,
+                None => return Err(LineError::TimestampOverflow { line: line_idx + 1 }),
+            };
+            *t = updated_timestamp;
+        }
+        Ok(line)
+    }
+
+    fn add_line_to_batch(
+        &mut self,
+        line: ParsedLine<'_>,
+        line_idx: usize,
+    ) -> Result<(), LineError> {
+        let measurement = line.series.measurement.as_str();
+
+        let (_, batch) = self
+            .batches
+            .raw_entry_mut()
+            .from_key(measurement)
+            .or_insert_with(|| (measurement.to_string(), MutableBatch::new()));
+
+        // TODO: Reuse writer
+        let mut writer = Writer::new(batch, 1);
+        match write_line(&mut writer, &line, self.default_time)
+            .context(WriteSnafu { line: line_idx + 1 })
+        {
+            Ok(_) => {
+                writer.commit();
+                self.stats.num_lines += 1;
+                self.stats.num_fields += line.field_set.len();
+            }
+            Err(e) => return Err(e),
+        };
+        Ok(())
+    }
+
+    /// Consume this [`LinesConverter`] returning the [`MutableBatch`]
+    /// and the [`PayloadStatistics`] for the written data
+    pub fn finish(self) -> Result<(HashMap<String, MutableBatch>, PayloadStatistics)> {
+        let Self { batches, stats, .. } = self;
+
+        // Keep only batches that have rows. If add_line_to_batch returned a WriteError for all
+        // lines of that table, there will be an empty mutable batch in `batches` that will violate
+        // the assumptions that the partitioner makes later.
+        let nonempty_batches: HashMap<_, _> = batches
+            .into_iter()
+            .filter(|(_table, batch)| batch.rows() > 0)
+            .collect();
+
+        // If there aren't any nonempty batches, then we have an empty payload.
+        match nonempty_batches.is_empty() {
+            false => Ok((nonempty_batches, stats)),
+            true => Err(Error::EmptyPayload),
+        }
+    }
+}
+
+/// Converts the provided lines of line protocol to a set of [`MutableBatch`]
+/// keyed by measurement name
+pub fn lines_to_batches(lines: &str, default_time: i64) -> Result<HashMap<String, MutableBatch>> {
+    Ok(lines_to_batches_stats(lines, default_time)?.0)
+}
+
+/// Converts the provided lines of line protocol to a set of [`MutableBatch`]
+/// keyed by measurement name, and a set of statistics about the converted line protocol
+pub fn lines_to_batches_stats(
+    lines: &str,
+    default_time: i64,
+) -> Result<(HashMap<String, MutableBatch>, PayloadStatistics)> {
+    let mut converter = LinesConverter::new(default_time);
+    converter.write_lp(lines)?;
+    converter.finish()
+}
+
+/// An error applying an already-parsed line protocol line ([`ParsedLine`]) to a
+/// [`MutableBatch`].
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Snafu)]
+pub enum LineWriteError {
+    /// A transparent error wrapper over the underling mutable batch error.
+    #[snafu(display("{}", source))]
+    MutableBatch {
+        /// The underlying error
+        source: mutable_batch::writer::Error,
+    },
+
+    /// The specified tag name appears twice in one LP line, with conflicting
+    /// values.
+    #[snafu(display(
+        "the tag '{}' is specified more than once with conflicting values",
+        name
+    ))]
+    DuplicateTag {
+        /// The duplicated tag name.
+        name: String,
+    },
+
+    /// The specified field name appears twice in one LP line, with conflicting
+    /// types.
+    #[snafu(display(
+        "the field '{}' is specified more than once with conflicting types",
+        name
+    ))]
+    ConflictedFieldTypes {
+        /// The duplicated field name.
+        name: String,
+    },
+}
+
+/// Writes the [`ParsedLine`] to the [`MutableBatch`], respecting the edge case
+/// semantics described in [`LinesConverter::write_lp()`].
+pub fn write_line(
+    writer: &mut Writer<'_>,
+    line: &ParsedLine<'_>,
+    default_time: i64,
+) -> Result<(), LineWriteError> {
+    // Only allocate the seen tags hashset if there are tags.
+    if let Some(tags) = &line.series.tag_set {
+        let mut seen = HashSet::with_capacity(tags.len());
+        for (tag_key, tag_value) in tags {
+            // Check if a field with this name has been observed previously.
+            if !seen.insert(tag_key) {
+                // This tag_key appears more than once, with differing values.
+                //
+                // This is always an error.
+                return Err(LineWriteError::DuplicateTag {
+                    name: tag_key.to_string(),
+                });
+            }
+            writer
+                .write_tag(tag_key.as_str(), None, std::iter::once(tag_value.as_str()))
+                .context(MutableBatchSnafu)?
+        }
+    }
+
+    // In order to maintain parity with TSM, if a field within a single line is
+    // repeated, the last occurrence is retained - for example, in the following
+    // line protocol write:
+    //
+    //               table v=2,bananas=42,v=3,platanos=24
+    //                      ▲              ▲
+    //                      └───────┬──────┘
+    //                              │
+    //                     duplicate field "v"
+    //
+    // The duplicate "v" field should be collapsed into a single "v=3" field,
+    // yielding an effective write of:
+    //
+    //                table bananas=42,v=3,platanos=24
+    //
+    // To do this in O(n) time, the code below walks backwards (from right to
+    // left) when visiting parsed fields, and tracks which fields it has
+    // observed. Any time a previously observed field is visited, it is skipped:
+    //
+    //                                visit direction
+    //                           ◀─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
+    //
+    //               table v=2,bananas=42,v=3,platanos=24
+    //                      ▲              ▲
+    //                      │              │
+    //                     skip           keep
+    //
+    // A notable exception of this "last value wins" rule is if the types differ
+    // between each occurrence of "v":
+    //
+    //                        table v=2i,v=3u
+    //
+    // In this instance we break from established TSM behaviour and return an
+    // error. IOx features schema enforcement, and as such is expected to reject
+    // conflicting types for writes. See the github issue[1] for the desired
+    // semantics.
+    //
+    // Tests below codify each of the scenarios described in the ticket.
+    //
+    // [1]: https://github.com/influxdata/influxdb_iox/issues/4326
+
+    let mut seen = HashMap::<_, &FieldValue<'_>>::with_capacity(line.field_set.len());
+    for (field_key, field_value) in line.field_set.iter().rev() {
+        // Check if a field with this name has been observed previously.
+        match seen.entry(field_key) {
+            Entry::Occupied(e) if e.get().is_same_type(field_value) => {
+                // This field_value, and the "last" occurrence of this field_key
+                // (the first visited) are of the same type - this occurrence is
+                // skipped.
+                continue;
+            }
+            Entry::Occupied(_) => {
+                // This occurrence of "field_key" is of a different type to that
+                // of the "last" (fist visited) occurrence. This is an
+                // internally type-conflicted line and should be rejected.
+                return Err(LineWriteError::ConflictedFieldTypes {
+                    name: field_key.to_string(),
+                });
+            }
+            Entry::Vacant(v) => {
+                v.insert(field_value);
+            }
+        };
+
+        match field_value {
+            FieldValue::I64(value) => {
+                writer.write_i64(field_key.as_str(), None, std::iter::once(*value))
+            }
+            FieldValue::U64(value) => {
+                writer.write_u64(field_key.as_str(), None, std::iter::once(*value))
+            }
+            FieldValue::F64(value) => {
+                writer.write_f64(field_key.as_str(), None, std::iter::once(*value))
+            }
+            FieldValue::String(value) => {
+                writer.write_string(field_key.as_str(), None, std::iter::once(value.as_str()))
+            }
+            FieldValue::Boolean(value) => {
+                writer.write_bool(field_key.as_str(), None, std::iter::once(*value))
+            }
+        }
+        .context(MutableBatchSnafu)?;
+    }
+
+    let time = line.timestamp.unwrap_or(default_time);
+    writer
+        .write_time("time", std::iter::once(time))
+        .context(MutableBatchSnafu)?;
+
+    Ok(())
+}
+
+/// Test helper utilities
+pub mod test_helpers {
+    use mutable_batch::MutableBatch;
+
+    /// Converts the line protocol for a single table into a WritePayload
+    pub fn lp_to_mutable_batch(lp: &str) -> (String, MutableBatch) {
+        let batches = super::lines_to_batches(lp, 0).unwrap();
+        assert_eq!(batches.len(), 1);
+
+        batches.into_iter().next().unwrap()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ::test_helpers::assert_error;
+    use arrow_util::assert_batches_eq;
+    use assert_matches::assert_matches;
+    use schema::Projection;
+
+    #[test]
+    fn test_basic() {
+        let lp = r#"cpu,tag1=v1,tag2=v2 val=2i 0
+        cpu,tag1=v4,tag2=v1 val=2i 0
+        mem,tag1=v2 ival=3i 0
+        cpu,tag2=v2 val=3i 1
+        cpu,tag1=v1,tag2=v2 fval=2.0
+        mem,tag1=v5 ival=2i 1
+        "#;
+
+        let batches = lines_to_batches(lp, 5).unwrap();
+        assert_eq!(batches.len(), 2);
+
+        assert_batches_eq!(
+            &[
+                "+------+------+------+--------------------------------+-----+",
+                "| fval | tag1 | tag2 | time                           | val |",
+                "+------+------+------+--------------------------------+-----+",
+                "|      | v1   | v2   | 1970-01-01T00:00:00Z           | 2   |",
+                "|      | v4   | v1   | 1970-01-01T00:00:00Z           | 2   |",
+                "|      |      | v2   | 1970-01-01T00:00:00.000000001Z | 3   |",
+                "| 2.0  | v1   | v2   | 1970-01-01T00:00:00.000000005Z |     |",
+                "+------+------+------+--------------------------------+-----+",
+            ],
+            &[batches["cpu"].to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_batches_eq!(
+            &[
+                "+------+------+--------------------------------+",
+                "| ival | tag1 | time                           |",
+                "+------+------+--------------------------------+",
+                "| 3    | v2   | 1970-01-01T00:00:00Z           |",
+                "| 2    | v5   | 1970-01-01T00:00:00.000000001Z |",
+                "+------+------+--------------------------------+",
+            ],
+            &[batches["mem"].to_arrow(Projection::All).unwrap()]
+        );
+    }
+
+    #[test]
+    fn test_partial_line_conversion() {
+        let lp = r#"cpu,tag1=v1,tag2=v2 val=2i 0
+        cpu,tag1=v4,tag2=v1 val=2i 0
+        mem,tag1=v2 ival=3i 0
+        ,tag2=v2 val=3i 1
+        cpu,tag1=v1,tag2=v2 fval=2.0
+        bad_line
+        mem,tag1=v5 ival=2i 1
+        "#;
+
+        let mut converter = LinesConverter::new(5);
+        let result = converter.write_lp(lp);
+        assert_matches!(
+            result,
+            Err(Error::PerLine { lines }) if matches!(&lines[..], [LineError::LineProtocol { .. }, LineError::LineProtocol { .. }]),
+            "expected an error returned from write_lp(), but found {:?}", result
+        );
+        let (batches, _) = converter.finish().unwrap();
+        assert_eq!(
+            batches.len(),
+            2,
+            "expected both batches are written, instead found {:?}",
+            batches.len(),
+        );
+
+        assert_batches_eq!(
+            &[
+                "+------+------+------+--------------------------------+-----+",
+                "| fval | tag1 | tag2 | time                           | val |",
+                "+------+------+------+--------------------------------+-----+",
+                "|      | v1   | v2   | 1970-01-01T00:00:00Z           | 2   |",
+                "|      | v4   | v1   | 1970-01-01T00:00:00Z           | 2   |",
+                "| 2.0  | v1   | v2   | 1970-01-01T00:00:00.000000005Z |     |",
+                "+------+------+------+--------------------------------+-----+",
+            ],
+            &[batches["cpu"].to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_batches_eq!(
+            &[
+                "+------+------+--------------------------------+",
+                "| ival | tag1 | time                           |",
+                "+------+------+--------------------------------+",
+                "| 3    | v2   | 1970-01-01T00:00:00Z           |",
+                "| 2    | v5   | 1970-01-01T00:00:00.000000001Z |",
+                "+------+------+--------------------------------+",
+            ],
+            &[batches["mem"].to_arrow(Projection::All).unwrap()]
+        );
+    }
+
+    #[test]
+    fn test_nulls_string_and_float() {
+        let lp = r#"m f0="cat" 1639612800000000000
+m f1=10i 1639612800000000000
+        "#;
+
+        let batches = lines_to_batches(lp, 5).unwrap();
+        assert_eq!(batches.len(), 1);
+
+        let batch = batches["m"].to_arrow(Projection::All).unwrap();
+        assert_batches_eq!(
+            &[
+                "+-----+----+----------------------+",
+                "| f0  | f1 | time                 |",
+                "+-----+----+----------------------+",
+                "| cat |    | 2021-12-16T00:00:00Z |",
+                "|     | 10 | 2021-12-16T00:00:00Z |",
+                "+-----+----+----------------------+",
+            ],
+            &[batch.clone()]
+        );
+
+        // Verify the nullness of the string column ("" not the same as null)
+        let f0 = &batch.columns()[0];
+        assert!(f0.is_valid(0));
+        assert!(!f0.is_valid(1));
+
+        // Verify the nullness of the f1 column ("" not the same as null)
+        let f1 = &batch.columns()[1];
+        assert!(!f1.is_valid(0));
+        assert!(f1.is_valid(1));
+    }
+
+    #[test]
+    fn test_nulls_int_and_uint_and_bool() {
+        let lp = r#"m i=1i 1639612800000000000
+m u=2u 1639612800000000000
+m b=t 1639612800000000000
+        "#;
+
+        let batches = lines_to_batches(lp, 5).unwrap();
+        assert_eq!(batches.len(), 1);
+
+        let batch = batches["m"].to_arrow(Projection::All).unwrap();
+        assert_batches_eq!(
+            &[
+                "+------+---+----------------------+---+",
+                "| b    | i | time                 | u |",
+                "+------+---+----------------------+---+",
+                "|      | 1 | 2021-12-16T00:00:00Z |   |",
+                "|      |   | 2021-12-16T00:00:00Z | 2 |",
+                "| true |   | 2021-12-16T00:00:00Z |   |",
+                "+------+---+----------------------+---+",
+            ],
+            &[batch.clone()]
+        );
+
+        // Verify the nullness of the int column
+        let b = &batch.columns()[0];
+        assert!(!b.is_valid(0));
+        assert!(!b.is_valid(1));
+        assert!(b.is_valid(2));
+
+        // Verify the nullness of the int column
+        let i = &batch.columns()[1];
+        assert!(i.is_valid(0));
+        assert!(!i.is_valid(1));
+        assert!(!i.is_valid(2));
+
+        // Verify the nullness of the uint column
+        let u = &batch.columns()[3];
+        assert!(!u.is_valid(0));
+        assert!(u.is_valid(1));
+        assert!(!u.is_valid(2));
+    }
+
+    // https://github.com/influxdata/influxdb_iox/issues/4326
+    mod issue4326 {
+        use super::*;
+
+        #[test]
+        fn test_duplicate_field_same_value() {
+            let lp = "m1 val=2i,val=2i 0";
+
+            let batches = lines_to_batches(lp, 5).unwrap();
+            assert_eq!(batches.len(), 1);
+
+            assert_batches_eq!(
+                &[
+                    "+----------------------+-----+",
+                    "| time                 | val |",
+                    "+----------------------+-----+",
+                    "| 1970-01-01T00:00:00Z | 2   |",
+                    "+----------------------+-----+",
+                ],
+                &[batches["m1"].to_arrow(Projection::All).unwrap()]
+            );
+        }
+
+        #[test]
+        fn test_duplicate_field_different_values() {
+            let lp = "m1 val=1i,val=2i 0";
+
+            let batches = lines_to_batches(lp, 5).unwrap();
+            assert_eq!(batches.len(), 1);
+
+            // "last value wins"
+            assert_batches_eq!(
+                &[
+                    "+----------------------+-----+",
+                    "| time                 | val |",
+                    "+----------------------+-----+",
+                    "| 1970-01-01T00:00:00Z | 2   |",
+                    "+----------------------+-----+",
+                ],
+                &[batches["m1"].to_arrow(Projection::All).unwrap()]
+            );
+        }
+
+        #[test]
+        fn test_duplicate_fields_different_type() {
+            let lp = "m1 val=1i,val=2.0 0";
+
+            let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(&lines[..],
+                [LineError::Write {
+                    source: LineWriteError::ConflictedFieldTypes { name },
+                    line: 1
+                }] if name == "val"
+            ));
+        }
+
+        #[test]
+        fn test_duplicate_tags_same_value() {
+            let lp = "m1,tag=1,tag=1 val=1i 0";
+
+            let err = lines_to_batches(lp, 5).expect_err("duplicate tag write should fail");
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::DuplicateTag { name },
+                        line: 1
+                    }] if name == "tag"
+            ));
+        }
+
+        #[test]
+        fn test_duplicate_tags_different_value() {
+            let lp = "m1,tag=1,tag=2 val=1i 0";
+
+            let err = lines_to_batches(lp, 5).expect_err("duplicate tag write should fail");
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::DuplicateTag { name },
+                        line: 1
+                    }] if name == "tag"
+            ));
+        }
+
+        // NOTE: All tags are strings, so this should never be a type conflict.
+        #[test]
+        fn test_duplicate_tags_different_type() {
+            let lp = "m1,tag=1,tag=2.0 val=1i 0";
+
+            let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::DuplicateTag { name },
+                        line: 1
+                    }] if name == "tag"
+            ));
+        }
+
+        // NOTE: disallowed in IOx but accepted in TSM
+        //
+        // https://github.com/influxdata/influxdb_iox/issues/3150
+        #[test]
+        fn test_duplicate_is_tag_and_field() {
+            let lp = "m1,v=1i v=1i 0";
+
+            let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::MutableBatch { .. },
+                        line: 1
+                    }]
+            ));
+        }
+
+        #[test]
+        fn test_duplicate_is_tag_and_field_different_types() {
+            let lp = "m1,v=1i v=1.0 0";
+
+            let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::MutableBatch { .. },
+                        line: 1
+                    }]
+            ));
+        }
+    }
+
+    #[test]
+    fn dont_add_batches_when_there_are_write_errors() {
+        let lp = r#"6,,=0,,=^/+\---6,,=yY\w\w\,y-/- (="
+\_/1 (=""#;
+
+        let mut converter = LinesConverter::new(10);
+        let _errors = match converter.write_lp(lp) {
+            Ok(_) => vec![],
+            Err(Error::PerLine { lines }) => lines,
+            Err(other) => panic!("unexpected error: `{other}` input: `{lp}`"),
+        };
+
+        assert_error!(converter.finish(), Error::EmptyPayload);
+    }
+
+    #[test]
+    fn dont_add_stats_when_there_are_write_errors() {
+        let lp = "cpu,tag1=v1,tag2=v2 val=2i 0
+cpu val=4u";
+
+        let mut converter = LinesConverter::new(10);
+        // The second line has a different type for val
+        converter.write_lp(lp).unwrap_err();
+        let (batches, stats) = converter.finish().unwrap();
+
+        let total_rows: usize = batches.iter().map(|(_table, batch)| batch.rows()).sum();
+        assert_eq!(stats.num_lines, total_rows);
+    }
+
+    #[test]
+    fn duplicate_field_names_when_one_contains_optional_escaping_doesnt_panic() {
+        let lp = "table ,field=33,\\,field=333";
+        lines_to_batches(lp, 5).unwrap();
+    }
+}
diff --git a/mutable_batch_pb/Cargo.toml b/mutable_batch_pb/Cargo.toml
new file mode 100644
index 0000000..5af7558
--- /dev/null
+++ b/mutable_batch_pb/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "mutable_batch_pb"
+description = "Conversion logic for binary write protocol <-> MutableBatch"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow_util = { path = "../arrow_util" }
+dml = { path = "../dml" }
+generated_types = { path = "../generated_types" }
+hashbrown = { workspace = true }
+mutable_batch = { path = "../mutable_batch" }
+schema = { path = "../schema" }
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+data_types = { path = "../data_types" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+partition = { path = "../partition" }
diff --git a/mutable_batch_pb/src/decode.rs b/mutable_batch_pb/src/decode.rs
new file mode 100644
index 0000000..5cb4fa5
--- /dev/null
+++ b/mutable_batch_pb/src/decode.rs
@@ -0,0 +1,1082 @@
+//! Code to decode [`MutableBatch`] from pbdata protobuf
+
+use generated_types::influxdata::pbdata::v1::{
+    column::{SemanticType, Values as PbValues},
+    Column as PbColumn, DatabaseBatch, PackedStrings, TableBatch,
+};
+use generated_types::DecodeError;
+use hashbrown::{HashMap, HashSet};
+use mutable_batch::{writer::Writer, MutableBatch};
+use schema::{InfluxColumnType, InfluxFieldType, TIME_COLUMN_NAME};
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+
+/// Error type for line protocol conversion
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("error writing column {}: {}", column, source))]
+    Write {
+        source: mutable_batch::writer::Error,
+        column: String,
+    },
+
+    #[snafu(display("duplicate column name: {}", column))]
+    DuplicateColumnName { column: String },
+
+    #[snafu(display("table batch must contain time column"))]
+    MissingTime,
+
+    #[snafu(display("time column must not contain nulls"))]
+    NullTime,
+
+    #[snafu(display("column with no values: {}", column))]
+    EmptyColumn { column: String },
+
+    #[snafu(display("column missing dictionary: {}", column))]
+    MissingDictionary { column: String },
+
+    #[snafu(display(
+        "column \"{}\" contains invalid offset {} at index {}",
+        column,
+        offset,
+        index
+    ))]
+    InvalidOffset {
+        column: String,
+        offset: usize,
+        index: usize,
+    },
+
+    #[snafu(display("column \"{}\" contains more than one type of values", column))]
+    MultipleValues { column: String },
+
+    #[snafu(display("unknown type for column {column}: {source}"))]
+    UnknownType { source: DecodeError, column: String },
+
+    #[snafu(display("cannot infer type for column: {}", column))]
+    InvalidType { column: String },
+}
+
+/// Result type for pbdata conversion
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Decodes a [`DatabaseBatch`] to a map of [`MutableBatch`] keyed by table ID
+pub fn decode_database_batch(database_batch: &DatabaseBatch) -> Result<HashMap<i64, MutableBatch>> {
+    let mut id_to_data = HashMap::with_capacity(database_batch.table_batches.len());
+
+    for table_batch in &database_batch.table_batches {
+        let batch = id_to_data.entry(table_batch.table_id).or_default();
+
+        write_table_batch(batch, table_batch)?;
+    }
+
+    Ok(id_to_data)
+}
+
+/// Writes the provided [`TableBatch`] to a [`MutableBatch`] on error any changes made
+/// to `batch` are reverted
+pub fn write_table_batch(batch: &mut MutableBatch, table_batch: &TableBatch) -> Result<()> {
+    let to_insert = table_batch.row_count as usize;
+    if to_insert == 0 {
+        return Ok(());
+    }
+
+    // Verify columns are unique
+    let mut columns = HashSet::with_capacity(table_batch.columns.len());
+    for col in &table_batch.columns {
+        ensure!(
+            columns.insert(col.column_name.as_str()),
+            DuplicateColumnNameSnafu {
+                column: &col.column_name
+            }
+        );
+    }
+
+    // Batch must contain a time column
+    ensure!(columns.contains(TIME_COLUMN_NAME), MissingTimeSnafu);
+
+    let mut writer = Writer::new(batch, to_insert);
+    for column in &table_batch.columns {
+        let influx_type = pb_column_type(column)?;
+        let valid_mask = compute_valid_mask(&column.null_mask, to_insert);
+        let valid_mask = valid_mask.as_deref();
+
+        // Already verified has values
+        let values = column.values.as_ref().unwrap();
+
+        match influx_type {
+            InfluxColumnType::Field(InfluxFieldType::Float) => writer.write_f64(
+                &column.column_name,
+                valid_mask,
+                RepeatLastElement::new(values.f64_values.iter().cloned()),
+            ),
+            InfluxColumnType::Field(InfluxFieldType::Integer) => writer.write_i64(
+                &column.column_name,
+                valid_mask,
+                RepeatLastElement::new(values.i64_values.iter().cloned()),
+            ),
+            InfluxColumnType::Field(InfluxFieldType::UInteger) => writer.write_u64(
+                &column.column_name,
+                valid_mask,
+                RepeatLastElement::new(values.u64_values.iter().cloned()),
+            ),
+            InfluxColumnType::Tag => {
+                if let Some(interned) = values.interned_string_values.as_ref() {
+                    let dictionary =
+                        interned
+                            .dictionary
+                            .as_ref()
+                            .context(MissingDictionarySnafu {
+                                column: &column.column_name,
+                            })?;
+                    validate_packed_string(&column.column_name, dictionary)?;
+                    writer.write_tag_dict(
+                        &column.column_name,
+                        valid_mask,
+                        RepeatLastElement::new(interned.values.iter().map(|x| *x as usize)),
+                        packed_strings_iter(dictionary),
+                    )
+                } else if let Some(packed) = values.packed_string_values.as_ref() {
+                    validate_packed_string(&column.column_name, packed)?;
+                    writer.write_tag(
+                        &column.column_name,
+                        valid_mask,
+                        RepeatLastElement::new(packed_strings_iter(packed)),
+                    )
+                } else {
+                    writer.write_tag(
+                        &column.column_name,
+                        valid_mask,
+                        RepeatLastElement::new(values.string_values.iter().map(|x| x.as_str())),
+                    )
+                }
+            }
+            InfluxColumnType::Field(InfluxFieldType::String) => {
+                if let Some(interned) = values.interned_string_values.as_ref() {
+                    let dictionary =
+                        interned
+                            .dictionary
+                            .as_ref()
+                            .context(MissingDictionarySnafu {
+                                column: &column.column_name,
+                            })?;
+
+                    validate_packed_string(&column.column_name, dictionary)?;
+                    writer.write_string(
+                        &column.column_name,
+                        valid_mask,
+                        RepeatLastElement::new(
+                            interned
+                                .values
+                                .iter()
+                                .map(|x| packed_string_idx(dictionary, *x as usize)),
+                        ),
+                    )
+                } else if let Some(packed) = values.packed_string_values.as_ref() {
+                    validate_packed_string(&column.column_name, packed)?;
+                    writer.write_string(
+                        &column.column_name,
+                        valid_mask,
+                        RepeatLastElement::new(packed_strings_iter(packed)),
+                    )
+                } else {
+                    writer.write_string(
+                        &column.column_name,
+                        valid_mask,
+                        RepeatLastElement::new(values.string_values.iter().map(|x| x.as_str())),
+                    )
+                }
+            }
+            InfluxColumnType::Field(InfluxFieldType::Boolean) => writer.write_bool(
+                &column.column_name,
+                valid_mask,
+                RepeatLastElement::new(values.bool_values.iter().cloned()),
+            ),
+            InfluxColumnType::Timestamp => {
+                ensure!(valid_mask.is_none(), NullTimeSnafu);
+                writer.write_time(
+                    &column.column_name,
+                    RepeatLastElement::new(values.i64_values.iter().cloned()),
+                )
+            }
+        }
+        .context(WriteSnafu {
+            column: &column.column_name,
+        })?;
+    }
+
+    writer.commit();
+    Ok(())
+}
+
+/// Inner state of [`RepeatLastElement`].
+enum RepeatLastElementInner<I>
+where
+    I: Iterator,
+    I::Item: Clone,
+{
+    /// The iteration is running and the iterator hasn't ended yet.
+    Running { it: I, next: I::Item },
+
+    /// The iterator has ended and we're repeating the last element by cloning it.
+    Repeating { element: I::Item },
+
+    /// The iterator was empty.
+    Empty,
+}
+
+/// Iterator wrapper that repeats the last element forever.
+///
+/// This will just yield `None` if the wrapped iterator was empty.
+struct RepeatLastElement<I>
+where
+    I: Iterator,
+    I::Item: Clone,
+{
+    /// Inner state, wrapped into an option to make the borrow-checker happy.
+    inner: Option<RepeatLastElementInner<I>>,
+}
+
+impl<I> RepeatLastElement<I>
+where
+    I: Iterator,
+    I::Item: Clone,
+{
+    fn new(mut it: I) -> Self {
+        let inner = match it.next() {
+            Some(next) => RepeatLastElementInner::Running { it, next },
+            None => RepeatLastElementInner::Empty,
+        };
+
+        Self { inner: Some(inner) }
+    }
+}
+
+impl<I> Iterator for RepeatLastElement<I>
+where
+    I: Iterator,
+    I::Item: Clone,
+{
+    type Item = I::Item;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.inner.take().expect("should be set") {
+            RepeatLastElementInner::Running { mut it, next } => {
+                match it.next() {
+                    Some(next2) => {
+                        self.inner = Some(RepeatLastElementInner::Running { it, next: next2 });
+                    }
+                    None => {
+                        self.inner = Some(RepeatLastElementInner::Repeating {
+                            element: next.clone(),
+                        });
+                    }
+                }
+                Some(next)
+            }
+            RepeatLastElementInner::Repeating { element } => {
+                let element_cloned = element.clone();
+                self.inner = Some(RepeatLastElementInner::Repeating { element });
+                Some(element_cloned)
+            }
+            RepeatLastElementInner::Empty => {
+                self.inner = Some(RepeatLastElementInner::Empty);
+                None
+            }
+        }
+    }
+}
+
+/// Validates that the packed strings array is valid
+fn validate_packed_string(column: &str, strings: &PackedStrings) -> Result<()> {
+    let mut last_offset = match strings.offsets.first() {
+        Some(first) => *first as usize,
+        None => return Ok(()),
+    };
+
+    for (index, offset) in strings.offsets.iter().enumerate().skip(1) {
+        let offset = *offset as usize;
+        if offset < last_offset || !strings.values.is_char_boundary(offset) {
+            return InvalidOffsetSnafu {
+                column,
+                offset,
+                index,
+            }
+            .fail();
+        }
+        last_offset = offset;
+    }
+    Ok(())
+}
+
+/// Indexes a [`PackedStrings`]
+///
+/// # Panic
+///
+/// - if the index is beyond the bounds
+/// - if the index is not at a UTF-8 character boundary
+fn packed_string_idx(strings: &PackedStrings, idx: usize) -> &str {
+    let start_offset = strings.offsets[idx] as usize;
+    let end_offset = strings.offsets[idx + 1] as usize;
+    &strings.values[start_offset..end_offset]
+}
+
+/// Returns an iterator over the strings in a [`PackedStrings`]
+///
+/// # Panic
+///
+/// If the offsets array is not an increasing sequence of numbers less than
+/// the length of the strings array
+fn packed_strings_iter(strings: &PackedStrings) -> impl Iterator<Item = &str> + '_ {
+    let mut last_offset = strings.offsets.first().cloned().unwrap_or_default() as usize;
+    strings.offsets.iter().skip(1).map(move |next_offset| {
+        let next_offset = *next_offset as usize;
+        let string = &strings.values[last_offset..next_offset];
+        last_offset = next_offset;
+        string
+    })
+}
+
+/// Converts a potentially truncated null mask to a valid mask
+fn compute_valid_mask(null_mask: &[u8], to_insert: usize) -> Option<Vec<u8>> {
+    if null_mask.is_empty() || null_mask.iter().all(|x| *x == 0) {
+        return None;
+    }
+
+    // The expected length of the validity mask
+    let expected_len = (to_insert + 7) >> 3;
+
+    // The number of bits over the byte boundary
+    let overrun = to_insert & 7;
+
+    let mut mask: Vec<_> = (0..expected_len)
+        .map(|x| match null_mask.get(x) {
+            Some(v) => !*v,
+            None => 0xFF,
+        })
+        .collect();
+
+    if overrun != 0 {
+        *mask.last_mut().unwrap() &= (1 << overrun) - 1;
+    }
+
+    Some(mask)
+}
+
+fn pb_column_type(col: &PbColumn) -> Result<InfluxColumnType> {
+    let values = col.values.as_ref().context(EmptyColumnSnafu {
+        column: &col.column_name,
+    })?;
+
+    let value_type = pb_value_type(&col.column_name, values)?;
+    let semantic_type = SemanticType::try_from(col.semantic_type).context(UnknownTypeSnafu {
+        column: &col.column_name,
+    })?;
+
+    match (semantic_type, value_type) {
+        (SemanticType::Tag, InfluxFieldType::String) => Ok(InfluxColumnType::Tag),
+        (SemanticType::Field, field) => Ok(InfluxColumnType::Field(field)),
+        (SemanticType::Time, InfluxFieldType::Integer)
+            if col.column_name.as_str() == TIME_COLUMN_NAME =>
+        {
+            Ok(InfluxColumnType::Timestamp)
+        }
+        _ => InvalidTypeSnafu {
+            column: &col.column_name,
+        }
+        .fail(),
+    }
+}
+
+fn pb_value_type(column: &str, values: &PbValues) -> Result<InfluxFieldType> {
+    let mut ret = None;
+    let mut set_type = |field: InfluxFieldType| -> Result<()> {
+        match ret {
+            Some(_) => MultipleValuesSnafu { column }.fail(),
+            None => {
+                ret = Some(field);
+                Ok(())
+            }
+        }
+    };
+
+    if !values.string_values.is_empty() {
+        set_type(InfluxFieldType::String)?;
+    }
+
+    if values.packed_string_values.is_some() {
+        set_type(InfluxFieldType::String)?;
+    }
+
+    if values.interned_string_values.is_some() {
+        set_type(InfluxFieldType::String)?;
+    }
+
+    if !values.i64_values.is_empty() {
+        set_type(InfluxFieldType::Integer)?;
+    }
+
+    if !values.u64_values.is_empty() {
+        set_type(InfluxFieldType::UInteger)?;
+    }
+
+    if !values.f64_values.is_empty() {
+        set_type(InfluxFieldType::Float)?;
+    }
+
+    if !values.bool_values.is_empty() {
+        set_type(InfluxFieldType::Boolean)?;
+    }
+
+    ret.context(EmptyColumnSnafu { column })
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_util::assert_batches_eq;
+    use generated_types::influxdata::pbdata::v1::InternedStrings;
+    use schema::Projection;
+
+    use super::*;
+
+    fn column(name: &str, semantic_type: SemanticType) -> PbColumn {
+        PbColumn {
+            column_name: name.to_string(),
+            semantic_type: semantic_type as _,
+            values: None,
+            null_mask: vec![],
+        }
+    }
+
+    fn empty_values() -> PbValues {
+        PbValues {
+            i64_values: vec![],
+            f64_values: vec![],
+            u64_values: vec![],
+            string_values: vec![],
+            bool_values: vec![],
+            bytes_values: vec![],
+            packed_string_values: None,
+            interned_string_values: None,
+        }
+    }
+
+    fn with_strings(mut column: PbColumn, values: Vec<&str>, nulls: Vec<u8>) -> PbColumn {
+        let mut v = empty_values();
+        v.string_values = values.iter().map(ToString::to_string).collect();
+        column.null_mask = nulls;
+        column.values = Some(v);
+        column
+    }
+
+    fn with_packed_strings(
+        mut column: PbColumn,
+        values: PackedStrings,
+        nulls: Vec<u8>,
+    ) -> PbColumn {
+        let mut v = empty_values();
+        v.packed_string_values = Some(values);
+        column.null_mask = nulls;
+        column.values = Some(v);
+        column
+    }
+
+    fn with_interned_strings(
+        mut column: PbColumn,
+        values: InternedStrings,
+        nulls: Vec<u8>,
+    ) -> PbColumn {
+        let mut v = empty_values();
+        v.interned_string_values = Some(values);
+        column.null_mask = nulls;
+        column.values = Some(v);
+        column
+    }
+
+    fn with_i64(mut column: PbColumn, values: Vec<i64>, nulls: Vec<u8>) -> PbColumn {
+        let mut v = empty_values();
+        v.i64_values = values;
+        column.null_mask = nulls;
+        column.values = Some(v);
+        column
+    }
+
+    fn with_u64(mut column: PbColumn, values: Vec<u64>, nulls: Vec<u8>) -> PbColumn {
+        let mut v = empty_values();
+        v.u64_values = values;
+        column.null_mask = nulls;
+        column.values = Some(v);
+        column
+    }
+
+    fn with_f64(mut column: PbColumn, values: Vec<f64>, nulls: Vec<u8>) -> PbColumn {
+        let mut v = empty_values();
+        v.f64_values = values;
+        column.null_mask = nulls;
+        column.values = Some(v);
+        column
+    }
+
+    fn with_bool(mut column: PbColumn, values: Vec<bool>, nulls: Vec<u8>) -> PbColumn {
+        let mut v = empty_values();
+        v.bool_values = values;
+        column.null_mask = nulls;
+        column.values = Some(v);
+        column
+    }
+
+    #[test]
+    fn test_packed_strings_iter() {
+        let s = PackedStrings {
+            values: "".to_string(),
+            offsets: vec![],
+        };
+        assert_eq!(packed_strings_iter(&s).count(), 0);
+
+        let s = PackedStrings {
+            values: "".to_string(),
+            offsets: vec![0],
+        };
+        assert_eq!(packed_strings_iter(&s).count(), 0);
+
+        let s = PackedStrings {
+            values: "fooboo".to_string(),
+            offsets: vec![0, 3, 6],
+        };
+        let r: Vec<_> = packed_strings_iter(&s).collect();
+
+        assert_eq!(r, vec!["foo", "boo"]);
+    }
+
+    #[test]
+    fn test_column_type() {
+        let mut column = column("test", SemanticType::Time);
+
+        let e = pb_column_type(&column).unwrap_err().to_string();
+        assert_eq!(e, "column with no values: test");
+
+        let mut values = empty_values();
+        values.i64_values = vec![2];
+        values.f64_values = vec![32.];
+        column.values = Some(values);
+
+        let e = pb_column_type(&column).unwrap_err().to_string();
+        assert_eq!(e, "column \"test\" contains more than one type of values");
+
+        let mut values = empty_values();
+        values.string_values = vec!["hello".to_string()];
+        values.packed_string_values = Some(PackedStrings {
+            values: "".to_string(),
+            offsets: vec![],
+        });
+        column.values = Some(values);
+
+        let e = pb_column_type(&column).unwrap_err().to_string();
+        assert_eq!(e, "column \"test\" contains more than one type of values");
+
+        let mut values = empty_values();
+        values.string_values = vec!["hello".to_string()];
+        values.interned_string_values = Some(InternedStrings {
+            dictionary: None,
+            values: vec![],
+        });
+        column.values = Some(values);
+
+        let e = pb_column_type(&column).unwrap_err().to_string();
+        assert_eq!(e, "column \"test\" contains more than one type of values");
+    }
+
+    #[test]
+    fn test_basic() {
+        let mut table_batch = TableBatch {
+            columns: vec![
+                with_strings(
+                    column("tag1", SemanticType::Tag),
+                    vec!["v1", "v1", "v2", "v2", "v1"],
+                    vec![],
+                ),
+                with_strings(
+                    column("tag2", SemanticType::Tag),
+                    vec!["v2", "v3"],
+                    vec![0b00010101],
+                ),
+                with_f64(
+                    column("f64", SemanticType::Field),
+                    vec![3., 5.],
+                    vec![0b00001101],
+                ),
+                with_i64(
+                    column("i64", SemanticType::Field),
+                    vec![56, 2],
+                    vec![0b00001110],
+                ),
+                with_i64(
+                    column("time", SemanticType::Time),
+                    vec![1, 2, 3, 4, 5],
+                    vec![0b00000000],
+                ),
+                with_u64(
+                    column("u64", SemanticType::Field),
+                    vec![4, 3, 2, 1],
+                    vec![0b00000100],
+                ),
+            ],
+            row_count: 5,
+            table_id: 42,
+        };
+
+        let mut batch = MutableBatch::new();
+
+        write_table_batch(&mut batch, &table_batch).unwrap();
+
+        let expected = &[
+            "+-----+-----+------+------+--------------------------------+-----+",
+            "| f64 | i64 | tag1 | tag2 | time                           | u64 |",
+            "+-----+-----+------+------+--------------------------------+-----+",
+            "|     | 56  | v1   |      | 1970-01-01T00:00:00.000000001Z | 4   |",
+            "| 3.0 |     | v1   | v2   | 1970-01-01T00:00:00.000000002Z | 3   |",
+            "|     |     | v2   |      | 1970-01-01T00:00:00.000000003Z |     |",
+            "|     |     | v2   | v3   | 1970-01-01T00:00:00.000000004Z | 2   |",
+            "| 5.0 | 2   | v1   |      | 1970-01-01T00:00:00.000000005Z | 1   |",
+            "+-----+-----+------+------+--------------------------------+-----+",
+        ];
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+        table_batch.columns.push(table_batch.columns[0].clone());
+
+        let err = write_table_batch(&mut batch, &table_batch)
+            .unwrap_err()
+            .to_string();
+        assert_eq!(err, "duplicate column name: tag1");
+
+        table_batch.columns.pop();
+
+        // Missing time column -> error
+        let mut time = table_batch.columns.remove(4);
+        assert_eq!(time.column_name.as_str(), "time");
+
+        let err = write_table_batch(&mut batch, &table_batch)
+            .unwrap_err()
+            .to_string();
+        assert_eq!(err, "table batch must contain time column");
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+        // Nulls in time column -> error
+        time.null_mask = vec![1];
+        table_batch.columns.push(time);
+
+        let err = write_table_batch(&mut batch, &table_batch)
+            .unwrap_err()
+            .to_string();
+        assert_eq!(err, "time column must not contain nulls");
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+        // Missing values -> error
+        table_batch.columns[0].values.take().unwrap();
+
+        let err = write_table_batch(&mut batch, &table_batch)
+            .unwrap_err()
+            .to_string();
+        assert_eq!(err, "column with no values: tag1");
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+        // No data -> error
+        table_batch.columns[0].values = Some(PbValues {
+            i64_values: vec![],
+            f64_values: vec![],
+            u64_values: vec![],
+            string_values: vec![],
+            bool_values: vec![],
+            bytes_values: vec![],
+            packed_string_values: None,
+            interned_string_values: None,
+        });
+
+        let err = write_table_batch(&mut batch, &table_batch)
+            .unwrap_err()
+            .to_string();
+        assert_eq!(err, "column with no values: tag1");
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+    }
+
+    #[test]
+    fn test_strings() {
+        let table_batch = TableBatch {
+            columns: vec![
+                with_packed_strings(
+                    column("tag1", SemanticType::Tag),
+                    PackedStrings {
+                        values: "helloinfluxdata".to_string(),
+                        offsets: vec![0, 5, 11, 11, 15],
+                    },
+                    vec![0b000010010],
+                ),
+                with_packed_strings(
+                    column("tag2", SemanticType::Tag),
+                    PackedStrings {
+                        values: "helloworld".to_string(),
+                        offsets: vec![0, 5, 10],
+                    },
+                    vec![0b000111010],
+                ),
+                with_packed_strings(
+                    column("s1", SemanticType::Field),
+                    PackedStrings {
+                        values: "cupcakesareawesome".to_string(),
+                        offsets: vec![0, 8, 11, 18],
+                    },
+                    vec![0b000110010],
+                ),
+                with_interned_strings(
+                    column("tag3", SemanticType::Tag),
+                    InternedStrings {
+                        dictionary: Some(PackedStrings {
+                            values: "tag1tag2".to_string(),
+                            offsets: vec![0, 4, 8, 8],
+                        }),
+                        values: vec![0, 1, 1, 0, 2, 1],
+                    },
+                    vec![0b000000000],
+                ),
+                with_interned_strings(
+                    column("s2", SemanticType::Field),
+                    InternedStrings {
+                        dictionary: Some(PackedStrings {
+                            values: "v1v2v3".to_string(),
+                            offsets: vec![0, 2, 4, 6],
+                        }),
+                        values: vec![0, 1, 2],
+                    },
+                    vec![0b000011010],
+                ),
+                with_i64(
+                    column("time", SemanticType::Time),
+                    vec![1, 2, 3, 4, 5, 6],
+                    vec![],
+                ),
+            ],
+            row_count: 6,
+            table_id: 42,
+        };
+
+        let mut batch = MutableBatch::new();
+        write_table_batch(&mut batch, &table_batch).unwrap();
+
+        let expected = &[
+            "+----------+----+--------+-------+------+--------------------------------+",
+            "| s1       | s2 | tag1   | tag2  | tag3 | time                           |",
+            "+----------+----+--------+-------+------+--------------------------------+",
+            "| cupcakes | v1 | hello  | hello | tag1 | 1970-01-01T00:00:00.000000001Z |",
+            "|          |    |        |       | tag2 | 1970-01-01T00:00:00.000000002Z |",
+            "| are      | v2 | influx | world | tag2 | 1970-01-01T00:00:00.000000003Z |",
+            "| awesome  |    |        |       | tag1 | 1970-01-01T00:00:00.000000004Z |",
+            "|          |    |        |       |      | 1970-01-01T00:00:00.000000005Z |",
+            "|          | v3 | data   |       | tag2 | 1970-01-01T00:00:00.000000006Z |",
+            "+----------+----+--------+-------+------+--------------------------------+",
+        ];
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+        // Try to write 6 rows expecting an error
+        let mut try_write = |other: PbColumn, expected_err: &str| {
+            let table_batch = TableBatch {
+                columns: vec![
+                    with_i64(
+                        column("time", SemanticType::Time),
+                        vec![1, 2, 3, 4, 5, 6],
+                        vec![],
+                    ),
+                    other,
+                ],
+                row_count: 6,
+                table_id: 42,
+            };
+
+            let err = write_table_batch(&mut batch, &table_batch)
+                .unwrap_err()
+                .to_string();
+
+            assert_eq!(err, expected_err);
+            assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+        };
+
+        try_write(
+            with_packed_strings(
+                column("s1", SemanticType::Tag),
+                PackedStrings {
+                    values: "helloworld".to_string(),
+                    offsets: vec![0, 5, 11],
+                },
+                vec![0b000111010],
+            ),
+            "column \"s1\" contains invalid offset 11 at index 2",
+        );
+
+        try_write(
+            with_packed_strings(
+                column("s1", SemanticType::Field),
+                PackedStrings {
+                    values: "helloworld".to_string(),
+                    offsets: vec![0, 5, 4],
+                },
+                vec![0b000111010],
+            ),
+            "column \"s1\" contains invalid offset 4 at index 2",
+        );
+
+        try_write(
+            with_packed_strings(
+                column("tag2", SemanticType::Field),
+                PackedStrings {
+                    values: "helloworld".to_string(),
+                    offsets: vec![0, 5, 10],
+                },
+                vec![0b000111010],
+            ),
+            "error writing column tag2: Unable to insert iox::column_type::field::string type into column tag2 with type iox::column_type::tag",
+        );
+
+        try_write(
+            with_packed_strings(
+                column("tag2", SemanticType::Tag),
+                PackedStrings {
+                    values: "hello😀world".to_string(),
+                    offsets: vec![0, 6, 10],
+                },
+                vec![0b000111010],
+            ),
+            "column \"tag2\" contains invalid offset 6 at index 1",
+        );
+
+        try_write(
+            with_interned_strings(
+                column("tag3", SemanticType::Tag),
+                InternedStrings {
+                    dictionary: Some(PackedStrings {
+                        values: "tag1tag2".to_string(),
+                        offsets: vec![0, 4, 8, 8],
+                    }),
+                    values: vec![0, 1, 3, 0, 2, 1],
+                },
+                vec![0b000000000],
+            ),
+            "error writing column tag3: Key not found in dictionary: 3",
+        );
+
+        try_write(
+            with_interned_strings(
+                column("tag3", SemanticType::Tag),
+                InternedStrings {
+                    dictionary: Some(PackedStrings {
+                        values: "tag1tag2".to_string(),
+                        offsets: vec![0, 4, 3, 8],
+                    }),
+                    values: vec![0, 1, 1, 0, 2, 1],
+                },
+                vec![0b000000000],
+            ),
+            "column \"tag3\" contains invalid offset 3 at index 2",
+        );
+    }
+
+    #[test]
+    fn test_optimization_trim_null_masks() {
+        // See https://github.com/influxdata/influxdb-pb-data-protocol#optimization-1-trim-null-masks
+        let table_batch = TableBatch {
+            columns: vec![
+                with_i64(
+                    column("i64", SemanticType::Field),
+                    vec![1, 2, 3, 4, 5, 6, 7],
+                    vec![0b11000001],
+                ),
+                with_i64(
+                    column("time", SemanticType::Time),
+                    vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                    vec![0b00000000],
+                ),
+            ],
+            row_count: 10,
+            table_id: 42,
+        };
+
+        let mut batch = MutableBatch::new();
+
+        write_table_batch(&mut batch, &table_batch).unwrap();
+
+        let expected = &[
+            "+-----+--------------------------------+",
+            "| i64 | time                           |",
+            "+-----+--------------------------------+",
+            "|     | 1970-01-01T00:00:00.000000001Z |",
+            "| 1   | 1970-01-01T00:00:00.000000002Z |",
+            "| 2   | 1970-01-01T00:00:00.000000003Z |",
+            "| 3   | 1970-01-01T00:00:00.000000004Z |",
+            "| 4   | 1970-01-01T00:00:00.000000005Z |",
+            "| 5   | 1970-01-01T00:00:00.000000006Z |",
+            "|     | 1970-01-01T00:00:00.000000007Z |",
+            "|     | 1970-01-01T00:00:00.000000008Z |",
+            "| 6   | 1970-01-01T00:00:00.000000009Z |",
+            "| 7   | 1970-01-01T00:00:00.000000010Z |",
+            "+-----+--------------------------------+",
+        ];
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+    }
+
+    #[test]
+    fn test_optimization_omit_null_masks() {
+        // See https://github.com/influxdata/influxdb-pb-data-protocol#optimization-1b-omit-empty-null-masks
+        let table_batch = TableBatch {
+            columns: vec![with_i64(
+                column("time", SemanticType::Time),
+                vec![1, 2, 3, 4, 5, 6, 7, 8, 9],
+                vec![],
+            )],
+            row_count: 9,
+            table_id: 42,
+        };
+
+        let mut batch = MutableBatch::new();
+
+        write_table_batch(&mut batch, &table_batch).unwrap();
+
+        let expected = &[
+            "+--------------------------------+",
+            "| time                           |",
+            "+--------------------------------+",
+            "| 1970-01-01T00:00:00.000000001Z |",
+            "| 1970-01-01T00:00:00.000000002Z |",
+            "| 1970-01-01T00:00:00.000000003Z |",
+            "| 1970-01-01T00:00:00.000000004Z |",
+            "| 1970-01-01T00:00:00.000000005Z |",
+            "| 1970-01-01T00:00:00.000000006Z |",
+            "| 1970-01-01T00:00:00.000000007Z |",
+            "| 1970-01-01T00:00:00.000000008Z |",
+            "| 1970-01-01T00:00:00.000000009Z |",
+            "+--------------------------------+",
+        ];
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+    }
+
+    #[test]
+    fn test_optimization_trim_repeated_tail_values() {
+        // See https://github.com/influxdata/influxdb-pb-data-protocol#optimization-2-trim-repeated-tail-values
+        let table_batch = TableBatch {
+            columns: vec![
+                with_strings(
+                    column("f_s", SemanticType::Field),
+                    vec!["s1", "s2", "s3"],
+                    vec![0b11000001],
+                ),
+                with_interned_strings(
+                    column("f_i", SemanticType::Field),
+                    InternedStrings {
+                        dictionary: Some(PackedStrings {
+                            values: "s1s2".to_string(),
+                            offsets: vec![0, 2, 4],
+                        }),
+                        values: vec![0, 1, 0],
+                    },
+                    vec![0b11000001],
+                ),
+                with_packed_strings(
+                    column("f_p", SemanticType::Field),
+                    PackedStrings {
+                        values: "s1s2s3".to_string(),
+                        offsets: vec![0, 2, 4, 6],
+                    },
+                    vec![0b11000001],
+                ),
+                with_strings(
+                    column("t_s", SemanticType::Tag),
+                    vec!["s1", "s2", "s3"],
+                    vec![0b11000001],
+                ),
+                with_interned_strings(
+                    column("t_i", SemanticType::Tag),
+                    InternedStrings {
+                        dictionary: Some(PackedStrings {
+                            values: "s1s2".to_string(),
+                            offsets: vec![0, 2, 4],
+                        }),
+                        values: vec![0, 1, 0],
+                    },
+                    vec![0b11000001],
+                ),
+                with_packed_strings(
+                    column("t_p", SemanticType::Tag),
+                    PackedStrings {
+                        values: "s1s2s3".to_string(),
+                        offsets: vec![0, 2, 4, 6],
+                    },
+                    vec![0b11000001],
+                ),
+                with_bool(
+                    column("bool", SemanticType::Field),
+                    vec![false, false, true],
+                    vec![0b11000001],
+                ),
+                with_f64(
+                    column("f64", SemanticType::Field),
+                    vec![1.1, 2.2, 3.3],
+                    vec![0b11000001],
+                ),
+                with_i64(
+                    column("i64", SemanticType::Field),
+                    vec![1, 2, 3],
+                    vec![0b11000001],
+                ),
+                with_u64(
+                    column("u64", SemanticType::Field),
+                    vec![1, 2, 3],
+                    vec![0b11000001],
+                ),
+                with_i64(column("time", SemanticType::Time), vec![1, 2, 3], vec![]),
+            ],
+            row_count: 9,
+            table_id: 42,
+        };
+
+        let mut batch = MutableBatch::new();
+
+        write_table_batch(&mut batch, &table_batch).unwrap();
+
+        let expected = &[
+            "+-------+-----+-----+-----+-----+-----+-----+-----+-----+--------------------------------+-----+",
+            "| bool  | f64 | f_i | f_p | f_s | i64 | t_i | t_p | t_s | time                           | u64 |",
+            "+-------+-----+-----+-----+-----+-----+-----+-----+-----+--------------------------------+-----+",
+            "|       |     |     |     |     |     |     |     |     | 1970-01-01T00:00:00.000000001Z |     |",
+            "| false | 1.1 | s1  | s1  | s1  | 1   | s1  | s1  | s1  | 1970-01-01T00:00:00.000000002Z | 1   |",
+            "| false | 2.2 | s2  | s2  | s2  | 2   | s2  | s2  | s2  | 1970-01-01T00:00:00.000000003Z | 2   |",
+            "| true  | 3.3 | s1  | s3  | s3  | 3   | s1  | s3  | s3  | 1970-01-01T00:00:00.000000003Z | 3   |",
+            "| true  | 3.3 | s1  | s3  | s3  | 3   | s1  | s3  | s3  | 1970-01-01T00:00:00.000000003Z | 3   |",
+            "| true  | 3.3 | s1  | s3  | s3  | 3   | s1  | s3  | s3  | 1970-01-01T00:00:00.000000003Z | 3   |",
+            "|       |     |     |     |     |     |     |     |     | 1970-01-01T00:00:00.000000003Z |     |",
+            "|       |     |     |     |     |     |     |     |     | 1970-01-01T00:00:00.000000003Z |     |",
+            "| true  | 3.3 | s1  | s3  | s3  | 3   | s1  | s3  | s3  | 1970-01-01T00:00:00.000000003Z | 3   |",
+            "+-------+-----+-----+-----+-----+-----+-----+-----+-----+--------------------------------+-----+",
+        ];
+
+        assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+        // we need at least one value though
+        let table_batch = TableBatch {
+            columns: vec![with_i64(column("time", SemanticType::Time), vec![], vec![])],
+            row_count: 9,
+            table_id: 42,
+        };
+
+        let mut batch = MutableBatch::new();
+
+        let err = write_table_batch(&mut batch, &table_batch)
+            .unwrap_err()
+            .to_string();
+        assert_eq!(err, "column with no values: time");
+    }
+}
diff --git a/mutable_batch_pb/src/encode.rs b/mutable_batch_pb/src/encode.rs
new file mode 100644
index 0000000..45e3dad
--- /dev/null
+++ b/mutable_batch_pb/src/encode.rs
@@ -0,0 +1,136 @@
+//! Code to encode [`MutableBatch`] as pbdata protobuf
+
+use arrow_util::bitset::{iter_set_positions, BitSet};
+use dml::DmlWrite;
+use generated_types::influxdata::pbdata::v1::column::SemanticType;
+use generated_types::influxdata::pbdata::v1::{
+    column::Values as PbValues, Column as PbColumn, DatabaseBatch, InternedStrings, PackedStrings,
+    TableBatch,
+};
+use mutable_batch::column::{Column, ColumnData};
+use mutable_batch::MutableBatch;
+use schema::InfluxColumnType;
+
+/// Convert a [`DmlWrite`] to a [`DatabaseBatch`]
+pub fn encode_write(database_id: i64, write: &DmlWrite) -> DatabaseBatch {
+    DatabaseBatch {
+        table_batches: write
+            .tables()
+            .map(|(table_id, batch)| encode_batch(table_id.get(), batch))
+            .collect(),
+        partition_key: write.partition_key().to_string(),
+        database_id,
+    }
+}
+
+/// Convert a [`MutableBatch`] to [`TableBatch`]
+pub fn encode_batch(table_id: i64, batch: &MutableBatch) -> TableBatch {
+    TableBatch {
+        columns: batch
+            .columns()
+            .filter_map(|(column_name, column)| {
+                // Skip encoding any entirely NULL columns.
+                //
+                // This prevents a type-inference error during deserialisation
+                // of the proto wire message.
+                //
+                //  https://github.com/influxdata/influxdb_iox/issues/4272
+                //
+                if column.valid_mask().is_all_unset() {
+                    return None;
+                }
+
+                Some(encode_column(column_name, column))
+            })
+            .collect(),
+        row_count: batch.rows() as u32,
+        table_id,
+    }
+}
+
+fn encode_column(column_name: &str, column: &Column) -> PbColumn {
+    let valid_mask = column.valid_mask().bytes();
+
+    let mut values = PbValues {
+        i64_values: vec![],
+        f64_values: vec![],
+        u64_values: vec![],
+        string_values: vec![],
+        bool_values: vec![],
+        bytes_values: vec![],
+        packed_string_values: None,
+        interned_string_values: None,
+    };
+
+    let semantic_type = match column.influx_type() {
+        InfluxColumnType::Tag => SemanticType::Tag,
+        InfluxColumnType::Field(_) => SemanticType::Field,
+        InfluxColumnType::Timestamp => SemanticType::Time,
+    };
+
+    match column.data() {
+        ColumnData::F64(col_data, _) => {
+            values.f64_values = iter_set_positions(valid_mask)
+                .map(|idx| col_data[idx])
+                .collect();
+        }
+        ColumnData::I64(col_data, _) => {
+            values.i64_values = iter_set_positions(valid_mask)
+                .map(|idx| col_data[idx])
+                .collect();
+        }
+        ColumnData::U64(col_data, _) => {
+            values.u64_values = iter_set_positions(valid_mask)
+                .map(|idx| col_data[idx])
+                .collect();
+        }
+        ColumnData::String(col_data, _) => {
+            let (col_offsets, col_storage) = col_data.inner();
+
+            // Nulls are stored as empty strings which take up no space
+            let mut offsets = vec![0; 1];
+            for idx in iter_set_positions(valid_mask) {
+                offsets.push(col_offsets[idx + 1] as u32)
+            }
+
+            values.packed_string_values = Some(PackedStrings {
+                values: col_storage.to_string(),
+                offsets,
+            });
+        }
+        ColumnData::Bool(col_data, _) => {
+            values.bool_values = iter_set_positions(valid_mask)
+                .map(|idx| col_data.get(idx))
+                .collect();
+        }
+        ColumnData::Tag(col_data, dict, _) => {
+            let (val_offsets, val_storage) = dict.values().inner();
+
+            values.interned_string_values = Some(InternedStrings {
+                dictionary: Some(PackedStrings {
+                    values: val_storage.to_string(),
+                    offsets: val_offsets.iter().map(|idx| *idx as u32).collect(),
+                }),
+                values: iter_set_positions(valid_mask)
+                    .map(|idx| col_data[idx] as u32)
+                    .collect(),
+            });
+        }
+    };
+
+    PbColumn {
+        column_name: column_name.to_string(),
+        semantic_type: semantic_type as _,
+        values: Some(values),
+        null_mask: compute_null_mask(column.valid_mask()),
+    }
+}
+
+fn compute_null_mask(valid_mask: &BitSet) -> Vec<u8> {
+    let mut buffer: Vec<_> = valid_mask.bytes().iter().map(|x| !*x).collect();
+    let overrun = valid_mask.len() & 7;
+    if overrun > 0 {
+        *buffer.last_mut().unwrap() &= (1 << overrun) - 1;
+    }
+    buffer
+}
diff --git a/mutable_batch_pb/src/lib.rs b/mutable_batch_pb/src/lib.rs
new file mode 100644
index 0000000..6babb30
--- /dev/null
+++ b/mutable_batch_pb/src/lib.rs
@@ -0,0 +1,28 @@
+//! Code to convert between binary write format and [`mutable_batch::MutableBatch`]
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use data_types as _;
+#[cfg(test)]
+use mutable_batch_lp as _;
+#[cfg(test)]
+use partition as _;
+use workspace_hack as _;
+
+pub mod decode;
+pub mod encode;
diff --git a/mutable_batch_pb/tests/encode.rs b/mutable_batch_pb/tests/encode.rs
new file mode 100644
index 0000000..2fd818b
--- /dev/null
+++ b/mutable_batch_pb/tests/encode.rs
@@ -0,0 +1,177 @@
+use arrow_util::assert_batches_eq;
+use data_types::PartitionKey;
+use mutable_batch::{writer::Writer, MutableBatch, WritePayload};
+use mutable_batch_pb::{decode::write_table_batch, encode::encode_batch};
+use partition::PartitionWrite;
+use schema::Projection;
+
+#[test]
+fn test_encode_decode() {
+    let (_, batch) = mutable_batch_lp::test_helpers::lp_to_mutable_batch(
+        r#"
+        foo,t1=asdf iv=1i,uv=774u,fv=1.0,bv=true,sv="hi" 1
+        foo,t1=bar uv=1u,fv=32.0,bv=true 2
+        foo,t1=bar iv=1i,uv=1u,fv=1.0,sv="bye" 3
+        foo iv=-3405i,uv=566u,bv=false,sv="hi" 4
+        foo,t1=asdf iv=1i,fv=1.23,bv=true,sv="hi" 5
+    "#,
+    );
+
+    let expected = &[
+        "+-------+------+-------+-----+------+--------------------------------+-----+",
+        "| bv    | fv   | iv    | sv  | t1   | time                           | uv  |",
+        "+-------+------+-------+-----+------+--------------------------------+-----+",
+        "| true  | 1.0  | 1     | hi  | asdf | 1970-01-01T00:00:00.000000001Z | 774 |",
+        "| true  | 32.0 |       |     | bar  | 1970-01-01T00:00:00.000000002Z | 1   |",
+        "|       | 1.0  | 1     | bye | bar  | 1970-01-01T00:00:00.000000003Z | 1   |",
+        "| false |      | -3405 | hi  |      | 1970-01-01T00:00:00.000000004Z | 566 |",
+        "| true  | 1.23 | 1     | hi  | asdf | 1970-01-01T00:00:00.000000005Z |     |",
+        "+-------+------+-------+-----+------+--------------------------------+-----+",
+    ];
+
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+    let encoded = encode_batch(42, &batch);
+    assert_eq!(encoded.table_id, 42);
+
+    let mut batch = MutableBatch::new();
+    write_table_batch(&mut batch, &encoded).unwrap();
+
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+}
+
+// This test asserts columns containing no values do not prevent an encoded
+// batch from being deserialize:
+//
+//  https://github.com/influxdata/influxdb_iox/issues/4272
+//
+// The test constructs a table such that after partitioning, one entire column
+// is NULL within a partition. In this test case, the following table is
+// partitioned by YMD:
+//
+//
+//    | time       | A    | B    |
+//    | ---------- | ---- | ---- |
+//    | 1970-01-01 | 1    | NULL |
+//    | 1970-07-05 | NULL | 1    |
+//
+// Yielding two partitions:
+//
+//    | time       | A    | B    |
+//    | ---------- | ---- | ---- |
+//    | 1970-01-01 | 1    | NULL |
+//
+// and:
+//
+//    | time       | A    | B    |
+//    | ---------- | ---- | ---- |
+//    | 1970-07-05 | NULL | 1    |
+//
+// In both partitions, one column is composed entirely of NULL values.
+//
+// Encoding each of these partitions succeeds, but decoding the partition fails
+// due to the inability to infer a column type from the serialized format which
+// contains no values:
+//
+// ```
+//  Column {
+//      column_name: "B",
+//      semantic_type: Field,
+//      values: Some(
+//          Values {
+//              i64_values: [],
+//              f64_values: [],
+//              u64_values: [],
+//              string_values: [],
+//              bool_values: [],
+//              bytes_values: [],
+//              packed_string_values: None,
+//              interned_string_values: None,
+//          },
+//      ),
+//      null_mask: [
+//          1,
+//      ],
+//  },
+// ```
+//
+// In a column that is not entirely NULL, one of the "Values" fields would be
+// non-empty, and the decoder would use this to infer the type of the column.
+//
+// Because we have chosen to not differentiate between "NULL" and "empty" in our
+// proto encoding, the decoder cannot infer which field within the "Values"
+// struct the column belongs to - all are valid, but empty. This causes
+// [`Error::EmptyColumn`] to be returned during deserialisation.
+//
+// This is fixed by skipping entirely-null columns when encoding the batch.
+#[test]
+fn test_encode_decode_null_columns_issue_4272() {
+    let mut batch = MutableBatch::new();
+    let mut writer = Writer::new(&mut batch, 2);
+
+    writer
+        // Yielding partition keys: ["1970-01-01", "1970-07-05"]
+        .write_time("time", [160, 16007561568756160].into_iter())
+        .unwrap();
+    writer
+        .write_i64("A", Some(&[0b00000001]), [1].into_iter())
+        .unwrap();
+    writer
+        .write_i64("B", Some(&[0b00000010]), [1].into_iter())
+        .unwrap();
+    writer.commit();
+
+    let mut partitions = PartitionWrite::partition(&batch, &Default::default()).unwrap();
+
+    // There should be two partitions, one with for the timestamp 160, and
+    // one for the other timestamp.
+    assert_eq!(partitions.len(), 2);
+
+    // Round-trip the "1970-01-01" partition
+    let mut got = MutableBatch::default();
+    partitions
+        .remove::<PartitionKey>(&"1970-01-01".into())
+        .expect("partition not found")
+        .write_to_batch(&mut got)
+        .expect("should write");
+
+    let encoded = encode_batch(24, &got);
+    assert_eq!(encoded.table_id, 24);
+
+    let mut batch = MutableBatch::new();
+    // Without the fix for #4272 this deserialisation call would fail.
+    write_table_batch(&mut batch, &encoded).unwrap();
+
+    let expected = &[
+        "+---+--------------------------------+",
+        "| A | time                           |",
+        "+---+--------------------------------+",
+        "| 1 | 1970-01-01T00:00:00.000000160Z |",
+        "+---+--------------------------------+",
+    ];
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+
+    // And finally assert the "1970-07-05" round-trip
+    let mut got = MutableBatch::default();
+    partitions
+        .remove::<PartitionKey>(&"1970-07-05".into())
+        .expect("partition not found")
+        .write_to_batch(&mut got)
+        .expect("should write");
+
+    let encoded = encode_batch(42, &got);
+    assert_eq!(encoded.table_id, 42);
+
+    let mut batch = MutableBatch::new();
+    // Without the fix for #4272 this deserialisation call would fail.
+    write_table_batch(&mut batch, &encoded).unwrap();
+
+    let expected = &[
+        "+---+--------------------------------+",
+        "| B | time                           |",
+        "+---+--------------------------------+",
+        "| 1 | 1970-07-05T06:32:41.568756160Z |",
+        "+---+--------------------------------+",
+    ];
+    assert_batches_eq!(expected, &[batch.to_arrow(Projection::All).unwrap()]);
+}
diff --git a/mutable_batch_tests/Cargo.toml b/mutable_batch_tests/Cargo.toml
new file mode 100644
index 0000000..d6e1494
--- /dev/null
+++ b/mutable_batch_tests/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "mutable_batch_tests"
+description = "MutableBatch integration tests and benchmarks"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+flate2 = "1.0"
+
+[dev-dependencies]
+bytes = "1.5"
+criterion = { version = "0.5", default-features = false, features = ["rayon"] }
+data_types = { path = "../data_types", default-features = false }
+dml = { path = "../dml" }
+generated_types = { path = "../generated_types" }
+mutable_batch = { path = "../mutable_batch" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+mutable_batch_pb = { path = "../mutable_batch_pb" }
+prost = { workspace = true }
+
+[[bench]]
+name = "statistics"
+harness = false
+
+[[bench]]
+name = "write_lp"
+harness = false
+
+[[bench]]
+name = "write_pb"
+harness = false
+
+[lib]
+# Allow --save-baseline to work
+# https://github.com/bheisler/criterion.rs/issues/275
+bench = false
diff --git a/mutable_batch_tests/benches/statistics.rs b/mutable_batch_tests/benches/statistics.rs
new file mode 100644
index 0000000..ef8d14c
--- /dev/null
+++ b/mutable_batch_tests/benches/statistics.rs
@@ -0,0 +1,184 @@
+use std::hint::black_box;
+
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput};
+
+use data_types::StatValues;
+use mutable_batch::{
+    column::{recompute_min_max, Column, ColumnData},
+    writer::Writer,
+    MutableBatch,
+};
+
+const N_VALUES: usize = 16_000; // Must be multiple of 8
+
+fn generate_f64() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..).map(|v| v as f64).take(N_VALUES / 2);
+
+    w.write_f64("v", Some(mask.as_slice()), values)
+        .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn generate_u64() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..).map(|v| v as u64).take(N_VALUES / 2);
+
+    w.write_u64("v", Some(mask.as_slice()), values)
+        .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn generate_bool() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..).map(|v| v & 1 == 0).take(N_VALUES / 2);
+
+    w.write_bool("v", Some(mask.as_slice()), values)
+        .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn generate_tag() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..)
+        .map(|v| (v % 100).to_string())
+        .take(N_VALUES / 2)
+        .collect::<Vec<_>>();
+
+    w.write_tag(
+        "v",
+        Some(mask.as_slice()),
+        values.iter().map(|v| v.as_str()),
+    )
+    .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn bench_rebuild(data: &mut Column) {
+    recompute_min_max(data);
+}
+
+fn bench_stats(col: &Column) {
+    match col.data() {
+        ColumnData::F64(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::I64(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::U64(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::Bool(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(&v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::String(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::Tag(data, dict, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, id) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(dict.lookup_id(*id).unwrap())
+                }
+            }
+            black_box(s);
+        }
+    }
+}
+
+fn run_bench(col: Column, c: &mut Criterion) {
+    let mut group = c.benchmark_group(col.data().to_string());
+    group.throughput(Throughput::Bytes(col.size() as u64));
+    group.bench_function("StatValues", |b| {
+        b.iter(|| {
+            bench_stats(&col);
+        });
+    });
+    group.bench_function("recompute_min_max", |b| {
+        b.iter_batched(
+            || col.clone(),
+            |mut col| {
+                bench_rebuild(&mut col);
+            },
+            BatchSize::SmallInput,
+        );
+    });
+    group.finish();
+}
+
+pub fn bench_statistics(c: &mut Criterion) {
+    run_bench(generate_f64(), c);
+    run_bench(generate_u64(), c);
+    run_bench(generate_bool(), c);
+    run_bench(generate_tag(), c);
+}
+
+criterion_group!(benches, bench_statistics);
+criterion_main!(benches);
diff --git a/mutable_batch_tests/benches/write_lp.rs b/mutable_batch_tests/benches/write_lp.rs
new file mode 100644
index 0000000..94dbdee
--- /dev/null
+++ b/mutable_batch_tests/benches/write_lp.rs
@@ -0,0 +1,29 @@
+use bytes::Bytes;
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+
+use mutable_batch_lp::lines_to_batches;
+use mutable_batch_tests::benchmark_lp;
+
+fn generate_lp_bytes() -> Vec<(String, Bytes)> {
+    benchmark_lp()
+        .into_iter()
+        .map(|(bench, lp)| (bench, lp.into()))
+        .collect()
+}
+
+pub fn write_lp(c: &mut Criterion) {
+    let mut group = c.benchmark_group("write_lp");
+    for (bench, lp_bytes) in generate_lp_bytes() {
+        group.throughput(Throughput::Bytes(lp_bytes.len() as u64));
+        group.bench_function(BenchmarkId::from_parameter(bench), |b| {
+            b.iter(|| {
+                let batches = lines_to_batches(std::str::from_utf8(&lp_bytes).unwrap(), 0).unwrap();
+                assert_eq!(batches.len(), 1);
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, write_lp);
+criterion_main!(benches);
diff --git a/mutable_batch_tests/benches/write_pb.rs b/mutable_batch_tests/benches/write_pb.rs
new file mode 100644
index 0000000..e2f897c
--- /dev/null
+++ b/mutable_batch_tests/benches/write_pb.rs
@@ -0,0 +1,60 @@
+use bytes::{Bytes, BytesMut};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use data_types::{NamespaceId, TableId};
+use dml::DmlWrite;
+use generated_types::influxdata::pbdata::v1::DatabaseBatch;
+use mutable_batch::MutableBatch;
+use mutable_batch_lp::lines_to_batches;
+use mutable_batch_tests::benchmark_lp;
+use prost::Message;
+
+fn generate_pbdata_bytes() -> Vec<(String, (usize, Bytes))> {
+    benchmark_lp()
+        .into_iter()
+        .map(|(bench, lp)| {
+            let batches = lines_to_batches(&lp, 0).unwrap();
+            let data = batches
+                .into_iter()
+                .enumerate()
+                .map(|(idx, (_table_name, batch))| (TableId::new(idx as _), batch))
+                .collect();
+
+            let write = DmlWrite::new(
+                NamespaceId::new(42),
+                data,
+                "bananas".into(),
+                Default::default(),
+            );
+            let database_batch = mutable_batch_pb::encode::encode_write(42, &write);
+
+            let mut bytes = BytesMut::new();
+            database_batch.encode(&mut bytes).unwrap();
+
+            (bench, (lp.len(), bytes.freeze()))
+        })
+        .collect()
+}
+
+pub fn write_pb(c: &mut Criterion) {
+    let mut group = c.benchmark_group("write_pb");
+    for (bench, (lp_bytes, pbdata_bytes)) in generate_pbdata_bytes() {
+        group.throughput(Throughput::Bytes(lp_bytes as u64));
+        group.bench_function(BenchmarkId::from_parameter(bench), |b| {
+            b.iter(|| {
+                let mut batch = MutableBatch::new();
+                let database_batch = DatabaseBatch::decode(pbdata_bytes.clone()).unwrap();
+                assert_eq!(database_batch.table_batches.len(), 1);
+
+                mutable_batch_pb::decode::write_table_batch(
+                    &mut batch,
+                    &database_batch.table_batches[0],
+                )
+                .unwrap();
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, write_pb);
+criterion_main!(benches);
diff --git a/mutable_batch_tests/src/lib.rs b/mutable_batch_tests/src/lib.rs
new file mode 100644
index 0000000..b4cbbd0
--- /dev/null
+++ b/mutable_batch_tests/src/lib.rs
@@ -0,0 +1,60 @@
+//! This crate only exists for its tests and benchmarks
+
+#![warn(unused_crate_dependencies)]
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use bytes as _;
+#[cfg(test)]
+use criterion as _;
+#[cfg(test)]
+use data_types as _;
+#[cfg(test)]
+use dml as _;
+#[cfg(test)]
+use generated_types as _;
+#[cfg(test)]
+use mutable_batch as _;
+#[cfg(test)]
+use mutable_batch_lp as _;
+#[cfg(test)]
+use mutable_batch_pb as _;
+#[cfg(test)]
+use prost as _;
+
+use flate2::read::GzDecoder;
+use std::io::Read;
+use std::path::Path;
+
+/// Parses the BENCHMARK_LP environment variable for a list of semicolon delimited paths
+/// to line protocol files. Returnss a list of (filename, line protocol) pairs for benchmarking
+pub fn benchmark_lp() -> Vec<(String, String)> {
+    let env = std::env::var("BENCHMARK_LP")
+        .expect("set BENCHMARK_LP to a semicolon-delimited list of source files");
+
+    env.split(';').map(read_path).collect()
+}
+
+fn read_path(path: &str) -> (String, String) {
+    let path = Path::new(path);
+
+    let filename = path.file_name().expect("file").to_string_lossy();
+
+    //  Path::extension only returns `.gz` not `.lp.gz`
+    let extension = match filename.split_once('.') {
+        Some((_, extension)) => extension,
+        None => "lp",
+    };
+
+    match extension {
+        "lp.gz" => {
+            let file = std::fs::File::open(path).unwrap();
+            let mut decoded = GzDecoder::new(file);
+            let mut ret = String::new();
+            decoded.read_to_string(&mut ret).unwrap();
+            (filename.to_string(), ret)
+        }
+        "lp" => (filename.to_string(), std::fs::read_to_string(path).unwrap()),
+        ext => panic!("Unrecognised extension: {ext}"),
+    }
+}
diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml
new file mode 100644
index 0000000..3a94885
--- /dev/null
+++ b/object_store_metrics/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "object_store_metrics"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+async-trait = "0.1.77"
+bytes = "1.5"
+futures = "0.3"
+iox_time = { version = "0.1.0", path = "../iox_time" }
+metric = { version = "0.1.0", path = "../metric" }
+object_store = { workspace = true }
+pin-project = "1.1.3"
+tokio = { version = "1.35", features = ["io-util"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "io-util"] }
diff --git a/object_store_metrics/src/dummy.rs b/object_store_metrics/src/dummy.rs
new file mode 100644
index 0000000..9960e2e
--- /dev/null
+++ b/object_store_metrics/src/dummy.rs
@@ -0,0 +1,123 @@
+//! Crate that mimics the interface of the the various object stores
+//! but does nothing if they are not enabled.
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use futures::StreamExt;
+use snafu::Snafu;
+use std::ops::Range;
+
+use object_store::{
+    path::Path, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId,
+    ObjectMeta, ObjectStore, PutOptions, PutResult, Result,
+};
+use tokio::io::AsyncWrite;
+
+/// A specialized `Error` for Azure object store-related errors
+#[derive(Debug, Snafu, Clone)]
+#[allow(missing_copy_implementations, missing_docs)]
+enum Error {
+    #[snafu(display(
+        "'{}' not supported with this build. Hint: recompile with appropriate features",
+        name
+    ))]
+    NotSupported { name: &'static str },
+}
+
+impl From<Error> for object_store::Error {
+    fn from(source: Error) -> Self {
+        match source {
+            Error::NotSupported { name } => Self::Generic {
+                store: name,
+                source: Box::new(source),
+            },
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+#[allow(missing_copy_implementations)]
+/// An object store that always generates an error
+pub(crate) struct DummyObjectStore {
+    name: &'static str,
+}
+
+impl DummyObjectStore {
+    /// Create a new [`DummyObjectStore`] that always fails
+    pub(crate) fn new(name: &'static str) -> Self {
+        Self { name }
+    }
+}
+
+impl std::fmt::Display for DummyObjectStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Dummy({})", self.name)
+    }
+}
+
+#[async_trait]
+impl ObjectStore for DummyObjectStore {
+    async fn put_opts(
+        &self,
+        _location: &Path,
+        _bytes: Bytes,
+        _opts: PutOptions,
+    ) -> Result<PutResult> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn put_multipart(
+        &self,
+        _location: &Path,
+    ) -> Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn get_opts(&self, _location: &Path, _options: GetOptions) -> Result<GetResult> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn get(&self, _location: &Path) -> Result<GetResult> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn get_range(&self, _location: &Path, _range: Range<usize>) -> Result<Bytes> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn head(&self, _location: &Path) -> Result<ObjectMeta> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn delete(&self, _location: &Path) -> Result<()> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    fn list(&self, _prefix: Option<&Path>) -> futures::stream::BoxStream<'_, Result<ObjectMeta>> {
+        futures::stream::once(async move {
+            NotSupportedSnafu { name: self.name }
+                .fail()
+                .map_err(|e| ObjectStoreError::Generic {
+                    store: self.name,
+                    source: Box::new(e),
+                })
+        })
+        .boxed()
+    }
+
+    async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result<ListResult> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn copy(&self, _from: &Path, _to: &Path) -> Result<()> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+
+    async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> Result<()> {
+        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    }
+}
diff --git a/object_store_metrics/src/lib.rs b/object_store_metrics/src/lib.rs
new file mode 100644
index 0000000..09292ba
--- /dev/null
+++ b/object_store_metrics/src/lib.rs
@@ -0,0 +1,1665 @@
+//! A metric instrumentation wrapper over [`ObjectStore`] implementations.
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![allow(clippy::clone_on_ref_ptr)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use object_store::{GetOptions, GetResultPayload, PutOptions, PutResult};
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::ops::Range;
+use std::sync::Arc;
+use std::{
+    marker::PhantomData,
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use futures::{stream::BoxStream, Stream, StreamExt};
+use iox_time::{SystemProvider, Time, TimeProvider};
+use metric::{DurationHistogram, Metric, U64Counter};
+use pin_project::{pin_project, pinned_drop};
+
+use object_store::{
+    path::Path, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result,
+};
+use tokio::io::AsyncWrite;
+
+#[cfg(test)]
+mod dummy;
+
+#[derive(Debug, Clone)]
+struct Metrics {
+    success_duration: DurationHistogram,
+    error_duration: DurationHistogram,
+}
+
+impl Metrics {
+    fn new(registry: &metric::Registry, op: &'static str) -> Self {
+        // Call durations broken down by op & result
+        let duration: Metric<DurationHistogram> = registry.register_metric(
+            "object_store_op_duration",
+            "object store operation duration",
+        );
+
+        Self {
+            success_duration: duration.recorder(&[("op", op), ("result", "success")]),
+            error_duration: duration.recorder(&[("op", op), ("result", "error")]),
+        }
+    }
+
+    fn record(&self, t_begin: Time, t_end: Time, success: bool) {
+        // Avoid exploding if time goes backwards - simply drop the measurement
+        // if it happens.
+        let Some(delta) = t_end.checked_duration_since(t_begin) else {
+            return;
+        };
+
+        if success {
+            self.success_duration.record(delta);
+        } else {
+            self.error_duration.record(delta);
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct MetricsWithBytes {
+    inner: Metrics,
+    success_bytes: U64Counter,
+    error_bytes: U64Counter,
+}
+
+impl MetricsWithBytes {
+    fn new(registry: &metric::Registry, op: &'static str) -> Self {
+        // Byte counts up/down
+        let bytes = registry.register_metric::<U64Counter>(
+            "object_store_transfer_bytes",
+            "cumulative count of file content bytes transferred to/from the object store",
+        );
+
+        Self {
+            inner: Metrics::new(registry, op),
+            success_bytes: bytes.recorder(&[("op", op), ("result", "success")]),
+            error_bytes: bytes.recorder(&[("op", op), ("result", "error")]),
+        }
+    }
+
+    fn record_bytes_only(&self, success: bool, bytes: u64) {
+        if success {
+            self.success_bytes.inc(bytes);
+        } else {
+            self.error_bytes.inc(bytes);
+        }
+    }
+
+    fn record(&self, t_begin: Time, t_end: Time, success: bool, bytes: Option<u64>) {
+        if let Some(bytes) = bytes {
+            self.record_bytes_only(success, bytes);
+        }
+
+        self.inner.record(t_begin, t_end, success);
+    }
+}
+
+#[derive(Debug, Clone)]
+struct MetricsWithCount {
+    inner: Metrics,
+    success_count: U64Counter,
+    error_count: U64Counter,
+}
+
+impl MetricsWithCount {
+    fn new(registry: &metric::Registry, op: &'static str) -> Self {
+        let count = registry.register_metric::<U64Counter>(
+            "object_store_transfer_objects",
+            "cumulative count of objects transferred to/from the object store",
+        );
+
+        Self {
+            inner: Metrics::new(registry, op),
+            success_count: count.recorder(&[("op", op), ("result", "success")]),
+            error_count: count.recorder(&[("op", op), ("result", "error")]),
+        }
+    }
+
+    fn record_count_only(&self, success: bool, count: u64) {
+        if success {
+            self.success_count.inc(count);
+        } else {
+            self.error_count.inc(count);
+        }
+    }
+
+    fn record(&self, t_begin: Time, t_end: Time, success: bool, count: Option<u64>) {
+        if let Some(count) = count {
+            self.record_count_only(success, count);
+        }
+
+        self.inner.record(t_begin, t_end, success);
+    }
+}
+
+/// An instrumentation decorator, wrapping an underlying [`ObjectStore`]
+/// implementation and recording bytes transferred and call latency.
+///
+/// # Stream Duration
+///
+/// The [`ObjectStore::get()`] call can return a [`Stream`] which is polled
+/// by the caller and may yield chunks of a file over a series of polls (as
+/// opposed to all of the file data in one go). Because the caller drives the
+/// polling and therefore fetching of data from the object store over the
+/// lifetime of the [`Stream`], the duration of a [`ObjectStore::get()`]
+/// request is measured to be the wall clock difference between the moment the
+/// caller executes the [`ObjectStore::get()`] call, up until the last chunk
+/// of data is yielded to the caller.
+///
+/// This means the duration metrics measuring consumption of returned streams
+/// are recording the rate at which the application reads the data, as opposed
+/// to the duration of time taken to fetch that data.
+///
+/// # Stream Errors
+///
+/// The [`ObjectStore::get()`] method can return a [`Stream`] of [`Result`]
+/// instances, and returning an error when polled is not necessarily a terminal
+/// state. The metric recorder allows for a caller to observe a transient error
+/// and subsequently go on to complete reading the stream, recording this read
+/// in the "success" histogram.
+///
+/// If a stream is not polled again after observing an error, the operation is
+/// recorded in the "error" histogram.
+///
+/// A stream can return an arbitrary sequence of success and error states before
+/// terminating, with the last observed poll result that yields a [`Result`]
+/// dictating which histogram the operation is recorded in.
+///
+/// # Bytes Transferred
+///
+/// The metric recording bytes transferred accounts for only object data, and
+/// not object metadata (such as that returned by list methods).
+///
+/// The total data transferred will be greater than the metric value due to
+/// metadata queries, read errors, etc. The metric tracks the amount of object
+/// data successfully yielded to the caller.
+///
+/// # Backwards Clocks
+///
+/// If the system clock is observed as moving backwards in time, call durations
+/// are not recorded. The bytes transferred metric is not affected.
+#[derive(Debug)]
+pub struct ObjectStoreMetrics {
+    inner: Arc<dyn ObjectStore>,
+    time_provider: Arc<dyn TimeProvider>,
+
+    put: MetricsWithBytes,
+    get: MetricsWithBytes,
+    get_range: MetricsWithBytes,
+    get_ranges: MetricsWithBytes,
+    head: Metrics,
+    delete: Metrics,
+    delete_stream: MetricsWithCount,
+    list: MetricsWithCount,
+    list_with_offset: MetricsWithCount,
+    list_with_delimiter: MetricsWithCount,
+    copy: Metrics,
+    rename: Metrics,
+    copy_if_not_exists: Metrics,
+    rename_if_not_exists: Metrics,
+}
+
+impl ObjectStoreMetrics {
+    /// Instrument `T`, pushing to `registry`.
+    pub fn new(
+        inner: Arc<dyn ObjectStore>,
+        time_provider: Arc<dyn TimeProvider>,
+        registry: &metric::Registry,
+    ) -> Self {
+        Self {
+            inner,
+            time_provider,
+
+            put: MetricsWithBytes::new(registry, "put"),
+            get: MetricsWithBytes::new(registry, "get"),
+            get_range: MetricsWithBytes::new(registry, "get_range"),
+            get_ranges: MetricsWithBytes::new(registry, "get_ranges"),
+            head: Metrics::new(registry, "head"),
+            delete: Metrics::new(registry, "delete"),
+            delete_stream: MetricsWithCount::new(registry, "delete_stream"),
+            list: MetricsWithCount::new(registry, "list"),
+            list_with_offset: MetricsWithCount::new(registry, "list_with_offset"),
+            list_with_delimiter: MetricsWithCount::new(registry, "list_with_delimiter"),
+            copy: Metrics::new(registry, "copy"),
+            rename: Metrics::new(registry, "rename"),
+            copy_if_not_exists: Metrics::new(registry, "copy_if_not_exists"),
+            rename_if_not_exists: Metrics::new(registry, "rename_if_not_exists"),
+        }
+    }
+}
+
+impl std::fmt::Display for ObjectStoreMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "ObjectStoreMetrics({})", self.inner)
+    }
+}
+
+#[async_trait]
+impl ObjectStore for ObjectStoreMetrics {
+    async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result<PutResult> {
+        let t = self.time_provider.now();
+        let size = bytes.len();
+        let res = self.inner.put_opts(location, bytes, opts).await;
+        self.put
+            .record(t, self.time_provider.now(), res.is_ok(), Some(size as _));
+        res
+    }
+
+    async fn put_multipart(
+        &self,
+        _location: &Path,
+    ) -> Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+        unimplemented!()
+    }
+
+    async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> {
+        unimplemented!()
+    }
+
+    async fn get_opts(&self, location: &Path, options: GetOptions) -> Result<GetResult> {
+        let started_at = self.time_provider.now();
+
+        let res = self.inner.get_opts(location, options).await;
+
+        match res {
+            Ok(mut res) => {
+                res.payload = match res.payload {
+                    GetResultPayload::File(file, path) => {
+                        self.get.record(
+                            started_at,
+                            self.time_provider.now(),
+                            true,
+                            file.metadata().ok().map(|m| m.len()),
+                        );
+                        GetResultPayload::File(file, path)
+                    }
+                    GetResultPayload::Stream(s) => {
+                        // Wrap the object store data stream in a decorator to track the
+                        // yielded data / wall clock, inclusive of the inner call above.
+                        GetResultPayload::Stream(Box::pin(Box::new(
+                            StreamMetricRecorder::new(
+                                s,
+                                started_at,
+                                BytesStreamDelegate::new(self.get.clone()),
+                            )
+                            .fuse(),
+                        )))
+                    }
+                };
+                Ok(res)
+            }
+            Err(e) => {
+                self.get
+                    .record(started_at, self.time_provider.now(), false, None);
+                Err(e)
+            }
+        }
+    }
+
+    async fn get_range(&self, location: &Path, range: Range<usize>) -> Result<Bytes> {
+        let t = self.time_provider.now();
+        let res = self.inner.get_range(location, range).await;
+        self.get_range.record(
+            t,
+            self.time_provider.now(),
+            res.is_ok(),
+            res.as_ref().ok().map(|b| b.len() as _),
+        );
+        res
+    }
+
+    async fn get_ranges(&self, location: &Path, ranges: &[Range<usize>]) -> Result<Vec<Bytes>> {
+        let t = self.time_provider.now();
+        let res = self.inner.get_ranges(location, ranges).await;
+        self.get_ranges.record(
+            t,
+            self.time_provider.now(),
+            res.is_ok(),
+            res.as_ref()
+                .ok()
+                .map(|b| b.iter().map(|b| b.len() as u64).sum()),
+        );
+        res
+    }
+
+    async fn head(&self, location: &Path) -> Result<ObjectMeta> {
+        let t = self.time_provider.now();
+        let res = self.inner.head(location).await;
+        self.head.record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+
+    async fn delete(&self, location: &Path) -> Result<()> {
+        let t = self.time_provider.now();
+        let res = self.inner.delete(location).await;
+        self.delete.record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+
+    fn delete_stream<'a>(
+        &'a self,
+        locations: BoxStream<'a, Result<Path>>,
+    ) -> BoxStream<'a, Result<Path>> {
+        let started_at = self.time_provider.now();
+
+        let s = self.inner.delete_stream(locations);
+
+        // Wrap the object store data stream in a decorator to track the
+        // yielded data / wall clock, inclusive of the inner call above.
+        StreamMetricRecorder::new(
+            s,
+            started_at,
+            CountStreamDelegate::new(self.delete_stream.clone()),
+        )
+        .fuse()
+        .boxed()
+    }
+
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
+        let started_at = self.time_provider.now();
+
+        let s = self.inner.list(prefix);
+
+        // Wrap the object store data stream in a decorator to track the
+        // yielded data / wall clock, inclusive of the inner call above.
+        StreamMetricRecorder::new(s, started_at, CountStreamDelegate::new(self.list.clone()))
+            .fuse()
+            .boxed()
+    }
+
+    fn list_with_offset(
+        &self,
+        prefix: Option<&Path>,
+        offset: &Path,
+    ) -> BoxStream<'_, Result<ObjectMeta>> {
+        let started_at = self.time_provider.now();
+
+        let s = self.inner.list_with_offset(prefix, offset);
+
+        // Wrap the object store data stream in a decorator to track the
+        // yielded data / wall clock, inclusive of the inner call above.
+        StreamMetricRecorder::new(
+            s,
+            started_at,
+            CountStreamDelegate::new(self.list_with_offset.clone()),
+        )
+        .fuse()
+        .boxed()
+    }
+
+    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
+        let t = self.time_provider.now();
+        let res = self.inner.list_with_delimiter(prefix).await;
+        self.list_with_delimiter.record(
+            t,
+            self.time_provider.now(),
+            res.is_ok(),
+            res.as_ref().ok().map(|res| res.objects.len() as _),
+        );
+        res
+    }
+
+    async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
+        let t = self.time_provider.now();
+        let res = self.inner.copy(from, to).await;
+        self.copy.record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+
+    async fn rename(&self, from: &Path, to: &Path) -> Result<()> {
+        let t = self.time_provider.now();
+        let res = self.inner.rename(from, to).await;
+        self.rename.record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+
+    async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
+        let t = self.time_provider.now();
+        let res = self.inner.copy_if_not_exists(from, to).await;
+        self.copy_if_not_exists
+            .record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+
+    async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
+        let t = self.time_provider.now();
+        let res = self.inner.rename_if_not_exists(from, to).await;
+        self.rename_if_not_exists
+            .record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+}
+
+/// A [`MetricDelegate`] is called whenever the [`StreamMetricRecorder`]
+/// observes an `Ok(Item)` in the stream.
+trait MetricDelegate {
+    /// The type this delegate observes.
+    type Item;
+
+    /// Invoked when the stream yields an `Ok(Item)`.
+    fn observe_ok(&self, value: &Self::Item);
+
+    /// Finish stream.
+    fn finish(&self, t_begin: Time, t_end: Time, success: bool);
+}
+
+/// A [`MetricDelegate`] for instrumented streams of [`Bytes`].
+///
+/// This impl is used to record the number of bytes yielded for
+/// [`ObjectStore::get()`] calls.
+#[derive(Debug)]
+struct BytesStreamDelegate(MetricsWithBytes);
+
+impl BytesStreamDelegate {
+    fn new(metrics: MetricsWithBytes) -> Self {
+        Self(metrics)
+    }
+}
+
+impl MetricDelegate for BytesStreamDelegate {
+    type Item = Bytes;
+
+    fn observe_ok(&self, bytes: &Self::Item) {
+        self.0.record_bytes_only(true, bytes.len() as _);
+    }
+
+    fn finish(&self, t_begin: Time, t_end: Time, success: bool) {
+        self.0.record(t_begin, t_end, success, None);
+    }
+}
+
+#[derive(Debug)]
+struct CountStreamDelegate<T>(MetricsWithCount, PhantomData<T>);
+
+impl<T> CountStreamDelegate<T> {
+    fn new(metrics: MetricsWithCount) -> Self {
+        Self(metrics, Default::default())
+    }
+}
+
+impl<T> MetricDelegate for CountStreamDelegate<T> {
+    type Item = T;
+
+    fn observe_ok(&self, _value: &Self::Item) {
+        self.0.record_count_only(true, 1);
+    }
+
+    fn finish(&self, t_begin: Time, t_end: Time, success: bool) {
+        self.0.record(t_begin, t_end, success, None);
+    }
+}
+
+/// [`StreamMetricRecorder`] decorates an underlying [`Stream`] for "get" /
+/// "list" catalog operations, recording the wall clock duration and invoking
+/// the metric delegate with the `Ok(T)` values.
+///
+/// For "gets" using the [`BytesStreamDelegate`], the bytes read counter is
+/// incremented each time [`Self::poll_next()`] yields a buffer, and once the
+/// [`StreamMetricRecorder`] is read to completion (specifically, until it
+/// yields `Poll::Ready(None)`), or when it is dropped (whichever is sooner) the
+/// decorator emits the wall clock measurement into the relevant histogram,
+/// bucketed by operation result.
+///
+/// A stream may return a transient error when polled, and later successfully
+/// emit all data in subsequent polls - therefore the duration is logged as an
+/// error only if the last poll performed by the caller returned an error.
+#[derive(Debug)]
+#[pin_project(PinnedDrop)]
+struct StreamMetricRecorder<S, D, P = SystemProvider>
+where
+    P: TimeProvider,
+    D: MetricDelegate,
+{
+    #[pin]
+    inner: S,
+
+    time_provider: P,
+
+    // The timestamp at which the read request began, inclusive of the work
+    // required to acquire the inner stream (which may involve fetching all the
+    // data if the result is only pretending to be a stream).
+    started_at: Time,
+    // The time at which the last part of the data stream (or error) was
+    // returned to the caller.
+    //
+    // The total get operation duration is calculated as this timestamp minus
+    // the started_at timestamp.
+    //
+    // This field is always Some, until the end of the stream is observed at
+    // which point the metrics are emitted and this field is set to None,
+    // preventing the drop impl duplicating them.
+    last_yielded_at: Option<Time>,
+    // The error state of the last poll - true if OK, false if an error
+    // occurred.
+    //
+    // This is used to select the correct success/error histogram which records
+    // the operation duration.
+    last_call_ok: bool,
+
+    // Called when the stream yields an `Ok(T)` to allow the delegate to inspect
+    // the `T`.
+    metric_delegate: D,
+}
+
+impl<S, D> StreamMetricRecorder<S, D>
+where
+    S: Stream,
+    D: MetricDelegate,
+{
+    fn new(stream: S, started_at: Time, metric_delegate: D) -> Self {
+        let time_provider = SystemProvider::default();
+        Self {
+            inner: stream,
+
+            // Set the last_yielded_at to now, ensuring the duration of work
+            // already completed acquiring the steam is correctly recorded even
+            // if the stream is never polled / data never read.
+            last_yielded_at: Some(time_provider.now()),
+            // Acquiring the stream was successful, even if the data was never
+            // read.
+            last_call_ok: true,
+
+            started_at,
+            time_provider,
+
+            metric_delegate,
+        }
+    }
+}
+
+impl<S, T, D, P, E> Stream for StreamMetricRecorder<S, D, P>
+where
+    S: Stream<Item = Result<T, E>>,
+    P: TimeProvider,
+    D: MetricDelegate<Item = T>,
+{
+    type Item = S::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+
+        let res = this.inner.poll_next(cx);
+
+        match res {
+            Poll::Ready(Some(Ok(value))) => {
+                *this.last_call_ok = true;
+                *this.last_yielded_at.as_mut().unwrap() = this.time_provider.now();
+
+                // Allow the pluggable metric delegate to record the value of T
+                this.metric_delegate.observe_ok(&value);
+
+                Poll::Ready(Some(Ok(value)))
+            }
+            Poll::Ready(Some(Err(e))) => {
+                *this.last_call_ok = false;
+                *this.last_yielded_at.as_mut().unwrap() = this.time_provider.now();
+                Poll::Ready(Some(Err(e)))
+            }
+            Poll::Ready(None) => {
+                // The stream has terminated - record the wall clock duration
+                // immediately.
+                this.metric_delegate.finish(
+                    *this.started_at,
+                    this.last_yielded_at
+                        .take()
+                        .expect("no last_yielded_at value for fused stream"),
+                    *this.last_call_ok,
+                );
+
+                Poll::Ready(None)
+            }
+            v => v,
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        // Impl the default size_hint() so this wrapper doesn't mask the size
+        // hint from the inner stream, if any.
+        self.inner.size_hint()
+    }
+}
+
+#[pinned_drop]
+impl<S, D, P> PinnedDrop for StreamMetricRecorder<S, D, P>
+where
+    P: TimeProvider,
+    D: MetricDelegate,
+{
+    fn drop(self: Pin<&mut Self>) {
+        // Only emit metrics if the end of the stream was not observed (and
+        // therefore last_yielded_at is still Some).
+        if let Some(last) = self.last_yielded_at {
+            self.metric_delegate
+                .finish(self.started_at, last, self.last_call_ok);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        io::{Error, ErrorKind},
+        sync::Arc,
+        time::Duration,
+    };
+
+    use futures::{stream, TryStreamExt};
+    use metric::Attributes;
+    use std::io::Read;
+
+    use dummy::DummyObjectStore;
+    use object_store::{local::LocalFileSystem, memory::InMemory};
+
+    use super::*;
+
+    #[track_caller]
+    fn assert_histogram_hit<const N: usize>(
+        metrics: &metric::Registry,
+        name: &'static str,
+        attr: [(&'static str, &'static str); N],
+    ) {
+        let histogram = metrics
+            .get_instrument::<Metric<DurationHistogram>>(name)
+            .expect("failed to read histogram")
+            .get_observer(&Attributes::from(&attr))
+            .expect("failed to get observer")
+            .fetch();
+
+        let hit_count = histogram.sample_count();
+        assert!(hit_count > 0, "metric {name} did not record any calls");
+    }
+
+    #[track_caller]
+    fn assert_histogram_not_hit<const N: usize>(
+        metrics: &metric::Registry,
+        name: &'static str,
+        attr: [(&'static str, &'static str); N],
+    ) {
+        let histogram = metrics
+            .get_instrument::<Metric<DurationHistogram>>(name)
+            .expect("failed to read histogram")
+            .get_observer(&Attributes::from(&attr))
+            .expect("failed to get observer")
+            .fetch();
+
+        let hit_count = histogram.sample_count();
+        assert!(hit_count == 0, "metric {name} did record {hit_count} calls");
+    }
+
+    #[track_caller]
+    fn assert_counter_value<const N: usize>(
+        metrics: &metric::Registry,
+        name: &'static str,
+        attr: [(&'static str, &'static str); N],
+        value: u64,
+    ) {
+        let count = metrics
+            .get_instrument::<Metric<U64Counter>>(name)
+            .expect("failed to read counter")
+            .get_observer(&Attributes::from(&attr))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(count, value);
+    }
+
+    #[tokio::test]
+    async fn test_put() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .put(
+                &Path::from("test"),
+                Bytes::from([42_u8, 42, 42, 42, 42].as_slice()),
+            )
+            .await
+            .expect("put should succeed");
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "put"), ("result", "success")],
+            5,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "put"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_put_fails() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(DummyObjectStore::new("s3"));
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .put(
+                &Path::from("test"),
+                Bytes::from([42_u8, 42, 42, 42, 42].as_slice()),
+            )
+            .await
+            .expect_err("put should error");
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "put"), ("result", "error")],
+            5,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "put"), ("result", "error")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_list() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("bar"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store.list(None).try_collect::<Vec<_>>().await.unwrap();
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_objects",
+            [("op", "list"), ("result", "success")],
+            2,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_list_with_offset() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("bar"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("baz"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .list_with_offset(None, &Path::from("bar"))
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_objects",
+            [("op", "list_with_offset"), ("result", "success")],
+            2,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list_with_offset"), ("result", "success")],
+        );
+
+        // NOT raw `list` call
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_list_fails() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(DummyObjectStore::new("s3"));
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        assert!(
+            store.list(None).try_collect::<Vec<_>>().await.is_err(),
+            "mock configured to fail"
+        );
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list"), ("result", "error")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_list_with_delimiter() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .list_with_delimiter(Some(&Path::from("test")))
+            .await
+            .expect("list should succeed");
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list_with_delimiter"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_list_with_delimiter_fails() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(DummyObjectStore::new("s3"));
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        assert!(
+            store
+                .list_with_delimiter(Some(&Path::from("test")))
+                .await
+                .is_err(),
+            "mock configured to fail"
+        );
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list_with_delimiter"), ("result", "error")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_head_fails() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(DummyObjectStore::new("s3"));
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .head(&Path::from("test"))
+            .await
+            .expect_err("mock configured to fail");
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "head"), ("result", "error")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_get_fails() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(DummyObjectStore::new("s3"));
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .get(&Path::from("test"))
+            .await
+            .expect_err("mock configured to fail");
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get"), ("result", "error")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_getrange_fails() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(DummyObjectStore::new("s3"));
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .get_range(&Path::from("test"), 0..1000)
+            .await
+            .expect_err("mock configured to fail");
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get_range"), ("result", "error")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_getranges() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::from_static(b"bar"))
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .get_ranges(&Path::from("foo"), &[0..2, 1..2, 0..1])
+            .await
+            .unwrap();
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "get_ranges"), ("result", "success")],
+            4,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get_ranges"), ("result", "success")],
+        );
+
+        // NO `get_range` used!
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get_range"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_copy() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .copy(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_copy_if_not_exists() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .copy_if_not_exists(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy_if_not_exists"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_rename() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .rename(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "rename"), ("result", "success")],
+        );
+
+        // NO `copy`/`delete` used!
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy"), ("result", "success")],
+        );
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_rename_if_not_exists() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .rename_if_not_exists(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "rename_if_not_exists"), ("result", "success")],
+        );
+
+        // NO `copy`/`copy_if_not_exists`/`delete` used!
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy"), ("result", "success")],
+        );
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy_if_not_exists"), ("result", "success")],
+        );
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_delete_stream() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("bar"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("baz"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .delete_stream(
+                stream::iter(["foo", "baz"])
+                    .map(|s| Ok(Path::from(s)))
+                    .boxed(),
+            )
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_objects",
+            [("op", "delete_stream"), ("result", "success")],
+            2,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete_stream"), ("result", "success")],
+        );
+
+        // NOT raw `delete` call
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_put_get_getrange_head_delete_file() {
+        let metrics = Arc::new(metric::Registry::default());
+        // Temporary workaround for https://github.com/apache/arrow-rs/issues/2370
+        let path = std::fs::canonicalize(".").unwrap();
+        let store = Arc::new(LocalFileSystem::new_with_prefix(path).unwrap());
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        let data = [42_u8, 42, 42, 42, 42];
+        let path = Path::from("test");
+        store
+            .put(&path, Bytes::copy_from_slice(&data))
+            .await
+            .expect("put should succeed");
+
+        let got = store.get(&path).await.expect("should read file");
+        match got.payload {
+            GetResultPayload::File(mut file, _) => {
+                let mut contents = vec![];
+                file.read_to_end(&mut contents)
+                    .expect("failed to read file data");
+                assert_eq!(contents, &data);
+            }
+            v => panic!("not a file: {v:?}"),
+        }
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "get"), ("result", "success")],
+            5,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get"), ("result", "success")],
+        );
+
+        store
+            .get_range(&path, 1..4)
+            .await
+            .expect("should clean up test file");
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "get_range"), ("result", "success")],
+            3,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get_range"), ("result", "success")],
+        );
+
+        store.head(&path).await.expect("should clean up test file");
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "head"), ("result", "success")],
+        );
+
+        store
+            .delete(&path)
+            .await
+            .expect("should clean up test file");
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_get_stream() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        let data = [42_u8, 42, 42, 42, 42];
+        let path = Path::from("test");
+        store
+            .put(&path, Bytes::copy_from_slice(&data))
+            .await
+            .expect("put should succeed");
+
+        let got = store.get(&path).await.expect("should read stream");
+        match got.payload {
+            GetResultPayload::Stream(mut stream) => while (stream.next().await).is_some() {},
+            v => panic!("not a stream: {v:?}"),
+        }
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "get"), ("result", "success")],
+            5,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get"), ("result", "success")],
+        );
+    }
+
+    // Ensures the stream decorator correctly records the wall-clock time taken
+    // for the caller to consume all the streamed data, and incrementally tracks
+    // the number of bytes observed.
+    #[tokio::test]
+    async fn test_stream_decorator() {
+        let inner = stream::iter(
+            [
+                Ok(Bytes::copy_from_slice(&[1])),
+                Ok(Bytes::copy_from_slice(&[2, 3, 4])),
+            ]
+            .into_iter()
+            .collect::<Vec<Result<_, std::io::Error>>>(),
+        );
+
+        let time_provider = SystemProvider::default();
+
+        let metrics = Arc::new(metric::Registry::default());
+        let m = MetricsWithBytes::new(&metrics, "test");
+
+        let mut stream = StreamMetricRecorder::new(
+            inner,
+            time_provider.now(),
+            BytesStreamDelegate::new(m.clone()),
+        );
+
+        let got = stream
+            .next()
+            .await
+            .expect("should yield data")
+            .expect("should succeed");
+        assert_eq!(got.len(), 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+
+        // Sleep at least 10ms to assert the recorder to captures the wall clock
+        // time.
+        const SLEEP: Duration = Duration::from_millis(20);
+        tokio::time::sleep(SLEEP).await;
+
+        let got = stream
+            .next()
+            .await
+            .expect("should yield data")
+            .expect("should succeed");
+        assert_eq!(got.len(), 3);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
+
+        let success_hist = &m.inner.success_duration;
+
+        // Until the stream is fully consumed, there should be no wall clock
+        // metrics emitted.
+        assert!(!success_hist.fetch().buckets.iter().any(|b| b.count > 0));
+
+        // The stream should complete and cause metrics to be emitted.
+        assert!(stream.next().await.is_none());
+
+        // Now the stream is complete, the wall clock duration must have been
+        // recorded.
+        let hit_count = success_hist.fetch().sample_count();
+        assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
+
+        // And it must be in a SLEEP or higher bucket.
+        let hit_count: u64 = success_hist
+            .fetch()
+            .buckets
+            .iter()
+            .skip_while(|b| b.le < SLEEP) // Skip buckets less than the sleep duration
+            .map(|v| v.count)
+            .sum();
+        assert_eq!(
+            hit_count, 1,
+            "wall clock duration not recorded in correct bucket"
+        );
+
+        // Metrics must not be duplicated when the decorator is dropped
+        drop(stream);
+        let hit_count = success_hist.fetch().sample_count();
+        assert_eq!(hit_count, 1, "wall clock duration duplicated");
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
+    }
+
+    // Ensures the stream decorator correctly records the wall clock duration
+    // and consumed byte count for a partially drained stream that is then
+    // dropped.
+    #[tokio::test]
+    async fn test_stream_decorator_drop_incomplete() {
+        let inner = stream::iter(
+            [
+                Ok(Bytes::copy_from_slice(&[1])),
+                Ok(Bytes::copy_from_slice(&[2, 3, 4])),
+            ]
+            .into_iter()
+            .collect::<Vec<Result<_, std::io::Error>>>(),
+        );
+
+        let time_provider = SystemProvider::default();
+
+        let metrics = Arc::new(metric::Registry::default());
+        let m = MetricsWithBytes::new(&metrics, "test");
+
+        let mut stream = StreamMetricRecorder::new(
+            inner,
+            time_provider.now(),
+            BytesStreamDelegate::new(m.clone()),
+        );
+
+        let got = stream
+            .next()
+            .await
+            .expect("should yield data")
+            .expect("should succeed");
+        assert_eq!(got.len(), 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+
+        // Sleep at least 10ms to assert the recorder to captures the wall clock
+        // time.
+        const SLEEP: Duration = Duration::from_millis(20);
+        tokio::time::sleep(SLEEP).await;
+
+        // Drop the stream without consuming the rest of the data.
+        drop(stream);
+
+        // Now the stream is complete, the wall clock duration must have been
+        // recorded.
+        let hit_count = m.inner.success_duration.fetch().sample_count();
+        assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
+
+        // And the number of bytes read must match the pre-drop value.
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+    }
+
+    // Ensures the stream decorator records the wall clock duration into the
+    // "error" histogram after the stream is dropped after emitting an error.
+    #[tokio::test]
+    async fn test_stream_decorator_transient_error_dropped() {
+        let inner = stream::iter(
+            [
+                Ok(Bytes::copy_from_slice(&[1])),
+                Err(Error::new(ErrorKind::Other, "oh no!")),
+                Ok(Bytes::copy_from_slice(&[2, 3, 4])),
+            ]
+            .into_iter()
+            .collect::<Vec<Result<_, std::io::Error>>>(),
+        );
+
+        let time_provider = SystemProvider::default();
+
+        let metrics = Arc::new(metric::Registry::default());
+        let m = MetricsWithBytes::new(&metrics, "test");
+
+        let mut stream = StreamMetricRecorder::new(
+            inner,
+            time_provider.now(),
+            BytesStreamDelegate::new(m.clone()),
+        );
+
+        let got = stream
+            .next()
+            .await
+            .expect("should yield data")
+            .expect("should succeed");
+        assert_eq!(got.len(), 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "error")],
+            0,
+        );
+
+        let _err = stream
+            .next()
+            .await
+            .expect("should yield an error")
+            .expect_err("error configured in underlying stream");
+
+        // Drop after observing an error
+        drop(stream);
+
+        // Ensure the wall clock was added to the "error" histogram.
+        let hit_count = m.inner.error_duration.fetch().sample_count();
+        assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
+
+        // And the number of bytes read must match
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "error")],
+            0,
+        );
+    }
+
+    // Ensures the stream decorator records the wall clock duration into the
+    // "success" histogram after the stream progresses past a transient error.
+    #[tokio::test]
+    async fn test_stream_decorator_transient_error_progressed() {
+        let inner = stream::iter(
+            [
+                Ok(Bytes::copy_from_slice(&[1])),
+                Err(Error::new(ErrorKind::Other, "oh no!")),
+                Ok(Bytes::copy_from_slice(&[2, 3, 4])),
+            ]
+            .into_iter()
+            .collect::<Vec<Result<_, std::io::Error>>>(),
+        );
+
+        let time_provider = SystemProvider::default();
+
+        let metrics = Arc::new(metric::Registry::default());
+        let m = MetricsWithBytes::new(&metrics, "test");
+
+        let mut stream = StreamMetricRecorder::new(
+            inner,
+            time_provider.now(),
+            BytesStreamDelegate::new(m.clone()),
+        );
+
+        let got = stream
+            .next()
+            .await
+            .expect("should yield data")
+            .expect("should succeed");
+        assert_eq!(got.len(), 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+
+        let _err = stream
+            .next()
+            .await
+            .expect("should yield an error")
+            .expect_err("error configured in underlying stream");
+
+        let got = stream
+            .next()
+            .await
+            .expect("should yield data")
+            .expect("should succeed");
+        assert_eq!(got.len(), 3);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
+
+        // Drop after observing an error
+        drop(stream);
+
+        // Ensure the wall clock was added to the "success" histogram after
+        // progressing past the transient error.
+        let hit_count = m.inner.success_duration.fetch().sample_count();
+        assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
+
+        // And the number of bytes read must match
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
+    }
+
+    // Ensures the wall clock time recorded by the stream decorator includes the
+    // initial get even if never polled.
+    #[tokio::test]
+    async fn test_stream_immediate_drop() {
+        let inner = stream::iter(
+            [Ok(Bytes::copy_from_slice(&[1]))]
+                .into_iter()
+                .collect::<Vec<Result<Bytes, std::io::Error>>>(),
+        );
+
+        let time_provider = SystemProvider::default();
+
+        let metrics = Arc::new(metric::Registry::default());
+        let m = MetricsWithBytes::new(&metrics, "test");
+
+        let stream = StreamMetricRecorder::new(
+            inner,
+            time_provider.now(),
+            BytesStreamDelegate::new(m.clone()),
+        );
+
+        // Drop immediately
+        drop(stream);
+
+        // Ensure the wall clock was added to the "success" histogram
+        let hit_count = m.inner.success_duration.fetch().sample_count();
+        assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
+
+        // And the number of bytes read must match
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            0,
+        );
+    }
+
+    // Ensures the wall clock time recorded by the stream decorator emits a wall
+    // clock duration even if it never yields any data.
+    #[tokio::test]
+    async fn test_stream_empty() {
+        let inner = stream::iter(
+            [].into_iter()
+                .collect::<Vec<Result<Bytes, std::io::Error>>>(),
+        );
+
+        let time_provider = SystemProvider::default();
+
+        let metrics = Arc::new(metric::Registry::default());
+        let m = MetricsWithBytes::new(&metrics, "test");
+
+        let mut stream = StreamMetricRecorder::new(
+            inner,
+            time_provider.now(),
+            BytesStreamDelegate::new(m.clone()),
+        );
+
+        assert!(stream.next().await.is_none());
+
+        // Ensure the wall clock was added to the "success" histogram even
+        // though it yielded no data.
+        let hit_count = m.inner.success_duration.fetch().sample_count();
+        assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
+
+        // And the number of bytes read must match
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            0,
+        );
+    }
+}
diff --git a/observability_deps/Cargo.toml b/observability_deps/Cargo.toml
new file mode 100644
index 0000000..a24de4f
--- /dev/null
+++ b/observability_deps/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "observability_deps"
+description = "Observability ecosystem dependencies for InfluxDB IOx, to ensure consistent versions and unified updates"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+tracing = { version = "0.1", features = ["max_level_trace"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[features]
+default = []
+
+# Optionally enable trace events in the release build if enabled.
+release_max_level_trace = ["tracing/release_max_level_trace"]
diff --git a/observability_deps/src/lib.rs b/observability_deps/src/lib.rs
new file mode 100644
index 0000000..1d1079e
--- /dev/null
+++ b/observability_deps/src/lib.rs
@@ -0,0 +1,3 @@
+// Export these crates publicly so we can have a single reference
+pub use tracing;
+pub use tracing::instrument;
diff --git a/panic_logging/Cargo.toml b/panic_logging/Cargo.toml
new file mode 100644
index 0000000..3c704e2
--- /dev/null
+++ b/panic_logging/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "panic_logging"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+metric = { path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+test_helpers = { path = "../test_helpers" }
diff --git a/panic_logging/src/lib.rs b/panic_logging/src/lib.rs
new file mode 100644
index 0000000..ef17394
--- /dev/null
+++ b/panic_logging/src/lib.rs
@@ -0,0 +1,292 @@
+//! Custom panic hook that sends the panic information to a tracing
+//! span
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::{collections::HashMap, fmt, panic, sync::Arc};
+
+use metric::U64Counter;
+use observability_deps::tracing::{error, warn};
+use panic::PanicInfo;
+
+type PanicFunctionPtr = Arc<Box<dyn Fn(&PanicInfo<'_>) + Sync + Send + 'static>>;
+
+/// RAII guard that installs a custom panic hook to send panic
+/// information to tracing.
+///
+/// Upon construction registers a custom panic
+/// hook which sends the panic to tracing first, before calling any
+/// prior panic hook.
+///
+/// Upon drop, restores the pre-existing panic hook
+#[derive(Default)]
+pub struct SendPanicsToTracing {
+    /// The previously installed panic hook -- Note it is wrapped in an
+    /// `Option` so we can `.take` it during the call to `drop()`;
+    old_panic_hook: Option<PanicFunctionPtr>,
+}
+
+impl SendPanicsToTracing {
+    pub fn new() -> Self {
+        Self::new_inner(None)
+    }
+
+    /// Configure this panic handler to emit a panic count metric.
+    ///
+    /// The metric is named `thread_panic_count_total` and is incremented each
+    /// time the panic handler is invoked.
+    pub fn new_with_metrics(metrics: &metric::Registry) -> Self {
+        let metrics = Metrics::new(metrics);
+        Self::new_inner(Some(metrics))
+    }
+
+    fn new_inner(metrics: Option<Metrics>) -> Self {
+        let current_panic_hook: PanicFunctionPtr = Arc::new(panic::take_hook());
+        let old_panic_hook = Some(Arc::clone(&current_panic_hook));
+        panic::set_hook(Box::new(move |info| {
+            let panic_type = PanicType::classify(info);
+            if let Some(metrics) = &metrics {
+                metrics.inc(panic_type);
+            }
+
+            let location = info.location();
+            error!(
+                panic_type = panic_type.name(),
+                panic_message = message(info),
+                panic_file = location.map(|l| l.file()),
+                panic_line = location.map(|l| l.line()),
+                panic_column = location.map(|l| l.column()),
+                "Thread panic",
+            );
+
+            current_panic_hook(info);
+        }));
+
+        Self { old_panic_hook }
+    }
+}
+
+// can't derive because the function pointer doesn't implement Debug
+impl fmt::Debug for SendPanicsToTracing {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("SendPanicsToTracing").finish()
+    }
+}
+
+impl Drop for SendPanicsToTracing {
+    fn drop(&mut self) {
+        if std::thread::panicking() {
+            warn!("Can't reset old panic hook as we are currently panicking");
+            return;
+        }
+
+        if let Some(old_panic_hook) = self.old_panic_hook.take() {
+            // since `old_panic_hook` is an `Arc` - at this point it
+            // should have two references -- the captured closure as
+            // well as `self`.
+
+            // Temporarily install a dummy hook that does nothing. We
+            // need to release the ref count in the closure of the
+            // panic handler.
+            panic::set_hook(Box::new(|_| {
+                println!("This panic hook should 'never' be called");
+            }));
+
+            if let Ok(old_panic_hook) = Arc::try_unwrap(old_panic_hook) {
+                panic::set_hook(Box::new(old_panic_hook))
+            } else {
+                // Should not happen -- but could if the panic handler
+                // was still running while this code is being executed
+                warn!("Can't reset old panic hook, old hook still has more than one reference");
+            }
+        } else {
+            // This is a "shouldn't happen" type error
+            warn!("Can't reset old panic hook, old hook was None...");
+        }
+    }
+}
+
+/// Ensure panics are fatal events by exiting the process with an exit code of
+/// 1 after calling the existing panic handler, if any.
+pub fn make_panics_fatal() {
+    let existing = panic::take_hook();
+
+    panic::set_hook(Box::new(move |info| {
+        // Call the existing panic hook.
+        existing(info);
+        // Exit the process.
+        //
+        // NOTE: execution may not reach this point if another hook
+        // kills the process first.
+        std::process::exit(1);
+    }));
+}
+
+/// Panic type
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+enum PanicType {
+    /// Counter for unknown panics.
+    Unknown,
+
+    /// Counter for "offset"/"offset overflow" panics.
+    ///
+    /// These are likely caused due too overly large string columns in Arrow.
+    OffsetOverflow,
+}
+
+impl PanicType {
+    fn all() -> &'static [Self] {
+        &[Self::Unknown, Self::OffsetOverflow]
+    }
+
+    fn name(&self) -> &'static str {
+        match self {
+            Self::Unknown => "unknown",
+            Self::OffsetOverflow => "offset_overflow",
+        }
+    }
+
+    fn classify(panic_info: &PanicInfo<'_>) -> Self {
+        match message(panic_info) {
+            Some("offset overflow" | "offset") => Self::OffsetOverflow,
+            _ => Self::Unknown,
+        }
+    }
+}
+
+/// Extract string message from [`PanicInfo`]
+fn message<'a>(panic_info: &'a PanicInfo<'a>) -> Option<&'a str> {
+    let payload_any = panic_info.payload();
+
+    payload_any
+        .downcast_ref::<&str>()
+        .copied()
+        .or(payload_any.downcast_ref::<String>().map(|s| s.as_str()))
+}
+
+/// Metrics used for panics.
+#[derive(Debug)]
+struct Metrics {
+    /// Counter for different panic types.
+    counters: HashMap<PanicType, U64Counter>,
+}
+
+impl Metrics {
+    fn new(metrics: &metric::Registry) -> Self {
+        let metric = metrics.register_metric::<U64Counter>(
+            "thread_panic_count",
+            "number of thread panics observed",
+        );
+
+        Self {
+            counters: PanicType::all()
+                .iter()
+                .map(|t| (*t, metric.recorder(&[("type", t.name())])))
+                .collect(),
+        }
+    }
+
+    fn inc(&self, panic_type: PanicType) {
+        self.counters
+            .get(&panic_type)
+            .expect("all types covered")
+            .inc(1);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::panic::panic_any;
+
+    use metric::{Attributes, Metric};
+    use test_helpers::{maybe_start_logging, tracing::TracingCapture};
+
+    use super::*;
+
+    fn assert_count(metrics: &metric::Registry, t: &'static str, count: u64) {
+        let got = metrics
+            .get_instrument::<Metric<U64Counter>>("thread_panic_count")
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from(&[("type", t)]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(got, count);
+    }
+
+    #[test]
+    fn test_panic_counter_and_logging() {
+        maybe_start_logging();
+
+        let metrics = metric::Registry::default();
+        let capture = Arc::new(TracingCapture::new());
+        let guard = SendPanicsToTracing::new_with_metrics(&metrics);
+
+        assert_count(&metrics, "offset_overflow", 0);
+        assert_count(&metrics, "unknown", 0);
+
+        let capture2 = Arc::clone(&capture);
+        std::thread::spawn(move || {
+            capture2.register_in_current_thread();
+            panic!("it's bananas");
+        })
+        .join()
+        .expect_err("wat");
+
+        let capture2 = Arc::clone(&capture);
+        std::thread::spawn(move || {
+            capture2.register_in_current_thread();
+            panic!("offset");
+        })
+        .join()
+        .expect_err("wat");
+
+        let capture2 = Arc::clone(&capture);
+        std::thread::spawn(move || {
+            capture2.register_in_current_thread();
+            let s = String::from("offset overflow");
+            panic!("{}", s);
+        })
+        .join()
+        .expect_err("wat");
+
+        let capture2 = Arc::clone(&capture);
+        std::thread::spawn(move || {
+            capture2.register_in_current_thread();
+            panic_any(1);
+        })
+        .join()
+        .expect_err("wat");
+
+        drop(guard);
+        let capture2 = Arc::clone(&capture);
+        std::thread::spawn(move || {
+            capture2.register_in_current_thread();
+            panic!("no guard");
+        })
+        .join()
+        .expect_err("wat");
+
+        assert_count(&metrics, "offset_overflow", 2);
+        assert_count(&metrics, "unknown", 2);
+
+        assert_eq!(
+            capture.to_string(),
+            "level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_message = \"it's bananas\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 242; panic_column = 13; \n\
+             level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_message = \"offset\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 250; panic_column = 13; \n\
+             level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_message = \"offset overflow\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 259; panic_column = 13; \n\
+             level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 267; panic_column = 13; "
+        );
+    }
+}
diff --git a/parquet_cache/Cargo.toml b/parquet_cache/Cargo.toml
new file mode 100644
index 0000000..7fc1b90
--- /dev/null
+++ b/parquet_cache/Cargo.toml
@@ -0,0 +1,60 @@
+[package]
+name = "parquet_cache"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[features]
+test-with-server-port = []
+
+[dependencies]
+arc-swap = "1.6.0"
+async-channel = "2.1.1"
+async-trait = "0.1.77"
+backoff = { path = "../backoff" }
+bytes = "1.5.0"
+chrono = "0.4.31"
+data_types = { path = "../data_types" }
+fnv = "1.0.7"
+futures = "0.3.30"
+http = "0.2.11"
+hyper = { version = "0.14.27", features = ["http2"] }
+iox_catalog = { path = "../iox_catalog" }
+k8s-openapi = { version = "0.20.0", features = ["schemars", "earliest"] }
+kube = { version = "0.87.1", features = ["runtime", "client", "derive"] }
+moka = { version = "0.12.3", features = ["future"] }
+mpchash = "1.2.1"
+notify = "6.1.1"
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+parking_lot = "0.12.1"
+parquet_file = { path = "../parquet_file" }
+pin-project = "1.1.3"
+reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] }
+schemars = "0.8.16"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.111"
+thiserror = "1.0.56"
+tokio = "1.35.1"
+tokio-util = { version = "0.7.10", features = ["codec"] }
+tower = "0.4.13"
+url = "2.5.0"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+ahash = "0.8.7"
+assert_matches = "1.5.0"
+bytes = "1.5.0"
+iox_tests = { path = "../iox_tests" }
+iox_time = { path = "../iox_time" }
+kube_test = { path = "../kube_test" }
+lazy_static = "1.4.0"
+rand = "0.8.5"
+tempfile = "3.9.0"
+tokio-stream = "0.1.14"
+uuid = "1.6.1"
+
diff --git a/parquet_cache/src/client.rs b/parquet_cache/src/client.rs
new file mode 100644
index 0000000..0f767f5
--- /dev/null
+++ b/parquet_cache/src/client.rs
@@ -0,0 +1,16 @@
+//! Contains the cache client.
+
+/// Interface for the object store. Consumed by Iox components.
+pub mod object_store;
+/// Interface for write hinting. Consumed by Iox components.
+pub mod write_hints;
+
+/// Connection to remote data cache. Used by the ObjectStore cache impl.
+pub(crate) mod cache_connector;
+pub(crate) mod http;
+pub(crate) mod keyspace;
+pub(crate) mod request;
+
+/// Mocks used for internal testing
+#[cfg(test)]
+pub(crate) mod mock;
diff --git a/parquet_cache/src/client/cache_connector.rs b/parquet_cache/src/client/cache_connector.rs
new file mode 100644
index 0000000..6ec3c18
--- /dev/null
+++ b/parquet_cache/src/client/cache_connector.rs
@@ -0,0 +1,37 @@
+use std::fmt::Debug;
+
+use tower::{Layer, ServiceBuilder};
+
+use super::{http::HttpService, keyspace::HostKeyspaceService};
+
+pub type ClientCacheConnector = HostKeyspaceService<HttpService>;
+
+/// Data cache errors.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Failure getting data from the cache.
+    #[error("Fetch error: {0}")]
+    FetchData(#[from] super::keyspace::Error),
+
+    /// Failure reading the (already fetched) data from cache.
+    #[error("Data error: {0}")]
+    ReadData(String),
+}
+
+/// Builder for the cache connector service.
+pub fn build_cache_connector(ns_service_addr: impl ToString) -> ClientCacheConnector {
+    ServiceBuilder::new()
+        .layer(MapToHost(ns_service_addr.to_string()))
+        .service(HttpService::new())
+}
+
+#[derive(Debug)]
+struct MapToHost(pub String);
+
+impl<S> Layer<S> for MapToHost {
+    type Service = HostKeyspaceService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        HostKeyspaceService::new(service, self.0.clone())
+    }
+}
diff --git a/parquet_cache/src/client/http.rs b/parquet_cache/src/client/http.rs
new file mode 100644
index 0000000..6bac49e
--- /dev/null
+++ b/parquet_cache/src/client/http.rs
@@ -0,0 +1,62 @@
+use std::{pin::Pin, sync::Arc, task::Poll};
+
+use futures::Future;
+use hyper::{client::HttpConnector, Body, Client, Request, Response, StatusCode};
+use tower::Service;
+
+use super::request::{PinnedFuture, RawRequest};
+
+#[derive(Debug, Clone)]
+pub struct HttpService {
+    /// Pool of connections.
+    client: Arc<Client<HttpConnector, Body>>,
+}
+
+impl HttpService {
+    pub fn new() -> Self {
+        let client = Client::builder()
+            .http2_keep_alive_while_idle(true)
+            .http2_only(true)
+            .retry_canceled_requests(true)
+            .build_http::<Body>();
+
+        Self {
+            client: Arc::new(client),
+        }
+    }
+}
+
+impl Default for HttpService {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Service<RawRequest> for HttpService {
+    type Response = Response<Body>;
+    type Error = hyper::Error;
+    type Future = PinnedFuture<Self::Response, Self::Error>;
+
+    fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: RawRequest) -> Self::Future {
+        match Request::<Body>::try_from(req) {
+            Ok(req) => Box::pin(self.client.request(req)),
+            Err(e) => invalid_request(e),
+        }
+    }
+}
+
+fn invalid_request(
+    error: impl std::error::Error,
+) -> Pin<Box<dyn Future<Output = Result<Response<Body>, hyper::Error>> + Send>> {
+    let (mut parts, _) = Response::new("invalid request").into_parts();
+    parts.status = StatusCode::BAD_REQUEST;
+
+    let body = Body::from(
+        serde_json::json!({"status": 400, "description": error.to_string()}).to_string(),
+    );
+    Box::pin(futures::future::ok(Response::from_parts(parts, body)))
+}
diff --git a/parquet_cache/src/client/keyspace.rs b/parquet_cache/src/client/keyspace.rs
new file mode 100644
index 0000000..d65a0b5
--- /dev/null
+++ b/parquet_cache/src/client/keyspace.rs
@@ -0,0 +1,314 @@
+use std::{collections::HashMap, sync::Arc, task::Poll};
+
+use arc_swap::ArcSwap;
+use backoff::{Backoff, BackoffConfig};
+use bytes::Buf;
+use http::uri::Authority;
+use hyper::{Body, Method, Response, StatusCode, Uri};
+use mpchash::HashRing;
+use observability_deps::tracing::warn;
+use tokio::sync::OnceCell;
+use tower::{Service, ServiceExt};
+
+use super::request::{PinnedFuture, RawRequest};
+use crate::data_types::{KeyspaceResponseBody, ServiceNode, ServiceNodeId};
+
+/// Errors associated fetching data from the cache.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Generic connection failure to remote data cache service.
+    #[error("Connection error: {0}")]
+    Connection(#[from] hyper::Error),
+
+    /// Error in constructing request.
+    #[error("Request error: {0}")]
+    Request(String),
+
+    /// Error with hashring keyspace
+    #[error("Keyspace error: {0}")]
+    Keyspace(String),
+
+    /// Invalid addr
+    #[error("Invalid addr: {0}")]
+    InvalidAddr(#[from] http::uri::InvalidUri),
+
+    /// Failure reading data from cache.
+    #[error("Data error: {0}")]
+    ReadData(String),
+}
+
+#[derive(Debug, Clone)]
+pub struct HostKeyspaceService<S> {
+    /// Inner service
+    service: S,
+    /// Namespace service addr (for requests to any cache server).
+    dst: String,
+    /// Inner state
+    inner: Arc<HostKeyspace>,
+}
+
+impl<S> HostKeyspaceService<S> {
+    /// Create keyspace middleware [`HostKeyspaceService`]
+    pub fn new(service: S, dst: String) -> Self {
+        Self {
+            service,
+            dst,
+            inner: Default::default(),
+        }
+    }
+}
+
+impl<S> HostKeyspaceService<S>
+where
+    S: Clone + Send + Sync + Service<RawRequest, Response = Response<Body>, Error = hyper::Error>,
+    for<'b> <S as Service<RawRequest>>::Future: std::marker::Send + 'b,
+{
+    /// Primary goal of [`HostKeyspaceService`] is to add the host to the [`RawRequest`].
+    async fn add_host_to_request(&mut self, mut req: RawRequest) -> Result<RawRequest, Error> {
+        let host = match &req.key {
+            Some(obj_key) => self.hostname(obj_key).await?,
+            None => self.dst.clone(), // k8s namespace service addr
+        };
+
+        req.uri_parts.authority =
+            Some(Authority::from_maybe_shared(host).map_err(Error::InvalidAddr)?);
+
+        Ok(req)
+    }
+
+    /// Hostname provided based upon hashed keyspace.
+    /// Lookup, if missing the re-query service for the latest keyspace.
+    async fn hostname(&mut self, key: &String) -> Result<String, Error> {
+        let node = self.inner.key_to_node(key);
+
+        match self.inner.hostname_table.load().get(&node) {
+            Some(hostname) => Ok(hostname.to_owned()),
+            None => {
+                let keyspace = self.get_service_nodes().await?;
+                let inner = &mut self.inner;
+                inner.build_keyspace(keyspace);
+                let node = inner.key_to_node(key);
+
+                let hostname = inner.hostname_table
+                    .load()
+                    .get(&node)
+                    .ok_or(Error::Keyspace(format!("key {} was assigned to node {}, but node was not found in latest keyspace hosts", key, node)))?
+                    .to_owned();
+                Ok(hostname)
+            }
+        }
+    }
+
+    /// Get list of [`ServiceNode`]s from cache service.
+    async fn get_service_nodes(&mut self) -> Result<Vec<ServiceNode>, Error> {
+        // use the Namespace service addr (self.dst), and not an individual server, to fetch the keyspace.
+        let uri_parts = format!("{}/keyspace", &self.dst)
+            .parse::<Uri>()
+            .map(http::uri::Parts::from)
+            .map_err(Error::InvalidAddr)?;
+
+        let req = RawRequest {
+            uri_parts,
+            method: Method::GET,
+            ..Default::default()
+        };
+
+        let service = self.service.ready().await?;
+        let resp = service.call(req).await.map_err(Error::Connection)?;
+
+        match resp.status() {
+            StatusCode::OK => {
+                let reader = hyper::body::aggregate(resp.into_body())
+                    .await
+                    .map_err(|e| Error::Keyspace(e.to_string()))?
+                    .reader();
+
+                let keyspace_nodes: KeyspaceResponseBody =
+                    serde_json::from_reader(reader).map_err(|e| Error::Keyspace(e.to_string()))?;
+
+                Ok(keyspace_nodes.nodes)
+            }
+            _ => Err(Error::Keyspace(String::from("keyspace request failure"))),
+        }
+    }
+
+    /// Initialize the keyspace on service start.
+    /// Has backoff-and-retry; intended to be called once.
+    async fn initialized(&mut self) {
+        Backoff::new(&BackoffConfig::default())
+            .retry_all_errors("probe data cache service for keyspace", || {
+                let mut this = self.clone();
+                async move {
+                    let probe = this
+                        .get_service_nodes()
+                        .await
+                        .map(|keyspace| this.inner.build_keyspace(keyspace));
+                    if probe.is_err() {
+                        warn!("failed to build data cache keyspace");
+                    }
+                    probe
+                }
+            })
+            .await
+            .expect("retry forever")
+    }
+}
+
+impl<S> Service<RawRequest> for HostKeyspaceService<S>
+where
+    S: Clone
+        + Service<RawRequest, Response = Response<Body>, Error = hyper::Error>
+        + Send
+        + Sync
+        + 'static,
+    for<'b> <S as Service<RawRequest>>::Future: std::marker::Send + 'b,
+{
+    type Response = S::Response;
+    type Error = Error;
+    type Future = PinnedFuture<Self::Response, Self::Error>;
+
+    fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: RawRequest) -> Self::Future {
+        let mut this = self.clone();
+        Box::pin(async move {
+            Arc::clone(&this.inner)
+                .initialize_once
+                .get_or_init(|| this.initialized())
+                .await;
+            let req = this.add_host_to_request(req).await?;
+            this.service.call(req).await.map_err(Error::Connection)
+        })
+    }
+}
+
+#[derive(Debug, Default)]
+struct HostKeyspace {
+    /// Hashring
+    keyspace: ArcSwap<HashRing<ServiceNodeId>>,
+    /// Map nodes to hostname.
+    hostname_table: ArcSwap<HashMap<ServiceNodeId, String>>,
+    /// A single init of the shared, clonable keyspace.
+    /// (Note that the re-building of an invalidated keyspace, is separate from this init.)
+    initialize_once: OnceCell<()>,
+}
+
+impl HostKeyspace {
+    /// Lookup key in keyspace
+    fn key_to_node(&self, key: &String) -> ServiceNodeId {
+        self.keyspace
+            .load()
+            .as_ref()
+            .primary_node(key)
+            .unwrap()
+            .to_owned()
+    }
+
+    /// Build keyspace for cache connector, from list of [`ServiceNode`]s.
+    fn build_keyspace(&self, keyspace_nodes: Vec<ServiceNode>) {
+        let mut keyspace = HashRing::new();
+        let mut hostname_table = HashMap::new();
+
+        for node in keyspace_nodes {
+            keyspace.add(node.id);
+            hostname_table.insert(node.id, node.hostname);
+        }
+
+        self.keyspace.swap(Arc::new(keyspace));
+        self.hostname_table.swap(Arc::new(hostname_table));
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::hash_map::Entry;
+
+    use parking_lot::Mutex;
+    use rand::seq::SliceRandom;
+    use uuid::Uuid;
+
+    use super::super::http::HttpService;
+    use crate::data_types::ServiceNode;
+
+    use super::*;
+
+    async fn assert_consistent_hashing(
+        mut cache_connector: HostKeyspaceService<HttpService>,
+        prev_assignments: Arc<Mutex<HashMap<String, String>>>,
+    ) {
+        // test with 100 files
+        for _ in 0..100 {
+            let key = format!("unique/location/{}/file.parquet", Uuid::new_v4());
+            for _ in 0..1000 {
+                let key = key.clone();
+
+                let got = cache_connector
+                    .hostname(&key)
+                    .await
+                    .expect("should assign hostname");
+                let expected = match prev_assignments.lock().entry(key) {
+                    Entry::Vacant(v) => {
+                        v.insert(got.clone());
+                        got.clone()
+                    }
+                    Entry::Occupied(o) => o.get().clone(),
+                };
+
+                assert_eq!(
+                    got, expected,
+                    "should match previous assignment {}, instead got {}",
+                    expected, got
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_hashing_is_consistent() {
+        let remote_cache_connector =
+            HostKeyspaceService::new(HttpService::default(), "foo".to_string());
+
+        let keyspace_nodes = (0..100)
+            .map(|id| ServiceNode {
+                id,
+                hostname: format!("cache-server-hostname-{}", id),
+            })
+            .collect();
+        remote_cache_connector.inner.build_keyspace(keyspace_nodes);
+
+        let prev_assignments = Arc::new(Mutex::new(HashMap::new())); // location_key, hostname_assigned
+        assert_consistent_hashing(remote_cache_connector, prev_assignments).await;
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_population_is_not_ordering_sensitive() {
+        // Sanity check. Asserting that the expected hashing properties hold true.
+
+        let remote_cache_connector =
+            HostKeyspaceService::new(HttpService::default(), "foo".to_string());
+        let prev_assignments = Arc::new(Mutex::new(HashMap::new())); // location_key, hostname_assigned
+
+        // test with 0..100 ordered nodes, used when building keyspace
+        let mut keyspace_nodes: Vec<ServiceNode> = (0..100)
+            .map(|id| ServiceNode {
+                id,
+                hostname: format!("cache-server-hostname-{}", id),
+            })
+            .collect();
+        remote_cache_connector
+            .inner
+            .build_keyspace(keyspace_nodes.clone());
+        assert_consistent_hashing(
+            remote_cache_connector.clone(),
+            Arc::clone(&prev_assignments),
+        )
+        .await;
+
+        // shuffled nodes, test against same/original assignments
+        keyspace_nodes.shuffle(&mut rand::thread_rng());
+        remote_cache_connector.inner.build_keyspace(keyspace_nodes);
+        assert_consistent_hashing(remote_cache_connector, prev_assignments).await;
+    }
+}
diff --git a/parquet_cache/src/client/mock.rs b/parquet_cache/src/client/mock.rs
new file mode 100644
index 0000000..584cb31
--- /dev/null
+++ b/parquet_cache/src/client/mock.rs
@@ -0,0 +1,153 @@
+use std::collections::HashSet;
+use std::{ops::Range, sync::Arc};
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use data_types::ParquetFileParams;
+use futures::stream::BoxStream;
+use object_store::{
+    path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore,
+    PutOptions, PutResult, Result,
+};
+use parking_lot::Mutex;
+use tokio::io::AsyncWrite;
+
+use crate::{
+    data_types::WriteHintAck, DataCacheObjectStore, MockCacheServer, WriteHintingObjectStore,
+};
+
+use super::cache_connector::build_cache_connector;
+
+/// Build a cache client,
+/// with a mocked server and mocked direct-to-store fallback.
+pub async fn build_cache_server_client(
+    direct_to_store: Arc<dyn ObjectStore>,
+) -> (DataCacheObjectStore, MockCacheServer) {
+    // build server and client
+    let dst = "localhost:0";
+    let cache_server = MockCacheServer::create(dst, Arc::clone(&direct_to_store)).await;
+    let cache_client = build_cache_connector(cache_server.addr());
+
+    // build object_store
+    let object_store = DataCacheObjectStore::new(cache_client, direct_to_store);
+
+    (object_store, cache_server)
+}
+
+/// A mocked direct-to-object-store, with the following characteristics:
+///   * panics when used as fallback (for GET requests)
+///   * tracks when called for PUT requests
+#[derive(Debug, Default)]
+pub struct MockDirectStore {
+    called: Mutex<HashSet<String>>,
+}
+
+impl MockDirectStore {
+    pub fn was_called(&self, fn_name: &str) -> bool {
+        self.called.lock().contains(&String::from(fn_name))
+    }
+}
+
+#[async_trait]
+impl ObjectStore for MockDirectStore {
+    async fn put_opts(
+        &self,
+        _location: &Path,
+        _bytes: Bytes,
+        _opts: PutOptions,
+    ) -> Result<PutResult> {
+        self.called.lock().insert(String::from("put"));
+        Ok(PutResult {
+            e_tag: None,
+            version: None,
+        })
+    }
+
+    async fn put_multipart(
+        &self,
+        _location: &Path,
+    ) -> Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+        self.called.lock().insert(String::from("put_multipart"));
+        Ok((
+            String::from("AsyncWriter for MockDirectStore"),
+            Box::new(tokio::io::BufWriter::new(vec![])),
+        ))
+    }
+
+    async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> {
+        self.called.lock().insert(String::from("abort_multipart"));
+        Ok(())
+    }
+
+    async fn get(&self, _location: &Path) -> Result<GetResult> {
+        panic!("object was not found in test cache")
+    }
+
+    async fn get_opts(&self, _location: &Path, _options: GetOptions) -> Result<GetResult> {
+        // test may intentionally test fallback behavior of get_opts()
+        panic!("direct_store.get_opts() was called during test")
+    }
+
+    async fn get_range(&self, _location: &Path, _range: Range<usize>) -> Result<Bytes> {
+        panic!("direct_store should not be called during test")
+    }
+
+    async fn get_ranges(&self, _location: &Path, _ranges: &[Range<usize>]) -> Result<Vec<Bytes>> {
+        panic!("direct_store should not be called during test")
+    }
+
+    async fn head(&self, _location: &Path) -> Result<ObjectMeta> {
+        // test may intentionally test fallback behavior of get_opts()
+        panic!("direct_store.head() was called during test")
+    }
+
+    async fn delete(&self, _location: &Path) -> Result<()> {
+        self.called.lock().insert(String::from("delete"));
+        Ok(())
+    }
+
+    fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
+        self.called.lock().insert(String::from("list"));
+        Box::pin(tokio_stream::iter(vec![]))
+    }
+
+    async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result<ListResult> {
+        self.called
+            .lock()
+            .insert(String::from("list_with_delimiter"));
+        Ok(ListResult {
+            common_prefixes: vec![],
+            objects: vec![],
+        })
+    }
+
+    async fn copy(&self, _from: &Path, _to: &Path) -> Result<()> {
+        self.called.lock().insert(String::from("copy"));
+        Ok(())
+    }
+
+    async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> Result<()> {
+        self.called
+            .lock()
+            .insert(String::from("copy_if_not_exists"));
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl WriteHintingObjectStore for MockDirectStore {
+    async fn write_hint<'a>(
+        &self,
+        _location: &'a Path,
+        _new_file: &'a ParquetFileParams,
+        _ack_setting: WriteHintAck,
+    ) -> Result<()> {
+        panic!("direct_store should not be called during test");
+    }
+}
+
+impl std::fmt::Display for MockDirectStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DirectStore")
+    }
+}
diff --git a/parquet_cache/src/client/object_store.rs b/parquet_cache/src/client/object_store.rs
new file mode 100644
index 0000000..b00642b
--- /dev/null
+++ b/parquet_cache/src/client/object_store.rs
@@ -0,0 +1,776 @@
+use std::collections::HashMap;
+use std::io::{Error, ErrorKind};
+use std::{ops::Range, sync::Arc};
+
+use async_trait::async_trait;
+use bytes::{Buf, Bytes};
+use futures::stream::{BoxStream, StreamExt, TryStreamExt};
+use http::Method;
+use hyper::StatusCode;
+use hyper::{Body, Response};
+use object_store::{
+    path::Path, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId,
+    ObjectMeta, ObjectStore, PutOptions, PutResult, Result,
+};
+use tokio::io::AsyncWrite;
+use tower::{Service, ServiceExt};
+
+use crate::data_types::{
+    extract_usize_header, GetObjectMetaResponse, X_RANGE_END_HEADER, X_RANGE_START_HEADER,
+};
+
+use super::cache_connector::{ClientCacheConnector, Error as CacheClientError};
+use super::request::RawRequest;
+
+/// identifier for `object_store::Error::Generic`
+const DATA_CACHE: &str = "object store to data cache";
+
+/// Data cache, consumable by IOX Components.
+pub struct DataCacheObjectStore {
+    pub(crate) cache: ClientCacheConnector,
+    pub(crate) direct_passthru: Arc<dyn ObjectStore>,
+}
+
+impl DataCacheObjectStore {
+    /// Create a new [`DataCacheObjectStore`].
+    pub fn new(cache: ClientCacheConnector, direct_store: Arc<dyn ObjectStore>) -> Self {
+        Self {
+            cache,
+            direct_passthru: Arc::new(direct_store),
+        }
+    }
+}
+
+/// ObjectStore client for using the data cache.
+///
+/// Defines when to use the direct (passthru) object store,
+/// versus the data cache.
+///
+/// Iox components all utilize the [`ObjectStore`] for store connection.
+/// Based upon startup configuration, this may be the data cache.
+#[async_trait]
+impl ObjectStore for DataCacheObjectStore {
+    async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result<PutResult> {
+        self.direct_passthru.put_opts(location, bytes, opts).await
+    }
+
+    async fn put_multipart(
+        &self,
+        location: &Path,
+    ) -> Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+        self.direct_passthru.put_multipart(location).await
+    }
+
+    async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> {
+        self.direct_passthru
+            .abort_multipart(location, multipart_id)
+            .await
+    }
+
+    async fn get_opts(&self, location: &Path, options: GetOptions) -> Result<GetResult> {
+        let object_meta: ObjectMeta = self.head(location).await?;
+
+        let key = location.to_string();
+
+        let uri_parts = format!("/object?location={}", key)
+            .parse::<http::Uri>()
+            .map(http::uri::Parts::from)
+            .expect("should be valid uri");
+
+        let GetOptions {
+            if_match,
+            if_none_match,
+            if_modified_since,
+            if_unmodified_since,
+            range,
+            version,
+            head,
+        } = &options;
+        let headers = Headers(&mut HashMap::new())
+            .add_header("If-Match", if_match)
+            .add_header("If-None-Match", if_none_match)
+            .add_header("If-Modified-Since", if_modified_since)
+            .add_header("If-Unmodified-Since", if_unmodified_since)
+            // Pass other options as non standard headers
+            .add_header("X-Version", version)
+            .add_header("X-Head", &Some(head))
+            .add_range(range)
+            .0
+            .to_owned();
+
+        let req = RawRequest {
+            method: Method::GET,
+            uri_parts,
+            headers,
+            key: Some(key),
+            ..Default::default()
+        };
+
+        let mut cache = self.cache.clone();
+        let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        match service.call(req).await {
+            Ok(resp) => match resp.status() {
+                StatusCode::OK => {
+                    match transform_get_object_response(resp, object_meta, range) {
+                        Ok(res) => Ok(res),
+                        Err(_) => self.direct_passthru.get_opts(location, options).await, // read_data error
+                    }
+                }
+                code => {
+                    if use_fallback(code) {
+                        self.direct_passthru.get_opts(location, options).await // http code error
+                    } else {
+                        let source = Box::new(Error::new(ErrorKind::Other, code.to_string()));
+                        Err(ObjectStoreError::Generic {
+                            store: DATA_CACHE,
+                            source,
+                        })
+                    }
+                }
+            },
+            Err(_) => self.direct_passthru.get_opts(location, options).await, // connection error
+        }
+    }
+
+    async fn get_range(&self, location: &Path, range: Range<usize>) -> Result<Bytes> {
+        self.get_opts(
+            location,
+            GetOptions {
+                range: Some(range),
+                ..Default::default()
+            },
+        )
+        .await?
+        .bytes()
+        .await
+    }
+
+    async fn head(&self, location: &Path) -> Result<ObjectMeta> {
+        let key = location.to_string();
+
+        let uri_parts = format!("/metadata?location={}", key)
+            .parse::<http::Uri>()
+            .map(http::uri::Parts::from)
+            .expect("should be valid uri");
+
+        let req = RawRequest {
+            method: Method::GET,
+            uri_parts,
+            key: Some(key),
+            ..Default::default()
+        };
+
+        let mut cache = self.cache.clone();
+        let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        match service.call(req).await {
+            Ok(mut resp) => match resp.status() {
+                StatusCode::OK => {
+                    let maybe_meta: Result<ObjectMeta, CacheClientError> =
+                        hyper::body::aggregate(resp.body_mut())
+                            .await
+                            .map_err(|e| CacheClientError::ReadData(e.to_string()))
+                            .map(|buf| buf.reader())
+                            .and_then(|reader| {
+                                serde_json::from_reader(reader)
+                                    .map_err(|e| CacheClientError::ReadData(e.to_string()))
+                            })
+                            .map(|get_meta_resp: GetObjectMetaResponse| {
+                                ObjectMeta::from(get_meta_resp)
+                            });
+
+                    match maybe_meta {
+                        Ok(meta) => Ok(meta),
+                        Err(_) => self.direct_passthru.head(location).await, // read_data error
+                    }
+                }
+                code => {
+                    if use_fallback(code) {
+                        self.direct_passthru.head(location).await // http code error
+                    } else {
+                        let source = Box::new(Error::new(ErrorKind::Other, code.to_string()));
+                        Err(ObjectStoreError::Generic {
+                            store: DATA_CACHE,
+                            source,
+                        })
+                    }
+                }
+            },
+            Err(_) => self.direct_passthru.head(location).await, // connection error
+        }
+    }
+
+    async fn delete(&self, location: &Path) -> Result<()> {
+        // Do not delete from cache, instead let it age out.
+        // Querier runs off of catalog snapshots of object_store state.
+        self.direct_passthru.delete(location).await
+    }
+
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
+        // Use object_store directly as src of truth for currently existing files.
+        // Because cache cannot know about completeness of the file set.
+        self.direct_passthru.list(prefix)
+    }
+
+    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
+        // Use object_store directly as src of truth for currently existing files.
+        // Because cache cannot know about completeness of the file set.
+        self.direct_passthru.list_with_delimiter(prefix).await
+    }
+
+    async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
+        self.direct_passthru.copy(from, to).await
+    }
+
+    async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
+        self.direct_passthru.copy_if_not_exists(from, to).await
+    }
+}
+
+impl std::fmt::Display for DataCacheObjectStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DataCacheObjectStore")
+    }
+}
+
+impl std::fmt::Debug for DataCacheObjectStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DataCacheObjectStore")
+    }
+}
+
+fn use_fallback(code: StatusCode) -> bool {
+    match code {
+        StatusCode::OK => unreachable!("should not be requesting fallback if response is OK"),
+        // Errors which should not result in trying the fallback.
+        StatusCode::BAD_REQUEST
+        | StatusCode::PRECONDITION_FAILED
+        | StatusCode::FORBIDDEN
+        | StatusCode::UNAUTHORIZED
+        | StatusCode::MOVED_PERMANENTLY
+        | StatusCode::NETWORK_AUTHENTICATION_REQUIRED => false,
+        // All other errors => use fallback.
+        _ => true,
+    }
+}
+
+fn transform_get_object_response(
+    resp: Response<Body>,
+    meta: ObjectMeta,
+    expected_range: &Option<Range<usize>>,
+) -> Result<GetResult, CacheClientError> {
+    let headers = resp.headers();
+    let range = Range {
+        start: extract_usize_header(X_RANGE_START_HEADER, headers)?,
+        end: extract_usize_header(X_RANGE_END_HEADER, headers)?,
+    };
+
+    if let Some(expected_range) = expected_range {
+        if !expected_range.start.eq(&range.start) || !expected_range.end.eq(&range.end) {
+            return Err(CacheClientError::ReadData(format!(
+                "expected range {:?} but found range {:?}",
+                expected_range, range
+            )));
+        }
+    };
+
+    let stream = resp
+        .into_body()
+        .map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })
+        .boxed();
+
+    Ok(GetResult {
+        payload: object_store::GetResultPayload::Stream(stream),
+        meta,
+        range,
+    })
+}
+
+/// Newtype around headers, for convenience methods.
+struct Headers<'a>(pub &'a mut HashMap<&'static str, String>);
+
+impl<'a> Headers<'a> {
+    fn add_header<T: ToString>(&mut self, k: &'static str, v: &Option<T>) -> &mut Self {
+        if let Some(v) = v {
+            // let header_name = k.to_owned();
+            self.0.insert(k, v.to_string());
+        }
+        self
+    }
+
+    fn add_range(&mut self, range: &Option<Range<usize>>) -> &mut Self {
+        if let Some(v) = range {
+            self.0
+                .insert("Range", format!("bytes={}-{}", v.start, v.end));
+        }
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+
+    use crate::client::mock::{build_cache_server_client, MockDirectStore};
+    use crate::server::mock::{build_resp_body, ExpectedResponse};
+
+    use super::*;
+
+    static FILE: &[u8] = "All my pretty data.".as_bytes();
+
+    #[tokio::test]
+    async fn test_writes_are_passed_to_store() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        assert!(object_store
+            .put(&Path::default(), FILE.into())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("put"),
+            "put should be passed to direct store"
+        );
+
+        assert!(object_store.put_multipart(&Path::default()).await.is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("put_multipart"),
+            "put_multipart should be passed to direct store"
+        );
+
+        assert!(object_store
+            .abort_multipart(&Path::default(), &MultipartId::default())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("abort_multipart"),
+            "abort_multipart should be passed to direct store"
+        );
+
+        assert!(object_store.delete(&Path::default()).await.is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("delete"),
+            "delete should be passed to direct store"
+        );
+
+        assert!(object_store
+            .copy(&Path::default(), &Path::default())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("copy"),
+            "copy should be passed to direct store"
+        );
+
+        assert!(object_store
+            .copy_if_not_exists(&Path::default(), &Path::default())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("copy_if_not_exists"),
+            "copy_if_not_exists should be passed to direct store"
+        );
+
+        cache_server.close().await;
+    }
+
+    #[tokio::test]
+    async fn test_list_all_objects_are_passed_to_store() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        object_store.list(Some(&Path::default()));
+        assert!(
+            Arc::clone(&direct_to_store).was_called("list"),
+            "list should be passed to direct store"
+        );
+
+        assert!(object_store
+            .list_with_delimiter(Some(&Path::default()))
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("list_with_delimiter"),
+            "list_with_delimiter should be passed to direct store"
+        );
+
+        cache_server.close().await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_requests_hit_the_cache() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        let path = Path::from("my/scoped/data/file.parquet");
+
+        // GET /metadata
+        let route = format!("/metadata?location={}", &path.to_string());
+        let expected_metadata_resp = GetObjectMetaResponse {
+            location: path.to_string(),
+            last_modified: Default::default(),
+            size: 42,
+            e_tag: None,
+            version: None,
+        };
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: build_resp_body(&expected_metadata_resp),
+                range: None,
+            },
+        );
+        assert_matches!(
+            object_store.head(&path).await,
+            Ok(res) if res == ObjectMeta::from(expected_metadata_resp.clone()),
+            "payload was returned and parsed properly"
+        );
+        assert!(
+            cache_server.was_called(&route),
+            "head should hit the cache server"
+        );
+
+        // GET fetch /object
+        // note: all fetch object requests use ObjectStore::get_opts()
+        let route = format!("/object?location={}", path);
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: std::str::from_utf8(FILE).unwrap().into(),
+                range: Some(Range {
+                    start: 0,
+                    end: FILE.len(),
+                }),
+            },
+        );
+        let object_resp = object_store.get(&path).await;
+        assert_matches!(
+            &object_resp,
+            Ok(GetResult {payload: _, meta, range: _}) if meta == &ObjectMeta::from(expected_metadata_resp),
+            "object metadata was returned and parsed properly"
+        ); // note: payload bytes will be asserted separately with the (non-mock-)server integration tests.
+        assert!(
+            cache_server.was_called(&route),
+            "get should hit the cache server"
+        );
+
+        cache_server.close().await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_range_request() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        let path = Path::from("my/scoped/data/file.parquet");
+
+        // add mock metadata
+        let route = format!("/metadata?location={}", &path.to_string());
+        let expected_metadata_resp = GetObjectMetaResponse {
+            location: path.to_string(),
+            last_modified: Default::default(),
+            size: 42,
+            e_tag: None,
+            version: None,
+        };
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: build_resp_body(&expected_metadata_resp),
+                range: None,
+            },
+        );
+
+        // add mock file
+        let route = format!("/object?location={}", &path.to_string());
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: std::str::from_utf8(&FILE[3..9]).unwrap().into(),
+                range: Some(Range { start: 3, end: 9 }),
+            },
+        );
+
+        // TEST: get_range()
+        let range = Range { start: 3, end: 9 };
+        let object_resp = object_store.get_range(&path, range.clone()).await;
+        assert_matches!(
+            &object_resp,
+            Ok(bytes) if bytes.len() == range.len(),
+            "returns proper bytes size for the range"
+        );
+        assert!(
+            cache_server.was_called(&route),
+            "get should hit the cache server"
+        );
+
+        // TEST: multiple get_ranges()
+        let object_resp = object_store
+            .get_ranges(&path, &[range.clone(), range.clone()])
+            .await;
+        assert_matches!(
+            &object_resp,
+            Ok(vec_bytes) if matches!(
+                &vec_bytes[..],
+                [bytes, bytes_2] if bytes.len() == range.len() && bytes_2.len() == range.len()
+            ),
+            "returns proper bytes size for multiple ranges"
+        );
+
+        cache_server.close().await;
+    }
+
+    mod test_range_failures {
+        use super::*;
+
+        #[should_panic(expected = "direct_store.get_opts() was called during test")]
+        #[tokio::test]
+        async fn test_get_opts_will_use_fallback_if_returned_range_does_not_match() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // add mock metadata
+            let route = format!("/metadata?location={}", &path.to_string());
+            let expected_metadata_resp = GetObjectMetaResponse {
+                location: path.to_string(),
+                last_modified: Default::default(),
+                size: 42,
+                e_tag: None,
+                version: None,
+            };
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: build_resp_body(&expected_metadata_resp),
+                    range: None,
+                },
+            );
+
+            // add mock file
+            let route = format!("/object?location={}", &path.to_string());
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: std::str::from_utf8(&FILE[3..9]).unwrap().into(),
+                    range: Some(Range { start: 3, end: 9 }),
+                },
+            );
+
+            // TEST: get_range()
+            let range = Range { start: 1, end: 7 };
+            let _ = object_store.get_range(&path, range.clone()).await;
+
+            cache_server.close().await;
+        }
+    }
+
+    mod test_head_failures {
+        use super::*;
+
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_missing_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, _cache_server) =
+                build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // TEST: metadata never provided to mock
+            let _ = object_store.head(&path).await;
+        }
+
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_bad_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // TEST: incorrect metadata provided to mock
+            let route = format!("/metadata?location={}", &path.to_string());
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: vec![].into(), // BAD: should be metadata
+                    range: None,
+                },
+            );
+            let _ = object_store.head(&path).await;
+        }
+
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_on_connection_failed() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // GET /metadata is working
+            let route = format!("/metadata?location={}", &path.to_string());
+            let expected_metadata_resp = GetObjectMetaResponse {
+                location: path.to_string(),
+                last_modified: Default::default(),
+                size: 42,
+                e_tag: None,
+                version: None,
+            };
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: build_resp_body(&expected_metadata_resp),
+                    range: None,
+                },
+            );
+            assert_matches!(
+                object_store.head(&path).await,
+                Ok(res) if res == ObjectMeta::from(expected_metadata_resp.clone()),
+                "payload was returned and parsed properly"
+            );
+
+            // kill server
+            cache_server.close().await;
+
+            // TEST: connection fails
+            let _ = object_store.head(&path).await;
+        }
+    }
+
+    mod test_get_opts_failures {
+        use crate::MockCacheServer;
+
+        use super::*;
+
+        async fn setup_metadata_head(path: &Path, cache_server: &MockCacheServer) {
+            // GET /metadata is working
+            let route = format!("/metadata?location={}", path);
+            let expected_metadata_resp = GetObjectMetaResponse {
+                location: path.to_string(),
+                last_modified: Default::default(),
+                size: 42,
+                e_tag: None,
+                version: None,
+            };
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: build_resp_body(&expected_metadata_resp),
+                    range: None,
+                },
+            );
+        }
+
+        #[should_panic(expected = "direct_store.get_opts() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_missing_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+            setup_metadata_head(&path, &cache_server).await;
+            assert!(
+                object_store.head(&path).await.is_ok(),
+                "should have functioning metadata/head request"
+            );
+
+            // TEST: object never provided to mock
+            let _ = object_store.get(&path).await;
+        }
+
+        #[should_panic(expected = "direct_store.get_opts() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_bad_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+            setup_metadata_head(&path, &cache_server).await;
+            assert!(
+                object_store.head(&path).await.is_ok(),
+                "should have functioning metadata/head request"
+            );
+
+            // TEST: incorrect metadata provided to mock
+            let route = format!("/object?location={}", &path.to_string());
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: vec![].into(), // BAD: should be object
+                    range: None,
+                },
+            );
+            let _ = object_store.get(&path).await;
+        }
+
+        // since server is shutdown, will fail on head() request before get_opts() request
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_on_connection_failed() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+            setup_metadata_head(&path, &cache_server).await;
+            assert!(
+                object_store.head(&path).await.is_ok(),
+                "should have functioning metadata/head request"
+            );
+
+            // GET /object is working
+            let route = format!("/object?location={}", path);
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: std::str::from_utf8(FILE).unwrap().into(),
+                    range: Some(Range {
+                        start: 0,
+                        end: FILE.len(),
+                    }),
+                },
+            );
+            assert!(object_store.get(&path).await.is_ok());
+
+            // kill server
+            cache_server.close().await;
+
+            // TEST: connection fails
+            let _ = object_store.get(&path).await;
+        }
+    }
+}
diff --git a/parquet_cache/src/client/request.rs b/parquet_cache/src/client/request.rs
new file mode 100644
index 0000000..bb0e733
--- /dev/null
+++ b/parquet_cache/src/client/request.rs
@@ -0,0 +1,46 @@
+use std::{collections::HashMap, pin::Pin};
+
+use futures::Future;
+use http::uri::Scheme;
+use hyper::{header::HeaderValue, Body, Method, Request, Uri};
+
+pub type PinnedFuture<R, E> = Pin<Box<dyn Future<Output = Result<R, E>> + Send>>;
+
+#[derive(Debug, Default)]
+pub struct RawRequest {
+    pub headers: HashMap<&'static str, String>,
+    pub body: Body,
+    pub uri_parts: http::uri::Parts,
+    pub method: Method,
+    pub key: Option<String>,
+}
+
+impl TryFrom<RawRequest> for Request<Body> {
+    type Error = http::Error;
+
+    fn try_from(value: RawRequest) -> Result<Self, Self::Error> {
+        let RawRequest {
+            headers: req_headers,
+            body,
+            mut uri_parts,
+            method,
+            key: _,
+        } = value;
+
+        // reduce unnecessary (within cluster) overhead from https
+        uri_parts.scheme = Some(Scheme::HTTP);
+
+        let mut req = Request::builder()
+            .method(method)
+            .uri(Uri::from_parts(uri_parts)?);
+
+        for (k, v) in req_headers.into_iter() {
+            req = req.header(
+                k,
+                HeaderValue::from_str(v.as_str()).map_err(http::Error::from)?,
+            );
+        }
+
+        req.body(body)
+    }
+}
diff --git a/parquet_cache/src/client/write_hints.rs b/parquet_cache/src/client/write_hints.rs
new file mode 100644
index 0000000..4d091ac
--- /dev/null
+++ b/parquet_cache/src/client/write_hints.rs
@@ -0,0 +1,223 @@
+use async_trait::async_trait;
+use bytes::{BufMut, BytesMut};
+use data_types::ParquetFileParams;
+use futures::FutureExt;
+use hyper::Method;
+use object_store::{limit::LimitStore, path::Path, Error as ObjectStoreError, ObjectStore, Result};
+use tower::{Service, ServiceExt};
+
+use crate::data_types::{WriteHint, WriteHintAck, WriteHintRequestBody};
+use crate::DataCacheObjectStore;
+
+use super::request::RawRequest;
+
+/// identifier for `object_store::Error::Generic`
+const DATA_CACHE: &str = "write hint to data cache";
+
+/// An [`ObjectStore`] which handles write hinting.
+///
+/// In some cases, the write hinting request does nothing (e.g. for direct-to-store impls).
+#[async_trait]
+pub trait WriteHintingObjectStore: ObjectStore {
+    /// Handle any write hinting performed by the [`ObjectStore`].
+    async fn write_hint<'a>(
+        &self,
+        location: &'a Path,
+        new_file: &'a ParquetFileParams,
+        ack_setting: WriteHintAck,
+    ) -> Result<()>;
+}
+
+#[async_trait]
+impl WriteHintingObjectStore for DataCacheObjectStore {
+    /// Provide write hinting to data cache.
+    ///
+    /// Response is configuration based on [`WriteHintAck`].
+    async fn write_hint<'a>(
+        &self,
+        location: &'a Path,
+        new_file: &'a ParquetFileParams,
+        ack_setting: WriteHintAck,
+    ) -> Result<()> {
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: location.to_string(),
+                hint: WriteHint::from(new_file),
+                ack_setting,
+            },
+        )
+        .map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        let key = location.to_string();
+
+        let uri_parts = "/write-hint"
+            .parse::<http::Uri>()
+            .map(http::uri::Parts::from)
+            .expect("should be valid uri");
+
+        let req = RawRequest {
+            method: Method::POST,
+            uri_parts,
+            key: Some(key),
+            body: hyper::Body::from(buf.into_inner().freeze()),
+            ..Default::default()
+        };
+
+        let mut cache = self.cache.clone();
+        let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        let write_hints = service.call(req);
+
+        match ack_setting {
+            WriteHintAck::Sent => {
+                write_hints.now_or_never();
+                Ok(())
+            }
+            WriteHintAck::Received => {
+                // server responds ok after receipt
+                write_hints.await.map_err(|e| ObjectStoreError::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                })?;
+                Ok(())
+            }
+            WriteHintAck::Completed => {
+                // server responds ok after downstream actions complete
+                write_hints.await.map_err(|e| ObjectStoreError::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                })?;
+                Ok(())
+            }
+        }
+    }
+}
+
+#[async_trait]
+impl<T: ObjectStore> WriteHintingObjectStore for LimitStore<T> {
+    /// Enable our store interface to always use `Arc<dyn ObjectStore + WriteHinting>`.
+    /// (Aws, Azure, and Gcp [`ObjectStore`] impls are all [`LimitStore`].)
+    ///
+    /// When data cache is not used, the write hinting does not occur.
+    async fn write_hint<'a>(
+        &self,
+        _location: &'a Path,
+        _new_file: &'a ParquetFileParams,
+        _ack_setting: WriteHintAck,
+    ) -> Result<()> {
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use data_types::{
+        ColumnId, ColumnSet, CompactionLevel, NamespaceId, ObjectStoreId, PartitionId, TableId,
+        Timestamp,
+    };
+    use object_store::{
+        aws::AmazonS3Builder, azure::MicrosoftAzureBuilder, gcp::GoogleCloudStorageBuilder,
+        limit::LimitStore,
+    };
+
+    use crate::client::mock::{build_cache_server_client, MockDirectStore};
+
+    use super::*;
+
+    fn new_file() -> ParquetFileParams {
+        ParquetFileParams {
+            namespace_id: NamespaceId::new(0),
+            table_id: TableId::new(0),
+            partition_id: PartitionId::new(0),
+            partition_hash_id: None,
+            object_store_id: ObjectStoreId::new(),
+            min_time: Timestamp::new(1),
+            max_time: Timestamp::new(5),
+            file_size_bytes: 0,
+            row_count: 0,
+            compaction_level: CompactionLevel::Initial,
+            created_at: Timestamp::new(1234),
+            column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]),
+            max_l0_created_at: Timestamp::new(1234),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_write_hinting_always_available() {
+        // This test confirms that any external interfaces can always utilize
+        // the object_store, without awareness of whether or not it's the data cache
+        // or a direct_to_store.
+        //
+        //  if object_store.put(&location).await.is_ok() {
+        //      object_store.write_hints(&location, new_files, ack_setting).await
+        //  }
+        //
+        // This avoids leaking any configuration details (for conditional checks) across the codebase.
+
+        let location = Path::from("my/scoped/data/file.parquet");
+        let new_file = new_file();
+        let ack_setting = WriteHintAck::Received;
+
+        // impl with gcp store
+        let builder = GoogleCloudStorageBuilder::new().with_bucket_name("foo".to_string());
+        let direct_store: Arc<dyn WriteHintingObjectStore> =
+            Arc::new(LimitStore::new(builder.build().unwrap(), 10));
+        assert!(direct_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+
+        // impl with aws store
+        let builder = AmazonS3Builder::new()
+            .with_bucket_name("foo".to_string())
+            .with_region("mars".to_string());
+        let direct_store: Arc<dyn WriteHintingObjectStore> =
+            Arc::new(LimitStore::new(builder.build().unwrap(), 10));
+        assert!(direct_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+
+        // impl with azure store
+        let builder = MicrosoftAzureBuilder::new()
+            .with_container_name("foo".to_string())
+            .with_account("dabozz".to_string());
+        let direct_store: Arc<dyn WriteHintingObjectStore> =
+            Arc::new(LimitStore::new(builder.build().unwrap(), 10));
+        assert!(direct_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_write_hinting_hits_the_cache() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        let location = Path::from("my/scoped/data/file.parquet");
+        let new_file = new_file();
+        let ack_setting = WriteHintAck::Received;
+
+        assert!(object_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+        assert!(
+            cache_server.was_called(&"/write-hint".to_string()),
+            "write-hint should hit the cache server"
+        ); // note: payload bytes will be asserted separately with the (non-mock-)server integration tests.
+    }
+}
diff --git a/parquet_cache/src/controller.rs b/parquet_cache/src/controller.rs
new file mode 100644
index 0000000..e2ae248
--- /dev/null
+++ b/parquet_cache/src/controller.rs
@@ -0,0 +1,53 @@
+//! The controller module contains the API and functionality
+//! used to implement the controller for a DataCacheSet.
+
+use futures::future::select;
+use kube::Client;
+use std::time::Duration;
+
+mod error;
+pub use error::{Error, Result};
+mod kube_util;
+mod parquet_cache;
+pub use parquet_cache::{
+    ParquetCache, ParquetCacheInstanceSet, ParquetCacheSpec, ParquetCacheStatus,
+};
+
+mod parquet_cache_controller;
+
+mod parquet_cache_set;
+pub use parquet_cache_set::{ParquetCacheSet, ParquetCacheSetSpec, ParquetCacheSetStatus};
+
+mod parquet_cache_set_controller;
+
+mod state_service;
+
+/// The name of the controller.
+const CONTROLLER_NAME: &str = "parquet-cache-set-controller";
+
+/// Label used to annotate the objects with the hash of the pod template.
+const POD_TEMPLATE_HASH_LABEL: &str = "pod-template-hash";
+
+/// Label used to annotate objects with the count of parquet cache replicas.
+const PARQUET_CACHE_REPLICAS_LABEL: &str = "parquet-cache-replicas";
+
+/// The time to wait before re-executing when waiting for cache instances to warm, or cool.
+const SHORT_WAIT: Duration = Duration::from_secs(60);
+
+/// The time to wait before re-executing when there is no longer any active work to do, or
+/// the controller will be awoken by changes to owned objects.
+const LONG_WAIT: Duration = Duration::from_secs(3600);
+
+/// Run the controllers for ParquetCache and ParquetCacheSet resources to completion.
+pub async fn run(client: Client, namespace: Option<String>) -> Result<(), kube::Error> {
+    let parquet_cache_join_handle =
+        parquet_cache_controller::spawn_controller(client.clone(), namespace.clone());
+    let parquet_cache_set_join_handle =
+        parquet_cache_set_controller::spawn_controller(client.clone(), namespace.clone());
+
+    select(parquet_cache_join_handle, parquet_cache_set_join_handle)
+        .await
+        .factor_first()
+        .0
+        .unwrap()
+}
diff --git a/parquet_cache/src/controller/error.rs b/parquet_cache/src/controller/error.rs
new file mode 100644
index 0000000..bb3dc65
--- /dev/null
+++ b/parquet_cache/src/controller/error.rs
@@ -0,0 +1,29 @@
+/// Errors that can be generated by the controller.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Error when encoding a resource object.
+    #[error("encoding error: {0}")]
+    EncodingError(#[from] serde_json::Error),
+
+    /// Error performing a kubernetes operation.
+    #[error("kubernetes error: {0}")]
+    KubeError(#[from] kube::Error),
+
+    /// Error getting a cache node's state.
+    #[error("node state error: {0}")]
+    NodeStateError(Box<dyn std::error::Error + Send + Sync>),
+
+    /// Error caused by an internal failure, this is almost certainly a bug.
+    #[error("internal error: {0}")]
+    InternalError(String),
+}
+
+impl Error {
+    /// Create a new [Error::InternalError] with the provided message.
+    pub fn internal(msg: &str) -> Self {
+        Self::InternalError(String::from(msg))
+    }
+}
+
+/// Result type for the controller.
+pub type Result<T, E = Error> = std::result::Result<T, E>;
diff --git a/parquet_cache/src/controller/kube_util.rs b/parquet_cache/src/controller/kube_util.rs
new file mode 100644
index 0000000..67f847f
--- /dev/null
+++ b/parquet_cache/src/controller/kube_util.rs
@@ -0,0 +1,93 @@
+use fnv::FnvHasher;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::{LabelSelector, OwnerReference};
+use kube::{Api, Error, Resource, ResourceExt};
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+use std::fmt::Debug;
+use std::hash::Hasher;
+
+/// The set of characters kubernetes considers safe for generated strings.
+const SAFE_CHARS: [char; 27] = [
+    'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x',
+    'z', '2', '4', '5', '6', '7', '8', '9',
+];
+
+/// Encode a string using a small character set that is considered safe. This
+/// minimizes the chances of accidental vulgarity.
+pub fn safe_string(s: &str) -> String {
+    s.chars()
+        .map(|c| SAFE_CHARS[c as usize % SAFE_CHARS.len()])
+        .collect()
+}
+
+/// Get a hash value for the provided object. The hashed value has no guaranteed properties
+/// other than the same input will have the same resulting hash. There is no attempt made to
+/// hash the value in the same way that kubernetes controllers will.
+pub fn hash_object<T>(obj: &T) -> Result<String, serde_json::Error>
+where
+    T: ?Sized + Serialize,
+{
+    let bytes = serde_json::to_vec(obj)?;
+    let mut hasher = FnvHasher::with_key(0);
+    hasher.write(&bytes);
+    Ok(safe_string(&format!(
+        "{}",
+        (hasher.finish() & 0xFFFFFFFF) as u32
+    )))
+}
+
+/// Format label selectors so they can be used with ListParams.
+pub fn selectors(selector: &LabelSelector) -> Option<String> {
+    let mut clauses = vec![];
+    if let Some(expressions) = &selector.match_expressions {
+        clauses.extend(expressions.iter().filter_map(|requirement| {
+            match requirement.operator.as_ref() {
+                "In" => requirement
+                    .values
+                    .as_ref()
+                    .map(|values| format!("{} in ({})", requirement.key, values.join(","))),
+                "NotIn" => requirement
+                    .values
+                    .as_ref()
+                    .map(|values| format!("{} notin ({})", requirement.key, values.join(","))),
+                "Exists" => Some(requirement.key.clone()),
+                "DoesNotExist" => Some(format!("!{}", requirement.key)),
+                _ => None, // Skip unknown operator.
+            }
+        }));
+    }
+    if let Some(labels) = &selector.match_labels {
+        clauses.extend(labels.iter().map(|(k, v)| format!("{k}={v}")))
+    }
+    match clauses.len() {
+        0 => None,
+        _ => Some(clauses.join(",")),
+    }
+}
+
+pub async fn list_owned<K>(api: &Api<K>, owner_uid: &String) -> Result<Vec<K>, Error>
+where
+    K: Debug + Clone + Resource + DeserializeOwned + Send + Sync + 'static,
+{
+    let object_list = api.list(&Default::default()).await?;
+    Ok(object_list
+        .items
+        .into_iter()
+        .filter(|obj| obj.owner_references().iter().any(|or| &or.uid == owner_uid))
+        .collect())
+}
+
+pub fn owner_reference<R>(obj: &R) -> OwnerReference
+where
+    R: Resource<DynamicType = ()>,
+{
+    let meta = obj.meta();
+    OwnerReference {
+        api_version: R::api_version(&()).into(),
+        block_owner_deletion: Some(true),
+        controller: Some(true),
+        kind: R::kind(&()).into(),
+        name: meta.name.clone().unwrap_or_default(),
+        uid: meta.uid.clone().unwrap_or_default(),
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache.rs b/parquet_cache/src/controller/parquet_cache.rs
new file mode 100644
index 0000000..8c65bc7
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache.rs
@@ -0,0 +1,139 @@
+use super::{Error, Result, PARQUET_CACHE_REPLICAS_LABEL};
+use k8s_openapi::api::core::v1::PodTemplateSpec;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector;
+use k8s_openapi::schemars::JsonSchema;
+use kube::CustomResource;
+use serde::{Deserialize, Serialize};
+
+/// Specification of a ParquetCache.
+#[derive(Debug, Default, Clone, CustomResource, Deserialize, Serialize, JsonSchema)]
+#[kube(
+    kind = "ParquetCache",
+    group = "iox.influxdata.com",
+    version = "v1alpha1",
+    namespaced
+)]
+#[kube(status = "ParquetCacheStatus")]
+#[kube(derive = "Default")]
+#[serde(rename_all = "camelCase")]
+pub struct ParquetCacheSpec {
+    /// The name of the config map to generate containing the data cache set
+    /// state. This config map must be volume mounted in the pod template.
+    /// If a name isn't set then the config map will have the same name as
+    /// the data cache set.
+    pub config_map_name: Option<String>,
+
+    /// The number of replicas that are required to be in the data cache set.
+    pub replicas: Option<i32>,
+
+    /// Selector is a label query over pods that should match the replica
+    /// count. Label keys and values that must match in order to be controlled
+    /// by this data cache set. It must match the pod template's labels.
+    pub selector: LabelSelector,
+
+    /// Port running on the pods that should be used to query the working state
+    /// using the `/state` endpoint.
+    pub state_port: Option<String>,
+
+    /// Template is the object that describes the pod that will be created
+    /// if insufficient replicas are detected.
+    pub template: PodTemplateSpec,
+}
+
+/// Status of a ParquetCache.
+#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)]
+pub struct ParquetCacheStatus {
+    /// The current cache instance set.
+    pub current: ParquetCacheInstanceSet,
+
+    /// The upcoming cache instance set.
+    pub next: ParquetCacheInstanceSet,
+}
+
+/// The set of instances that form a parquet cache group.
+#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)]
+pub struct ParquetCacheInstanceSet {
+    /// The revision number of the cache instance set.
+    pub revision: i64,
+
+    /// The set of instances that form the cache set.
+    pub instances: Vec<String>,
+}
+
+impl ParquetCache {
+    fn name(&self) -> Result<&String> {
+        self.metadata
+            .name
+            .as_ref()
+            .ok_or(Error::internal("ParquetCache has no name"))
+    }
+
+    /// Get the name of the [k8s_openapi::api::core::v1::ConfigMap] that should be created to
+    /// contain the status information required by the parquet servers.
+    pub(super) fn config_map_name(&self) -> Result<&String> {
+        if let Some(name) = &self.spec.config_map_name {
+            Ok(name)
+        } else {
+            self.name()
+        }
+    }
+
+    /// The number of replicas specified for this ParquetCache.
+    pub(super) fn replicas(&self) -> i32 {
+        self.spec.replicas.unwrap_or(1)
+    }
+
+    /// Get the PodTemplateSpec to pass on to the [super::ParquetCacheSet]. This will make necessary
+    /// changes to the template supplied in the [ParquetCacheSpec].
+    ///
+    /// The generated [PodTemplateSpec] includes a label containing the requested replica count.
+    /// This ensures that a different [super::ParquetCacheSet] is created even if the only change to the
+    /// [ParquetCache] is a change in the replica count.
+    pub(super) fn parquet_cache_set_template(&self) -> PodTemplateSpec {
+        let mut template = self.spec.template.clone();
+        let metadata = template.metadata.get_or_insert(Default::default());
+        let labels = metadata.labels.get_or_insert(Default::default());
+        labels.insert(
+            String::from(PARQUET_CACHE_REPLICAS_LABEL),
+            format!("{}", self.replicas()),
+        );
+        template
+    }
+
+    /// Generate a name for a ParquetCacheSet derived from this ParquetCache.
+    pub(super) fn parquet_cache_set_name(&self, pod_template_hash: &str) -> Result<String> {
+        let name = self.name()?;
+        Ok(format!("{name}-{pod_template_hash}"))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
+
+    #[test]
+    fn config_map_name() {
+        let pc = ParquetCache {
+            metadata: ObjectMeta {
+                name: Some(String::from("test-data-cache-set")),
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        assert_eq!("test-data-cache-set", pc.config_map_name().unwrap());
+
+        let pc = ParquetCache {
+            metadata: ObjectMeta {
+                name: Some("test-data-cache-set".to_string()),
+                ..Default::default()
+            },
+            spec: ParquetCacheSpec {
+                config_map_name: Some(String::from("config-map")),
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        assert_eq!("config-map", pc.config_map_name().unwrap());
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache_controller.rs b/parquet_cache/src/controller/parquet_cache_controller.rs
new file mode 100644
index 0000000..3dba058
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache_controller.rs
@@ -0,0 +1,1446 @@
+use super::{
+    kube_util::{hash_object, list_owned, owner_reference},
+    Error, ParquetCache, ParquetCacheInstanceSet, ParquetCacheSet, ParquetCacheSetSpec,
+    ParquetCacheStatus, Result, LONG_WAIT, PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL,
+    SHORT_WAIT,
+};
+use crate::data_types::InstanceState;
+use chrono::Utc;
+use futures::StreamExt;
+use k8s_openapi::api::core::v1::{ConfigMap, PodTemplateSpec};
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
+use kube::runtime::controller::Action;
+use kube::runtime::Controller;
+use kube::{Api, Client, Resource, ResourceExt};
+use observability_deps::tracing::{debug, error, info};
+use std::collections::{BTreeMap, BTreeSet};
+use std::fmt::Debug;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::task::JoinHandle;
+
+/// Start a new controller task to reconcile [ParquetCacheSet] objects.
+pub fn spawn_controller(client: Client, ns: Option<String>) -> JoinHandle<Result<(), kube::Error>> {
+    tokio::spawn(run_controller(client, ns))
+}
+
+async fn run_controller(client: Client, ns: Option<String>) -> Result<(), kube::Error> {
+    let parquet_cache_api = match &ns {
+        Some(ns) => Api::<ParquetCache>::namespaced(client.clone(), ns),
+        None => Api::<ParquetCache>::all(client.clone()),
+    };
+    let parquet_cache_set_api = match &ns {
+        Some(ns) => Api::<ParquetCacheSet>::namespaced(client.clone(), ns),
+        None => Api::<ParquetCacheSet>::all(client.clone()),
+    };
+
+    Controller::new(parquet_cache_api, Default::default())
+        .owns(parquet_cache_set_api, Default::default())
+        .run(reconcile, error_policy, Arc::new(Context { client }))
+        .for_each(|_| futures::future::ready(()))
+        .await;
+    Ok(())
+}
+
+async fn reconcile(obj: Arc<ParquetCache>, ctx: Arc<Context>) -> Result<Action> {
+    let namespace = obj.metadata.namespace.as_deref();
+    let name = obj.name_any();
+    info!(namespace, name, "reconcile request");
+    let sleep = ParquetCacheController::new(obj.as_ref().clone(), ctx.client.clone())
+        .reconcile()
+        .await?;
+    Ok(Action::requeue(sleep))
+}
+
+fn error_policy(_object: Arc<ParquetCache>, err: &Error, _ctx: Arc<Context>) -> Action {
+    // TODO add exponential backoff
+    let sleep = Duration::from_secs(5);
+    error!(
+        err = err as &dyn std::error::Error,
+        "reconcile failed, requeue in {:?}", sleep
+    );
+    Action::requeue(sleep)
+}
+
+/// Context used when reconciling [ParquetCacheSet] objects.
+struct Context {
+    client: Client,
+}
+
+const COOLING_SECONDS: i64 = 300;
+
+/// Controller for the ParquetCache custom resource. This controller maintains ParquetCacheSet
+/// resources for a ParquetCache.
+#[derive(Debug)]
+struct ParquetCacheController {
+    config_map_api: Api<ConfigMap>,
+    parquet_cache_api: Api<ParquetCache>,
+    parquet_cache_set_api: Api<ParquetCacheSet>,
+
+    parquet_cache: ParquetCache,
+}
+
+impl ParquetCacheController {
+    /// Create a new ParquetCacheSetController instance for the provided [ParquetCacheSet]
+    /// and [Client].
+    fn new(parquet_cache: ParquetCache, client: Client) -> Self {
+        let ns = parquet_cache.metadata.namespace.as_ref().unwrap();
+        let config_maps = Api::namespaced(client.clone(), ns);
+        let parquet_caches = Api::namespaced(client.clone(), ns);
+        let parquet_cache_sets = Api::namespaced(client.clone(), ns);
+
+        Self {
+            config_map_api: config_maps,
+            parquet_cache_api: parquet_caches,
+            parquet_cache_set_api: parquet_cache_sets,
+            parquet_cache,
+        }
+    }
+
+    /// Perform the business logic required to move the DataCacheSet state forward towards the
+    /// desired state.
+    pub async fn reconcile(&mut self) -> Result<Duration> {
+        // ensure the config map exists before attempting to start pods.
+        let cm = self.status_config_map()?;
+        match self.config_map_api.create(&Default::default(), &cm).await {
+            Ok(_) => {
+                info!(name = cm.metadata.name, "Created ConfigMap");
+            }
+            Err(kube::Error::Api(status)) if status.reason == "AlreadyExists" => (),
+            Err(error) => return Err(error)?,
+        }
+
+        let duration = self.reconcile_inner().await?;
+
+        // update the config map with the latest set.
+        let cm = self.status_config_map()?;
+        debug!("update config map");
+        self.config_map_api
+            .replace(
+                self.parquet_cache.config_map_name()?,
+                &Default::default(),
+                &cm,
+            )
+            .await?;
+        debug!("update ParquetCache status");
+        self.parquet_cache_api
+            .replace_status(
+                self.parquet_cache.metadata.name.as_ref().unwrap(),
+                &Default::default(),
+                serde_json::to_vec(&self.parquet_cache)?,
+            )
+            .await?;
+        Ok(duration)
+    }
+
+    /// Perform the changes required to reconcile the state of the ParquetCache. Changes to the
+    /// status are written to memory and will updated after this method returns.
+    async fn reconcile_inner(&mut self) -> Result<Duration> {
+        let template = self.parquet_cache.parquet_cache_set_template();
+        let pod_template_hash = hash_object(&template)?;
+
+        // find and remove any owned cache sets that are no longer required.
+        self.remove_empty_cache_sets(&pod_template_hash).await?;
+
+        if self.check_warming_pods().await? {
+            self.status_mut().current = self.status_mut().next.clone();
+        } else {
+            // Some pods are still warming, check again soon.
+            return Ok(SHORT_WAIT);
+        }
+        if !self.check_cooling_pods(&pod_template_hash).await? {
+            // Some pods are still cooling, check again soon.
+            return Ok(SHORT_WAIT);
+        }
+        if self.status_mut().current.instances.len() != self.parquet_cache.replicas() as usize {
+            self.resize(&pod_template_hash, &template).await?;
+        } else {
+            self.migrate(&pod_template_hash, &template).await?;
+        }
+
+        // If we get to here then either there is nothing to change, or some changes
+        // have been made and the controller will be woken by those changes.
+        Ok(LONG_WAIT)
+    }
+
+    async fn remove_empty_cache_sets(&mut self, pod_template_hash: &String) -> Result<()> {
+        let parquet_cache_sets = self.owned_parquet_cache_sets().await?;
+        let to_delete = parquet_cache_sets
+            .into_iter()
+            .filter(|pcs| {
+                let is_latest = if let Some(pth) = pcs
+                    .metadata
+                    .labels
+                    .as_ref()
+                    .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL).cloned())
+                {
+                    &pth == pod_template_hash
+                } else {
+                    false
+                };
+                let is_empty = if let Some(pods) =
+                    pcs.status.as_ref().and_then(|status| status.pods.as_ref())
+                {
+                    pods.is_empty()
+                } else {
+                    true
+                };
+                !is_latest && is_empty
+            })
+            .collect::<Vec<ParquetCacheSet>>();
+
+        for pcs in to_delete {
+            info!(name = pcs.metadata.name, "Deleting ParquetCacheSet");
+            self.parquet_cache_set_api
+                .delete(&pcs.metadata.name.unwrap(), &Default::default())
+                .await?;
+        }
+        Ok(())
+    }
+
+    async fn check_warming_pods(&mut self) -> Result<bool> {
+        let status = self.status_mut();
+        if status.current.revision == status.next.revision {
+            return Ok(true);
+        }
+        for instance in status.next.instances.clone() {
+            let (parquet_cache_set_name, _) = instance.rsplit_once('-').unwrap();
+            let parquet_cache_set = self
+                .parquet_cache_set_api
+                .get(parquet_cache_set_name)
+                .await?;
+            let parquet_cache_set_status = parquet_cache_set.status.unwrap_or_default();
+            let pod_status = parquet_cache_set_status
+                .pods
+                .as_ref()
+                .and_then(|pods| pods.get(&instance));
+            let phase = pod_status
+                .and_then(|status| status.phase.as_ref())
+                .map(String::as_str);
+            let state = pod_status
+                .and_then(|status| status.state.as_ref())
+                .map(|state| state.state.to_string());
+            debug!(name = &instance, phase, state, "Checking Pod status");
+            if phase.unwrap_or("") != "Running" {
+                return Ok(false);
+            }
+            if pod_status
+                .and_then(|status| status.state.as_ref())
+                .map(|state| state.state != InstanceState::Warming)
+                .unwrap_or(true)
+            {
+                return Ok(false);
+            }
+        }
+        Ok(true)
+    }
+
+    async fn check_cooling_pods(&mut self, pod_template_hash: &String) -> Result<bool> {
+        let mut live_pods = self
+            .status_mut()
+            .current
+            .instances
+            .iter()
+            .cloned()
+            .collect::<BTreeSet<_>>();
+        for pod in &self.status_mut().next.instances {
+            live_pods.insert(pod.clone());
+        }
+        let parquet_cache_sets = self.owned_parquet_cache_sets().await?;
+
+        let current_status = parquet_cache_sets
+            .iter()
+            .filter_map(|pcs| pcs.status.as_ref())
+            .filter_map(|status| status.pods.as_ref())
+            .flat_map(|pods| pods.clone().into_iter().collect::<Vec<_>>())
+            .filter(|(k, _)| self.status_mut().current.instances.contains(k))
+            .map(|(k, status)| {
+                let (_, suffix) = split_pod_name(&k);
+                (suffix, status)
+            })
+            .collect::<BTreeMap<_, _>>();
+
+        let cooling_pods = parquet_cache_sets
+            .iter()
+            .filter(|&pcs| !has_pod_template_hash(pcs, pod_template_hash))
+            .filter_map(|pcs| pcs.status.as_ref())
+            .filter_map(|status| status.pods.as_ref())
+            .flat_map(|pods| pods.keys().cloned().collect::<Vec<_>>())
+            .filter(|key| !live_pods.contains(key))
+            .collect::<Vec<_>>();
+
+        let mut cooling = false;
+        for pod in cooling_pods {
+            let (pcs_name, suffix) = split_pod_name(&pod);
+            if let Some(change) = current_status
+                .get(&suffix)
+                .and_then(|status| status.state.as_ref())
+                .map(|state| state.state_changed)
+            {
+                if change > Utc::now().timestamp() - COOLING_SECONDS {
+                    // If the pod has been cooling for less than the wait time, keep waiting.
+                    cooling = true;
+                    continue;
+                }
+            }
+            let mut pcs = self.parquet_cache_set_api.get(&pcs_name).await?;
+            pcs.spec
+                .replica_suffixes
+                .as_mut()
+                .unwrap()
+                .retain(|s| s != &suffix);
+            self.parquet_cache_set_api
+                .replace(&pcs_name, &Default::default(), &pcs)
+                .await?;
+        }
+        Ok(!cooling)
+    }
+
+    async fn resize(
+        &mut self,
+        pod_template_hash: &String,
+        template: &PodTemplateSpec,
+    ) -> Result<()> {
+        let owned = self.owned_parquet_cache_sets().await?;
+
+        // Clear any ParquetCacheSets that are not the required one.
+        for mut pcs in owned {
+            let is_current = pcs
+                .metadata
+                .labels
+                .as_ref()
+                .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL))
+                .map(|v| v == pod_template_hash)
+                .unwrap_or_default();
+            if is_current {
+                continue;
+            }
+            pcs.spec.replica_suffixes = None;
+            self.set_parquet_cache_set(&pcs).await?;
+        }
+
+        // Create the desired ParquetCacheSet.
+        let mut pcs = self
+            .get_parquet_cache_set(pod_template_hash, template)
+            .await?;
+        let suffixes = (0..self.parquet_cache.replicas())
+            .map(|n| format!("{n}"))
+            .collect::<Vec<_>>();
+        pcs.spec.replica_suffixes = Some(suffixes.clone());
+        self.set_parquet_cache_set(&pcs).await?;
+        let next_revision = self.status_mut().next.revision + 1;
+        let instances = suffixes
+            .iter()
+            .map(|suffix| format!("{}-{suffix}", pcs.metadata.name.as_ref().unwrap()))
+            .collect();
+        self.status_mut().next = ParquetCacheInstanceSet {
+            revision: next_revision,
+            instances,
+        };
+        self.status_mut().current = self.status_mut().next.clone();
+        Ok(())
+    }
+
+    async fn migrate(&mut self, pod_template_hash: &str, template: &PodTemplateSpec) -> Result<()> {
+        let current = self.status_mut().current.clone();
+        assert_eq!(current.revision, self.status_mut().next.revision);
+        let parquet_cache_set_name = self
+            .parquet_cache
+            .parquet_cache_set_name(pod_template_hash)?;
+
+        for (idx, name) in current.instances.iter().enumerate() {
+            let (prefix, suffix) = split_pod_name(name);
+            if prefix == parquet_cache_set_name {
+                continue;
+            }
+            let mut pcs = self
+                .get_parquet_cache_set(pod_template_hash, template)
+                .await?;
+            if pcs.spec.replica_suffixes.is_none() {
+                pcs.spec.replica_suffixes = Some(vec![]);
+            }
+            pcs.spec
+                .replica_suffixes
+                .as_mut()
+                .unwrap()
+                .push(suffix.clone());
+            self.set_parquet_cache_set(&pcs).await?;
+            self.status_mut().next.revision = current.revision + 1;
+            self.status_mut().next.instances[idx] = format!("{parquet_cache_set_name}-{suffix}");
+            break;
+        }
+        Ok(())
+    }
+
+    async fn owned_parquet_cache_sets(&self) -> Result<Vec<ParquetCacheSet>> {
+        let uid = self
+            .parquet_cache
+            .metadata
+            .uid
+            .as_ref()
+            .ok_or(Error::internal("ParquetCache has no uid"))?;
+        Ok(list_owned(&self.parquet_cache_set_api, uid).await?)
+    }
+
+    /// Create or update the specified ParquetCacheSet.
+    async fn set_parquet_cache_set(&mut self, pcs: &ParquetCacheSet) -> Result<ParquetCacheSet> {
+        let name = pcs.metadata.name.as_ref().ok_or(Error::internal(
+            "attempt to set a ParquetCacheSet without a name",
+        ))?;
+        let pp = Default::default();
+        if pcs.metadata.uid.is_some() {
+            Ok(self.parquet_cache_set_api.replace(name, &pp, pcs).await?)
+        } else {
+            Ok(self.parquet_cache_set_api.create(&pp, pcs).await?)
+        }
+    }
+
+    /// Retrieve the ParquetCacheSet for the specified Pod template hash. If there is no such
+    /// ParquetCacheSet then create a ParquetCacheSet object with appropriate defaults taken from
+    /// the current ParquetCache document.
+    async fn get_parquet_cache_set(
+        &mut self,
+        pod_template_hash: &str,
+        template: &PodTemplateSpec,
+    ) -> Result<ParquetCacheSet> {
+        let name = self
+            .parquet_cache
+            .parquet_cache_set_name(pod_template_hash)?;
+        Ok(self
+            .parquet_cache_set_api
+            .get_opt(&name)
+            .await?
+            .unwrap_or_else(|| self.new_parquet_cache_set(name, pod_template_hash, template)))
+    }
+
+    fn new_parquet_cache_set(
+        &self,
+        name: String,
+        pod_template_hash: &str,
+        template: &PodTemplateSpec,
+    ) -> ParquetCacheSet {
+        let pod_template_hash_key = String::from(POD_TEMPLATE_HASH_LABEL);
+        let replica_count_key = String::from(PARQUET_CACHE_REPLICAS_LABEL);
+        let replica_count_value = format!("{}", self.parquet_cache.replicas());
+
+        let mut labels = self
+            .parquet_cache
+            .metadata
+            .labels
+            .clone()
+            .unwrap_or_default();
+        labels.insert(
+            pod_template_hash_key.clone(),
+            String::from(pod_template_hash),
+        );
+        labels.insert(replica_count_key.clone(), replica_count_value.clone());
+        let mut selector = self.parquet_cache.spec.selector.clone();
+        let match_labels = selector.match_labels.get_or_insert(Default::default());
+        match_labels.insert(
+            pod_template_hash_key.clone(),
+            String::from(pod_template_hash),
+        );
+        match_labels.insert(replica_count_key.clone(), replica_count_value.clone());
+
+        let mut template = template.clone();
+        let template_metadata = template.metadata.get_or_insert(Default::default());
+        template_metadata.namespace = self.parquet_cache.metadata.namespace.clone();
+        let template_labels = template_metadata.labels.get_or_insert(Default::default());
+        template_labels.insert(
+            pod_template_hash_key.clone(),
+            String::from(pod_template_hash),
+        );
+
+        ParquetCacheSet {
+            metadata: ObjectMeta {
+                labels: Some(labels),
+                name: Some(name),
+                namespace: self.parquet_cache.metadata.namespace.clone(),
+                owner_references: Some(vec![self.owner_reference()]),
+                ..Default::default()
+            },
+            spec: ParquetCacheSetSpec {
+                replica_suffixes: None,
+                selector,
+                state_port: self.parquet_cache.spec.state_port.clone(),
+                template: Some(template),
+            },
+            status: None,
+        }
+    }
+
+    fn status_config_map(&mut self) -> Result<ConfigMap> {
+        let mut data = BTreeMap::new();
+        let status = self.status_mut();
+        data.insert(
+            "current".to_string(),
+            serde_json::to_string(&status.current)?,
+        );
+        data.insert("next".to_string(), serde_json::to_string(&status.next)?);
+        Ok(ConfigMap {
+            metadata: ObjectMeta {
+                namespace: self.parquet_cache.metadata.namespace.clone(),
+                name: Some(self.parquet_cache.config_map_name()?.clone()),
+                owner_references: Some(vec![self.owner_reference()]),
+                ..Default::default()
+            },
+            data: Some(data),
+            ..Default::default()
+        })
+    }
+
+    /// Generate an owner reference for the current ParquetCache document.
+    fn owner_reference(&self) -> OwnerReference {
+        owner_reference(&self.parquet_cache)
+    }
+
+    fn status_mut(&mut self) -> &mut ParquetCacheStatus {
+        self.parquet_cache.status.get_or_insert(Default::default())
+    }
+}
+
+fn split_pod_name(name: &str) -> (String, String) {
+    if let Some((prefix, suffix)) = name.rsplit_once('-') {
+        (String::from(prefix), String::from(suffix))
+    } else {
+        (String::from(name), String::from(""))
+    }
+}
+
+fn has_pod_template_hash<K>(obj: &K, pod_template_hash: &String) -> bool
+where
+    K: Resource,
+{
+    if let Some(hash) = obj
+        .meta()
+        .labels
+        .as_ref()
+        .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL))
+    {
+        hash == pod_template_hash
+    } else {
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::controller::parquet_cache::{ParquetCacheInstanceSet, ParquetCacheSpec};
+    use crate::controller::parquet_cache_set::InstanceStatus;
+    use crate::controller::{ParquetCacheSet, ParquetCacheSetStatus, SHORT_WAIT};
+    use crate::data_types::{InstanceState, State};
+    use hyper::Body;
+    use kube::client::ClientBuilder;
+    use kube::ResourceExt;
+    use kube_test::{AsHandler, ResourceHandler, Service};
+    use std::ops::Sub;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn create_config_map() {
+        let ns = "create_config_map";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(ns, name, Default::default());
+        let uid = pc.metadata.uid.clone().unwrap_or_default();
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let cm = fixture.config_maps.get(ns, name).unwrap();
+        assert_eq!(ns, cm.metadata.namespace.as_ref().unwrap());
+        assert_eq!(name, cm.metadata.name.as_ref().unwrap());
+        assert_eq!(uid, cm.metadata.owner_references.as_ref().unwrap()[0].uid);
+        assert!(!cm.data.as_ref().unwrap().get("current").unwrap().is_empty());
+        assert!(!cm.data.as_ref().unwrap().get("next").unwrap().is_empty());
+    }
+
+    #[tokio::test]
+    async fn create_config_map_no_fail_on_existing() {
+        let ns = "create_config_map_no_fail_on_existing";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        fixture.config_maps.set(ns, name, Default::default());
+        let pc = fixture.parquet_caches.set(ns, name, Default::default());
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let cm = fixture.config_maps.get(ns, name).unwrap();
+        assert_eq!(ns, cm.metadata.namespace.as_ref().unwrap());
+        assert_eq!(name, cm.metadata.name.as_ref().unwrap());
+    }
+
+    #[tokio::test]
+    async fn create_initial_parquet_cache_set_at_full_size() {
+        let ns = "create_initial_parquet_cache_set_at_full_size";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                spec: ParquetCacheSpec {
+                    replicas: Some(5),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+        let uid = pc.metadata.uid.clone().unwrap_or_default();
+        let template_hash = hash_object(&pc.parquet_cache_set_template()).unwrap();
+
+        fixture.reconcile(ns, pc.clone()).await.unwrap();
+
+        let parquet_cache_sets = fixture
+            .parquet_cache_sets
+            .all(ns)
+            .into_iter()
+            .filter(|pcs| pcs.owner_references().iter().any(|or| or.uid == uid))
+            .collect::<Vec<_>>();
+
+        assert_eq!(1, parquet_cache_sets.len());
+        let pcs = &parquet_cache_sets[0];
+        assert_eq!(
+            &template_hash,
+            pcs.metadata
+                .labels
+                .as_ref()
+                .and_then(|map| map.get(POD_TEMPLATE_HASH_LABEL))
+                .unwrap()
+        );
+        assert_eq!(
+            5,
+            pcs.spec
+                .replica_suffixes
+                .as_ref()
+                .map(Vec::len)
+                .unwrap_or_default()
+        );
+
+        let cm = fixture.config_maps.get(ns, name).unwrap();
+        let current = cm.data.as_ref().unwrap().get("current").unwrap().clone();
+        let next = cm.data.as_ref().unwrap().get("next").unwrap().clone();
+        assert_eq!(current, next);
+
+        let pcis = serde_json::from_str::<ParquetCacheInstanceSet>(&current).unwrap();
+        assert_eq!(5, pcis.instances.len());
+    }
+
+    #[tokio::test]
+    async fn old_parquet_cache_set_removed() {
+        let ns = "old_parquet_cache_set_removed";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(ns, name, Default::default());
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([])),
+                }),
+            },
+        );
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{pcs2_name}-0"),
+                        Default::default(),
+                    )])),
+                }),
+            },
+        );
+        let template_hash = hash_object(&pc.parquet_cache_set_template()).unwrap();
+        let pcs3_name = format!("{name}-{template_hash}");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs3_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        template_hash.clone(),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([])),
+                }),
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        assert!(fixture.parquet_cache_sets.get(ns, &pcs1_name).is_none());
+        assert!(fixture.parquet_cache_sets.get(ns, &pcs2_name).is_some());
+        assert!(fixture.parquet_cache_sets.get(ns, &pcs3_name).is_some());
+    }
+
+    #[tokio::test]
+    async fn warming_pods_retry_shortly() {
+        let ns = "warming_pods_retry_shortly";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(30))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pc).await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn warm_pods_update_status() {
+        let ns = "warm_pods_update_status";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(30))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let status = fixture
+            .parquet_caches
+            .get(ns, name)
+            .unwrap()
+            .status
+            .unwrap();
+        assert_eq!(status.next, status.current);
+    }
+
+    #[tokio::test]
+    async fn cooling_pods_retry_shortly() {
+        let ns = "cooling_pods_retry_shortly";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(30))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pc).await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn cooled_pods_are_removed() {
+        let ns = "cooled_pods_are_removed";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(400))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert!(
+            pcs1.spec.replica_suffixes.is_none() || pcs1.spec.replica_suffixes.unwrap().is_empty()
+        );
+    }
+
+    #[tokio::test]
+    async fn resizing_recreates_everything() {
+        let ns = "resizing_recreates_everything";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                spec: ParquetCacheSpec {
+                    replicas: Some(2),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 1,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+        let template = pc.parquet_cache_set_template();
+        let hash = hash_object(&template).unwrap();
+        let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap();
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert!(
+            pcs1.spec.replica_suffixes.is_none() || pcs1.spec.replica_suffixes.unwrap().is_empty()
+        );
+        let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap();
+        assert_eq!(2, pcs2.spec.replica_suffixes.unwrap().len())
+    }
+
+    #[tokio::test]
+    async fn template_change_starts_migration() {
+        let ns = "template_change_starts_migration";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 1,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+        let template = pc.parquet_cache_set_template();
+        let hash = hash_object(&template).unwrap();
+        let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap();
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pc = fixture.parquet_caches.get(ns, name).unwrap();
+        let status = &pc.status.unwrap();
+        assert!(status.current.revision < status.next.revision);
+        assert_eq!(format!("{name}-aaaaaaaaaa-0"), status.current.instances[0]);
+        assert_eq!(format!("{name}-{hash}-0"), status.next.instances[0]);
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert_eq!(1, pcs1.spec.replica_suffixes.unwrap().len());
+        let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap();
+        assert_eq!(1, pcs2.spec.replica_suffixes.unwrap().len())
+    }
+
+    #[tokio::test]
+    async fn one_pod_migrated_at_a_time() {
+        let ns = "template_change_starts_migration";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                spec: ParquetCacheSpec {
+                    replicas: Some(3),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("1"), String::from("2")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([
+                        (
+                            format!("{name}-aaaaaaaaaa-1"),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    state_changed: chrono::Utc::now()
+                                        .sub(Duration::from_secs(1800))
+                                        .timestamp(),
+                                    current_node_set_revision: 2,
+                                    next_node_set_revision: 2,
+                                }),
+                            },
+                        ),
+                        (
+                            format!("{name}-aaaaaaaaaa-2"),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    state_changed: chrono::Utc::now()
+                                        .sub(Duration::from_secs(1800))
+                                        .timestamp(),
+                                    current_node_set_revision: 2,
+                                    next_node_set_revision: 2,
+                                }),
+                            },
+                        ),
+                    ])),
+                }),
+            },
+        );
+        let template = pc.parquet_cache_set_template();
+        let hash = hash_object(&template).unwrap();
+        let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap();
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        hash.clone(),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-{hash}-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![
+                            format!("{name}-{hash}-0"),
+                            format!("{name}-aaaaaaaaaa-1"),
+                            format!("{name}-aaaaaaaaaa-2"),
+                        ],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![
+                            format!("{name}-{hash}-0"),
+                            format!("{name}-aaaaaaaaaa-1"),
+                            format!("{name}-aaaaaaaaaa-2"),
+                        ],
+                    },
+                }),
+                ..pc
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pc = fixture.parquet_caches.get(ns, name).unwrap();
+        let status = &pc.status.unwrap();
+        assert!(status.current.revision < status.next.revision);
+        assert_eq!(status.current.instances[0], status.next.instances[0]);
+        assert_eq!(format!("{name}-aaaaaaaaaa-1"), status.current.instances[1]);
+        assert_eq!(format!("{name}-{hash}-1"), status.next.instances[1]);
+        assert_eq!(status.current.instances[2], status.next.instances[2]);
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert_eq!(2, pcs1.spec.replica_suffixes.unwrap().len());
+        let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap();
+        assert_eq!(2, pcs2.spec.replica_suffixes.unwrap().len())
+    }
+
+    #[derive(Debug, Default)]
+    struct Fixture {
+        pub config_maps: Arc<ResourceHandler<ConfigMap>>,
+        pub parquet_cache_sets: Arc<ResourceHandler<ParquetCacheSet>>,
+        pub parquet_caches: Arc<ResourceHandler<ParquetCache>>,
+    }
+
+    impl Fixture {
+        fn service(&self) -> Service {
+            let service = Service::new();
+            service.add_handler(self.config_maps.as_handler());
+            service.add_handler(self.parquet_cache_sets.as_handler());
+            service.add_handler(self.parquet_caches.as_handler());
+            service
+        }
+
+        async fn reconcile(
+            &self,
+            ns: impl Into<String> + Send,
+            pc: ParquetCache,
+        ) -> Result<Duration> {
+            let service = self.service();
+            let client = ClientBuilder::new(service, ns).build::<Body>();
+            let mut controller = ParquetCacheController::new(pc, client);
+            let hnd = tokio::spawn(async move { controller.reconcile().await });
+            hnd.await.unwrap()
+        }
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache_set.rs b/parquet_cache/src/controller/parquet_cache_set.rs
new file mode 100644
index 0000000..bb172fa
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache_set.rs
@@ -0,0 +1,75 @@
+use crate::data_types::{InstanceState, State};
+use k8s_openapi::api::core::v1::PodTemplateSpec;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector;
+use k8s_openapi::schemars::JsonSchema;
+use kube::CustomResource;
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+
+/// Specification of a ParquetCacheSet.
+#[derive(Debug, Default, Clone, CustomResource, Deserialize, Serialize, JsonSchema)]
+#[kube(
+    kind = "ParquetCacheSet",
+    group = "iox.influxdata.com",
+    version = "v1alpha1",
+    namespaced
+)]
+#[kube(status = "ParquetCacheSetStatus")]
+#[kube(derive = "Default")]
+#[serde(rename_all = "camelCase")]
+pub struct ParquetCacheSetSpec {
+    /// Suffixes for the pods required to be in the set.
+    pub replica_suffixes: Option<Vec<String>>,
+
+    /// Selector is a label query over pods that should match the replica
+    /// count. Label keys and values that must match in order to be controlled
+    /// by this parquet cache set. It must match the pod template's labels.
+    pub selector: LabelSelector,
+
+    /// Port to connect to on the pod in order to enquire about the status of
+    /// the cache.
+    pub state_port: Option<String>,
+
+    /// Template is the object that describes the pod that will be created
+    /// if insufficient replicas are detected.
+    pub template: Option<PodTemplateSpec>,
+}
+
+/// Status of a ParquetCacheSet.
+#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)]
+pub struct ParquetCacheSetStatus {
+    /// Status of the pods that form the set.
+    pub pods: Option<BTreeMap<String, InstanceStatus>>,
+}
+
+#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)]
+pub struct InstanceStatus {
+    /// The phase the pod is in.
+    pub phase: Option<String>,
+
+    /// The state reported by the pod. This is only included if the pod is in the "Running" phase
+    /// and the state could be queried successfully.
+    pub state: Option<State>,
+}
+
+impl InstanceStatus {
+    /// Determine if the status represents a warming instance.
+    pub(super) fn is_warming(&self) -> bool {
+        match &self.phase {
+            None => false,
+            Some(phase) => match phase.as_str() {
+                "Running" => match &self.state {
+                    None => true,
+                    Some(state) => state.state == InstanceState::Warming,
+                },
+                _ => false,
+            },
+        }
+    }
+}
+
+impl ParquetCacheSet {
+    pub(super) fn selectors(&self) -> Option<String> {
+        super::kube_util::selectors(&self.spec.selector)
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache_set_controller.rs b/parquet_cache/src/controller/parquet_cache_set_controller.rs
new file mode 100644
index 0000000..ad478bf
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache_set_controller.rs
@@ -0,0 +1,676 @@
+use super::{
+    kube_util::owner_reference, parquet_cache_set::InstanceStatus, state_service, Error,
+    ParquetCacheSet, ParquetCacheSetStatus, Result, CONTROLLER_NAME, LONG_WAIT,
+    PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL, SHORT_WAIT,
+};
+use futures::StreamExt;
+use k8s_openapi::api::core::v1::Pod;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
+use kube::api::{ListParams, PostParams};
+use kube::runtime::controller::Action;
+use kube::runtime::watcher::Config;
+use kube::runtime::Controller;
+use kube::{Api, Client, ResourceExt};
+use observability_deps::tracing::{error, info};
+use std::collections::BTreeSet;
+use std::fmt::Debug;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::task::JoinHandle;
+
+/// Start a new controller task to reconcile [ParquetCacheSet] objects.
+pub fn spawn_controller(client: Client, ns: Option<String>) -> JoinHandle<Result<(), kube::Error>> {
+    tokio::spawn(run_controller(client, ns))
+}
+
+async fn run_controller(client: Client, ns: Option<String>) -> Result<(), kube::Error> {
+    let parquet_cache_set_api = match &ns {
+        Some(ns) => Api::<ParquetCacheSet>::namespaced(client.clone(), ns),
+        None => Api::<ParquetCacheSet>::all(client.clone()),
+    };
+    let pod_api = match &ns {
+        Some(ns) => Api::<Pod>::namespaced(client.clone(), ns),
+        None => Api::<Pod>::all(client.clone()),
+    };
+
+    Controller::new(parquet_cache_set_api, Default::default())
+        .owns(
+            pod_api,
+            Config::default().labels(&format!(
+                "{},{}",
+                PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL
+            )),
+        )
+        .run(
+            reconcile,
+            error_policy,
+            Arc::new(Context {
+                client,
+                state_service: Default::default(),
+            }),
+        )
+        .for_each(|_| futures::future::ready(()))
+        .await;
+    Ok(())
+}
+
+async fn reconcile(obj: Arc<ParquetCacheSet>, ctx: Arc<Context>) -> Result<Action> {
+    let namespace = obj.metadata.namespace.as_deref();
+    let name = obj.name_any();
+    info!(namespace, name, "reconcile request");
+    let sleep = ParquetCacheSetController::new(
+        obj.as_ref().clone(),
+        ctx.state_service.clone(),
+        ctx.client.clone(),
+    )
+    .reconcile()
+    .await?;
+    Ok(Action::requeue(sleep))
+}
+
+fn error_policy(_object: Arc<ParquetCacheSet>, err: &Error, _ctx: Arc<Context>) -> Action {
+    // TODO add exponential backoff
+    let sleep = Duration::from_secs(5);
+    error!(
+        err = err as &dyn std::error::Error,
+        "reconcile failed, requeue in {:?}", sleep
+    );
+    Action::requeue(sleep)
+}
+
+/// Context used when reconciling [ParquetCacheSet] objects.
+struct Context {
+    client: Client,
+    state_service: state_service::Client,
+}
+
+/// Controller for the ParquetCacheSet custom resource. This controller maintains the set of pods
+/// created by a ParquetCacheSet.
+#[derive(Debug)]
+struct ParquetCacheSetController {
+    parquet_cache_set_api: Api<ParquetCacheSet>,
+    pod_api: Api<Pod>,
+    state_service: state_service::Client,
+
+    parquet_cache_set: ParquetCacheSet,
+}
+
+impl ParquetCacheSetController {
+    /// Create a new ParquetCacheSetController instance for the provided [ParquetCacheSet]
+    /// and [Client].
+    fn new(
+        parquet_cache_set: ParquetCacheSet,
+        state_service: state_service::Client,
+        client: Client,
+    ) -> Self {
+        let ns = parquet_cache_set.metadata.namespace.as_ref().unwrap();
+        let parquet_cache_sets: Api<ParquetCacheSet> = Api::namespaced(client.clone(), ns);
+        let pods: Api<Pod> = Api::namespaced(client.clone(), ns);
+
+        Self {
+            parquet_cache_set_api: parquet_cache_sets,
+            pod_api: pods,
+            state_service,
+            parquet_cache_set,
+        }
+    }
+
+    /// Perform the business logic required to move the ParquetCacheSet state forward towards the
+    /// desired state.
+    async fn reconcile(&mut self) -> Result<Duration> {
+        let duration = self.reconcile_inner().await?;
+
+        // Ensure the status is always kept up-to-date.
+        self.parquet_cache_set_api
+            .replace_status(
+                self.parquet_cache_set.metadata.name.as_ref().unwrap(),
+                &Default::default(),
+                serde_json::to_vec(&self.parquet_cache_set)?,
+            )
+            .await?;
+        Ok(duration)
+    }
+
+    async fn reconcile_inner(&mut self) -> Result<Duration> {
+        let prefix = self.parquet_cache_set.metadata.name.as_ref().unwrap();
+        let pod_names = self
+            .parquet_cache_set
+            .spec
+            .replica_suffixes
+            .as_ref()
+            .map_or_else(BTreeSet::new, |v| {
+                v.iter()
+                    .map(|suffix| format!("{prefix}-{suffix}"))
+                    .collect::<BTreeSet<_>>()
+            });
+
+        self.delete_removed(&pod_names).await?;
+        self.create_missing(&pod_names).await?;
+        self.update_status(&pod_names).await?;
+
+        let warming = self
+            .status_mut()
+            .pods
+            .as_ref()
+            .map(|pods| pods.iter().any(|(_, status)| status.is_warming()))
+            .unwrap_or(false);
+
+        // If there are cache pods in the warming state then check them in a minute, otherwise wait
+        // for an hour, or for a state change.
+        Ok(if warming { SHORT_WAIT } else { LONG_WAIT })
+    }
+
+    async fn delete_removed(&mut self, pod_names: &BTreeSet<String>) -> Result<()> {
+        let pods = self
+            .pod_api
+            .list(&ListParams {
+                label_selector: self.parquet_cache_set.selectors(),
+                ..Default::default()
+            })
+            .await?;
+        let to_delete = pods
+            .iter()
+            .filter_map(|pod| pod.metadata.name.as_ref())
+            .filter(|&name| !pod_names.contains(name))
+            .collect::<Vec<_>>();
+
+        for pod_name in to_delete {
+            info!(name = pod_name, "Deleting Pod");
+            self.pod_api.delete(pod_name, &Default::default()).await?;
+        }
+        Ok(())
+    }
+
+    async fn create_missing(&mut self, pods: &BTreeSet<String>) -> Result<()> {
+        for pod in pods {
+            if !self.pod_exists(pod).await? {
+                info!(name = pod, "Creating Pod");
+                self.create_pod(pod.clone()).await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn update_status(&mut self, pod_names: &BTreeSet<String>) -> Result<()> {
+        if let Some(pods) = self.status_mut().pods.as_mut() {
+            pods.clear();
+        }
+        for name in pod_names {
+            let pod = self.pod_api.get_status(name).await?;
+            let phase = pod.status.clone().and_then(|status| status.phase);
+            let state = match phase.as_deref() {
+                Some("Running") => {
+                    self.state_service
+                        .state(&pod, &self.parquet_cache_set.spec.state_port)
+                        .await?
+                }
+                _ => None,
+            };
+            self.status_mut()
+                .pods
+                .get_or_insert(Default::default())
+                .insert(name.clone(), InstanceStatus { phase, state });
+        }
+        Ok(())
+    }
+
+    async fn pod_exists(&self, name: &str) -> Result<bool> {
+        match self.pod_api.get(name).await {
+            Ok(_) => Ok(true),
+            Err(kube::Error::Api(error_response)) if error_response.reason == "NotFound" => {
+                Ok(false)
+            }
+            Err(error) => Err(Error::from(error)),
+        }
+    }
+
+    /// Create a new data cache instance pod.
+    async fn create_pod(&self, name: String) -> Result<Pod> {
+        let template = self
+            .parquet_cache_set
+            .spec
+            .template
+            .clone()
+            .unwrap_or_default();
+        let pod = Pod {
+            metadata: ObjectMeta {
+                namespace: self.parquet_cache_set.metadata.namespace.clone(),
+                name: Some(name),
+                owner_references: Some(vec![self.owner_reference()]),
+                ..template.metadata.unwrap_or_default()
+            },
+            spec: template.spec,
+            ..Default::default()
+        };
+        Ok(self
+            .pod_api
+            .create(
+                &PostParams {
+                    dry_run: false,
+                    field_manager: Some(CONTROLLER_NAME.to_string()),
+                },
+                &pod,
+            )
+            .await?)
+    }
+
+    /// Generate an owner reference for the current ParquetCacheSet document.
+    fn owner_reference(&self) -> OwnerReference {
+        owner_reference(&self.parquet_cache_set)
+    }
+
+    fn status_mut(&mut self) -> &mut ParquetCacheSetStatus {
+        self.parquet_cache_set
+            .status
+            .get_or_insert(Default::default())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::controller::state_service::Request;
+    use crate::controller::{ParquetCacheSet, ParquetCacheSetSpec};
+    use crate::data_types::{InstanceState, State};
+    use hyper::Body;
+    use k8s_openapi::api::core::v1::{Pod, PodSpec, PodTemplateSpec};
+    use kube::client::ClientBuilder;
+    use kube_test::{AsHandler, ResourceHandler, Service};
+    use std::collections::BTreeMap;
+    use std::sync::Arc;
+    use std::task::{Context, Poll};
+
+    #[tokio::test]
+    async fn create_pods() {
+        let ns = "create_pods";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let fixture: Fixture = Default::default();
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0"), String::from("1")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.reconcile(ns, pcs.clone()).await.unwrap();
+
+        let pods = fixture.pods.all(ns);
+        assert_eq!(2, pods.len());
+
+        let mut pod_names = pods
+            .iter()
+            .map(|pod| pod.metadata.name.as_ref().unwrap().clone())
+            .collect::<Vec<_>>();
+        pod_names.sort();
+        assert_eq!(
+            &vec!["parquet-cache-aaaaaaaaaa-0", "parquet-cache-aaaaaaaaaa-1"],
+            &pod_names
+        );
+
+        // Make sure the provided template has been used, and the pods are owned by the
+        // ParquetCacheSet.
+        for pod in &pods {
+            assert_eq!(2, pod.spec.as_ref().unwrap().priority.unwrap());
+            assert_eq!(
+                owner_reference(&pcs),
+                pod.metadata.owner_references.as_ref().unwrap()[0].clone()
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn remove_pods() {
+        let ns = "remove_pods";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let fixture: Fixture = Default::default();
+
+        let pod0_name = format!("{name}-0");
+        let pod1_name = format!("{name}-1");
+        let pod2_name = format!("{name}-2");
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("1"), String::from("2")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([
+                        (
+                            pod0_name.clone(),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    ..Default::default()
+                                }),
+                            },
+                        ),
+                        (
+                            pod1_name.clone(),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    ..Default::default()
+                                }),
+                            },
+                        ),
+                        (
+                            pod2_name.clone(),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    ..Default::default()
+                                }),
+                            },
+                        ),
+                    ])),
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod0_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod1_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod2_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.reconcile(ns, pcs).await.unwrap();
+
+        let pods = fixture.pods.all(ns);
+        assert_eq!(2, pods.len());
+
+        let mut pod_names = pods
+            .iter()
+            .map(|pod| pod.metadata.name.as_ref().unwrap().clone())
+            .collect::<Vec<_>>();
+        pod_names.sort();
+        assert_eq!(vec![pod1_name.clone(), pod2_name.clone()], pod_names);
+    }
+
+    #[tokio::test]
+    async fn warming_pods_refresh_shortly() {
+        let ns = "warming_pods_refresh_shortly";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let mut fixture: Fixture = Default::default();
+
+        let pod0_name = format!("{name}-0");
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        pod0_name.clone(),
+                        InstanceStatus {
+                            phase: Some(String::from("Pending")),
+                            state: None,
+                        },
+                    )])),
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod0_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                status: Some(k8s_openapi::api::core::v1::PodStatus {
+                    phase: Some(String::from("Running")),
+                    ..Default::default()
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.instance_state.insert(
+            pod0_name.clone(),
+            State {
+                state: InstanceState::Warming,
+                ..Default::default()
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pcs).await.unwrap());
+
+        let pcs = fixture.parquet_cache_sets.get(ns, name).unwrap();
+        assert_eq!(
+            "Running",
+            pcs.status
+                .as_ref()
+                .unwrap()
+                .pods
+                .as_ref()
+                .unwrap()
+                .get(&pod0_name)
+                .unwrap()
+                .phase
+                .as_deref()
+                .unwrap()
+        );
+        assert_eq!(
+            InstanceState::Warming,
+            pcs.status
+                .as_ref()
+                .unwrap()
+                .pods
+                .as_ref()
+                .unwrap()
+                .get(&pod0_name)
+                .unwrap()
+                .state
+                .as_ref()
+                .unwrap()
+                .state
+        );
+    }
+
+    #[tokio::test]
+    async fn no_status_pods_refresh_shortly() {
+        let ns = "no_status_pods_refresh_shortly";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let fixture: Fixture = Default::default();
+
+        let pod0_name = format!("{name}-0");
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        pod0_name.clone(),
+                        InstanceStatus {
+                            phase: Some(String::from("Pending")),
+                            state: None,
+                        },
+                    )])),
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod0_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                status: Some(k8s_openapi::api::core::v1::PodStatus {
+                    phase: Some(String::from("Running")),
+                    ..Default::default()
+                }),
+                ..Default::default()
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pcs).await.unwrap());
+
+        let pcs = fixture.parquet_cache_sets.get(ns, name).unwrap();
+        assert_eq!(
+            "Running",
+            pcs.status
+                .as_ref()
+                .unwrap()
+                .pods
+                .as_ref()
+                .unwrap()
+                .get(&pod0_name)
+                .unwrap()
+                .phase
+                .as_deref()
+                .unwrap()
+        );
+        assert!(&pcs
+            .status
+            .unwrap()
+            .pods
+            .unwrap()
+            .get(&pod0_name)
+            .unwrap()
+            .state
+            .is_none());
+    }
+
+    #[derive(Debug, Default)]
+    struct Fixture {
+        pub parquet_cache_sets: Arc<ResourceHandler<ParquetCacheSet>>,
+        pub pods: Arc<ResourceHandler<Pod>>,
+        pub instance_state: BTreeMap<String, State>,
+    }
+
+    impl Fixture {
+        fn service(&self) -> Service {
+            let service = Service::new();
+            service.add_handler(self.parquet_cache_sets.as_handler());
+            service.add_handler(self.pods.as_handler());
+            service
+        }
+
+        async fn reconcile(
+            &self,
+            ns: impl Into<String> + Send,
+            pcs: ParquetCacheSet,
+        ) -> Result<Duration> {
+            let service = self.service();
+            let client = ClientBuilder::new(service, ns).build::<Body>();
+            let state_service_client =
+                state_service::Client::new(StateService(self.instance_state.clone()));
+            let mut controller = ParquetCacheSetController::new(pcs, state_service_client, client);
+            let hnd = tokio::spawn(async move { controller.reconcile().await });
+            hnd.await.unwrap()
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct StateService(BTreeMap<String, State>);
+
+    impl tower::Service<state_service::Request> for StateService {
+        type Response = Option<State>;
+        type Error = Box<dyn std::error::Error + Send + Sync>;
+        type Future = std::future::Ready<Result<Self::Response, Self::Error>>;
+
+        fn poll_ready(
+            &mut self,
+            _cx: &mut Context<'_>,
+        ) -> Poll<std::result::Result<(), Self::Error>> {
+            Poll::Ready(Ok(()))
+        }
+
+        fn call(&mut self, req: Request) -> Self::Future {
+            std::future::ready(Ok(self
+                .0
+                .get(req.pod.metadata.name.as_deref().unwrap_or_default())
+                .cloned()))
+        }
+    }
+}
diff --git a/parquet_cache/src/controller/state_service.rs b/parquet_cache/src/controller/state_service.rs
new file mode 100644
index 0000000..847d3b8
--- /dev/null
+++ b/parquet_cache/src/controller/state_service.rs
@@ -0,0 +1,109 @@
+use super::{Error, Result};
+use crate::data_types::State;
+use hyper::service::Service;
+use k8s_openapi::api::core::v1::Pod;
+use observability_deps::tracing::debug;
+use std::fmt::{Debug, Formatter};
+use std::future::{poll_fn, Future};
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tower::buffer::Buffer;
+use tower::util::BoxService;
+use tower::{BoxError, ServiceExt};
+
+#[derive(Debug, Clone)]
+pub struct Request {
+    pub pod: Pod,
+    pub port: Option<String>,
+}
+
+#[derive(Clone)]
+pub struct Client {
+    inner: Buffer<BoxService<Request, Option<State>, BoxError>, Request>,
+}
+
+impl Client {
+    pub fn new<S>(svc: S) -> Self
+    where
+        S: Service<Request, Response = Option<State>> + Clone + Send + 'static,
+        S::Error: Into<BoxError> + Send + Sync,
+        S::Future: Future<Output = Result<Option<State>, S::Error>> + Send + 'static,
+    {
+        Self {
+            inner: Buffer::new(BoxService::new(svc.map_err(|e| e.into())), 1024),
+        }
+    }
+
+    pub async fn state(&mut self, pod: &Pod, port: &Option<String>) -> Result<Option<State>> {
+        let request = Request {
+            pod: pod.clone(),
+            port: port.clone(),
+        };
+        poll_fn(|cx| (self.inner.poll_ready(cx)))
+            .await
+            .map_err(Error::NodeStateError)?;
+        self.inner
+            .call(request)
+            .await
+            .map_err(Error::NodeStateError)
+    }
+}
+
+impl Debug for Client {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "pod state service client")
+    }
+}
+
+impl Default for Client {
+    fn default() -> Self {
+        Self::new(ReqwestClient {})
+    }
+}
+
+#[derive(Debug, Clone)]
+struct ReqwestClient {}
+
+impl Service<Request> for ReqwestClient {
+    type Response = Option<State>;
+    type Error = reqwest::Error;
+    type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;
+
+    fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request) -> Self::Future {
+        let fut = async {
+            let url = req
+                .pod
+                .status
+                .and_then(|status| status.pod_ip)
+                .map(|ip_addr| match req.port {
+                    Some(port) => format!("http://{ip_addr}:{port}/state"),
+                    None => format!("http://{ip_addr}/state"),
+                });
+            debug!(url, "Getting pod state");
+            if let Some(url) = url {
+                let response = match reqwest::get(url).await {
+                    Ok(response) => Some(response.json().await?),
+                    Err(error) => {
+                        debug!(
+                            error = &error as &dyn std::error::Error,
+                            "Error getting state"
+                        );
+                        if error.is_connect() {
+                            None
+                        } else {
+                            return Err(error);
+                        }
+                    }
+                };
+                Ok(response)
+            } else {
+                Ok(None)
+            }
+        };
+        Box::pin(fut)
+    }
+}
diff --git a/parquet_cache/src/data_types.rs b/parquet_cache/src/data_types.rs
new file mode 100644
index 0000000..f69dd60
--- /dev/null
+++ b/parquet_cache/src/data_types.rs
@@ -0,0 +1,12 @@
+//! Contains the datatypes to be shared across the data cache server and client.
+
+mod keyspace;
+pub use keyspace::*;
+mod objects;
+pub use objects::*;
+mod policy;
+pub use policy::*;
+mod state;
+pub use state::*;
+mod write_hints;
+pub use write_hints::*;
diff --git a/parquet_cache/src/data_types/keyspace.rs b/parquet_cache/src/data_types/keyspace.rs
new file mode 100644
index 0000000..faecda8
--- /dev/null
+++ b/parquet_cache/src/data_types/keyspace.rs
@@ -0,0 +1,164 @@
+use crate::data_types::State;
+use k8s_openapi::schemars::JsonSchema;
+use serde::{Deserialize, Serialize, Serializer};
+use std::time::{SystemTime, UNIX_EPOCH};
+
+use super::state::InstanceState;
+
+/// Response body for keyspace request.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct KeyspaceResponseBody {
+    /// Complete list of nodes for the hashring assignment of keyspace.
+    pub nodes: Vec<ServiceNode>,
+}
+
+/// Identifier used by data cache node.
+///
+/// This identifier should remain consistent for any nodes being cycled (e.g. k8s),
+/// as it determines the location in the hashring.
+pub type ServiceNodeId = u64;
+
+/// Hostname data cache node.
+pub type ServiceNodeHostname = String;
+
+/// Data cache service node.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ServiceNode {
+    /// Id of data cache service node.
+    pub id: ServiceNodeId,
+    /// Hostname.
+    pub hostname: ServiceNodeHostname,
+}
+
+/// The set of instances that form a parquet cache group.
+#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)]
+pub struct ParquetCacheInstanceSet {
+    /// The revision number of the cache instance set.
+    pub revision: i64,
+
+    /// The set of instances that form the cache set.
+    pub instances: Vec<String>,
+}
+
+impl ParquetCacheInstanceSet {
+    /// Returns true if the instance set is empty.
+    pub fn contains(&self, node_hostname: &ServiceNodeHostname) -> bool {
+        self.instances.contains(node_hostname)
+    }
+}
+
+// TODO: make on-disc and in-mem representations match!
+/// Converts on on-disc representation of the keyspace from the controller
+/// into the keyspace respresentation consumed by the cache client & server.
+impl From<&ParquetCacheInstanceSet> for KeyspaceResponseBody {
+    fn from(value: &ParquetCacheInstanceSet) -> Self {
+        Self {
+            nodes: value
+                .clone()
+                .instances
+                .into_iter()
+                .enumerate()
+                .map(|(id, hostname)| ServiceNode {
+                    id: id as u64,
+                    hostname,
+                })
+                .collect(),
+        }
+    }
+}
+
+impl From<&KeyspaceVersion> for InstanceState {
+    fn from(value: &KeyspaceVersion) -> Self {
+        match (&value.current, &value.next) {
+            (Some(current), Some(next)) => {
+                match (
+                    current.contains(&value.self_node),
+                    next.contains(&value.self_node),
+                ) {
+                    (false, true) => Self::Warming,
+                    (true, true) => Self::Running,
+                    (true, false) => Self::Cooling,
+                    (false, false) => Self::Cooling,
+                }
+            }
+            (None, Some(next)) if next.contains(&value.self_node) => Self::Warming,
+            (Some(_), None) => unreachable!("next should always be set, if curr exists"),
+            _ => Self::Pending,
+        }
+    }
+}
+
+/// Tracker of Keyspace version changes.
+///
+/// The response of `GET /state` is the serialized version of this struct.
+#[derive(Clone, Debug)]
+pub struct KeyspaceVersion {
+    /// Hostname of node, in order to identify self in [`ParquetCacheInstanceSet`].
+    ///
+    /// Does not change.
+    self_node: ServiceNodeHostname,
+    /// current ParquetCacheInstanceSet
+    pub current: Option<ParquetCacheInstanceSet>,
+    /// next ParquetCacheInstanceSet
+    pub next: Option<ParquetCacheInstanceSet>,
+    /// time that the service was last updated
+    pub changed: SystemTime,
+}
+
+impl Serialize for KeyspaceVersion {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let state = State {
+            state: InstanceState::from(self),
+            state_changed: self
+                .changed
+                .duration_since(UNIX_EPOCH)
+                .unwrap_or_default()
+                .as_secs() as i64,
+            current_node_set_revision: self.current.as_ref().map(|pcis| pcis.revision).unwrap_or(0),
+            next_node_set_revision: self.next.as_ref().map(|pcis| pcis.revision).unwrap_or(0),
+        };
+        state.serialize(serializer)
+    }
+}
+
+impl KeyspaceVersion {
+    /// Initialize the KeyspaceVersion with only the hostname (config option) known.
+    pub fn new(self_node: ServiceNodeHostname) -> Self {
+        Self {
+            self_node,
+            current: None,
+            next: None,
+            changed: SystemTime::now(),
+        }
+    }
+
+    /// Get hostname.
+    pub fn hostname(&self) -> &ServiceNodeHostname {
+        &self.self_node
+    }
+
+    /// Duplicate the `next` to `current`.
+    ///
+    /// This method is tightly coupled to the definition of InstanceState::from(KeyspaceVersion).
+    pub fn clone_next_to_curr(&self) -> Self {
+        Self {
+            self_node: self.self_node.clone(),
+            current: self.next.clone(),
+            next: self.next.clone(),
+            changed: SystemTime::now(),
+        }
+    }
+
+    /// Set next.
+    pub fn set_next(&self, next: ParquetCacheInstanceSet) -> Self {
+        Self {
+            self_node: self.self_node.clone(),
+            current: self.next.clone(), // increment forward
+            next: Some(next),
+            changed: SystemTime::now(),
+        }
+    }
+}
diff --git a/parquet_cache/src/data_types/objects.rs b/parquet_cache/src/data_types/objects.rs
new file mode 100644
index 0000000..5555569
--- /dev/null
+++ b/parquet_cache/src/data_types/objects.rs
@@ -0,0 +1,79 @@
+use hyper::{header::HeaderValue, HeaderMap};
+use serde::{Deserialize, Serialize};
+
+use crate::client::cache_connector::Error as CacheServerError;
+
+pub static X_RANGE_START_HEADER: &str = "x-object-range-start";
+pub static X_RANGE_END_HEADER: &str = "x-object-range-end";
+
+pub fn extract_usize_header(
+    header: &'static str,
+    values: &HeaderMap<HeaderValue>,
+) -> Result<usize, CacheServerError> {
+    let val = values
+        .get(header)
+        .ok_or(CacheServerError::ReadData(format!(
+            "missing header {}",
+            header
+        )))?
+        .to_str()
+        .map_err(|_| CacheServerError::ReadData(format!("missing {} header", header)))?;
+
+    val.parse::<usize>()
+        .map_err(|_| CacheServerError::ReadData(format!("invalid {} header", header)))
+}
+
+/// Metadata for object.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+pub struct GetObjectMetaResponse {
+    /// The full path to the object
+    pub location: String,
+    /// The last modified time
+    pub last_modified: chrono::DateTime<chrono::Utc>,
+    /// The size in bytes of the object
+    pub size: usize,
+    /// The unique identifier for the object
+    pub e_tag: Option<String>,
+    /// A version indicator for this object
+    pub version: Option<String>,
+}
+
+impl From<GetObjectMetaResponse> for object_store::ObjectMeta {
+    fn from(value: GetObjectMetaResponse) -> Self {
+        let GetObjectMetaResponse {
+            location,
+            last_modified,
+            size,
+            e_tag,
+            version,
+        } = value;
+
+        Self {
+            location: object_store::path::Path::parse(location).expect("should be valid path"),
+            last_modified,
+            size,
+            e_tag,
+            version,
+        }
+    }
+}
+
+impl From<object_store::ObjectMeta> for GetObjectMetaResponse {
+    fn from(value: object_store::ObjectMeta) -> Self {
+        let object_store::ObjectMeta {
+            location,
+            last_modified,
+            size,
+            e_tag,
+            version,
+        } = value;
+
+        Self {
+            location: location.to_string(),
+            last_modified,
+            size,
+            e_tag,
+            version,
+        }
+    }
+}
diff --git a/parquet_cache/src/data_types/policy.rs b/parquet_cache/src/data_types/policy.rs
new file mode 100644
index 0000000..da13b6b
--- /dev/null
+++ b/parquet_cache/src/data_types/policy.rs
@@ -0,0 +1,17 @@
+use serde::{Deserialize, Serialize};
+
+/// TODO: clap_blocks
+#[derive(Debug, Default, Clone, Copy)]
+pub struct PolicyConfig {
+    pub max_capacity: u64,
+    pub event_recency_max_duration_nanoseconds: u64,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, Default)]
+pub struct ObjectParams {
+    pub namespace_id: i64,
+    pub table_id: i64,
+    pub min_time: i64,
+    pub max_time: i64,
+    pub file_size_bytes: i64,
+}
diff --git a/parquet_cache/src/data_types/state.rs b/parquet_cache/src/data_types/state.rs
new file mode 100644
index 0000000..9afe54e
--- /dev/null
+++ b/parquet_cache/src/data_types/state.rs
@@ -0,0 +1,52 @@
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use std::fmt::{Display, Formatter};
+
+#[derive(Debug, Default, Clone, PartialEq, Deserialize, Copy, Serialize, JsonSchema)]
+pub struct State {
+    /// The current state of the cache node.
+    pub state: InstanceState,
+
+    /// Timestamp (seconds from unix epoch) that the state last changed.
+    pub state_changed: i64,
+
+    /// The revision number of the current node set known to the cache node.
+    pub current_node_set_revision: i64,
+
+    /// The revision number of the next node set known to the cache node.
+    pub next_node_set_revision: i64,
+}
+
+#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Default, Copy, Clone, JsonSchema)]
+pub enum InstanceState {
+    #[default]
+    /// Default state, prior to loading any configmap keyspace.
+    #[serde(rename = "pending")]
+    Pending,
+    /// Have configmap, are warming, and not receiving traffic.
+    ///
+    /// Can still respond to `GET /state` requests (from controller).
+    #[serde(rename = "warming")]
+    Warming,
+    /// Ready for traffic.
+    ///
+    /// Includes own host in `GET /keyspace` responses.
+    #[serde(rename = "running")]
+    Running,
+    /// Response to `GET /keyspace` requests are now directing traffic elsewhere.
+    ///
+    /// May still have ongoing requests.
+    #[serde(rename = "cooling")]
+    Cooling,
+}
+
+impl Display for InstanceState {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Pending => write!(f, "pending"),
+            Self::Warming => write!(f, "warming"),
+            Self::Running => write!(f, "running"),
+            Self::Cooling => write!(f, "cooling"),
+        }
+    }
+}
diff --git a/parquet_cache/src/data_types/write_hints.rs b/parquet_cache/src/data_types/write_hints.rs
new file mode 100644
index 0000000..fdff107
--- /dev/null
+++ b/parquet_cache/src/data_types/write_hints.rs
@@ -0,0 +1,81 @@
+use data_types::{ParquetFile, ParquetFileParams};
+use serde::{Deserialize, Serialize};
+
+use super::ObjectParams;
+
+/// Request payload provided on WriteHinting.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct WriteHintRequestBody {
+    /// Object store [`Path`](object_store::path::Path) converted to cache key.
+    pub location: String,
+    /// The actual [`WriteHint`].
+    pub hint: WriteHint,
+    /// Requested server contract to fulfill prior to ACK.
+    pub ack_setting: WriteHintAck,
+}
+
+/// DataCache is a read-only, write-hinting service.
+///
+/// Cache writes to store, then hints to pull into cache.
+/// Return ok based upon a configurable level of cache server ack.
+#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
+pub enum WriteHintAck {
+    /// cache client sent write hint
+    Sent,
+    /// cache server received write hint
+    Received,
+    /// cache server completed downstream action
+    #[default]
+    Completed,
+}
+
+impl std::fmt::Display for WriteHintAck {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+/// Write hint metadata provided by the client.
+pub type WriteHint = ObjectParams;
+
+impl From<&ParquetFileParams> for WriteHint {
+    fn from(value: &ParquetFileParams) -> Self {
+        let ParquetFileParams {
+            namespace_id,
+            table_id,
+            min_time,
+            max_time,
+            file_size_bytes,
+            ..
+        } = value;
+
+        Self {
+            namespace_id: namespace_id.get(),
+            table_id: table_id.get(),
+            min_time: min_time.get(),
+            max_time: max_time.get(),
+            file_size_bytes: file_size_bytes.to_owned(),
+        }
+    }
+}
+
+impl From<&ParquetFile> for WriteHint {
+    fn from(value: &ParquetFile) -> Self {
+        let ParquetFile {
+            namespace_id,
+            table_id,
+            min_time,
+            max_time,
+            file_size_bytes,
+            ..
+        } = value;
+
+        Self {
+            namespace_id: namespace_id.get(),
+            table_id: table_id.get(),
+            min_time: min_time.get(),
+            max_time: max_time.get(),
+            file_size_bytes: file_size_bytes.to_owned(),
+        }
+    }
+}
diff --git a/parquet_cache/src/lib.rs b/parquet_cache/src/lib.rs
new file mode 100644
index 0000000..b4f4d5d
--- /dev/null
+++ b/parquet_cache/src/lib.rs
@@ -0,0 +1,51 @@
+//! IOx parquet cache client.
+//!
+//! ParquetCache client interface to be used by IOx components to
+//! get and put parquet files into the cache.
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(rustdoc::private_intra_doc_links, unreachable_pub)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod client;
+pub use client::{cache_connector::Error, write_hints::WriteHintingObjectStore};
+
+pub mod controller;
+
+pub(crate) mod data_types;
+
+mod server;
+#[cfg(test)]
+pub use server::mock::MockCacheServer;
+pub use server::{build_cache_server, ParquetCacheServer, ParquetCacheServerConfig, ServerError};
+
+use object_store::ObjectStore;
+use std::sync::Arc;
+
+use crate::client::{cache_connector::build_cache_connector, object_store::DataCacheObjectStore};
+
+// TODO: change this to `Arc<dyn WriteHintingObjectStore>`
+// and have consumers (e.g. ingester, compactor) issue write-hints.
+//
+/// Build a cache client.
+pub fn make_client(
+    namespace_service_address: String,
+    object_store: Arc<dyn ObjectStore>,
+) -> Arc<dyn ObjectStore> {
+    let server_connection = build_cache_connector(namespace_service_address);
+    Arc::new(DataCacheObjectStore::new(server_connection, object_store))
+}
diff --git a/parquet_cache/src/server.rs b/parquet_cache/src/server.rs
new file mode 100644
index 0000000..0f5308b
--- /dev/null
+++ b/parquet_cache/src/server.rs
@@ -0,0 +1,482 @@
+#![allow(dead_code)]
+//! Contains the cache server.
+
+use std::sync::Arc;
+
+use iox_catalog::interface::Catalog;
+use object_store::ObjectStore;
+use tower::ServiceBuilder;
+
+use crate::data_types::PolicyConfig;
+
+use self::{
+    cache::{BuildCacheService, CacheService},
+    data::DataService,
+    keyspace::{BuildKeyspaceService, KeyspaceService},
+    precondition::{BuildPreconditionService, PreconditionService},
+};
+
+// Layers in the cache server:
+mod cache;
+mod data;
+mod keyspace;
+mod precondition;
+
+// Shared server types:
+mod error;
+pub use error::Error as ServerError;
+mod response;
+
+#[cfg(test)]
+pub(crate) mod mock;
+
+/// The cache server type.
+pub type ParquetCacheServer = CacheService<KeyspaceService<PreconditionService<DataService>>>;
+
+/// Config for cache server.
+#[derive(Debug)]
+pub struct ParquetCacheServerConfig {
+    /// The path to the config file for the keyspace.
+    pub keyspace_config_path: String,
+    /// The hostname of the cache instance (k8s pod) running this process.
+    pub hostname: String,
+    /// The local directory to store data.
+    pub local_dir: String,
+    /// The policy config for the cache eviction.
+    pub policy_config: PolicyConfig,
+}
+
+/// Build a cache server.
+pub async fn build_cache_server(
+    config: ParquetCacheServerConfig,
+    direct_store: Arc<dyn ObjectStore>,
+    catalog: Arc<dyn Catalog>,
+) -> ParquetCacheServer {
+    let ParquetCacheServerConfig {
+        keyspace_config_path: configfile_path,
+        hostname: node_hostname,
+        local_dir,
+        policy_config,
+    } = config;
+
+    ServiceBuilder::new()
+        // outermost layer 0
+        .layer(BuildCacheService)
+        // layer 1
+        .layer(BuildKeyspaceService {
+            configfile_path,
+            node_hostname,
+        })
+        // layer 2
+        .layer(BuildPreconditionService)
+        // innermost layer 3
+        .service(DataService::new(direct_store, catalog, policy_config, Some(local_dir)).await)
+}
+
+#[cfg(test)]
+mod integration_tests {
+    use std::{
+        fs::create_dir_all,
+        io::{Seek, Write},
+        path::Path,
+        time::Duration,
+    };
+
+    use bytes::{Buf, BufMut, BytesMut};
+    use http::{Method, StatusCode};
+    use hyper::{Body, Request};
+    use iox_tests::{TestCatalog, TestParquetFileBuilder};
+    use object_store::{local::LocalFileSystem, ObjectMeta};
+    use serde::Deserialize;
+    use serde_json::Deserializer;
+    use tempfile::{tempdir, NamedTempFile, TempDir};
+    use tower::Service;
+
+    use crate::data_types::{
+        GetObjectMetaResponse, InstanceState, KeyspaceResponseBody, ParquetCacheInstanceSet,
+        ServiceNode, State, WriteHint, WriteHintRequestBody,
+    };
+    use crate::server::response::Response as ServerInternalResponse;
+
+    use super::*;
+
+    fn create_fs_direct_store(local_dir: &Path) -> Arc<dyn ObjectStore> {
+        create_dir_all(local_dir).unwrap();
+        Arc::new(LocalFileSystem::new_with_prefix(local_dir).expect("should create fs ObjectStore"))
+    }
+
+    #[tokio::test]
+    async fn test_invalid_path() {
+        let tmpdir = tempdir().unwrap();
+        let direct_store = create_fs_direct_store(tmpdir.path());
+        let catalog = iox_tests::TestCatalog::new();
+
+        let config = ParquetCacheServerConfig {
+            keyspace_config_path: "/tmp".to_string(),
+            hostname: "localhost".to_string(),
+            local_dir: tmpdir.path().to_str().unwrap().to_string(),
+            policy_config: PolicyConfig::default(),
+        };
+
+        let mut server = build_cache_server(config, direct_store, catalog.catalog()).await;
+
+        let req = Request::get("http://foo.io/invalid-path/")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await;
+
+        // assert expected http response
+        assert_matches::assert_matches!(
+            resp,
+            Err(ServerError::BadRequest(msg)) if msg.contains("invalid path"),
+            "expected bad request, instead found {:?}", resp
+        );
+    }
+
+    const VALID_HOSTNAME: &str = "hostname-a";
+    lazy_static::lazy_static! {
+        static ref KEYSPACE_DEFINITION: ParquetCacheInstanceSet = ParquetCacheInstanceSet {
+            revision: 0,
+            // a single node in the keyspace, therefore all keys should hash to this keyspace
+            instances: vec![VALID_HOSTNAME].into_iter().map(String::from).collect(),
+        };
+    }
+
+    const LOCATION: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000001.parquet";
+    const DATA: &[u8] = b"all my pretty words";
+
+    async fn setup_service_and_direct_store(
+        direct_store: Arc<dyn ObjectStore>,
+        cache_tmpdir: TempDir,
+        file: &mut NamedTempFile,
+    ) -> (ParquetCacheServer, Arc<TestCatalog>, ObjectMeta) {
+        let catalog = iox_tests::TestCatalog::new();
+
+        let policy_config = PolicyConfig {
+            max_capacity: 3_200_000_000,
+            event_recency_max_duration_nanoseconds: 1_000_000_000 * 5, // 5 seconds
+        };
+
+        writeln!(file, "{}", serde_json::json!(*KEYSPACE_DEFINITION))
+            .expect("should write keyspace definition to configfile");
+
+        let obj_store_path = object_store::path::Path::from(LOCATION);
+
+        let config = ParquetCacheServerConfig {
+            keyspace_config_path: file.path().to_str().unwrap().to_string(),
+            hostname: VALID_HOSTNAME.to_string(),
+            local_dir: cache_tmpdir.path().to_str().unwrap().to_string(),
+            policy_config,
+        };
+
+        let server = build_cache_server(config, Arc::clone(&direct_store), catalog.catalog()).await;
+
+        // add object to direct store
+        direct_store
+            .put(&obj_store_path, DATA.into())
+            .await
+            .expect("should write object to direct store");
+        let expected_meta = direct_store
+            .head(&obj_store_path)
+            .await
+            .expect("should have object in direct store");
+
+        // wait until service is ready
+        let mut this = server.clone();
+        futures::future::poll_fn(move |cx| this.poll_ready(cx))
+            .await
+            .expect("should not have failed");
+
+        (server, catalog, expected_meta)
+    }
+
+    async fn confirm_data_exists(expected_meta: ObjectMeta, server: &mut ParquetCacheServer) {
+        // issue read metadata
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION))
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+
+        // assert expected http response for metadata
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body: GetObjectMetaResponse = serde_json::from_reader(
+            hyper::body::aggregate(resp.into_body())
+                .await
+                .expect("should create reader")
+                .reader(),
+        )
+        .expect("should read response body");
+        let resp_meta: object_store::ObjectMeta = resp_body.into();
+        assert_eq!(
+            resp_meta, expected_meta,
+            "expected proper metadata, instead found {:?}",
+            resp_meta
+        );
+
+        // issue read object
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION))
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+
+        // assert expected http response for object
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let body = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("reading response body");
+        assert_eq!(
+            body.len(),
+            DATA.to_vec().len(),
+            "expected data in body, instead found {}",
+            std::str::from_utf8(&body).unwrap()
+        );
+    }
+
+    #[tokio::test]
+    async fn test_write_hint_and_read() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, _, expected_meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri(format!("http://foo.io/write-hint?location={}", LOCATION))
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+
+        // assert expected http response for write-hint
+        let expected_resp = ServerInternalResponse::Written;
+        assert_eq!(
+            resp.status(),
+            expected_resp.code(),
+            "expected http response status code to match, instead found {:?}",
+            resp
+        );
+        let body = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("reading response body");
+        assert_eq!(
+            body.len(),
+            0,
+            "expected empty body, instead found {}",
+            std::str::from_utf8(&body).unwrap()
+        );
+
+        confirm_data_exists(expected_meta, &mut server).await;
+    }
+
+    #[tokio::test]
+    async fn test_cache_miss_writeback_and_read() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, catalog, expected_meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // write-back requires catalog data, therefore insert into catalog
+        let namespace = catalog.create_namespace_1hr_retention("ns0").await;
+        let table = namespace.create_table("table0").await;
+        let partition = table.create_partition("partition_key").await;
+
+        // insert parquet file into catalog, with proper matching object store id
+        let parquet_file_path = parquet_file::ParquetFilePath::try_from(&LOCATION.to_string())
+            .expect("should be valid parquet file path");
+        let parquet_file = TestParquetFileBuilder::default()
+            .with_creation_time(iox_time::Time::from_date_time(expected_meta.last_modified))
+            .with_file_size_bytes(DATA.to_vec().len() as u64)
+            .with_object_store_id(parquet_file_path.object_store_id());
+        partition
+            .create_parquet_file_catalog_record(parquet_file)
+            .await;
+
+        // trigger cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION))
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await;
+        assert_matches::assert_matches!(
+            resp,
+            Err(ServerError::CacheMiss),
+            "expected cache miss, instead found {:?}",
+            resp
+        );
+
+        // wait for write-back to complete
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        confirm_data_exists(expected_meta, &mut server).await;
+    }
+
+    #[tokio::test]
+    async fn test_state_responses() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, _, _meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // check keyspace status is running
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/state")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body_json = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("should read response body");
+        let mut de = Deserializer::from_slice(&resp_body_json);
+        let mut state = State::deserialize(&mut de).expect("valid State object");
+        state.state_changed = 0; // ignore the timestamp
+        assert_eq!(
+            state,
+            State {
+                state: InstanceState::Running,
+                state_changed: 0,
+                current_node_set_revision: 0,
+                next_node_set_revision: 0,
+            },
+        );
+
+        // tell keyspace to cool, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(configfile.path())
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // waiting for new_keyspace_definition to load
+        // cannot use poll_ready, as it is already returning ready (to accept `GET /state` requests)
+        tokio::time::sleep(Duration::from_secs(10)).await;
+
+        // check keyspace status is cooling
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/state")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body_json = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("should read response body");
+        let mut de = Deserializer::from_slice(&resp_body_json);
+        let mut state = State::deserialize(&mut de).expect("valid State object");
+        state.state_changed = 0; // ignore the timestamp
+        assert_eq!(
+            state,
+            State {
+                state: InstanceState::Cooling,
+                state_changed: 0,
+                current_node_set_revision: 0,
+                next_node_set_revision: 1,
+            },
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_nodes() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, _, _meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // get keyspace nodes
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/keyspace")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body: KeyspaceResponseBody = serde_json::from_reader(
+            hyper::body::aggregate(resp.into_body())
+                .await
+                .expect("should create reader")
+                .reader(),
+        )
+        .expect("should read response body");
+        assert_matches::assert_matches!(
+            resp_body,
+            KeyspaceResponseBody { nodes } if matches!(
+                &nodes[..],
+                [ServiceNode { id: 0, hostname }] if hostname == VALID_HOSTNAME
+            )
+        );
+    }
+}
diff --git a/parquet_cache/src/server/cache.rs b/parquet_cache/src/server/cache.rs
new file mode 100644
index 0000000..6acb7c4
--- /dev/null
+++ b/parquet_cache/src/server/cache.rs
@@ -0,0 +1,113 @@
+use std::{pin::Pin, task::Poll};
+
+use futures::{ready, Future};
+use http::{Method, Request, Response, StatusCode};
+use hyper::Body;
+use tokio::sync::OnceCell;
+use tower::{Layer, Service};
+
+use super::response::PinnedFuture;
+
+pub type FinalResponseFuture =
+    Pin<Box<dyn Future<Output = Result<Response<Body>, super::error::Error>> + Send>>;
+
+/// Cache Service
+#[derive(Debug, Clone)]
+pub struct CacheService<S: Clone> {
+    inner: S,
+    initialize_once: OnceCell<()>,
+}
+
+impl<S> CacheService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture> + Clone + Send + Sync + 'static,
+{
+    pub fn new(inner: S) -> Self {
+        Self {
+            inner,
+            initialize_once: Default::default(),
+        }
+    }
+
+    pub async fn prewarm(&mut self) -> Result<(), super::error::Error> {
+        // TODO:
+        // 0. (already done): LruCacheManager::new() => should have cache policy.
+        // 1. (already done): Keyspace::poll_ready() => should have the keyspace.
+        // 2. TODO(optional): may have persisted state from previous LruCacheManager, to reduce catalog load
+        // 3. GET list of obj_keys from catalog.
+        //      * Query limits based on cache policy.
+        //      * Use slower prewarming, paginated catalog queries, prioritized cache insertion.
+        // 4. for key in list => self.call(<`/write-hint` request for key>)
+        //      * inner KeyspaceService will filter by key hash
+        //      * inner DataService will filter by cache eviction policy
+        //      * inner WriteService will handle write-back
+
+        // 5. message to inner that prewarming is done.
+        let req = Request::builder()
+            .method(Method::PATCH)
+            .uri("/warmed")
+            .body(Body::empty())
+            .expect("should create prewarm PATCH /warmed req");
+        self.inner
+            .call(req)
+            .await
+            .map_err(|e| super::error::Error::Warming(e.to_string()))?;
+
+        Ok(())
+    }
+}
+
+impl<S> Service<Request<Body>> for CacheService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture> + Clone + Send + Sync + 'static,
+{
+    type Response = Response<Body>;
+    type Error = super::error::Error;
+    type Future = FinalResponseFuture;
+
+    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        // wait for inner service to receive requests
+        let _ = ready!(self.inner.poll_ready(cx));
+
+        // initialize once (which issues a request to inner service)
+        let mut this = self.clone();
+        Box::pin(async move {
+            self.initialize_once
+                .get_or_try_init(|| this.prewarm())
+                .await
+        })
+        .as_mut()
+        .poll(cx)
+        .map_ok(|_| ())
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let clone = self.inner.clone();
+        let mut inner = std::mem::replace(&mut self.inner, clone);
+        Box::pin(async move {
+            match inner.call(req).await {
+                Ok(resp) => match Response::builder().status(resp.code()).body(resp.into()) {
+                    Ok(resp) => Ok(resp),
+                    Err(e) => Ok(Response::builder()
+                        .status(StatusCode::INTERNAL_SERVER_ERROR)
+                        .body(e.to_string().into())
+                        .expect("should build error response")),
+                },
+                Err(e) => Err(e),
+            }
+        })
+    }
+}
+
+pub struct BuildCacheService;
+
+impl<S> Layer<S> for BuildCacheService
+where
+    S: Service<Request<Body>, Future = PinnedFuture> + Clone + Send + Sync + 'static,
+{
+    type Service = CacheService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        CacheService::new(service)
+    }
+}
diff --git a/parquet_cache/src/server/data.rs b/parquet_cache/src/server/data.rs
new file mode 100644
index 0000000..fd50aab
--- /dev/null
+++ b/parquet_cache/src/server/data.rs
@@ -0,0 +1,810 @@
+mod manager;
+mod reads;
+mod store;
+mod writes;
+
+use std::{sync::Arc, task::Poll};
+
+use backoff::{Backoff, BackoffConfig};
+use bytes::Buf;
+use http::{Request, Uri};
+use hyper::{Body, Method};
+use iox_catalog::interface::Catalog;
+use object_store::ObjectStore;
+use observability_deps::tracing::{error, warn};
+use tokio::task::JoinHandle;
+use tower::Service;
+
+use self::{
+    manager::{CacheManager, CacheManagerValue},
+    reads::ReadHandler,
+    store::LocalStore,
+    writes::WriteHandler,
+};
+use super::{error::Error, response::Response};
+use crate::data_types::{PolicyConfig, WriteHint, WriteHintRequestBody};
+
+#[derive(Debug, thiserror::Error)]
+pub enum DataError {
+    #[error("Read error: {0}")]
+    Read(String),
+    #[error("Write-stream error: {0}")]
+    Stream(String),
+    #[error("Write-file error: {0}")]
+    File(String),
+    #[error("Bad Request: {0}")]
+    BadRequest(String),
+    #[error("Bad Request: object location does not exist in catalog or object store")]
+    DoesNotExist,
+}
+
+/// Service that provides access to the data.
+#[derive(Debug, Clone)]
+pub struct DataService {
+    catalog: Arc<dyn Catalog>,
+    cache_manager: Arc<CacheManager>,
+    read_handler: ReadHandler,
+    write_hander: WriteHandler,
+    handle: Arc<JoinHandle<()>>,
+    backoff_config: BackoffConfig,
+}
+
+impl DataService {
+    pub async fn new(
+        direct_store: Arc<dyn ObjectStore>,
+        catalog: Arc<dyn Catalog>,
+        config: PolicyConfig,
+        dir: Option<impl ToString + Send>,
+    ) -> Self {
+        let data_accessor = Arc::new(LocalStore::new(dir));
+
+        // TODO: use a bounded channel
+        // Apply back pressure if we can't keep up (a.k.a. the actual eviction from the local store).
+        let (evict_tx, evict_rx) = async_channel::unbounded();
+
+        // start background task to evict from local store
+        let data_accessor_ = Arc::clone(&data_accessor);
+        let handle = tokio::spawn(async move {
+            while let Ok(key) = evict_rx.recv().await {
+                let _ = data_accessor_.delete_object(&key).await;
+            }
+        });
+
+        Self {
+            catalog,
+            read_handler: ReadHandler::new(Arc::clone(&data_accessor)),
+            write_hander: WriteHandler::new(Arc::clone(&data_accessor), direct_store),
+            cache_manager: Arc::new(CacheManager::new(config, evict_tx)),
+            handle: Arc::new(handle),
+            backoff_config: Default::default(),
+        }
+    }
+
+    async fn create_write_hint(&self, location: &String) -> Result<WriteHint, Error> {
+        let parquet_file_path = parquet_file::ParquetFilePath::try_from(location)
+            .map_err(|e| Error::BadRequest(e.to_string()))?;
+
+        let maybe_parquet_file = Backoff::new(&self.backoff_config)
+            .retry_all_errors("lookup write-hint in catalog", || async {
+                self.catalog
+                    .repositories()
+                    .parquet_files()
+                    .get_by_object_store_id(parquet_file_path.object_store_id())
+                    .await
+            })
+            .await
+            .expect("retry forever");
+
+        match maybe_parquet_file {
+            None => Err(Error::DoesNotExist),
+            Some(parquet_file) => Ok(WriteHint::from(&parquet_file)),
+        }
+    }
+
+    async fn write_back(&self, location: String, write_hint: WriteHint) -> Result<(), Error> {
+        // confirm valid location
+        parquet_file::ParquetFilePath::try_from(&location)
+            .map_err(|e| Error::BadRequest(e.to_string()))?;
+
+        // write to local store
+        let metadata = self
+            .write_hander
+            .write_local(&location, &write_hint)
+            .await?;
+
+        // update cache manager
+        self.cache_manager
+            .insert(
+                location,
+                CacheManagerValue {
+                    params: write_hint,
+                    metadata,
+                },
+            )
+            .await;
+
+        Ok(())
+    }
+}
+
+impl Service<Request<Body>> for DataService {
+    type Response = Response;
+    type Error = Error;
+    type Future = super::response::PinnedFuture;
+
+    fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        match (req.method(), req.uri().path()) {
+            (&Method::GET, "/state")
+            | (&Method::PATCH, "/warmed")
+            | (&Method::GET, "/keyspace") => {
+                unreachable!("`this request should have already been handled in the KeyspaceLayer`")
+            }
+            (&Method::GET, "/metadata") | (&Method::GET, "/object") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    let obj_location = parse_object_location(req.uri())?;
+                    match this.cache_manager.in_cache(&obj_location).await {
+                        Ok(_) => match req.uri().path() {
+                            "/metadata" => {
+                                let meta = this.cache_manager.fetch_metadata(&obj_location).await?;
+                                Ok(Response::Head(meta.into()))
+                            }
+                            "/object" => {
+                                let stream = this.read_handler.read_local(&obj_location).await?;
+                                Ok(Response::Data(stream))
+                            }
+                            _ => unreachable!(),
+                        },
+                        Err(Error::CacheMiss) => {
+                            // trigger write-back on another thread
+                            let this_ = this.clone();
+                            tokio::spawn(async move {
+                                let write_hint = match this_.create_write_hint(&obj_location).await
+                                {
+                                    Ok(write_hint) => write_hint,
+                                    Err(error) => {
+                                        warn!(%error, "write-back failed to create write-hint (likely missing from catalog)");
+                                        return;
+                                    }
+                                };
+
+                                if let Err(error) = this_.write_back(obj_location, write_hint).await
+                                {
+                                    error!(%error, "write-back failed to perform local-store write");
+                                }
+                            });
+
+                            // still return immediate response, such that client will use direct_store fallback
+                            Err(Error::CacheMiss)
+                        }
+                        Err(e) => Err(e),
+                    }
+                })
+            }
+            (&Method::POST, "/write-hint") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    let reader = hyper::body::aggregate(req.into_body())
+                        .await
+                        .map_err(|e| Error::BadRequest(e.to_string()))?
+                        .reader();
+                    let write_hint: WriteHintRequestBody = serde_json::from_reader(reader)
+                        .map_err(|e| Error::BadRequest(e.to_string()))?;
+
+                    match this.cache_manager.in_cache(&write_hint.location).await {
+                        Ok(_) => Ok(Response::Written),
+                        Err(_) => {
+                            this.write_back(write_hint.location, write_hint.hint)
+                                .await?;
+                            Ok(Response::Written)
+                        }
+                    }
+                })
+            }
+            (any_method, any_path) => {
+                let msg = format!("invalid path: {} {}", any_method, any_path);
+                Box::pin(async { Err(Error::BadRequest(msg)) })
+            }
+        }
+    }
+}
+
+fn parse_object_location(uri: &Uri) -> Result<String, Error> {
+    let as_url = url::Url::parse(uri.to_string().as_str())
+        .expect("should be already validated path & query");
+    match as_url.query_pairs().find(|(k, _v)| k.eq("location")) {
+        None => Err(Error::BadRequest(
+            "missing required query parameter: location".into(),
+        )),
+        Some((_key, location)) => Ok(location.to_string()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, fs::File, io::Write, ops::Range, path::PathBuf};
+
+    use assert_matches::assert_matches;
+    use bytes::{BufMut, Bytes, BytesMut};
+    use chrono::{DateTime, Utc};
+    use futures::{stream::BoxStream, TryStreamExt};
+    use iox_tests::TestParquetFileBuilder;
+    use object_store::{
+        path::Path, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta,
+        ObjectStore, PutOptions, PutResult,
+    };
+    use tempfile::{tempdir, TempDir};
+    use tokio::{fs::create_dir_all, io::AsyncWrite};
+
+    use crate::data_types::GetObjectMetaResponse;
+
+    use super::*;
+
+    const ONE_SECOND: u64 = 1_000_000_000;
+
+    // refer to valid path in parquet_file::ParquetFilePath
+    const LOCATION_F: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000000.parquet";
+    const LOCATION_S: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000001.parquet";
+    const LOCATION_MISSING: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000002.parquet"; // not in catalog, nor remote store
+
+    const DATA: &[u8] = b"all my pretty words";
+
+    lazy_static::lazy_static! {
+        static ref LAST_MODIFIED: DateTime<Utc> = Utc::now();
+    }
+
+    #[derive(Debug)]
+    struct MockData(Bytes, bool /* as_stream */);
+
+    #[derive(Debug)]
+    struct MockDirectStore {
+        mocked: HashMap<String /* location */, MockData>,
+        temp_dir: TempDir,
+    }
+
+    impl MockDirectStore {
+        fn default() -> Self {
+            Self {
+                mocked: HashMap::new(),
+                temp_dir: tempdir().expect("should create temp dir"),
+            }
+        }
+
+        fn put_mock(&mut self, location: String, data: MockData) {
+            self.mocked.insert(location, data);
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl ObjectStore for MockDirectStore {
+        async fn get_opts(
+            &self,
+            location: &Path,
+            _options: GetOptions,
+        ) -> object_store::Result<GetResult> {
+            let MockData(bytes, as_stream) = match self.mocked.get(&location.to_string()) {
+                Some(data) => data,
+                _ => {
+                    return Err(object_store::Error::NotFound {
+                        path: location.to_string(),
+                        source: "not found in remote store".into(),
+                    })
+                }
+            };
+
+            let meta = ObjectMeta {
+                location: location.clone(),
+                last_modified: *LAST_MODIFIED,
+                size: DATA.to_vec().len(),
+                e_tag: Default::default(),
+                version: Default::default(),
+            };
+
+            let bytes = bytes.to_owned();
+            let payload =
+                match as_stream {
+                    true => GetResultPayload::Stream(Box::pin(futures::stream::once(async move {
+                        Ok(bytes)
+                    }))),
+                    false => {
+                        let path = self.temp_dir.path().join(location.to_string());
+                        create_dir_all(path.parent().unwrap())
+                            .await
+                            .expect("should create nested path");
+                        let mut file =
+                            File::create(path.as_path()).expect("should be able to open temp file");
+                        file.write_all(&bytes)
+                            .expect("should be able to write to temp file");
+                        file.flush().expect("should be able to flush temp file");
+                        GetResultPayload::File(file, path)
+                    }
+                };
+
+            Ok(GetResult {
+                payload,
+                meta,
+                range: Range {
+                    start: 0,
+                    end: DATA.to_vec().len(),
+                },
+            })
+        }
+
+        async fn put_opts(
+            &self,
+            _location: &Path,
+            _bytes: Bytes,
+            _opts: PutOptions,
+        ) -> object_store::Result<PutResult> {
+            unimplemented!()
+        }
+        async fn put_multipart(
+            &self,
+            _location: &Path,
+        ) -> object_store::Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+            unimplemented!()
+        }
+        async fn abort_multipart(
+            &self,
+            _location: &Path,
+            _multipart_id: &MultipartId,
+        ) -> object_store::Result<()> {
+            unimplemented!()
+        }
+        async fn delete(&self, _location: &Path) -> object_store::Result<()> {
+            unimplemented!()
+        }
+        fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, object_store::Result<ObjectMeta>> {
+            unimplemented!()
+        }
+        async fn list_with_delimiter(
+            &self,
+            _prefix: Option<&Path>,
+        ) -> object_store::Result<ListResult> {
+            unimplemented!()
+        }
+        async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
+            unimplemented!()
+        }
+        async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
+            unimplemented!()
+        }
+    }
+
+    impl std::fmt::Display for MockDirectStore {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "MockDirectStore")
+        }
+    }
+
+    fn make_parquet_file(location: &str) -> TestParquetFileBuilder {
+        let parquet_file_path = parquet_file::ParquetFilePath::try_from(&location.to_string())
+            .expect("should be valid parquet file path");
+
+        TestParquetFileBuilder::default()
+            .with_creation_time(iox_time::Time::from_date_time(*LAST_MODIFIED))
+            .with_file_size_bytes(DATA.to_vec().len() as u64)
+            .with_object_store_id(parquet_file_path.object_store_id())
+    }
+
+    async fn make_service(temp_dir: PathBuf, policy_config: Option<PolicyConfig>) -> DataService {
+        let mut direct_store = MockDirectStore::default();
+        // data returned as file, for write-back
+        direct_store.put_mock(
+            LOCATION_F.to_string(),
+            MockData(Bytes::from(DATA.to_vec()), false),
+        );
+        // data returned as stream, for write-back
+        direct_store.put_mock(
+            LOCATION_S.to_string(),
+            MockData(Bytes::from(DATA.to_vec()), true),
+        );
+
+        // create catalog
+        let test_catalog = iox_tests::TestCatalog::new();
+        let namespace = test_catalog.create_namespace_1hr_retention("ns0").await;
+        let table = namespace.create_table("table0").await;
+        let partition = table.create_partition("partition_key").await;
+
+        // add parquet files to catalog
+        partition
+            .create_parquet_file_catalog_record(make_parquet_file(LOCATION_F))
+            .await;
+        partition
+            .create_parquet_file_catalog_record(make_parquet_file(LOCATION_S))
+            .await;
+
+        DataService::new(
+            Arc::new(direct_store),
+            test_catalog.catalog(),
+            policy_config.unwrap_or(PolicyConfig {
+                max_capacity: 3_200_000,
+                event_recency_max_duration_nanoseconds: ONE_SECOND * 60 * 2,
+            }),
+            Some(temp_dir.to_str().unwrap()),
+        )
+        .await
+    }
+
+    // note: uses file for write-back
+    #[tokio::test]
+    async fn test_metadata_writeback_on_cache_miss() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // return cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::CacheMiss),
+            "should return cache miss, instead found {:?}",
+            resp
+        );
+
+        // wait for write-back to complete
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        // return cache hit
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        let expected = GetObjectMetaResponse::from(ObjectMeta {
+            location: LOCATION_F.into(),
+            size: DATA.to_vec().len(),
+            last_modified: *LAST_MODIFIED,
+            e_tag: Default::default(),
+            version: Default::default(),
+        });
+        assert_matches!(
+            resp,
+            Ok(Response::Head(meta)) if meta == expected,
+            "should return metadata for location, instead found {:?}", resp
+        );
+    }
+
+    // note: uses file for write-back
+    #[tokio::test]
+    async fn test_object_writeback_on_cache_miss() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // return cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::CacheMiss),
+            "should return cache miss, instead found {:?}",
+            resp
+        );
+
+        // wait for write-back to complete
+        tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
+
+        // return cache hit
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        match resp {
+            Ok(Response::Data(stream)) => {
+                let data = stream.try_collect::<Vec<_>>().await.unwrap();
+                assert_eq!(
+                    data,
+                    vec![DATA.to_vec()],
+                    "should have returned matching bytes"
+                );
+            }
+            _ => panic!("should return data for location, instead found {:?}", resp),
+        }
+    }
+
+    // note: uses stream for write-back
+    #[tokio::test]
+    async fn test_write_hint() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_S.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Written),
+            "should return successful write-back, instead found {:?}",
+            resp
+        );
+
+        // return cache hit -- metadata
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        let expected = GetObjectMetaResponse::from(ObjectMeta {
+            location: LOCATION_S.into(),
+            size: DATA.to_vec().len(),
+            last_modified: *LAST_MODIFIED,
+            e_tag: Default::default(),
+            version: Default::default(),
+        });
+        assert_matches!(
+            resp,
+            Ok(Response::Head(meta)) if meta == expected,
+            "should return metadata for location, instead found {:?}", resp
+        );
+
+        // return cache hit -- object
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        match resp {
+            Ok(Response::Data(stream)) => {
+                let data = stream.try_collect::<Vec<_>>().await.unwrap();
+                assert_eq!(
+                    data,
+                    vec![DATA.to_vec()],
+                    "should have returned matching bytes"
+                );
+            }
+            _ => panic!("should return data for location, instead found {:?}", resp),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_write_hint_fails_for_invalid_path() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: "not_a_valid_path.parquet".into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::BadRequest(_)),
+            "should return failed write-back, instead found {:?}",
+            resp
+        );
+    }
+
+    #[tokio::test]
+    async fn test_write_hint_fails_for_incorrect_size() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_S.into(),
+                hint: WriteHint {
+                    file_size_bytes: 12312,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::Data(_)),
+            "should error for incorrect file size in write-hint, instead found {:?}",
+            resp
+        );
+    }
+
+    #[tokio::test]
+    async fn test_fails_for_nonexistent_object() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        // Fails when looking up in remote store. Does not check catalog first.
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_MISSING.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::Data(DataError::DoesNotExist)),
+            "should return failed write-back, instead found {:?}",
+            resp
+        );
+    }
+
+    #[tokio::test]
+    async fn test_eviction() {
+        // setup
+        let policy_config = PolicyConfig {
+            max_capacity: DATA.to_vec().len() as u64 + 1,
+            event_recency_max_duration_nanoseconds: ONE_SECOND * 60 * 2,
+        };
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), Some(policy_config)).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_S.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Written),
+            "should return successful write-back, instead found {:?}",
+            resp
+        );
+
+        // return cache hit
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Head(_)),
+            "should return metadata for location, instead found {:?}",
+            resp
+        );
+        service.cache_manager.flush_pending().await;
+
+        // issue 2nd write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_F.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Written),
+            "should return successful write-back, instead found {:?}",
+            resp
+        );
+        service.cache_manager.flush_pending().await;
+
+        // eviction should have happened
+        // should return cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::CacheMiss),
+            "should return cache miss, instead found {:?}",
+            resp
+        );
+
+        // other object should still be in cache
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Head(_)),
+            "should return metadata for location, instead found {:?}",
+            resp
+        );
+
+        dir.close().expect("should close temp dir");
+    }
+}
diff --git a/parquet_cache/src/server/data/manager.rs b/parquet_cache/src/server/data/manager.rs
new file mode 100644
index 0000000..5b29e1d
--- /dev/null
+++ b/parquet_cache/src/server/data/manager.rs
@@ -0,0 +1,836 @@
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+use std::sync::atomic::{AtomicU64, Ordering as AtomicOrdering};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use arc_swap::ArcSwap;
+use async_channel::Sender;
+use moka::future::{Cache, FutureExt};
+use moka::notification::ListenerFuture;
+use moka::Expiry;
+use object_store::ObjectMeta;
+use observability_deps::tracing::error;
+use parking_lot::Mutex;
+use tokio::task::JoinSet;
+
+use crate::data_types::{ObjectParams, PolicyConfig};
+use crate::server::error::Error;
+
+type ExternalRequestKey = String;
+type CacheManagerKey = Arc<String>;
+
+#[derive(Debug, Clone)]
+pub struct CacheManagerValue {
+    /// Required for eviction policy.
+    pub params: ObjectParams,
+    /// Returned on `GET /metadata` head requests.
+    pub metadata: ObjectMeta,
+}
+
+type InMemoryCache = Cache<CacheManagerKey, CacheManagerValue>;
+
+/// Manages the cache eviction policy.
+///
+/// Cache manager built upon a fast, concurrent in-memory cache.
+/// In-memory will be the keys, as well as minimum metadata for managing cache eviction.
+#[derive(Debug)]
+pub struct CacheManager {
+    /// High-concurrency in-memory cache, used for the eviction policy.
+    manager: Arc<InMemoryCache>,
+    /// Current size of the cache.
+    current_size: Arc<AtomicU64>,
+}
+
+impl CacheManager {
+    pub fn new(config: PolicyConfig, evict_tx: Sender<String>) -> Self {
+        let current_size = Arc::new(AtomicU64::new(0));
+
+        // listener => then evict from local store
+        let current_size_ = Arc::clone(&current_size);
+        let listener =
+            move |k: Arc<CacheManagerKey>, v: CacheManagerValue, _cause| -> ListenerFuture {
+                let evict_tx = evict_tx.clone();
+                let current_size = Arc::clone(&current_size_);
+                async move {
+                    // use async_channel to ensure evicted, before removing from current_size
+                    match evict_tx.send((**k).clone()).await {
+                        Ok(_) => {
+                            current_size
+                                .fetch_sub(v.params.file_size_bytes as u64, AtomicOrdering::SeqCst);
+                        }
+                        Err(e) => {
+                            error!("CacheManager eviction listener failed to send: {:?}", e);
+                        }
+                    }
+                }
+                .boxed()
+            };
+
+        // event-recency
+        let evicter = Arc::new(Evictor::new_with_placeholder_cache_ref());
+        let expiry = Arc::new(EventRecency::new(
+            Arc::clone(&current_size),
+            config.max_capacity,
+            Arc::clone(&evicter),
+        ));
+
+        // cache manager
+        let manager = Arc::new(
+            Cache::builder()
+                .max_capacity(config.max_capacity)
+                .weigher(Self::size_weigher) // triggers eviction
+                .expire_after(EntryExpiry::new(config, Arc::clone(&expiry))) // triggered on insert & read
+                .async_eviction_listener(listener)
+                .build(),
+        );
+
+        // set cache on evicter
+        evicter.set_cache(Arc::clone(&manager));
+
+        Self {
+            manager,
+            current_size,
+        }
+    }
+
+    /// Maps the max_capacity to the disk bytes.
+    fn size_weigher(_k: &CacheManagerKey, v: &CacheManagerValue) -> u32 {
+        v.params.file_size_bytes as u32
+    }
+
+    /// Inserts the key-value pair into the cache.
+    pub async fn insert(&self, k: ExternalRequestKey, v: CacheManagerValue) {
+        let size = v.params.file_size_bytes;
+        self.manager.entry(Arc::new(k)).or_insert(v).await;
+        self.current_size
+            .fetch_add(size as u64, AtomicOrdering::SeqCst);
+    }
+
+    /// Returns Ok if the key is in the cache.
+    pub async fn in_cache(&self, k: &ExternalRequestKey) -> Result<(), Error> {
+        self.manager
+            .get(k)
+            .await
+            .map(|_| ())
+            .ok_or(Error::CacheMiss)
+    }
+
+    /// Returns the metadata for the object.
+    pub async fn fetch_metadata(&self, k: &ExternalRequestKey) -> Result<ObjectMeta, Error> {
+        Ok(self.manager.get(k).await.ok_or(Error::CacheMiss)?.metadata)
+    }
+
+    /// Explicitly evict key from cache.
+    #[cfg(test)]
+    async fn invalidate(&self, k: ExternalRequestKey) {
+        self.manager.invalidate(&k).await;
+    }
+
+    /// Trigger moka to flush all pending tasks. Use for testing ONLY.
+    #[cfg(test)]
+    pub(crate) async fn flush_pending(&self) {
+        self.manager.run_pending_tasks().await;
+    }
+}
+
+#[derive(Clone)]
+pub struct EntryExpiry {
+    /// Outer bound on how long to hold.
+    max_recency_duration: Duration,
+    /// Handles event recency.
+    event_recency: Arc<EventRecency>,
+}
+
+impl EntryExpiry {
+    fn new(config: PolicyConfig, evicter: Arc<EventRecency>) -> Self {
+        Self {
+            max_recency_duration: Duration::from_nanos(
+                config.event_recency_max_duration_nanoseconds,
+            ),
+            event_recency: evicter,
+        }
+    }
+}
+
+/// Moka helps achieve high concurrency with buffered inserts.
+///
+/// When pending tasks are applied, if more space is needed then existing keys are flushed
+/// based upon expiration.
+impl Expiry<CacheManagerKey, CacheManagerValue> for EntryExpiry {
+    /// Sets the expiry duration for every insertion.
+    /// If incoming should not be inserted, then set expiry to 0.
+    fn expire_after_create(
+        &self,
+        k: &CacheManagerKey,
+        v: &CacheManagerValue,
+        _inserted_at: Instant,
+    ) -> Option<Duration> {
+        if !self.event_recency.should_insert(k, v) {
+            return Some(Duration::from_secs(0));
+        }
+
+        if let Some(now) = chrono::Utc::now().timestamp_nanos_opt() {
+            let event_timestamp_nanos = v.params.max_time;
+
+            let age_out_nanoseconds =
+                event_timestamp_nanos.saturating_add(self.max_recency_duration.as_nanos() as i64);
+            let duration_until_event_ages_out = age_out_nanoseconds.saturating_sub(now);
+
+            Some(Duration::from_nanos(duration_until_event_ages_out as u64))
+        } else {
+            None
+        }
+    }
+}
+
+/// Tracks the event time recency, and evicts based upon the event time.
+struct EventRecency {
+    /// Current size of the cache.
+    ///
+    /// Used to determine when to evict.
+    /// Does not rely upon the moka-buffered inserts (unlike [`Cache`].weighted_size()).
+    current_size: Arc<AtomicU64>,
+    /// Upper bound on cache size.
+    max_capacity: u64,
+
+    /// Min-heap, used to track event time recency.
+    min_heap: Arc<Mutex<BinaryHeap<Slot>>>,
+    /// Tracks the current min, which will be updated with store() to minimize lock contention.
+    current_min: Arc<AtomicU64>,
+    /// Handles updates to min-heap on separate threads, to avoid locking on the hot path.
+    background_tasks: JoinSet<()>,
+    insert_tx: tokio::sync::mpsc::UnboundedSender<Slot>,
+    remove_tx: tokio::sync::mpsc::UnboundedSender<()>,
+}
+
+impl EventRecency {
+    /// Creates a new [`EventRecency`].
+    fn new(current_size: Arc<AtomicU64>, max_capacity: u64, evictor: Arc<Evictor>) -> Self {
+        let min_heap: Arc<Mutex<BinaryHeap<Slot>>> = Default::default();
+        let current_min: Arc<AtomicU64> = Default::default();
+
+        // TODO: replace with bounded channels.
+        let (insert_tx, mut insert_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (remove_tx, mut remove_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        // insert into min-heap, off the hot path
+        let mut background_tasks = JoinSet::new();
+        let min_heap_ = Arc::clone(&min_heap);
+        background_tasks.spawn(async move {
+            loop {
+                if let Some(slot) = insert_rx.recv().await {
+                    let mut guard = min_heap_.lock();
+                    guard.push(slot);
+                    drop(guard);
+                }
+            }
+        });
+
+        // remove from min-heap, off the hot path
+        let min_heap_ = Arc::clone(&min_heap);
+        let current_min_ = Arc::clone(&current_min);
+        background_tasks.spawn(async move {
+            loop {
+                if remove_rx.recv().await.is_some() {
+                    let mut guard = min_heap_.lock();
+                    let to_evict = guard.pop().expect("should have entry via peek");
+                    let new_min = guard.peek().map(|slot| slot.max_time);
+                    drop(guard);
+
+                    if let Some(new_min) = new_min {
+                        current_min_.store(new_min as u64, AtomicOrdering::Release);
+                    }
+
+                    evictor.evict_from_cache(to_evict.key);
+                }
+            }
+        });
+
+        Self {
+            current_size,
+            max_capacity,
+            min_heap,
+            current_min,
+            background_tasks,
+            insert_tx,
+            remove_tx,
+        }
+    }
+
+    /// Returns true if the incoming entry should be inserted.
+    fn should_insert(&self, incoming_k: &CacheManagerKey, incoming_v: &CacheManagerValue) -> bool {
+        let incoming_size = incoming_v.params.file_size_bytes as u64;
+        let should_insert =
+            if self.current_size.load(AtomicOrdering::SeqCst) + incoming_size > self.max_capacity {
+                self.max_capacity_should_insert(incoming_v)
+            } else {
+                true
+            };
+
+        if should_insert {
+            self.insert_tx
+                .send(Slot {
+                    max_time: incoming_v.params.max_time,
+                    key: Arc::clone(incoming_k),
+                })
+                .expect("should send min heap insert");
+        }
+        should_insert
+    }
+
+    /// Returns true if incoming entry should be inserted.
+    ///
+    /// Handles the case where the cache is at max_capacity,
+    /// by either evicting based upon event time recency,
+    /// or rejecting the incoming entry.
+    fn max_capacity_should_insert(&self, incoming_v: &CacheManagerValue) -> bool {
+        match self
+            .current_min
+            .load(AtomicOrdering::Relaxed)
+            .partial_cmp(&(incoming_v.params.max_time as u64))
+        {
+            Some(Ordering::Less) | Some(Ordering::Equal) => {
+                // incoming event time is more recent than current min
+                // therefore, evict current min
+                let _ = self.remove_tx.send(());
+                true
+            }
+            Some(Ordering::Greater) => false, // incoming event time is older than current min
+            None => true,                     // no entries in min-heap
+        }
+    }
+}
+
+/// Slot in the min-heap, used to evict based upon event timestamp recency.
+///
+/// [`BinaryHeap`] is a max-heap, therefore the Ord implementation is reversed.
+#[derive(Debug, Eq, PartialEq)]
+struct Slot {
+    max_time: i64,
+    key: CacheManagerKey,
+}
+
+#[allow(clippy::non_canonical_partial_ord_impl)]
+impl PartialOrd for Slot {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        other.max_time.partial_cmp(&self.max_time)
+    }
+}
+
+impl Ord for Slot {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.partial_cmp(other).unwrap()
+    }
+}
+
+/// Does the eviction.
+#[derive(Debug)]
+struct Evictor {
+    /// Ref to cache, in order to evict.
+    cache_manager: ArcSwap<InMemoryCache>,
+}
+
+impl Evictor {
+    /// Creates a new [`Evictor`], with a placeholder cache ref.
+    fn new_with_placeholder_cache_ref() -> Self {
+        Self {
+            cache_manager: ArcSwap::new(Arc::new(Cache::new(0))),
+        }
+    }
+
+    /// Sets the cache ref.
+    fn set_cache(&self, cache: Arc<InMemoryCache>) {
+        self.cache_manager.store(cache);
+    }
+
+    /// Evicts the key from the cache.
+    ///
+    /// Must be a non-blocking downstream action from [`EntryExpiry`].
+    ///
+    /// [`Cache`].invalidate() provides immediate invalidation of the entry,
+    /// outside of any pending moka insert tasks.
+    ///
+    /// When pending moka insert tasks are applied, if max_capacity is reached
+    /// then existing keys are flushed based upon expiration.
+    /// As we are spawning a non-blocking thread, we are not guaranteed
+    /// to have this eviction occur prior to the flushing of the task queue.
+    ///
+    /// Worst case scenario is that an incoming key is rejected (not accepted into cache) due to space.
+    fn evict_from_cache(&self, key: CacheManagerKey) {
+        let guard = self.cache_manager.load();
+        let cache = guard.as_ref().clone();
+        tokio::spawn(async move {
+            cache.invalidate(&key).await;
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+    use async_channel::unbounded;
+
+    use crate::data_types::PolicyConfig;
+
+    use super::*;
+
+    fn now_nanos() -> i64 {
+        chrono::Utc::now().timestamp_nanos_opt().unwrap()
+    }
+
+    fn cache_manager_value(size: usize, max_time: Option<i64>) -> CacheManagerValue {
+        let max_time = max_time.unwrap_or(now_nanos());
+
+        CacheManagerValue {
+            params: ObjectParams {
+                file_size_bytes: size as i64,
+                max_time,
+                min_time: max_time - 1_000_000_000,
+                ..Default::default()
+            },
+            metadata: ObjectMeta {
+                last_modified: chrono::Utc::now(),
+                location: object_store::path::Path::from("not_used"),
+                size,
+                e_tag: None,
+                version: None,
+            },
+        }
+    }
+
+    fn policy_config(max_capacity: u64) -> PolicyConfig {
+        PolicyConfig {
+            max_capacity,
+            event_recency_max_duration_nanoseconds: 1_000_000_000 * 60 * 60,
+        }
+    }
+
+    #[tokio::test]
+    async fn test_eviction_listener() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert entry
+        let value = cache_manager_value(
+            1_000_000, None, // all will have same event timestamp
+        );
+        let to_evict = "k_a".to_string();
+        cache_manager.insert(to_evict.clone(), value.clone()).await;
+
+        // check current_size
+        assert_eq!(
+            cache_manager.current_size.load(AtomicOrdering::SeqCst),
+            1_000_000
+        );
+
+        // explicitly evict
+        cache_manager.invalidate(to_evict.clone()).await;
+
+        // eviction listener should receive notification
+        assert_matches!(
+            evict_rx.recv().await,
+            Ok(_),
+            "should have received eviction notice",
+        );
+
+        assert_eq!(
+            cache_manager.current_size.load(AtomicOrdering::SeqCst),
+            0,
+            "should have zero current_size after eviction"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_evicts_at_max_capacity() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // all will have same event timestamp
+        );
+        let oldest = "k_a".to_string();
+        cache_manager.insert(oldest.clone(), value.clone()).await;
+        cache_manager.insert("k_b".into(), value.clone()).await;
+
+        // To Discuss: this flush is needed, in order to apply ordering in k_a+k_b, as before k_c.
+        // otherwise, the k_c is evicted instead
+        cache_manager.manager.run_pending_tasks().await;
+
+        // insert 1 more entry, which should force an eviction (over capacity)
+        cache_manager.insert("k_c".into(), value).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict oldest inserted entry
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == oldest,
+            "should have evicted oldest inserted key, instead found {:?}", res
+        );
+
+        // should still have other 2 entries
+        assert!(
+            cache_manager.in_cache(&"k_b".to_string()).await.is_ok(),
+            "should still have k_b"
+        );
+        assert!(
+            cache_manager.in_cache(&"k_c".to_string()).await.is_ok(),
+            "should still have k_c"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_lfu_eviction() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // all will have same event timestamp
+        );
+        let read = "k_a".to_string();
+        cache_manager.insert(read.clone(), value.clone()).await;
+        let not_read = "k_b".to_string();
+        cache_manager.insert(not_read.clone(), value.clone()).await;
+
+        // read one entry, many times, to pass the probability threshold
+        // To Discuss: is this sufficient for LFU?
+        //      * the write-back will be triggered on a single cache miss
+        //      * the LFU eviction would be using moka's probabilistic algorithm
+        for _ in 0..63 {
+            assert!(
+                cache_manager.in_cache(&read).await.is_ok(),
+                "should have read key"
+            );
+        }
+
+        // insert 1 more entry
+        cache_manager.insert("k_c".into(), value).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict unread entry
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == not_read,
+            "should have evicted unread key, instead found {:?}", res
+        );
+
+        // should have other 2 entries
+        assert!(
+            cache_manager.in_cache(&read).await.is_ok(),
+            "should still have the read key"
+        );
+        assert!(
+            cache_manager.in_cache(&"k_c".to_string()).await.is_ok(),
+            "should have newly inserted k_c"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_recency_eviction() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries, where the older entry has a more recent event time
+        let older_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 5_000_000_000), // 5 seconds ago
+        );
+        let newer_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 1_000_000_000), // 1 second ago
+        );
+        let should_keep = "younger_event_time_but_older_insert".to_string();
+        cache_manager
+            .insert(should_keep.clone(), newer_event_time.clone())
+            .await;
+
+        let should_evict = "older_event_time_but_younger_insert".to_string();
+        cache_manager
+            .insert(should_evict.clone(), older_event_time)
+            .await;
+
+        // insert 1 more entry, with same event time as should_keep
+        cache_manager
+            .insert("k_c".into(), newer_event_time.clone())
+            .await;
+
+        // To Discuss: this is waiting for event time recency eviction to occur
+        // before the moka task queue is flushed.
+        // This is the race condition explained in doc comments for
+        // Evicter::evict_from_cache().
+        tokio::time::sleep(Duration::from_micros(1)).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict based on event time, not insertion order
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == should_evict,
+            "should have evicted older_event_time, instead found {:?}", res
+        );
+
+        // LFU as a tie-breaker with same event time
+        assert!(
+            cache_manager.in_cache(&should_keep).await.is_ok(),
+            "should have read key"
+        );
+        cache_manager.insert("k_d".into(), newer_event_time).await; // now have 3 with newer_event_time
+        cache_manager.manager.run_pending_tasks().await;
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if v == "k_c",
+            "should have evicted least recently queried key, instead found {:?}", res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_trumps_lfu() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries, where the older entry has a more recent event time
+        let older_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 5_000_000_000), // 5 seconds ago
+        );
+        let newer_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 1_000_000_000), // 1 second ago
+        );
+        let should_keep = "younger_event_time_but_older_insert".to_string();
+        cache_manager
+            .insert(should_keep.clone(), newer_event_time.clone())
+            .await;
+        let should_evict = "older_event_time_but_younger_insert".to_string();
+        cache_manager
+            .insert(should_evict.clone(), older_event_time)
+            .await;
+
+        // query the older timestamp, many times, to pass the probability threshold
+        for _ in 0..63 {
+            assert!(
+                cache_manager.in_cache(&should_evict).await.is_ok(),
+                "should have read key"
+            );
+        }
+
+        // insert 1 more entry, with same event time as should_keep
+        cache_manager
+            .insert("k_c".into(), newer_event_time.clone())
+            .await;
+
+        // To Discuss: this is waiting for event time recency eviction to occur
+        // before the moka task queue is flushed.
+        // This is the race condition explained in doc comments for
+        // Evicter::evict_from_cache().
+        tokio::time::sleep(Duration::from_micros(1)).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict based on event time, not LFU
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == should_evict,
+            "should have evicted older_event_time, instead found {:?}", res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_recency_age_out() {
+        let (evict_tx, _) = unbounded();
+
+        // build cache manager, with 2 second ageout
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            PolicyConfig {
+                max_capacity: max_capacity as u64,
+                event_recency_max_duration_nanoseconds: 1_000_000_000 * 2,
+            },
+            evict_tx,
+        ));
+
+        // insert
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // will have current event timestamp
+        );
+        let now = "now_event_time".to_string();
+        cache_manager.insert(now.clone(), value.clone()).await;
+        assert!(
+            cache_manager.in_cache(&now).await.is_ok(),
+            "should have now"
+        );
+
+        // age out
+        tokio::time::sleep(Duration::from_secs(3)).await;
+        cache_manager.manager.run_pending_tasks().await;
+        assert!(
+            cache_manager.in_cache(&now).await.is_err(),
+            "should no longer have now"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_recency_age_out_with_future_time() {
+        let (evict_tx, _) = unbounded();
+
+        // build cache manager, with 2 second ageout
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            PolicyConfig {
+                max_capacity: max_capacity as u64,
+                event_recency_max_duration_nanoseconds: 1_000_000_000 * 2,
+            },
+            evict_tx,
+        ));
+
+        // insert
+        let value = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() + 2_000_000_000), // 2 seconds into future
+        );
+        let future_event = "future_event_time".to_string();
+        cache_manager
+            .insert(future_event.clone(), value.clone())
+            .await;
+        assert!(
+            cache_manager.in_cache(&future_event).await.is_ok(),
+            "should have future_event"
+        );
+
+        // age out
+        tokio::time::sleep(Duration::from_secs(3 + 2)).await;
+        cache_manager.manager.run_pending_tasks().await;
+        assert!(
+            cache_manager.in_cache(&future_event).await.is_err(),
+            "should no longer have future_event"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_fetch_metadata() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // all will have same event timestamp
+        );
+        let read = "k_a".to_string();
+        cache_manager.insert(read.clone(), value.clone()).await;
+        let not_read = "k_b".to_string();
+        cache_manager.insert(not_read.clone(), value.clone()).await;
+
+        // assert can find metadata
+        let expected_metadata = value.clone().metadata;
+        assert_matches!(
+            cache_manager.fetch_metadata(&read).await,
+            Ok(metadata) if metadata == expected_metadata,
+            "should have found metadata"
+        );
+
+        // assert metadata access applies to LFU eviction policy
+        for _ in 0..63 {
+            assert!(
+                cache_manager.fetch_metadata(&read).await.is_ok(),
+                "should have read key"
+            );
+        }
+        cache_manager.manager.run_pending_tasks().await;
+
+        // insert 1 more entry
+        cache_manager.insert("k_c".into(), value).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict unread entry
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == not_read,
+            "should have evicted unread key, instead found {:?}", res
+        );
+
+        // should have other 2 entries
+        assert!(
+            cache_manager.in_cache(&read).await.is_ok(),
+            "should still have the read key"
+        );
+        assert!(
+            cache_manager.in_cache(&"k_c".to_string()).await.is_ok(),
+            "should have newly inserted k_c"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_cache_misses() {
+        let (evict_tx, _) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // cache misses
+        assert_matches!(
+            cache_manager
+                .fetch_metadata(&"not_in_cache".to_string())
+                .await,
+            Err(Error::CacheMiss),
+            "should have returned cache miss for metadata",
+        );
+        assert_matches!(
+            cache_manager.in_cache(&"not_in_cache".to_string()).await,
+            Err(Error::CacheMiss),
+            "should have returned cache miss for object",
+        );
+        // when cache miss:
+        // 1. return error
+        // 2. upper layer (DataService) will handle any write back
+    }
+}
diff --git a/parquet_cache/src/server/data/reads.rs b/parquet_cache/src/server/data/reads.rs
new file mode 100644
index 0000000..07c09af
--- /dev/null
+++ b/parquet_cache/src/server/data/reads.rs
@@ -0,0 +1,23 @@
+use std::sync::Arc;
+
+use super::store::{LocalStore, StreamedObject};
+use super::DataError;
+
+/// Service that handles the READ requests (`GET /object`).
+#[derive(Debug, Clone)]
+pub struct ReadHandler {
+    cache: Arc<LocalStore>,
+}
+
+impl ReadHandler {
+    pub fn new(cache: Arc<LocalStore>) -> Self {
+        Self { cache }
+    }
+
+    pub async fn read_local(&self, location: &String) -> Result<StreamedObject, DataError> {
+        self.cache
+            .read_object(location)
+            .await
+            .map_err(|e| DataError::Read(e.to_string()))
+    }
+}
diff --git a/parquet_cache/src/server/data/store.rs b/parquet_cache/src/server/data/store.rs
new file mode 100644
index 0000000..e4ef795
--- /dev/null
+++ b/parquet_cache/src/server/data/store.rs
@@ -0,0 +1,510 @@
+use std::{
+    path::{Path, PathBuf},
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use bytes::{Bytes, BytesMut};
+use futures::{
+    stream::{BoxStream, StreamExt},
+    FutureExt, TryStreamExt,
+};
+use pin_project::pin_project;
+use tokio::fs::{create_dir_all, remove_dir, remove_file, File};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, Error, ReadBuf};
+use tokio_util::codec::{BytesCodec, FramedRead};
+
+/// object_store expected stream IO type
+pub type StreamedObject = BoxStream<'static, object_store::Result<Bytes>>;
+
+/// identifier for `object_store::Error::Generic`
+const DATA_CACHE: &str = "local store accessor";
+
+/// Access to stored data.
+#[derive(Debug)]
+pub struct LocalStore {
+    dir: PathBuf,
+}
+
+impl LocalStore {
+    pub fn new(path: Option<impl ToString>) -> Self {
+        let dir = path.map(|p| p.to_string()).unwrap_or("/tmp".to_string());
+        Self {
+            dir: Path::new(dir.as_str()).to_owned(),
+        }
+    }
+
+    fn local_path(&self, location: &String) -> PathBuf {
+        self.dir.join(location)
+    }
+
+    /// Move a given file location, into cache
+    pub async fn move_file_to_cache(&self, from: PathBuf, location: &String) -> Result<(), Error> {
+        let to = self.local_path(location);
+        match to.parent() {
+            None => {
+                return Err(Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "object location is not valid",
+                ))
+            }
+            Some(path) => create_dir_all(path).await?,
+        };
+        std::fs::rename(from, to)
+    }
+
+    /// Async write operation
+    pub async fn write_object(
+        &self,
+        location: &String,
+        size: i64,
+        mut stream: StreamedObject,
+    ) -> Result<(), Error> {
+        if location.starts_with('/') {
+            return Err(Error::new(
+                std::io::ErrorKind::InvalidData,
+                "object location cannot be an absolute path",
+            ));
+        }
+        let path = self.local_path(location);
+        let mut obj = AsyncStoreObject::new(path.as_path(), size).await?;
+
+        while let Some(maybe_bytes) = stream.next().await {
+            if maybe_bytes.is_err() {
+                let _ = obj.delete().await;
+                return Err(Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "error reading incoming byte stream",
+                ));
+            }
+
+            match obj.write_all(&maybe_bytes.unwrap()).await {
+                Ok(_) => continue,
+                Err(e) => {
+                    let _ = obj.delete().await;
+                    return Err(e);
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Read `GET /object` returns a stream
+    pub async fn read_object(&self, location: &String) -> Result<StreamedObject, Error> {
+        if location.starts_with('/') {
+            return Err(Error::new(
+                std::io::ErrorKind::InvalidData,
+                "object location cannot be an absolute path",
+            ));
+        }
+
+        // Potential TODO: replace the StreamedObject with sendfile?
+        // the the client can return a GetResultPayload::File() through the interface.
+        let path = self.dir.join(location);
+        Ok(AsyncStoreObject::open(path.as_path()).await?.read_stream())
+    }
+
+    /// Delete object in local store, such as on cache eviction.
+    pub async fn delete_object(&self, location: &String) -> Result<(), Error> {
+        if location.starts_with('/') {
+            return Err(Error::new(
+                std::io::ErrorKind::InvalidData,
+                "object location cannot be an absolute path",
+            ));
+        }
+
+        let path = self.dir.join(location);
+        AsyncStoreObject::open(path.as_path()).await?.delete().await
+    }
+}
+
+#[pin_project]
+pub struct AsyncStoreObject<'a> {
+    #[pin]
+    inner: File,
+    path: &'a Path,
+}
+
+impl<'a> AsyncStoreObject<'a> {
+    /// Create a new AsyncStoreObject, honoring the path provided.
+    async fn new(path: &'a Path, size: i64) -> std::io::Result<Self> {
+        // The path of the object (in the ObjectStore implementations) is:
+        // <namespace_id>/<table_id>/<partition_id>/<object_store_id>.
+        //
+        // Future cache eviction policies may be mapped to resource allocation per table_id.
+        create_dir_all(path.parent().unwrap_or(path)).await?;
+        let file = File::create(path).await?;
+        file.set_len(size as u64).await?;
+
+        Ok(Self { inner: file, path })
+    }
+
+    async fn open(path: &'a Path) -> std::io::Result<Self> {
+        Ok(Self {
+            inner: File::open(path).await?,
+            path,
+        })
+    }
+
+    fn read_stream(self) -> StreamedObject {
+        Box::pin(
+            FramedRead::new(self.inner, BytesCodec::new())
+                .map_ok(BytesMut::freeze)
+                .map_err(|e| object_store::Error::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                }),
+        )
+    }
+
+    async fn delete(&self) -> std::io::Result<()> {
+        remove_file(self.path).await?;
+        let dir = self.path.parent().unwrap_or(self.path);
+        if dir.read_dir()?.next().is_none() {
+            remove_dir(dir).await
+        } else {
+            Ok(())
+        }
+    }
+}
+
+impl<'a> AsyncRead for AsyncStoreObject<'a> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
+impl<'a> AsyncWrite for AsyncStoreObject<'a> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<std::io::Result<usize>> {
+        let this = self.project();
+        this.inner.poll_write(cx, buf)
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<std::io::Result<()>> {
+        let this = self.project();
+        Box::pin(this.inner.get_mut().sync_all()).poll_unpin(cx)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<std::io::Result<()>> {
+        let this = self.project();
+        Box::pin(this.inner.get_mut().shutdown()).poll_unpin(cx)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{hash::Hasher, io::ErrorKind};
+
+    use assert_matches::assert_matches;
+    use rand::{distributions::Alphanumeric, thread_rng, Rng};
+    use tempfile::TempDir;
+    use tokio::io::AsyncReadExt;
+
+    use super::*;
+
+    async fn create_incoming_stream(file_path: &PathBuf) -> StreamedObject {
+        let mut writeable = File::create(file_path)
+            .await
+            .expect("should create file in tempdir");
+
+        for _ in 0..5 {
+            let rand_string: String = thread_rng()
+                .sample_iter(&Alphanumeric)
+                .take(1_000_000)
+                .map(char::from)
+                .collect();
+            writeable
+                .write_all(rand_string.as_bytes())
+                .await
+                .expect("should write to mock incoming");
+        }
+        writeable
+            .sync_all()
+            .await
+            .expect("should fsync incoming mock data file");
+
+        let readable = File::open(file_path)
+            .await
+            .expect("file should be readable");
+        Box::pin(
+            FramedRead::new(readable, BytesCodec::new())
+                .map_ok(BytesMut::freeze)
+                .map_err(|e| object_store::Error::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                }),
+        )
+    }
+
+    async fn run_write_read_test() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-io.txt");
+        let obj_stream = create_incoming_stream(&incoming_file_path).await;
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(&location.to_string(), 1_000_000 * 5, obj_stream)
+            .await;
+        assert_matches!(
+            write_res,
+            Ok(()),
+            "write should return ok, instead found {:?}",
+            write_res
+        );
+
+        let read_res = local_store.read_object(&location.to_string()).await;
+        assert!(read_res.is_ok(), "read should return ok");
+
+        // expected == data which was streamed in to WRITE
+        let mut expected = Vec::new();
+        File::open(incoming_file_path)
+            .await
+            .expect("should open original incoming data file")
+            .read_to_end(&mut expected)
+            .await
+            .unwrap();
+        let mut expected_hash = ahash::AHasher::default();
+        expected_hash.write(&expected);
+
+        // got == data that was WRITE then READ
+        let mut got = Vec::new();
+        tokio_util::io::StreamReader::new(read_res.unwrap())
+            .read_to_end(&mut got)
+            .await
+            .unwrap();
+        let mut got_hash = ahash::AHasher::default();
+        got_hash.write(&got);
+
+        assert_eq!(
+            1_000_000 * 5,
+            expected.len(),
+            "incoming mock file stream was incorrect"
+        );
+        assert_eq!(
+            expected.len(),
+            got.len(),
+            "expected {} bytes but found {} bytes",
+            expected.len(),
+            got.len()
+        );
+        assert_eq!(
+            got_hash.finish(),
+            expected_hash.finish(),
+            "hash of file contents do not match"
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn can_duplicate_write_to_key() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-dupe-writes.txt");
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                create_incoming_stream(&incoming_file_path).await,
+            )
+            .await;
+        assert!(
+            write_res.is_ok(),
+            "first write should succeed, instead found {:?}",
+            write_res
+        );
+
+        let duplicate_write = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                create_incoming_stream(&incoming_file_path).await,
+            )
+            .await;
+        assert!(
+            duplicate_write.is_ok(),
+            "second write should also succeed, instead found {:?}",
+            duplicate_write
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn run_delete_test() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-io.txt");
+        let obj_stream = create_incoming_stream(&incoming_file_path).await;
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(&location.to_string(), 1_000_000 * 5, obj_stream)
+            .await;
+        assert_matches!(
+            write_res,
+            Ok(()),
+            "write should return ok, instead found {:?}",
+            write_res
+        );
+
+        // confirm obj is written
+        let written_obj_path = tempdir.path().join(location);
+        let mut written_obj = Vec::new();
+        File::open(written_obj_path.clone())
+            .await
+            .expect("should open original incoming data file")
+            .read_to_end(&mut written_obj)
+            .await
+            .unwrap();
+        assert_eq!(
+            1_000_000 * 5,
+            written_obj.len(),
+            "object should be written to full length"
+        );
+
+        // delete obj
+        let del_res = local_store.delete_object(&location.to_string()).await;
+        assert!(del_res.is_ok(), "should return OK on delete");
+
+        // confirm does not exist
+        let should_be_err = File::open(written_obj_path).await;
+        assert_matches!(
+            should_be_err,
+            Err(e) if e.kind() == ErrorKind::NotFound,
+            "cache obj should not exist"
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn error_with_absolute_path_in_obj_key() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-abs-key-path.txt");
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "/absolute/pathed/object.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                create_incoming_stream(&incoming_file_path).await,
+            )
+            .await;
+        assert_matches!(
+            write_res,
+            Err(e) if e.to_string().contains("object location cannot be an absolute path"),
+            "expected write to error, instead found {:?}",
+            write_res
+        );
+
+        let read_res = local_store.read_object(&location.to_string()).await;
+        assert!(read_res.is_err(), "expected read to error",);
+
+        let delete_res = local_store.delete_object(&location.to_string()).await;
+        assert!(delete_res.is_err(), "expected delete to error",);
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn write_aborts_are_handled() {
+        let stream_with_partial_write = Box::pin(tokio_stream::iter(vec![Err(
+            object_store::Error::Generic {
+                store: "error in bytes stream from remote object store",
+                source: "delete on first write".into(),
+            },
+        )])) as StreamedObject;
+
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                stream_with_partial_write,
+            )
+            .await;
+        assert_matches!(
+            write_res,
+            Err(e) if e.to_string().contains("error reading incoming byte stream"),
+            "expected write to error, instead found {:?}",
+            write_res
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn partial_files_are_deleted_on_write_abort() {
+        let stream_with_partial_write = Box::pin(tokio_stream::iter(vec![
+            Ok(Bytes::from(&b"good yield"[..])),
+            Err(object_store::Error::Generic {
+                store: "error in bytes stream from remote object store",
+                source: "foobar".into(),
+            }),
+        ])) as StreamedObject;
+
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                stream_with_partial_write,
+            )
+            .await;
+        assert_matches!(
+            write_res,
+            Err(e) if e.to_string().contains("error reading incoming byte stream"),
+            "expected write to error, instead found {:?}",
+            write_res
+        );
+
+        let incoming_file_path = tempdir.path().join("./incoming-partial.txt");
+        let should_not_exist = File::open(incoming_file_path).await;
+        assert_matches!(
+            should_not_exist,
+            Err(e) if e.kind() == ErrorKind::NotFound,
+            "file partial should not exist"
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    #[tokio::test]
+    async fn test_write_read_object() {
+        futures::join!(run_write_read_test(), can_duplicate_write_to_key(),);
+    }
+
+    #[tokio::test]
+    async fn test_delete_object() {
+        run_delete_test().await;
+    }
+
+    #[tokio::test]
+    async fn test_error_handling() {
+        futures::join!(
+            error_with_absolute_path_in_obj_key(),
+            write_aborts_are_handled(),
+            partial_files_are_deleted_on_write_abort(),
+        );
+    }
+}
diff --git a/parquet_cache/src/server/data/writes.rs b/parquet_cache/src/server/data/writes.rs
new file mode 100644
index 0000000..d42fc5b
--- /dev/null
+++ b/parquet_cache/src/server/data/writes.rs
@@ -0,0 +1,69 @@
+use std::sync::Arc;
+
+use object_store::{GetResult, GetResultPayload, ObjectMeta, ObjectStore};
+use observability_deps::tracing::warn;
+
+use crate::data_types::WriteHint;
+
+use super::{store::LocalStore, DataError};
+
+/// Handles the WRITE requests (`/write-hint`)
+#[derive(Debug, Clone)]
+pub struct WriteHandler {
+    cache: Arc<LocalStore>,
+    direct_store: Arc<dyn ObjectStore>,
+}
+
+impl WriteHandler {
+    pub fn new(cache: Arc<LocalStore>, direct_store: Arc<dyn ObjectStore>) -> Self {
+        Self {
+            cache,
+            direct_store,
+        }
+    }
+
+    pub async fn write_local(
+        &self,
+        location: &str,
+        write_hint: &WriteHint,
+    ) -> Result<ObjectMeta, DataError> {
+        // get from remote
+        let WriteHint {
+            file_size_bytes, ..
+        } = write_hint;
+        let GetResult { meta, payload, .. } = self
+            .direct_store
+            .get(&location.into())
+            .await
+            .map_err(|e| match e {
+                object_store::Error::NotFound { .. } => DataError::DoesNotExist,
+                _ => DataError::Stream(e.to_string()),
+            })?;
+
+        if !(meta.size as i64).eq(file_size_bytes) {
+            warn!(
+                "failed to perform writeback due to file size mismatch: {} != {}",
+                meta.size, file_size_bytes
+            );
+            return Err(DataError::BadRequest(
+                "failed to perform writeback due to file size mismatch".to_string(),
+            ));
+        }
+
+        // write local
+        match payload {
+            GetResultPayload::File(_, pathbuf) => self
+                .cache
+                .move_file_to_cache(pathbuf, &location.into())
+                .await
+                .map_err(|e| DataError::File(e.to_string()))?,
+            GetResultPayload::Stream(stream) => self
+                .cache
+                .write_object(&location.into(), *file_size_bytes, stream)
+                .await
+                .map_err(|e| DataError::Stream(e.to_string()))?,
+        };
+
+        Ok(meta)
+    }
+}
diff --git a/parquet_cache/src/server/error.rs b/parquet_cache/src/server/error.rs
new file mode 100644
index 0000000..24e87c4
--- /dev/null
+++ b/parquet_cache/src/server/error.rs
@@ -0,0 +1,55 @@
+use hyper::StatusCode;
+
+use crate::server::data::DataError;
+
+/// Error type for the server.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Error in the keyspace layer.
+    #[error("Keyspace error: {0}")]
+    Keyspace(String),
+    /// Error in the precondition layer.
+    #[error("Precondition error: {0}")]
+    Precondition(String),
+    /// Error in the data layer.
+    #[error("Data error: {0}")]
+    Data(#[from] DataError),
+
+    /// Error with warming.
+    #[error("Warming error: {0}")]
+    Warming(String),
+    /// Cache miss.
+    #[error("Cache miss")]
+    CacheMiss,
+    /// Bad request from the user.
+    #[error("Bad Request: {0}")]
+    BadRequest(String),
+    /// Object does not exist.
+    #[error("Bad Request: object location does not exist in catalog or object store")]
+    DoesNotExist,
+    /// Error due to server shutdown.
+    #[error("Server shutdown")]
+    ServerShutdown,
+}
+
+impl Error {
+    /// Return the HTTP status code for this error.
+    ///
+    /// Should match the handling, per code, in the [client](crate::client::object_store::DataCacheObjectStore).
+    pub fn code(&self) -> StatusCode {
+        match self {
+            // If errors here, have the client return an error.
+            Self::BadRequest(_)
+            | Self::DoesNotExist
+            | Self::Data(DataError::BadRequest(_))
+            | Self::Data(DataError::DoesNotExist) => StatusCode::BAD_REQUEST,
+            Self::Precondition(_) => StatusCode::PRECONDITION_FAILED,
+            // If errors below here, result in the client using the fallback.
+            Self::CacheMiss => StatusCode::NOT_FOUND,
+            Self::Keyspace(_) | Self::Warming(_) | Self::Data(_) => {
+                StatusCode::INTERNAL_SERVER_ERROR
+            }
+            Self::ServerShutdown => StatusCode::SERVICE_UNAVAILABLE,
+        }
+    }
+}
diff --git a/parquet_cache/src/server/keyspace.rs b/parquet_cache/src/server/keyspace.rs
new file mode 100644
index 0000000..88ea0f8
--- /dev/null
+++ b/parquet_cache/src/server/keyspace.rs
@@ -0,0 +1,957 @@
+use std::{path::Path, sync::Arc, task::Poll};
+
+use arc_swap::ArcSwap;
+use futures::Future;
+use http::{Method, Request};
+use hyper::Body;
+use mpchash::HashRing;
+use notify::{RecommendedWatcher, RecursiveMode, Watcher};
+use observability_deps::tracing::error;
+use tokio::{sync::Notify, task::JoinHandle};
+use tower::{Layer, Service};
+
+use crate::{
+    data_types::{
+        InstanceState, KeyspaceResponseBody, KeyspaceVersion, ParquetCacheInstanceSet, ServiceNode,
+        ServiceNodeHostname, ServiceNodeId,
+    },
+    server::response::Response,
+};
+
+use super::{error::Error, response::PinnedFuture};
+
+struct BackgroundTask {
+    path: String,
+    fswatcher: RecommendedWatcher,
+    notifier_handle: JoinHandle<()>,
+}
+
+impl Drop for BackgroundTask {
+    fn drop(&mut self) {
+        if let Err(e) = self.fswatcher.unwatch(Path::new(&self.path)) {
+            error!("KeyspaceService fswatcher failed to unwatch: {}", e)
+        }
+        self.notifier_handle.abort();
+    }
+}
+
+/// Service that applies the keyspace per request.
+pub struct KeyspaceService<S> {
+    shared: Arc<BackgroundTask>,
+    ready_tx: Arc<Notify>,
+    ready_rx: std::pin::Pin<Box<dyn Future<Output = ()> + Send + Sync + 'static>>,
+    keyspace: Arc<Keyspace>,
+    inner: S,
+}
+
+impl<S> std::fmt::Debug for KeyspaceService<S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("KeyspaceService")
+            .field("keyspace", &self.keyspace)
+            .finish_non_exhaustive()
+    }
+}
+
+impl<S: Clone> Clone for KeyspaceService<S> {
+    fn clone(&self) -> Self {
+        let ready_rx = Arc::clone(&self.ready_tx);
+        let ready_rx = Box::pin(async move {
+            ready_rx.notified().await;
+        });
+
+        Self {
+            shared: Arc::clone(&self.shared),
+            ready_tx: Arc::clone(&self.ready_tx),
+            ready_rx,
+            keyspace: Arc::clone(&self.keyspace),
+            inner: self.inner.clone(),
+        }
+    }
+}
+
+impl<S: Service<Request<Body>> + Clone + Send + Sync + 'static> KeyspaceService<S> {
+    fn new(inner: S, configfile_path: String, node_hostname: String) -> Result<Self, Error> {
+        let path = configfile_path.clone();
+
+        let data = Arc::new(KeyspaceData::new(node_hostname));
+        let keyspace = Arc::new(Keyspace {
+            data: data.into(),
+            configfile_path,
+        });
+
+        let ready_tx = Arc::new(Notify::new());
+        let (fswatcher, notifier_handle) =
+            Self::start_background_task(Arc::clone(&keyspace), Arc::clone(&ready_tx))?;
+
+        let ready_rx = Arc::clone(&ready_tx);
+        let ready_rx = Box::pin(async move {
+            ready_rx.notified().await;
+        });
+
+        Ok(Self {
+            shared: Arc::new(BackgroundTask {
+                path,
+                fswatcher,
+                notifier_handle,
+            }),
+            ready_tx,
+            ready_rx,
+            keyspace,
+            inner,
+        })
+    }
+
+    fn start_background_task(
+        keyspace: Arc<Keyspace>,
+        ready_tx: Arc<Notify>,
+    ) -> Result<(RecommendedWatcher, JoinHandle<()>), Error> {
+        let changed = Arc::new(Notify::new());
+        let has_changed = Arc::clone(&changed);
+
+        let configfile_path = keyspace.configfile_path.clone();
+        let ready_tx_ = Arc::clone(&ready_tx);
+        let keyspace_ = Arc::clone(&keyspace);
+
+        // start watcher -- default is to poll for changes every 30 seconds
+        let watcher_and_listener =
+            notify::recommended_watcher(move |res: notify::Result<notify::Event>| match res {
+                Ok(notify::Event { kind, .. }) => {
+                    if kind.is_modify() || kind.is_create() {
+                        has_changed.notify_one();
+                    }
+                }
+                Err(e) => error!(error=%e, "KeyspaceService fswatcher failed"),
+            })
+            .and_then(move |mut watcher| {
+                watcher.watch(Path::new(&configfile_path), RecursiveMode::NonRecursive)?;
+                Ok((
+                    watcher,
+                    tokio::spawn(async move {
+                        loop {
+                            changed.notified().await;
+                            keyspace.update(Arc::clone(&ready_tx)).await;
+                        }
+                    }),
+                ))
+            })
+            .map_err(|e| Error::Keyspace(e.to_string()))?;
+
+        // handle race where the file is created before the watcher is started
+        if Path::exists(Path::new(&keyspace_.configfile_path)) {
+            tokio::spawn(async move {
+                keyspace_.update(ready_tx_).await;
+            });
+        }
+
+        Ok(watcher_and_listener)
+    }
+}
+
+impl<S> Service<Request<Body>> for KeyspaceService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture, Error = Error> + Clone + Send + Sync + 'static,
+{
+    type Response = super::response::Response;
+    type Error = Error;
+    type Future = PinnedFuture;
+
+    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        if !self.keyspace.ready() {
+            futures::ready!(self.ready_rx.as_mut().poll(cx));
+        }
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        match (req.method(), req.uri().path()) {
+            (&Method::GET, "/state") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    // return the version we have loaded
+                    // serde serialization will add the CacheState enum, based on this version
+                    Ok(Response::KeyspaceVersion(
+                        this.keyspace.data.load().version.clone(),
+                    ))
+                })
+            }
+            (&Method::PATCH, "/warmed") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    this.keyspace.set_to_running();
+                    Ok(Response::Ready)
+                })
+            }
+            (&Method::GET, "/keyspace") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    let (_, _, keyspace) = this.keyspace.read_definition().await;
+                    Ok(Response::Keyspace(keyspace))
+                })
+            }
+            (&Method::GET, "/metadata")
+            | (&Method::GET, "/object")
+            | (&Method::POST, "/write-hint") => {
+                let clone = self.inner.clone();
+                let mut inner = std::mem::replace(&mut self.inner, clone);
+                let this = self.clone();
+                Box::pin(async move {
+                    let as_url = url::Url::parse(req.uri().to_string().as_str())
+                        .expect("should be already validated path & query");
+                    let obj_location = match as_url.query_pairs().find(|(k, _v)| k.eq("location")) {
+                        None => {
+                            return Err(Error::Keyspace(
+                                "invalid or missing object location".into(),
+                            ));
+                        }
+                        Some((_key, location)) => location.to_string(),
+                    };
+
+                    // when keyspace is invalid (being re-built), return error such that
+                    // cache client decides to (1) re-fetch keyspace, and/or (2) uses fallback
+                    match this.keyspace.in_keyspace(&obj_location) {
+                        true => inner.call(req).await,
+                        false => Err(Error::Keyspace(format!(
+                            "object {} is not found in keyspace",
+                            obj_location
+                        ))),
+                    }
+                })
+            }
+            (any_method, any_path) => {
+                let msg = format!("invalid path: {} {}", any_method, any_path);
+                Box::pin(async { Err(Error::BadRequest(msg)) })
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct KeyspaceData {
+    /// ID self
+    /// Is none if keyspace has been invalidated.
+    own: Option<ServiceNodeId>,
+    // Hashring
+    keyspace: Arc<HashRing<ServiceNodeId>>,
+    /// Versioning, so can provide current vs next, per GET `/state` request
+    /// Is none if Self::Pending (a.k.a. no definition loaded yet)
+    version: KeyspaceVersion,
+}
+
+impl KeyspaceData {
+    pub fn new(self_node: ServiceNodeHostname) -> Self {
+        Self {
+            own: None,
+            keyspace: Default::default(),
+            version: KeyspaceVersion::new(self_node),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Keyspace {
+    /// Atomically updated keyspace data.
+    data: ArcSwap<KeyspaceData>,
+    /// Fs-accessible file containing the [`ParquetCacheInstanceSet`]
+    configfile_path: String,
+}
+
+impl Keyspace {
+    /// `Valid` as in able to check keyspace hashring.
+    ///
+    /// Returns true if the keyspace definition exists, and own-node is within keyspace.
+    fn is_valid(&self) -> bool {
+        self.data.load().own.is_some()
+    }
+
+    /// `Ready` as in poll_ready (to receive requests).
+    /// Returns true if the keyspace is not in the init phase.
+    ///
+    /// Request include `GET /state` requests during warming and cooling phases.
+    fn ready(&self) -> bool {
+        let data = self.data.load();
+        match InstanceState::from(&data.as_ref().version) {
+            InstanceState::Pending => false,
+            InstanceState::Warming | InstanceState::Running | InstanceState::Cooling => true,
+        }
+    }
+
+    /// `Running` as in the [`InstanceState`].
+    fn set_to_running(&self) {
+        self.data.rcu(|data| KeyspaceData {
+            own: data.own,
+            keyspace: Arc::clone(&data.keyspace),
+            version: data.version.clone_next_to_curr(),
+        });
+    }
+
+    /// Returns true if the object location is in the keyspace.
+    fn in_keyspace(&self, object: &String) -> bool {
+        let data = self.data.load();
+        self.is_valid()
+            && match data.own {
+                None => false,
+                Some(id) => match data.keyspace.primary_node(object) {
+                    Some(&assigned_node) => assigned_node == id,
+                    None => false,
+                },
+            }
+    }
+
+    /// Read keyspace definition from file.
+    async fn read_definition(
+        &self,
+    ) -> (
+        ParquetCacheInstanceSet, /* KeyspaceVersion.next */
+        Option<ServiceNodeId>,   /* None == current node is not in KeyspaceVersion.next */
+        Vec<ServiceNode>,        /* full set of KeyspaceVersion.next hashring */
+    ) {
+        let current_instance_set_next = tokio::fs::read_to_string(self.configfile_path.clone())
+            .await
+            .expect("config map file should always exist on pod");
+        let parquet_cache_instance_set: ParquetCacheInstanceSet =
+            serde_json::from_str(current_instance_set_next.as_str())
+                .expect("should have valid ParquetCacheInstanceSet format");
+
+        let service_nodes = KeyspaceResponseBody::from(&parquet_cache_instance_set).nodes;
+
+        let self_hostname = self.data.load().version.hostname().clone();
+        (
+            parquet_cache_instance_set,
+            service_nodes
+                .iter()
+                .position(|node| node.hostname == self_hostname)
+                .map(|node_id| node_id as u64),
+            service_nodes,
+        )
+    }
+
+    /// Update keyspace definition.
+    async fn update(&self, ready: Arc<Notify>) {
+        let (next_version, own, all_nodes) = self.read_definition().await;
+
+        let mut keyspace = HashRing::new();
+        for ServiceNode { id, hostname: _ } in all_nodes {
+            keyspace.add(id);
+        }
+        let keyspace = Arc::new(keyspace);
+
+        // determine if KeyspaceVersion changed
+        let prev_data = self.data.rcu(|curr_data| {
+            match &curr_data.version.next {
+                Some(next) if next_version.revision == next.revision => {
+                    // no change -- already knows about next
+                    Arc::clone(curr_data)
+                }
+                _ => Arc::new(KeyspaceData {
+                    own,
+                    keyspace: Arc::clone(&keyspace),
+                    version: curr_data.version.set_next(next_version.to_owned()),
+                }),
+            }
+        });
+
+        if InstanceState::from(&prev_data.version) == InstanceState::Pending && self.ready() {
+            // Let anyone waiting on poll_ready know that we're no longer pending.
+            ready.notify_waiters();
+        }
+    }
+}
+
+pub struct BuildKeyspaceService {
+    pub configfile_path: String,
+    pub node_hostname: String,
+}
+
+impl<S: Service<Request<Body>> + Clone + Send + Sync + 'static> Layer<S> for BuildKeyspaceService {
+    type Service = KeyspaceService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        KeyspaceService::new(
+            service,
+            self.configfile_path.clone(),
+            self.node_hostname.clone(),
+        )
+        .expect("cache server failed to deploy due to keyspace layer init error")
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{
+        io::{Seek, Write},
+        sync::atomic::{AtomicU32, Ordering},
+        task::Context,
+        time::Duration,
+    };
+
+    use assert_matches::assert_matches;
+    use futures::{future, task::noop_waker_ref};
+    use tempfile::{NamedTempFile, TempDir};
+    use tokio::io::AsyncWriteExt;
+    use tokio_stream::StreamExt;
+    use tower::{ServiceBuilder, ServiceExt};
+
+    use super::super::response::Response;
+    use super::*;
+
+    const VALID_HOSTNAME: &str = "hostname-a";
+    lazy_static::lazy_static! {
+        static ref KEYSPACE_DEFINITION: String = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 0,
+            // a single node in the keyspace, therefore all keys should hash to this keyspace
+            instances: vec![VALID_HOSTNAME].into_iter().map(String::from).collect(),
+        }).to_string();
+    }
+
+    #[derive(Clone, Default)]
+    struct MockInnermostService {
+        call: Arc<AtomicU32>,
+        poll_ready: Arc<AtomicU32>,
+    }
+
+    impl Service<Request<Body>> for MockInnermostService {
+        type Response = Response;
+        type Error = Error;
+        type Future = PinnedFuture;
+
+        fn poll_ready(
+            &mut self,
+            _cx: &mut std::task::Context<'_>,
+        ) -> Poll<Result<(), Self::Error>> {
+            self.poll_ready.fetch_add(1, Ordering::SeqCst);
+            Poll::Ready(Ok(()))
+        }
+        fn call(&mut self, _req: Request<Body>) -> Self::Future {
+            self.call.fetch_add(1, Ordering::SeqCst);
+            Box::pin(future::ok(Response::Ready))
+        }
+    }
+
+    fn metadata_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/metadata?location=bar")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn object_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/object?location=bar")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn write_hint_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint?location=bar")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn state_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("/state")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn warmed_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::PATCH)
+            .uri("/warmed")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn keyspace_defn_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("/keyspace")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    async fn write_defn_to_file(defn: &[u8], configfile_path: &Path) {
+        let mut file = tokio::fs::File::create(&configfile_path).await.unwrap();
+        file.write_all(defn)
+            .await
+            .expect("should write keyspace definition to configfile");
+
+        // notify fswatcher will sometimes skip events when the file descriptor is still open
+        file.shutdown()
+            .await
+            .expect("should shutdown file descriptor");
+    }
+
+    #[allow(clippy::future_not_send)]
+    async fn wait_until_service_is_ready(server: &mut KeyspaceService<MockInnermostService>) {
+        future::poll_fn(move |cx| server.poll_ready(cx))
+            .await
+            .expect("should not have failed");
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_can_load_definition() {
+        let mut file = NamedTempFile::new().unwrap();
+        writeln!(file, "{}", KEYSPACE_DEFINITION.as_str())
+            .expect("should write keyspace definition to configfile");
+
+        let keyspace = Keyspace {
+            configfile_path: file.path().to_str().unwrap().to_string(),
+            data: Arc::new(KeyspaceData::new(VALID_HOSTNAME.into())).into(),
+        };
+
+        assert!(
+            !keyspace.is_valid(),
+            "default keyspace should be invalid, due to no definition loaded"
+        );
+
+        let notify = Arc::new(Notify::new());
+        keyspace.update(Arc::clone(&notify)).await;
+        assert!(
+            keyspace.is_valid(),
+            "keyspace should be valid, after definition is loaded"
+        );
+
+        // remove from keyspace, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(file.path())
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // should no longer be in keyspace
+        keyspace.update(Arc::clone(&notify)).await;
+        assert!(
+            !keyspace.is_valid(),
+            "keyspace should not be valid, when own-hostname not in definition"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_poll_ready_during_instance_phases() {
+        let mut file = NamedTempFile::new().unwrap();
+        writeln!(file, "{}", KEYSPACE_DEFINITION.as_str())
+            .expect("should write keyspace definition to configfile");
+
+        let keyspace = Keyspace {
+            configfile_path: file.path().to_str().unwrap().to_string(),
+            data: Arc::new(KeyspaceData::new(VALID_HOSTNAME.into())).into(),
+        };
+
+        // init phase
+        assert!(!keyspace.ready(), "default keyspace should not poll_ready");
+
+        // warming phase
+        // this in when the outer service layers will be calling the inner KeyspaceService
+        let notify = Arc::new(Notify::new());
+        keyspace.update(Arc::clone(&notify)).await;
+        assert!(
+            keyspace.ready(),
+            "keyspace should poll_ready, after definition (with own node) is loaded"
+        );
+
+        // running phase
+        keyspace.set_to_running();
+        assert!(keyspace.ready(), "keyspace should poll_ready, when running");
+
+        // remove from keyspace, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(file.path())
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // cooling phase
+        keyspace.update(notify).await;
+        assert!(
+            keyspace.ready(),
+            "keyspace should still poll_ready when cooling, to handle `GET /state` requests"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_watcher_consumes_definition_file() {
+        // no keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        let mut file = tokio::fs::File::create(&configfile_path).await.unwrap();
+
+        // start service
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(MockInnermostService::default());
+
+        // assert poll_ready returns pending, when no keyspace definition
+        assert_matches!(
+            server.poll_ready(&mut Context::from_waker(noop_waker_ref())),
+            Poll::Pending,
+            "should return pending status, as keyspace definition does not yet exist"
+        );
+
+        // write keyspace definition to configfile
+        file.write_all(KEYSPACE_DEFINITION.as_bytes())
+            .await
+            .expect("should write keyspace definition to configfile");
+        file.shutdown()
+            .await
+            .expect("should shutdown file descriptor");
+
+        // wait for keyspace to be loaded by the watcher
+        wait_until_service_is_ready(&mut server).await;
+
+        // call service
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Warming,
+            "should return successful response, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_service_instance_phases() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for service.poll_ready to return ready
+        wait_until_service_is_ready(&mut server).await;
+
+        // call service when warming
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Warming,
+            "should return InstanceState::Warming, instead found {:?}",
+            res
+        );
+
+        // tell keyspace it's warmed
+        assert!(
+            server.call(warmed_req()).await.is_ok(),
+            "should be able to PATCH /warmed"
+        );
+
+        // call poll_ready when warmed
+        assert_matches!(
+            server.poll_ready(&mut Context::from_waker(noop_waker_ref())),
+            Poll::Ready(Ok(_)),
+            "should return ready status"
+        );
+
+        // call `GET /state` when warmed
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Running,
+            "should return InstanceState::Running, instead found {:?}",
+            res
+        );
+
+        // tell keyspace to cool, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(&configfile_path)
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // waiting for new_keyspace_definition to load
+        // cannot use poll_ready, as it is already returning ready (to accept `GET /state` requests)
+        tokio::time::sleep(Duration::from_secs(10)).await;
+
+        // call poll_ready when cooling
+        assert_matches!(
+            server.poll_ready(&mut Context::from_waker(noop_waker_ref())),
+            Poll::Ready(Ok(_)),
+            "should return ready status"
+        );
+        // call `GET /state` when cooling
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Cooling,
+            "should return InstanceState::Cooling, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_service_oks_for_included_key() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for service.poll_ready to return ready
+        wait_until_service_is_ready(&mut server).await;
+
+        // GET /metadata
+        let res = server.call(metadata_req()).await;
+        assert!(
+            res.is_ok(),
+            "should return successful `GET /metadata`, instead found {:?}",
+            res
+        );
+
+        // GET /object
+        let res = server.call(object_req()).await;
+        assert!(
+            res.is_ok(),
+            "should return successful `GET /object`, instead found {:?}",
+            res
+        );
+
+        // GET /write-hint
+        let res = server.call(write_hint_req()).await;
+        assert!(
+            res.is_ok(),
+            "should return successful `POST /write-hint`, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_service_errs_for_excluded_key() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for keyspace to be loaded by the watcher
+        wait_until_service_is_ready(&mut server).await;
+
+        // update, to remove self from keyspace
+        server.keyspace.data.rcu(|data| {
+            Arc::new(KeyspaceData {
+                own: None,
+                keyspace: Arc::clone(&data.keyspace),
+                version: data.version.set_next(ParquetCacheInstanceSet {
+                    revision: data.version.next.as_ref().unwrap().revision + 1,
+                    instances: vec!["another-node"].into_iter().map(String::from).collect(),
+                }),
+            })
+        });
+
+        // GET /metadata
+        let res = server.call(metadata_req()).await;
+        assert_matches!(
+            res,
+            Err(Error::Keyspace(_)),
+            "should return errored `GET /metadata`, instead found {:?}",
+            res
+        );
+
+        // GET /object
+        let res = server.call(object_req()).await;
+        assert_matches!(
+            res,
+            Err(Error::Keyspace(_)),
+            "should return errored `GET /object`, instead found {:?}",
+            res
+        );
+
+        // GET /write-hint
+        let res = server.call(write_hint_req()).await;
+        assert_matches!(
+            res,
+            Err(Error::Keyspace(_)),
+            "should return errored `POST /write-hint`, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_service_fetch_keyspace() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for service.poll_ready to return ready
+        wait_until_service_is_ready(&mut server).await;
+
+        // GET /keyspace
+        let res = server.call(keyspace_defn_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::Keyspace(nodes)) if matches!(
+                &nodes[..],
+                [ServiceNode { id: 0, hostname }] if hostname == VALID_HOSTNAME
+            ),
+            "should return successful `GET /keyspace`, instead found {:?}",
+            res
+        );
+    }
+
+    mod usage_of_poll_ready {
+        use super::*;
+
+        #[tokio::test]
+        async fn test_poll_ready_is_not_triggered_on_call() {
+            // provide keyspace definition
+            let dir = TempDir::new().unwrap();
+            let configfile_path = dir.path().join("configfile.json");
+            write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+            // start service
+            let innermost_service = MockInnermostService::default();
+            let mut server = ServiceBuilder::new()
+                .layer(BuildKeyspaceService {
+                    configfile_path: configfile_path.to_str().unwrap().to_string(),
+                    node_hostname: VALID_HOSTNAME.into(),
+                })
+                .service(innermost_service.clone());
+
+            // wait for keyspace to be loaded by the watcher
+            wait_until_service_is_ready(&mut server).await;
+            let init_poll_ready = innermost_service.poll_ready.load(Ordering::SeqCst);
+
+            // call service
+            // use `GET /object` since it calls inner service
+            let res = server.call(object_req()).await;
+            assert!(
+                res.is_ok(),
+                "should return successful response, instead found {:?}",
+                res
+            );
+
+            // assert that poll_ready was not called
+            assert_eq!(
+                innermost_service.call.load(Ordering::SeqCst),
+                1,
+                "should call innermost service once"
+            );
+            assert_eq!(
+                innermost_service.poll_ready.load(Ordering::SeqCst),
+                init_poll_ready,
+                "should not have called innermost poll_ready, on Service::call()"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_poll_ready_used_when_connected_to_stream() {
+            // provide keyspace definition
+            let dir = TempDir::new().unwrap();
+            let configfile_path = dir.path().join("configfile.json");
+            write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+            // start service
+            let innermost_service = MockInnermostService::default();
+            let mut server = ServiceBuilder::new()
+                .layer(BuildKeyspaceService {
+                    configfile_path: configfile_path.to_str().unwrap().to_string(),
+                    node_hostname: VALID_HOSTNAME.into(),
+                })
+                .service(innermost_service.clone());
+
+            // Stream of requests, processed by service.
+            let (reqs, rx) = futures::channel::mpsc::unbounded();
+            let mut resps = server.clone().call_all(rx);
+
+            // wait for service.poll_ready to return ready
+            wait_until_service_is_ready(&mut server).await;
+            let init_poll_ready = innermost_service.poll_ready.load(Ordering::SeqCst);
+
+            // stream Service::call() requests
+            vec![metadata_req(), object_req(), write_hint_req()]
+                .into_iter()
+                .for_each(|req| {
+                    reqs.unbounded_send(req).unwrap();
+                });
+            drop(reqs);
+
+            // await responses
+            while let Some(rsp) = resps.next().await {
+                assert!(
+                    rsp.is_ok(),
+                    "should return successful response, instead found {:?}",
+                    rsp
+                );
+            }
+
+            // assert that Service::poll_ready() was called at least as many times as Service::call()
+            assert_eq!(
+                innermost_service.call.load(Ordering::SeqCst),
+                3,
+                "should call innermost service once"
+            );
+            assert!(
+                innermost_service.poll_ready.load(Ordering::SeqCst) >= 3 + init_poll_ready,
+                "should have called innermost poll_ready"
+            );
+        }
+    }
+}
diff --git a/parquet_cache/src/server/mock.rs b/parquet_cache/src/server/mock.rs
new file mode 100644
index 0000000..deebe15
--- /dev/null
+++ b/parquet_cache/src/server/mock.rs
@@ -0,0 +1,217 @@
+use std::{
+    collections::{HashMap, HashSet},
+    convert::Infallible,
+    ops::Range,
+    sync::Arc,
+};
+
+use bytes::{BufMut, Bytes, BytesMut};
+use hyper::{
+    server::conn::{AddrIncoming, AddrStream},
+    service::{make_service_fn, service_fn},
+    Body, Method, Request, Response, Server,
+};
+use object_store::ObjectStore;
+use parking_lot::Mutex;
+use std::net::SocketAddr;
+use tokio::{net::TcpListener, sync::oneshot, task::JoinHandle};
+
+use crate::data_types::{
+    KeyspaceResponseBody, ServiceNode, X_RANGE_END_HEADER, X_RANGE_START_HEADER,
+};
+
+#[allow(missing_debug_implementations)]
+pub struct MockCacheServer {
+    addr: SocketAddr,
+    stop: oneshot::Sender<()>,
+    join: JoinHandle<()>,
+    req_handler: Arc<MockCacheServerRequestHandler>,
+}
+
+impl MockCacheServer {
+    pub async fn create(addr: &str, _object_store: Arc<dyn ObjectStore>) -> Self {
+        let listener = TcpListener::bind(addr)
+            .await
+            .expect("listener should have bound to addr");
+        let addr = listener.local_addr().unwrap();
+
+        let req_handler: Arc<MockCacheServerRequestHandler> =
+            Arc::new(MockCacheServerRequestHandler::new(addr.to_string()));
+
+        let handler = Arc::clone(&req_handler);
+        let make_svc = make_service_fn(move |_socket: &AddrStream| {
+            let handler = Arc::clone(&handler);
+            async move {
+                Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
+                    let handler = Arc::clone(&handler);
+                    async move { Arc::clone(&handler).handle(req) }
+                }))
+            }
+        });
+
+        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
+
+        let join = tokio::spawn(async {
+            Server::builder(AddrIncoming::from_listener(listener).unwrap())
+                .http2_only(true)
+                .serve(make_svc)
+                .with_graceful_shutdown(async {
+                    rx.await.ok();
+                })
+                .await
+                .unwrap()
+        });
+
+        Self {
+            addr,
+            stop: tx,
+            join,
+            req_handler,
+        }
+    }
+
+    pub fn addr(&self) -> String {
+        format!("http://{}", self.addr)
+    }
+
+    pub async fn close(self) {
+        self.stop
+            .send(())
+            .expect("Error sending stop signal to server");
+        self.join
+            .await
+            .expect("Error stopping parquet cache server");
+    }
+
+    pub fn was_called(&self, path_and_query: &String) -> bool {
+        self.req_handler.called.lock().contains(path_and_query)
+    }
+
+    pub fn was_called_with_payload(&self, path_and_query: &String) -> bool {
+        self.req_handler.called.lock().contains(path_and_query)
+    }
+
+    pub fn respond_with(&self, path_and_query: String, expected: ExpectedResponse) {
+        self.req_handler
+            .respond_with
+            .lock()
+            .insert(path_and_query, expected);
+    }
+}
+
+#[derive(Clone)]
+pub struct MockCacheServerRequestHandler {
+    pub hostname: String,
+    pub called: Arc<Mutex<HashSet<String>>>, // route_&_query
+    pub respond_with: Arc<Mutex<HashMap<String, ExpectedResponse>>>, // route_&_query, reponse_payload_body
+}
+
+#[derive(Clone, Debug)]
+pub struct ExpectedResponse {
+    pub bytes: Bytes,
+    pub range: Option<Range<usize>>,
+}
+
+impl MockCacheServerRequestHandler {
+    fn new(hostname: String) -> Self {
+        Self {
+            hostname,
+            called: Default::default(),
+            respond_with: Default::default(),
+        }
+    }
+
+    fn handle(&self, req: Request<Body>) -> Result<Response<hyper::body::Body>, Infallible> {
+        let path_and_query = req.uri().path_and_query().unwrap().to_string();
+
+        match (req.method(), req.uri().path()) {
+            (&Method::GET, "/keyspace") => {
+                self.insert_into_tracker(req);
+
+                let body = KeyspaceResponseBody {
+                    nodes: vec![ServiceNode {
+                        id: 42,
+                        hostname: self.hostname.clone(),
+                    }],
+                };
+
+                Ok::<_, Infallible>(Response::new(Body::from(build_resp_body(&body))))
+            }
+            (&Method::GET, "/metadata") => {
+                self.insert_into_tracker(req);
+                Ok::<_, Infallible>(Response::new(self.get_resp_body(&path_and_query)))
+            }
+            (&Method::GET, "/object") => {
+                // assert range header in mock server
+                if let Some(range) = req.headers().get("range") {
+                    // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range
+                    // <unit>=<range-start>-<range-end>
+                    let mut range = range.to_str().unwrap().to_string();
+                    range = range
+                        .strip_prefix("bytes=")
+                        .expect("should start range header with `bytes=`")
+                        .to_string();
+                    let (start, end) = range.split_at(
+                        range
+                            .find('-')
+                            .expect("should have dash delineating range `start-end`"),
+                    );
+                    assert!(start.parse::<usize>().is_ok());
+                    assert!(end[1..].parse::<usize>().is_ok());
+                };
+
+                self.insert_into_tracker(req);
+
+                let range = self
+                    .get_size_range(&path_and_query)
+                    .expect("should have used respond_with() for mocked response");
+
+                let resp = Response::builder()
+                    .header(X_RANGE_START_HEADER, range.start.to_string())
+                    .header(X_RANGE_END_HEADER, range.end.to_string())
+                    .body(self.get_resp_body(&path_and_query))
+                    .expect("should be a valid response");
+
+                Ok::<_, Infallible>(resp)
+            }
+            (&Method::POST, "/write-hint") => {
+                self.insert_into_tracker(req);
+                Ok::<_, Infallible>(Response::new(Body::empty()))
+            }
+            _ => unimplemented!(),
+        }
+    }
+
+    fn insert_into_tracker(&self, req: Request<Body>) {
+        self.called.lock().insert(
+            req.uri()
+                .path_and_query()
+                .expect("should exist")
+                .to_string(),
+        );
+    }
+
+    fn get_resp_body(&self, path_and_query: &String) -> Body {
+        match self.respond_with.lock().get(path_and_query) {
+            None => Body::empty(),
+            Some(expected) => Body::from(expected.clone().bytes),
+        }
+    }
+
+    fn get_size_range(&self, path_and_query: &String) -> Option<Range<usize>> {
+        self.respond_with
+            .lock()
+            .get(path_and_query)
+            .map(|expected| expected.clone().range.unwrap())
+    }
+}
+
+pub fn build_resp_body<T>(body: &T) -> Bytes
+where
+    T: Sized + serde::Serialize,
+{
+    let mut buf = BytesMut::new().writer();
+    serde_json::to_writer(&mut buf, body).expect("should write response body");
+
+    buf.into_inner().freeze()
+}
diff --git a/parquet_cache/src/server/precondition.rs b/parquet_cache/src/server/precondition.rs
new file mode 100644
index 0000000..a591e7c
--- /dev/null
+++ b/parquet_cache/src/server/precondition.rs
@@ -0,0 +1,57 @@
+use std::task::Poll;
+
+use http::{HeaderMap, Request};
+use hyper::Body;
+use object_store::ObjectMeta;
+use tower::{Layer, Service};
+
+use super::error::Error;
+use super::response::PinnedFuture;
+
+/// Service that applies the preconditions per request.
+///
+/// Refer to GetOptions:
+/// <https://github.com/apache/arrow-rs/blob/481652a4f8d972b633063158903dbdb0adcf094d/object_store/src/lib.rs#L871>
+#[derive(Debug, Clone)]
+pub struct PreconditionService<S: Clone + Send + Sync + 'static> {
+    inner: S,
+}
+
+impl<S: Clone + Send + Sync + 'static> PreconditionService<S> {
+    pub fn new(inner: S) -> Self {
+        Self { inner }
+    }
+
+    fn passes(&self, _preconditions: HeaderMap, _metadata: ObjectMeta) -> bool {
+        unimplemented!("TODO: precondition applied for any request, per HTTP header contract")
+    }
+}
+
+impl<S> Service<Request<Body>> for PreconditionService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture, Error = Error> + Clone + Send + Sync + 'static,
+{
+    type Response = super::response::Response;
+    type Error = Error;
+    type Future = super::response::PinnedFuture;
+
+    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let clone = self.inner.clone();
+        let mut inner = std::mem::replace(&mut self.inner, clone);
+        Box::pin(async move { inner.call(req).await })
+    }
+}
+
+pub struct BuildPreconditionService;
+
+impl<S: Clone + Send + Sync + 'static> Layer<S> for BuildPreconditionService {
+    type Service = PreconditionService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        PreconditionService::new(service)
+    }
+}
diff --git a/parquet_cache/src/server/response.rs b/parquet_cache/src/server/response.rs
new file mode 100644
index 0000000..70cb31a
--- /dev/null
+++ b/parquet_cache/src/server/response.rs
@@ -0,0 +1,83 @@
+use std::{fmt::Debug, pin::Pin};
+
+use bytes::{BufMut, Bytes, BytesMut};
+use futures::{stream::BoxStream, Future};
+use http::StatusCode;
+use hyper::Body;
+
+use crate::data_types::{
+    GetObjectMetaResponse, KeyspaceResponseBody, KeyspaceVersion, ServiceNode,
+};
+
+pub type PinnedFuture = Pin<Box<dyn Future<Output = Result<Response, super::error::Error>> + Send>>;
+
+pub enum Response {
+    /// Internal-only response used during pre-warming, for `PATCH /warmed`
+    Ready,
+    /// For `GET /keyspace`
+    Keyspace(Vec<ServiceNode>),
+    /// For `GET /state`
+    KeyspaceVersion(KeyspaceVersion),
+    /// For `GET /metadata`
+    Head(GetObjectMetaResponse),
+    /// For `GET /object`
+    Data(BoxStream<'static, object_store::Result<Bytes>>),
+    /// For `POST /write-hint`
+    Written,
+}
+
+impl Debug for Response {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Ready => write!(f, "Response::Ready"),
+            Self::Keyspace(k) => write!(f, "Response::Keyspace({:?})", k),
+            Self::KeyspaceVersion(v) => write!(f, "Response::KeyspaceVersion({:?})", v),
+            Self::Head(h) => write!(f, "Response::Head({:?})", h),
+            Self::Data(_) => write!(f, "Response::Data"),
+            Self::Written => write!(f, "Response::Written"),
+        }
+    }
+}
+
+impl Response {
+    pub fn code(&self) -> StatusCode {
+        match self {
+            Self::Ready => {
+                unreachable!("should be an internal-only Response, and not sent across the wire")
+            }
+            Self::Keyspace(_) | Self::KeyspaceVersion(_) | Self::Head(_) | Self::Data(_) => {
+                StatusCode::OK
+            }
+            Self::Written => StatusCode::CREATED,
+        }
+    }
+}
+
+impl From<Response> for Body {
+    fn from(value: Response) -> Self {
+        match value {
+            Response::Ready => {
+                unreachable!("should be an internal-only Response, and not sent across the wire")
+            }
+            Response::Keyspace(nodes) => {
+                Self::from(build_resp_body(&KeyspaceResponseBody { nodes }))
+            }
+            Response::KeyspaceVersion(version) => {
+                Self::from(serde_json::json!(version).to_string())
+            }
+            Response::Head(data) => Self::from(build_resp_body(&data)),
+            Response::Data(stream) => Self::wrap_stream(stream),
+            Response::Written => Self::empty(),
+        }
+    }
+}
+
+fn build_resp_body<T>(body: &T) -> Bytes
+where
+    T: Sized + serde::Serialize,
+{
+    let mut buf = BytesMut::new().writer();
+    serde_json::to_writer(&mut buf, body).expect("should write response body");
+
+    buf.into_inner().freeze()
+}
diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml
new file mode 100644
index 0000000..4f59e04
--- /dev/null
+++ b/parquet_file/Cargo.toml
@@ -0,0 +1,38 @@
+[package]
+name = "parquet_file"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+arrow = { workspace = true }
+base64 = "0.21"
+bytes = "1.5"
+data_types = { path = "../data_types" }
+datafusion = { workspace = true }
+datafusion_util = { path = "../datafusion_util" }
+futures = "0.3"
+generated_types = { path = "../generated_types" }
+iox_time = { path = "../iox_time" }
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+parquet = { workspace = true }
+pbjson-types = { workspace = true }
+prost = { workspace = true }
+schema = { path = "../schema" }
+snafu = "0.8"
+thiserror = "1.0.56"
+thrift = "0.17"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt", "rt-multi-thread", "sync"] }
+uuid = { version = "1", features = ["v4"] }
+zstd = { version = "0.13", default-features = false }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+assert_matches = "1.5.0"
+rand = "0.8.3"
+test_helpers = { version = "0.1.0", path = "../test_helpers" }
diff --git a/parquet_file/src/chunk.rs b/parquet_file/src/chunk.rs
new file mode 100644
index 0000000..c9c4325
--- /dev/null
+++ b/parquet_file/src/chunk.rs
@@ -0,0 +1,90 @@
+//! A metadata summary of a Parquet file in object storage, with the ability to
+//! download & execute a scan.
+
+use crate::{
+    storage::{ParquetExecInput, ParquetStorage},
+    ParquetFilePath,
+};
+use data_types::{ObjectStoreId, ParquetFile, TimestampMinMax};
+use schema::Schema;
+use std::{mem, sync::Arc};
+
+/// A abstract representation of a Parquet file in object storage, with
+/// associated metadata.
+#[derive(Debug)]
+pub struct ParquetChunk {
+    /// Parquet file.
+    parquet_file: Arc<ParquetFile>,
+
+    /// Schema that goes with this table's parquet file
+    schema: Schema,
+
+    /// Persists the parquet file within a namespace's relative path
+    store: ParquetStorage,
+}
+
+impl ParquetChunk {
+    /// Create parquet chunk.
+    pub fn new(parquet_file: Arc<ParquetFile>, schema: Schema, store: ParquetStorage) -> Self {
+        Self {
+            parquet_file,
+            schema,
+            store,
+        }
+    }
+
+    /// Store that contains this file.
+    pub fn store(&self) -> &ParquetStorage {
+        &self.store
+    }
+
+    /// Return raw parquet file metadata.
+    pub fn parquet_file(&self) -> &Arc<ParquetFile> {
+        &self.parquet_file
+    }
+
+    /// Return object store id
+    pub fn object_store_id(&self) -> ObjectStoreId {
+        self.parquet_file.object_store_id
+    }
+
+    /// Return the approximate memory size of the chunk, in bytes including the
+    /// dictionary, tables, and their rows.
+    pub fn size(&self) -> usize {
+        mem::size_of_val(self) + self.parquet_file.size() - mem::size_of_val(&self.parquet_file)
+    }
+
+    /// Infallibly return the full schema (for all columns) for this chunk
+    pub fn schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    /// Return stream of data read from parquet file
+    /// Inputs for [`ParquetExec`].
+    ///
+    /// See [`ParquetExecInput`] for more information.
+    ///
+    /// [`ParquetExec`]: datafusion::datasource::physical_plan::ParquetExec
+    pub fn parquet_exec_input(&self) -> ParquetExecInput {
+        let path: ParquetFilePath = self.parquet_file.as_ref().into();
+        self.store.parquet_exec_input(&path, self.file_size_bytes())
+    }
+
+    /// The total number of rows in all row groups in this chunk.
+    pub fn rows(&self) -> usize {
+        self.parquet_file.row_count as usize
+    }
+
+    /// Size of the parquet file in object store
+    pub fn file_size_bytes(&self) -> usize {
+        self.parquet_file.file_size_bytes as usize
+    }
+
+    /// return time range
+    pub fn timestamp_min_max(&self) -> TimestampMinMax {
+        TimestampMinMax {
+            min: self.parquet_file.min_time.get(),
+            max: self.parquet_file.max_time.get(),
+        }
+    }
+}
diff --git a/parquet_file/src/lib.rs b/parquet_file/src/lib.rs
new file mode 100644
index 0000000..4dc5a7f
--- /dev/null
+++ b/parquet_file/src/lib.rs
@@ -0,0 +1,327 @@
+//! Parquet file generation, storage, and metadata implementations.
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    unreachable_pub,
+    missing_docs,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(clippy::missing_docs_in_private_items)]
+
+use std::{path::PathBuf, str::FromStr};
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod chunk;
+pub mod metadata;
+pub mod serialize;
+pub mod storage;
+pub mod writer;
+
+use data_types::{
+    NamespaceId, ObjectStoreId, ParquetFile, ParquetFileParams, PartitionKey, TableId,
+    TransitionPartitionId,
+};
+use object_store::path::Path;
+
+/// Location of a Parquet file within a namespace's object store.
+/// The exact format is an implementation detail and is subject to change.
+#[derive(Debug, Clone, Eq, PartialEq, Hash)]
+pub struct ParquetFilePath {
+    namespace_id: NamespaceId,
+    table_id: TableId,
+    partition_id: TransitionPartitionId,
+    object_store_id: ObjectStoreId,
+}
+
+impl ParquetFilePath {
+    /// Create parquet file path relevant for the storage layout.
+    pub fn new(
+        namespace_id: NamespaceId,
+        table_id: TableId,
+        partition_id: &TransitionPartitionId,
+        object_store_id: ObjectStoreId,
+    ) -> Self {
+        Self {
+            namespace_id,
+            table_id,
+            partition_id: partition_id.clone(),
+            object_store_id,
+        }
+    }
+
+    /// Get object-store path.
+    pub fn object_store_path(&self) -> Path {
+        let Self {
+            namespace_id,
+            table_id,
+            partition_id,
+            object_store_id,
+        } = self;
+        Path::from_iter([
+            namespace_id.to_string().as_str(),
+            table_id.to_string().as_str(),
+            partition_id.to_string().as_str(),
+            &format!("{object_store_id}.parquet"),
+        ])
+    }
+
+    /// Get object store ID.
+    pub fn object_store_id(&self) -> ObjectStoreId {
+        self.object_store_id
+    }
+
+    /// Set new object store ID.
+    pub fn with_object_store_id(self, object_store_id: ObjectStoreId) -> Self {
+        Self {
+            object_store_id,
+            ..self
+        }
+    }
+}
+
+impl From<&Self> for ParquetFilePath {
+    fn from(borrowed: &Self) -> Self {
+        borrowed.clone()
+    }
+}
+
+impl From<(&TransitionPartitionId, &crate::metadata::IoxMetadata)> for ParquetFilePath {
+    fn from((partition_id, m): (&TransitionPartitionId, &crate::metadata::IoxMetadata)) -> Self {
+        Self {
+            namespace_id: m.namespace_id,
+            table_id: m.table_id,
+            partition_id: partition_id.clone(),
+            object_store_id: m.object_store_id,
+        }
+    }
+}
+
+impl From<&ParquetFile> for ParquetFilePath {
+    fn from(f: &ParquetFile) -> Self {
+        Self {
+            namespace_id: f.namespace_id,
+            table_id: f.table_id,
+            partition_id: TransitionPartitionId::from_parts(
+                f.partition_id,
+                f.partition_hash_id.clone(),
+            ),
+            object_store_id: f.object_store_id,
+        }
+    }
+}
+
+impl From<&ParquetFileParams> for ParquetFilePath {
+    fn from(f: &ParquetFileParams) -> Self {
+        let partition_id =
+            TransitionPartitionId::from_parts(f.partition_id, f.partition_hash_id.clone());
+
+        Self {
+            partition_id,
+            namespace_id: f.namespace_id,
+            table_id: f.table_id,
+            object_store_id: f.object_store_id,
+        }
+    }
+}
+
+impl TryFrom<&String> for ParquetFilePath {
+    type Error = object_store::path::Error;
+
+    fn try_from(path: &String) -> Result<Self, Self::Error> {
+        let mut parts = path.split(object_store::path::DELIMITER);
+
+        let namespace_id = parts
+            .next()
+            .ok_or(Self::Error::EmptySegment {
+                path: path.to_owned(),
+            })?
+            .parse::<i64>()
+            .map_err(|_| Self::Error::InvalidPath {
+                path: PathBuf::from(path.to_owned()),
+            })?;
+
+        let table_id = parts
+            .next()
+            .ok_or(Self::Error::EmptySegment {
+                path: path.to_owned(),
+            })?
+            .parse::<i64>()
+            .map_err(|_| Self::Error::InvalidPath {
+                path: path.clone().into(),
+            })?;
+        let table_id = TableId::new(table_id);
+
+        let partition_id = parts.next().ok_or(Self::Error::EmptySegment {
+            path: path.to_owned(),
+        })?;
+        let partition_key = PartitionKey::from(partition_id);
+
+        let object_store_id = parts.next().ok_or(Self::Error::EmptySegment {
+            path: path.to_owned(),
+        })?; // uuid.parquet
+        let object_store_id =
+            object_store_id
+                .split('.')
+                .next()
+                .ok_or(Self::Error::EmptySegment {
+                    path: path.to_owned(),
+                })?;
+
+        Ok(Self {
+            namespace_id: NamespaceId::new(namespace_id),
+            table_id,
+            partition_id: TransitionPartitionId::new(table_id, &partition_key),
+            object_store_id: ObjectStoreId::from_str(object_store_id).map_err(|_| {
+                Self::Error::InvalidPath {
+                    path: path.clone().into(),
+                }
+            })?,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use assert_matches::assert_matches;
+    use data_types::{PartitionId, PartitionKey, TransitionPartitionId};
+    use uuid::Uuid;
+
+    #[test]
+    fn parquet_file_absolute_dirs_and_file_path_database_partition_ids() {
+        let pfp = ParquetFilePath::new(
+            NamespaceId::new(1),
+            TableId::new(2),
+            &TransitionPartitionId::Deprecated(PartitionId::new(4)),
+            ObjectStoreId::from_uuid(Uuid::nil()),
+        );
+        let path = pfp.object_store_path();
+        assert_eq!(
+            path.to_string(),
+            "1/2/4/00000000-0000-0000-0000-000000000000.parquet",
+        );
+    }
+
+    #[test]
+    fn parquet_file_absolute_dirs_and_file_path_deterministic_partition_ids() {
+        let table_id = TableId::new(2);
+        let pfp = ParquetFilePath::new(
+            NamespaceId::new(1),
+            table_id,
+            &TransitionPartitionId::new(table_id, &PartitionKey::from("hello there")),
+            ObjectStoreId::from_uuid(Uuid::nil()),
+        );
+        let path = pfp.object_store_path();
+        assert_eq!(
+            path.to_string(),
+            "1/2/d10f045c8fb5589e1db57a0ab650175c422310a1474b4de619cc2ded48f65b81\
+            /00000000-0000-0000-0000-000000000000.parquet",
+        );
+    }
+
+    #[test]
+    fn parquet_file_path_parsed_from_object_store_path() {
+        let object_store_id = uuid::Uuid::new_v4();
+
+        // valid
+        let path = format!("1/2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Ok(res) if res == ParquetFilePath::new(
+                NamespaceId::new(1),
+                TableId::new(2),
+                &TransitionPartitionId::new(
+                    TableId::new(2),
+                    &PartitionKey::from("4"),
+                ),
+                ObjectStoreId::from_uuid(object_store_id),
+            ),
+            "should parse valid path, instead found {:?}", pfp
+        );
+
+        // namespace_id errors
+        let path = format!("2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = format!("bad/2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid namespace_id, instead found {:?}", pfp
+        );
+
+        // table_id errors
+        let path = format!("1/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = format!("1/bad/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid table_id, instead found {:?}", pfp
+        );
+
+        // namespace_id errors
+        let path = format!("2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = format!("bad/2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid namespace_id, instead found {:?}", pfp
+        );
+
+        // partition_id errors
+        let path = format!("1/2/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+
+        // object_store_id errors
+        let path = "1/2/4".to_string();
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = "1/2/4/bad".to_string();
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid object_store_id, instead found {:?}", pfp
+        );
+    }
+}
diff --git a/parquet_file/src/metadata.rs b/parquet_file/src/metadata.rs
new file mode 100644
index 0000000..3e304a8
--- /dev/null
+++ b/parquet_file/src/metadata.rs
@@ -0,0 +1,1116 @@
+//! Metadata encoding and decoding.
+//!
+//! # Data Flow
+//! The following diagram shows how metadata flows through the system from the perspective of the parquet reader/writer:
+//!
+//! 1. **Incoming Data:** Incoming data contains one or multiple [Apache Arrow] `RecordBatch`, IOx-specific statistics,
+//!    the IOx-specific schema (which also contains key-value metadata) and a timestamp range.
+//! 2. **Parquet Creation:** The `RecordBatch` is converted into an [Apache Parquet] file. Note that the `RecordBatch`
+//!    itself has a schema associated, so technically it is not required to use the input schema. However if we have
+//!    multiple batches it is technically simpler to provide a single schema to the Parquet writer in which case we also
+//!    use the input schema. Statistics and timestamp range are NOT provided to the Parquet writer since it needs to
+//!    calculate the Parquet-specific statistics anyway.
+//! 3. **Parquet Metadata:** From the Parquet file we extract the metadata, which technically is the footer of the file
+//!    and does NOT contain any payload data.
+//! 4. **Parquet File Lifecycle:** The Parquet file is stored into the object store and can be recovered from there.
+//!    With it the metadata can also be recovered. However for performance reasons we also want to keep the metadata
+//!    separately which is discussed in the next step.
+//! 5. **Thrift Encoding:** Within the parquet file the metadata resides as a serialized [Apache Thrift] message. We
+//!    reuse this encoding and serialize the metadata using the same [Thrift Compact Protocol]. The resulting bytes can
+//!    then be stored within the catalog.
+//! 6. **IOx Metadata Extraction:** From the Parquet metadata we can recover all important metadata parts for IOx,
+//!    namely schema, statistics and timestamp range.
+//!
+//! ```text
+//! .....................................    .....................................
+//! .                                   .    .                                   .
+//! .               Input               .    .             Output                .
+//! .                                   .    .                                   .
+//! .  ┌─────────────┐ ┌─────────────┐  .    .  ┌─────────────┐ ┌─────────────┐  .
+//! .  │             │ │             │  .    .  │             │ │             │  .
+//! .  │ RecordBatch │ │ Statistics  │  .    .  │ Store+Path  │ │ Statistics  │  .
+//! .  │   (1..n)    │ │             │  .    .  │             │ │             │  .
+//! .  └─┬───────────┘ └─────────────┘  .    .  └─────────────┘ └▲────────────┘  .
+//! .    │                              .    .                   │               .
+//! .  ┌─┼───────────┐ ┌─────────────┐  .    .  ┌─────────────┐ ┌┼────────────┐  .
+//! .  │ │           │ │             │  .    .  │             │ ││            │  .
+//! .  │ │ Schema    │ │  TS Range   │  .    .  │   Schema    │ ││ TS Range   │  .
+//! .  │ │           │ │             │  .    .  │             │ ││            │  .
+//! .  └─┼───────────┘ └─────────────┘  .    .  └─▲───────────┘ └┼─▲──────────┘  .
+//! .    │                              .    .    │              │ │             .
+//! .....│...............................    .....│..............│.│..............
+//!      │                                        │              │ │
+//!      │ Arrow => Parquet                       ├──────────────┴─┘
+//!      │                                        │
+//!      │                                        │
+//! ┌────▼─────────────────────┐                  │
+//! │                          │                  │
+//! │       Apache Parquet     │                  │
+//! │                          │                  │
+//! │  ┌───────────────────────┤                  │
+//! │  │      Magic Number     │                  │
+//! │  ├───────────────────────┤                  │ Restore
+//! │  │                       │ │                │
+//! │  │ Row Group 1 ┌─────────┤ │                │
+//! │  │             │ ...     │ │                │
+//! │  ├─────────────┴─────────┤ │                │
+//! │  │ ...                   │ │Payload         │
+//! │  ├───────────────────────┤ │                │
+//! │  │                       │ │                │
+//! │  │ Row Group N ┌─────────┤ │                │
+//! │  │             │ ...     │ │                │
+//! │  ├─────────────┴─────────┤                ┌─┴────────────────┐
+//! │  │                       │ │              │                  │
+//! │  │ Footer      ┌─────────┤ │Metadata ─────► Parquet Metadata │
+//! │  │             │ ...     │ │              │                  │
+//! │  ├─────────────┴─────────┤                └─▲────────────────┘
+//! │  │     Footer Length     │                  │
+//! │  ├───────────────────────┤                  │ Encode / Decode
+//! │  │      Magic Number     │                  │
+//! └▲─┴───────────────────────┘                ┌─▼────────────────┐
+//!  │                                          │                  │
+//!  │                                          │  Thrift Bytes    │
+//!  │                                          │                  │
+//!  │ Store / Load                             └─▲────────────────┘
+//!  │                                            │
+//!  │                                            │ Store / Load
+//!  │                                            │
+//! ┌▼─────────────┐                            ┌─▼────────────────┐
+//! │              │                            │                  │
+//! │ Object Store │                            │     Catalog      │
+//! │              │                            │                  │
+//! └──────────────┘                            └──────────────────┘
+//! ```
+//!
+//! [Apache Arrow]: https://arrow.apache.org/
+//! [Apache Parquet]: https://parquet.apache.org/
+//! [Apache Thrift]: https://thrift.apache.org/
+//! [Thrift Compact Protocol]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
+use base64::{prelude::BASE64_STANDARD, Engine};
+use bytes::Bytes;
+use data_types::{
+    ColumnId, ColumnSet, ColumnSummary, CompactionLevel, CompactionLevelProtoError, InfluxDbType,
+    NamespaceId, ObjectStoreId, ParquetFileParams, PartitionHashId, PartitionId, PartitionKey,
+    StatValues, Statistics, TableId, Timestamp,
+};
+use generated_types::influxdata::iox::ingester::v1 as proto;
+use iox_time::Time;
+use observability_deps::tracing::{debug, trace};
+use parquet::{
+    arrow::parquet_to_arrow_schema,
+    file::{
+        metadata::{
+            FileMetaData as ParquetFileMetaData, ParquetMetaData,
+            RowGroupMetaData as ParquetRowGroupMetaData,
+        },
+        reader::FileReader,
+        serialized_reader::SerializedFileReader,
+        statistics::Statistics as ParquetStatistics,
+    },
+    schema::types::SchemaDescriptor as ParquetSchemaDescriptor,
+    thrift::TSerializable,
+};
+use prost::Message;
+use schema::{
+    sort::{SortKey, SortKeyBuilder},
+    InfluxColumnType, InfluxFieldType, Schema, TIME_COLUMN_NAME,
+};
+use snafu::{ensure, OptionExt, ResultExt, Snafu};
+use std::{convert::TryInto, fmt::Debug, mem, sync::Arc};
+use thrift::protocol::{TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol};
+use uuid::Uuid;
+
+/// Current version for serialized metadata.
+///
+/// For breaking changes, this will change.
+///
+/// **Important: When changing this structure, consider bumping the catalog transaction version (`TRANSACTION_VERSION`
+///              in the `parquet_catalog` crate)!**
+pub const METADATA_VERSION: u32 = 10;
+
+/// File-level metadata key to store the IOx-specific data.
+pub const METADATA_KEY: &str = "IOX:metadata";
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Cannot read parquet metadata from bytes: {}", source))]
+    ParquetMetaDataRead {
+        source: parquet::errors::ParquetError,
+    },
+
+    #[snafu(display("Cannot read thrift message: {}", source))]
+    ThriftReadFailure { source: thrift::Error },
+
+    #[snafu(display("Cannot write thrift message: {}", source))]
+    ThriftWriteFailure { source: thrift::Error },
+
+    #[snafu(display("Cannot convert parquet schema to thrift: {}", source))]
+    ParquetSchemaToThrift {
+        source: parquet::errors::ParquetError,
+    },
+
+    #[snafu(display("Cannot convert thrift to parquet schema: {}", source))]
+    ParquetSchemaFromThrift {
+        source: parquet::errors::ParquetError,
+    },
+
+    #[snafu(display("Cannot convert thrift to parquet row group: {}", source))]
+    ParquetRowGroupFromThrift {
+        source: parquet::errors::ParquetError,
+    },
+
+    #[snafu(display("No row group found, cannot recover statistics"))]
+    NoRowGroup {},
+
+    #[snafu(display(
+        "Cannot find statistics for column {} in row group {}",
+        column,
+        row_group
+    ))]
+    StatisticsMissing { row_group: usize, column: String },
+
+    #[snafu(display(
+        "Statistics for column {} in row group {} contain deprecated and potentially wrong min/max values",
+        column,
+        row_group
+    ))]
+    StatisticsMinMaxDeprecated { row_group: usize, column: String },
+
+    #[snafu(display(
+        "Statistics for column {} in row group {} have wrong type: expected {:?} but got {}",
+        column,
+        row_group,
+        expected,
+        actual,
+    ))]
+    StatisticsTypeMismatch {
+        row_group: usize,
+        column: String,
+        expected: InfluxColumnType,
+        actual: ParquetStatistics,
+    },
+
+    #[snafu(display(
+        "Statistics for column {} in row group {} contain invalid UTF8 data: {}",
+        column,
+        row_group,
+        source,
+    ))]
+    StatisticsUtf8Error {
+        row_group: usize,
+        column: String,
+        source: parquet::errors::ParquetError,
+    },
+
+    #[snafu(display("Cannot read arrow schema from parquet: {}", source))]
+    ArrowFromParquetFailure {
+        source: parquet::errors::ParquetError,
+    },
+
+    #[snafu(display("Cannot read IOx schema from arrow: {}", source))]
+    IoxFromArrowFailure { source: schema::Error },
+
+    #[snafu(display("Parquet metadata does not contain IOx metadata"))]
+    IoxMetadataMissing {},
+
+    #[snafu(display("Field missing while parsing IOx metadata: {}", field))]
+    IoxMetadataFieldMissing { field: String },
+
+    #[snafu(display("Cannot parse timestamp from parquet metadata: {}", e))]
+    IoxInvalidTimestamp { e: String },
+
+    #[snafu(display("Cannot parse IOx metadata from Protobuf: {}", source))]
+    IoxMetadataBroken {
+        source: Box<dyn std::error::Error + Send + Sync + 'static>,
+    },
+
+    #[snafu(display("Cannot encode ZSTD message for parquet metadata: {}", source))]
+    ZstdEncodeFailure { source: std::io::Error },
+
+    #[snafu(display("Cannot decode ZSTD message for parquet metadata: {}", source))]
+    ZstdDecodeFailure { source: std::io::Error },
+
+    #[snafu(display("Cannot parse UUID: {}", source))]
+    UuidParse { source: uuid::Error },
+
+    #[snafu(display("{}: `{}`", source, compaction_level))]
+    InvalidCompactionLevel {
+        source: CompactionLevelProtoError,
+        compaction_level: i32,
+    },
+}
+
+#[allow(missing_docs)]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// IOx-specific metadata.
+///
+/// # Serialization
+/// This will serialized as base64-encoded [Protocol Buffers 3] into the file-level key-value
+/// Parquet metadata (under [`METADATA_KEY`]).
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct IoxMetadata {
+    /// The uuid used as the location of the parquet file in the OS.
+    /// This uuid will later be used as the catalog's ParquetFileId
+    pub object_store_id: ObjectStoreId,
+
+    /// Timestamp when this file was created.
+    pub creation_timestamp: Time,
+
+    /// namespace id of the data
+    pub namespace_id: NamespaceId,
+
+    /// namespace name of the data
+    pub namespace_name: Arc<str>,
+
+    /// table id of the data
+    pub table_id: TableId,
+
+    /// table name of the data
+    pub table_name: Arc<str>,
+
+    /// partition key of the data
+    pub partition_key: PartitionKey,
+
+    /// The compaction level of the file.
+    ///
+    ///  * 0 (`CompactionLevel::Initial`): represents a level-0 file that is persisted by an
+    ///      Ingester. Partitions with level-0 files are usually hot/recent partitions.
+    ///  * 1 (`CompactionLevel::FileOverlapped`): represents a level-1 file that is persisted by a
+    ///      Compactor and potentially overlaps with other level-1 files. Partitions with level-1
+    ///      files are partitions with a lot of or/and large overlapped files that have to go
+    ///      through many compaction cycles before they are fully compacted to non-overlapped
+    ///      files.
+    ///  * 2 (`CompactionLevel::FileNonOverlapped`): represents a level-1 file that is persisted by
+    ///      a Compactor and does not overlap with other files except level 0 ones. Eventually,
+    ///      cold partitions (partitions that no longer needs to get compacted) will only include
+    ///      one or many level-1 files
+    pub compaction_level: CompactionLevel,
+
+    /// Sort key of this chunk
+    pub sort_key: Option<SortKey>,
+
+    /// Max timestamp of creation timestamp of L0 files
+    /// If this metadata is for an L0 file, this value will be the same as the `creation_timestamp`
+    /// If this metadata is for an L1/L2 file, this value will be the max of all L0 files
+    ///  that are compacted into this file
+    pub max_l0_created_at: Time,
+}
+
+impl IoxMetadata {
+    /// Convert to base64 encoded protobuf format
+    pub fn to_base64(&self) -> std::result::Result<String, prost::EncodeError> {
+        Ok(BASE64_STANDARD.encode(self.to_protobuf()?))
+    }
+
+    /// Read from base64 encoded protobuf format
+    pub fn from_base64(proto_base64: &[u8]) -> Result<Self> {
+        let proto_bytes = BASE64_STANDARD
+            .decode(proto_base64)
+            .map_err(|err| Box::new(err) as _)
+            .context(IoxMetadataBrokenSnafu)?;
+
+        Self::from_protobuf(&proto_bytes)
+    }
+
+    /// Convert to protobuf v3 message.
+    pub fn to_protobuf(&self) -> std::result::Result<Vec<u8>, prost::EncodeError> {
+        let sort_key = self.sort_key.as_ref().map(|key| proto::SortKey {
+            expressions: key
+                .iter()
+                .map(|(name, options)| proto::sort_key::Expr {
+                    column: name.to_string(),
+                    descending: options.descending,
+                    nulls_first: options.nulls_first,
+                })
+                .collect(),
+        });
+
+        let proto_msg = proto::IoxMetadata {
+            object_store_id: self.object_store_id.get_uuid().as_bytes().to_vec(),
+            creation_timestamp: Some(self.creation_timestamp.date_time().into()),
+            namespace_id: self.namespace_id.get(),
+            namespace_name: self.namespace_name.to_string(),
+            table_id: self.table_id.get(),
+            table_name: self.table_name.to_string(),
+            partition_key: self.partition_key.to_string(),
+            sort_key,
+            compaction_level: self.compaction_level as i32,
+            max_l0_created_at: Some(self.max_l0_created_at.date_time().into()),
+        };
+
+        let mut buf = Vec::new();
+        proto_msg.encode(&mut buf)?;
+
+        Ok(buf)
+    }
+
+    /// Read from protobuf message
+    pub fn from_protobuf(data: &[u8]) -> Result<Self> {
+        // extract protobuf message from bytes
+        let proto_msg = proto::IoxMetadata::decode(data)
+            .map_err(|err| Box::new(err) as _)
+            .context(IoxMetadataBrokenSnafu)?;
+
+        // extract creation timestamp
+        let creation_timestamp =
+            decode_timestamp_from_field(proto_msg.creation_timestamp, "creation_timestamp")?;
+        let max_l0_created_at =
+            decode_timestamp_from_field(proto_msg.max_l0_created_at, "max_l0_created_at")?;
+
+        // extract strings
+        let namespace_name = Arc::from(proto_msg.namespace_name.as_ref());
+        let table_name = Arc::from(proto_msg.table_name.as_ref());
+        let partition_key = PartitionKey::from(proto_msg.partition_key);
+
+        // sort key
+        let sort_key = proto_msg.sort_key.map(|proto_key| {
+            let mut builder = SortKeyBuilder::with_capacity(proto_key.expressions.len());
+            for expr in proto_key.expressions {
+                builder = builder.with_col_opts(expr.column, expr.descending, expr.nulls_first)
+            }
+            builder.build()
+        });
+
+        Ok(Self {
+            object_store_id: ObjectStoreId::from_uuid(
+                parse_uuid(&proto_msg.object_store_id)?.ok_or_else(|| {
+                    Error::IoxMetadataFieldMissing {
+                        field: "object_store_id".to_string(),
+                    }
+                })?,
+            ),
+            creation_timestamp,
+            namespace_id: NamespaceId::new(proto_msg.namespace_id),
+            namespace_name,
+            table_id: TableId::new(proto_msg.table_id),
+            table_name,
+            partition_key,
+            sort_key,
+            compaction_level: proto_msg.compaction_level.try_into().context(
+                InvalidCompactionLevelSnafu {
+                    compaction_level: proto_msg.compaction_level,
+                },
+            )?,
+            max_l0_created_at,
+        })
+    }
+
+    /// Generate metadata for a file generated from some process other than IOx ingesting.
+    ///
+    /// This metadata will not have valid catalog values; inserting files with this metadata into
+    /// the catalog should get valid values out-of-band.
+    pub fn external(creation_timestamp_ns: i64, table_name: impl Into<Arc<str>>) -> Self {
+        Self {
+            object_store_id: ObjectStoreId::from_uuid(Uuid::nil()),
+            creation_timestamp: Time::from_timestamp_nanos(creation_timestamp_ns),
+            namespace_id: NamespaceId::new(1),
+            namespace_name: "external".into(),
+            table_id: TableId::new(1),
+            table_name: table_name.into(),
+            partition_key: "unknown".into(),
+            compaction_level: CompactionLevel::Initial,
+            sort_key: None,
+            max_l0_created_at: Time::from_timestamp_nanos(creation_timestamp_ns),
+        }
+    }
+
+    /// verify uuid
+    pub fn match_object_store_id(&self, id: ObjectStoreId) -> bool {
+        id == self.object_store_id
+    }
+
+    /// Create a corresponding iox catalog's ParquetFile
+    ///
+    /// # Panics
+    ///
+    /// This method panics if the [`IoxParquetMetaData`] structure does not
+    /// contain valid metadata bytes, has no readable schema, or has no field
+    /// statistics.
+    ///
+    /// A [`RecordBatch`] serialized without the embedded metadata found in the
+    /// IOx [`Schema`] type will cause a statistic resolution failure due to
+    /// lack of the IOx field type metadata for the time column. Batches
+    /// produced from the through the IOx write path always include this
+    /// metadata.
+    ///
+    /// [`RecordBatch`]: arrow::record_batch::RecordBatch
+    pub fn to_parquet_file<F>(
+        &self,
+        partition_id: PartitionId,
+        partition_hash_id: Option<PartitionHashId>,
+        file_size_bytes: usize,
+        metadata: &IoxParquetMetaData,
+        column_id_map: F,
+    ) -> ParquetFileParams
+    where
+        F: for<'a> Fn(&'a str) -> ColumnId,
+    {
+        let decoded = metadata.decode().expect("invalid IOx metadata");
+        trace!(
+            ?partition_id,
+            ?decoded,
+            "DecodedIoxParquetMetaData decoded from its IoxParquetMetaData"
+        );
+        let row_count = decoded.row_count();
+        if decoded.md.row_groups().is_empty() {
+            debug!(
+                ?partition_id,
+                "Decoded IoxParquetMetaData has no row groups to provide useful statistics"
+            );
+        }
+
+        // Derive the min/max timestamp from the Parquet column statistics.
+        let schema = decoded
+            .read_schema()
+            .expect("failed to read encoded schema");
+        let stats = decoded
+            .read_statistics(&schema)
+            .expect("invalid statistics");
+        let columns: Vec<_> = stats.iter().map(|v| column_id_map(&v.name)).collect();
+        let time_summary = stats
+            .into_iter()
+            .find(|v| v.name == TIME_COLUMN_NAME)
+            .expect("no time column in metadata statistics");
+
+        // Sanity check the type of this column before using the values.
+        assert_eq!(time_summary.influxdb_type, InfluxDbType::Timestamp);
+
+        // Extract the min/max timestamps.
+        let (min_time, max_time) = match time_summary.stats {
+            Statistics::I64(stats) => {
+                let min = Timestamp::new(stats.min.expect("no min time statistic"));
+                let max = Timestamp::new(stats.max.expect("no max time statistic"));
+                (min, max)
+            }
+            _ => panic!("unexpected physical type for timestamp column"),
+        };
+
+        ParquetFileParams {
+            namespace_id: self.namespace_id,
+            table_id: self.table_id,
+            partition_id,
+            partition_hash_id,
+            object_store_id: self.object_store_id,
+            min_time,
+            max_time,
+            file_size_bytes: file_size_bytes as i64,
+            compaction_level: self.compaction_level,
+            row_count: row_count.try_into().expect("row count overflows i64"),
+            created_at: Timestamp::from(self.creation_timestamp),
+            column_set: ColumnSet::new(columns),
+            max_l0_created_at: Timestamp::from(self.max_l0_created_at),
+        }
+    }
+
+    /// Estimate the memory consumption of this object and its contents
+    pub fn size(&self) -> usize {
+        // size of this structure, including inlined size + heap sizes
+        let size_without_sortkey_refs = mem::size_of_val(self)
+            + self.namespace_name.as_bytes().len()
+            + self.table_name.as_bytes().len()
+            + std::mem::size_of::<PartitionKey>();
+
+        if let Some(sort_key) = self.sort_key.as_ref() {
+            size_without_sortkey_refs +
+                sort_key.size()
+            // already included in `size_of_val(self)` above so remove to avoid double counting
+                - mem::size_of_val(sort_key)
+        } else {
+            size_without_sortkey_refs
+        }
+    }
+}
+
+/// Parse big-endian UUID from protobuf.
+pub fn parse_uuid(bytes: &[u8]) -> Result<Option<Uuid>> {
+    if bytes.is_empty() {
+        Ok(None)
+    } else {
+        let uuid = Uuid::from_slice(bytes).context(UuidParseSnafu {})?;
+        Ok(Some(uuid))
+    }
+}
+
+fn decode_timestamp_from_field(
+    value: Option<pbjson_types::Timestamp>,
+    field: &'static str,
+) -> Result<Time> {
+    let date_time = value
+        .context(IoxMetadataFieldMissingSnafu { field })?
+        .try_into()
+        .map_err(|e: &str| Error::IoxInvalidTimestamp { e: e.to_string() })?;
+
+    Ok(Time::from_date_time(date_time))
+}
+
+/// Parquet metadata with IOx-specific wrapper.
+#[derive(Clone, PartialEq, Eq)]
+pub struct IoxParquetMetaData {
+    /// [Apache Parquet] metadata as freestanding [Apache Thrift]-encoded, and [Zstandard]-compressed bytes.
+    ///
+    /// This can be used to store metadata separate from the related payload data. The usage of [Apache Thrift] allows the
+    /// same stability guarantees as the usage of an ordinary [Apache Parquet] file. To encode a thrift message into bytes
+    /// the [Thrift Compact Protocol] is used.
+    ///
+    /// [Apache Parquet]: https://parquet.apache.org/
+    /// [Apache Thrift]: https://thrift.apache.org/
+    /// [Thrift Compact Protocol]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
+    /// [Zstandard]: http://facebook.github.io/zstd/
+    data: Vec<u8>,
+}
+
+impl Debug for IoxParquetMetaData {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("IoxParquetMetaData")
+            .field("data", &format!("<{} bytes>", self.data.len()))
+            .finish()
+    }
+}
+
+impl IoxParquetMetaData {
+    /// Read parquet metadata from a parquet file.
+    pub fn from_file_bytes(data: Bytes) -> Result<Option<Self>> {
+        if data.is_empty() {
+            return Ok(None);
+        }
+
+        let reader = SerializedFileReader::new(data).context(ParquetMetaDataReadSnafu {})?;
+        let parquet_md = reader.metadata().clone();
+
+        let data = Self::parquet_md_to_thrift(parquet_md)?;
+        Ok(Some(Self::from_thrift_bytes(data)))
+    }
+
+    /// Read parquet metadata from thrift bytes.
+    pub fn from_thrift_bytes(mut data: Vec<u8>) -> Self {
+        data.shrink_to_fit();
+        Self { data }
+    }
+
+    /// [Apache Parquet] metadata as freestanding [Apache Thrift]-encoded, and [Zstandard]-compressed bytes.
+    ///
+    /// This can be used to store metadata separate from the related payload data. The usage of [Apache Thrift] allows the
+    /// same stability guarantees as the usage of an ordinary [Apache Parquet] file. To encode a thrift message into bytes
+    /// the [Thrift Compact Protocol] is used.
+    ///
+    /// [Apache Parquet]: https://parquet.apache.org/
+    /// [Apache Thrift]: https://thrift.apache.org/
+    /// [Thrift Compact Protocol]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
+    /// [Zstandard]: http://facebook.github.io/zstd/
+    pub fn thrift_bytes(&self) -> &[u8] {
+        self.data.as_ref()
+    }
+
+    /// Encode [Apache Parquet] metadata as freestanding [Apache Thrift]-encoded, and [Zstandard]-compressed bytes.
+    ///
+    /// This can be used to store metadata separate from the related payload data. The usage of [Apache Thrift] allows the
+    /// same stability guarantees as the usage of an ordinary [Apache Parquet] file. To encode a thrift message into bytes
+    /// the [Thrift Compact Protocol] is used.
+    ///
+    /// [Apache Parquet]: https://parquet.apache.org/
+    /// [Apache Thrift]: https://thrift.apache.org/
+    /// [Thrift Compact Protocol]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
+    /// [Zstandard]: http://facebook.github.io/zstd/
+    fn parquet_md_to_thrift(parquet_md: ParquetMetaData) -> Result<Vec<u8>> {
+        // step 1: assemble a thrift-compatible struct
+        use parquet::schema::types::to_thrift as schema_to_thrift;
+
+        let file_metadata = parquet_md.file_metadata();
+        let thrift_schema =
+            schema_to_thrift(file_metadata.schema()).context(ParquetSchemaToThriftSnafu {})?;
+        let thrift_row_groups: Vec<_> = parquet_md
+            .row_groups()
+            .iter()
+            .map(|rg| rg.to_thrift())
+            .collect();
+
+        let thrift_file_metadata = parquet::format::FileMetaData {
+            version: file_metadata.version(),
+            schema: thrift_schema,
+
+            // TODO: column order thrift wrapper (https://github.com/influxdata/influxdb_iox/issues/1408)
+            // NOTE: currently the column order is `None` for all written files, see https://github.com/apache/arrow-rs/blob/4dfbca6e5791be400d2fd3ae863655445327650e/parquet/src/file/writer.rs#L193
+            column_orders: None,
+            num_rows: file_metadata.num_rows(),
+            row_groups: thrift_row_groups,
+            key_value_metadata: file_metadata.key_value_metadata().cloned(),
+            created_by: file_metadata.created_by().map(|s| s.to_string()),
+            encryption_algorithm: None,
+            footer_signing_key_metadata: None,
+        };
+
+        // step 2: serialize the thrift struct into bytes
+        Self::try_from(thrift_file_metadata).map(|opt| opt.data)
+    }
+
+    /// Decode [Apache Parquet] metadata from [Apache Thrift]-encoded, and [Zstandard]-compressed bytes.
+    ///
+    /// [Apache Parquet]: https://parquet.apache.org/
+    /// [Apache Thrift]: https://thrift.apache.org/
+    /// [Zstandard]: http://facebook.github.io/zstd/
+    pub fn decode(&self) -> Result<DecodedIoxParquetMetaData> {
+        // step 1: decompress
+        let data = zstd::decode_all(&self.data[..]).context(ZstdDecodeFailureSnafu)?;
+
+        // step 2: load thrift data from byte stream
+        let thrift_file_metadata = {
+            let mut protocol = TCompactInputProtocol::new(&data[..]);
+            parquet::format::FileMetaData::read_from_in_protocol(&mut protocol)
+                .context(ThriftReadFailureSnafu {})?
+        };
+
+        // step 3: convert thrift to in-mem structs
+        use parquet::schema::types::from_thrift as schema_from_thrift;
+
+        let schema = schema_from_thrift(&thrift_file_metadata.schema)
+            .context(ParquetSchemaFromThriftSnafu {})?;
+        let schema_descr = Arc::new(ParquetSchemaDescriptor::new(schema));
+        let mut row_groups = Vec::with_capacity(thrift_file_metadata.row_groups.len());
+        for rg in thrift_file_metadata.row_groups {
+            row_groups.push(
+                ParquetRowGroupMetaData::from_thrift(Arc::clone(&schema_descr), rg)
+                    .context(ParquetRowGroupFromThriftSnafu {})?,
+            );
+        }
+        // TODO: parse column order, or ignore it: https://github.com/influxdata/influxdb_iox/issues/1408
+        let column_orders = None;
+
+        let file_metadata = ParquetFileMetaData::new(
+            thrift_file_metadata.version,
+            thrift_file_metadata.num_rows,
+            thrift_file_metadata.created_by,
+            thrift_file_metadata.key_value_metadata,
+            schema_descr,
+            column_orders,
+        );
+        let md = ParquetMetaData::new(file_metadata, row_groups);
+        Ok(DecodedIoxParquetMetaData { md })
+    }
+
+    /// In-memory size in bytes, including `self`.
+    pub fn size(&self) -> usize {
+        assert_eq!(self.data.len(), self.data.capacity(), "data is not trimmed");
+        mem::size_of_val(self) + self.data.capacity()
+    }
+}
+
+impl TryFrom<parquet::format::FileMetaData> for IoxParquetMetaData {
+    type Error = Error;
+
+    fn try_from(v: parquet::format::FileMetaData) -> Result<Self, Self::Error> {
+        let mut buffer = Vec::new();
+        {
+            let mut protocol = TCompactOutputProtocol::new(&mut buffer);
+            v.write_to_out_protocol(&mut protocol)
+                .context(ThriftWriteFailureSnafu {})?;
+            protocol.flush().context(ThriftWriteFailureSnafu {})?;
+        }
+
+        // step 3: compress data
+        // Note: level 0 is the zstd-provided default
+        let buffer = zstd::encode_all(&buffer[..], 0).context(ZstdEncodeFailureSnafu)?;
+
+        Ok(Self::from_thrift_bytes(buffer))
+    }
+}
+
+/// Parquet metadata with IOx-specific wrapper, in decoded form.
+#[derive(Debug)]
+pub struct DecodedIoxParquetMetaData {
+    /// Low-level parquet metadata that stores all relevant information.
+    md: ParquetMetaData,
+}
+
+impl DecodedIoxParquetMetaData {
+    /// Return parquet file metadata
+    pub fn parquet_file_meta(&self) -> &ParquetFileMetaData {
+        self.md.file_metadata()
+    }
+
+    /// return row group metadata
+    pub fn parquet_row_group_metadata(&self) -> &[ParquetRowGroupMetaData] {
+        self.md.row_groups()
+    }
+
+    /// Return the number of rows in the parquet file
+    pub fn row_count(&self) -> usize {
+        self.md.file_metadata().num_rows() as usize
+    }
+
+    /// Read IOx metadata from file-level key-value parquet metadata.
+    pub fn read_iox_metadata_new(&self) -> Result<IoxMetadata> {
+        // find file-level key-value metadata entry
+        let kv = self
+            .md
+            .file_metadata()
+            .key_value_metadata()
+            .as_ref()
+            .context(IoxMetadataMissingSnafu)?
+            .iter()
+            .find(|kv| kv.key == METADATA_KEY)
+            .context(IoxMetadataMissingSnafu)?;
+
+        // extract protobuf message from key-value entry
+        let proto_base64 = kv.value.as_ref().context(IoxMetadataMissingSnafu)?;
+        // read to rust object
+        IoxMetadata::from_base64(proto_base64.as_bytes())
+    }
+
+    /// Read IOx schema from parquet metadata.
+    pub fn read_schema(&self) -> Result<Schema> {
+        let file_metadata = self.md.file_metadata();
+
+        let arrow_schema = parquet_to_arrow_schema(
+            file_metadata.schema_descr(),
+            file_metadata.key_value_metadata(),
+        )
+        .context(ArrowFromParquetFailureSnafu {})?;
+
+        // The parquet reader will propagate any metadata keys present in the parquet
+        // metadata onto the arrow schema. This will include the encoded IOxMetadata
+        //
+        // We strip this out to avoid false negatives when comparing schemas for equality,
+        // as this metadata will vary from file to file
+        let arrow_schema_ref = Arc::new(arrow_schema.with_metadata(Default::default()));
+
+        arrow_schema_ref
+            .try_into()
+            .context(IoxFromArrowFailureSnafu {})
+    }
+
+    /// Read IOx statistics (including timestamp range) from parquet metadata.
+    pub fn read_statistics(&self, schema: &Schema) -> Result<Vec<ColumnSummary>> {
+        ensure!(!self.md.row_groups().is_empty(), NoRowGroupSnafu);
+
+        let mut column_summaries = Vec::with_capacity(schema.len());
+
+        for (row_group_idx, row_group) in self.md.row_groups().iter().enumerate() {
+            let row_group_column_summaries =
+                read_statistics_from_parquet_row_group(row_group, row_group_idx, schema)?;
+
+            combine_column_summaries(&mut column_summaries, row_group_column_summaries);
+        }
+
+        Ok(column_summaries)
+    }
+
+    /// Estimate the memory consumption of this object and its contents
+    pub fn size(&self) -> usize {
+        // This is likely a wild under count as it doesn't include
+        // memory pointed to in the `ParquetMetaData` structues.
+        // Feature tracked in arrow-rs: https://github.com/apache/arrow-rs/issues/1729
+        mem::size_of_val(self)
+    }
+}
+
+/// Read IOx statistics from parquet row group metadata.
+fn read_statistics_from_parquet_row_group(
+    row_group: &ParquetRowGroupMetaData,
+    row_group_idx: usize,
+    schema: &Schema,
+) -> Result<Vec<ColumnSummary>> {
+    let mut column_summaries = Vec::with_capacity(schema.len());
+
+    for ((iox_type, field), column_chunk_metadata) in schema.iter().zip(row_group.columns()) {
+        let parquet_stats = column_chunk_metadata
+            .statistics()
+            .context(StatisticsMissingSnafu {
+                row_group: row_group_idx,
+                column: field.name().clone(),
+            })?;
+
+        let min_max_set = parquet_stats.has_min_max_set();
+        if min_max_set && parquet_stats.is_min_max_deprecated() {
+            StatisticsMinMaxDeprecatedSnafu {
+                row_group: row_group_idx,
+                column: field.name().clone(),
+            }
+            .fail()?;
+        }
+
+        let count = row_group.num_rows().max(0) as u64;
+
+        let stats = extract_iox_statistics(
+            parquet_stats,
+            min_max_set,
+            iox_type,
+            count,
+            row_group_idx,
+            field.name(),
+        )?;
+        column_summaries.push(ColumnSummary {
+            name: field.name().clone(),
+            influxdb_type: match iox_type {
+                InfluxColumnType::Tag => InfluxDbType::Tag,
+                InfluxColumnType::Field(_) => InfluxDbType::Field,
+                InfluxColumnType::Timestamp => InfluxDbType::Timestamp,
+            },
+            stats,
+        });
+    }
+
+    Ok(column_summaries)
+}
+
+fn combine_column_summaries(total: &mut Vec<ColumnSummary>, other: Vec<ColumnSummary>) {
+    for col in &mut *total {
+        if let Some(other_col) = other.iter().find(|c| c.name == col.name) {
+            col.update_from(other_col);
+        }
+    }
+
+    for other_col in other.into_iter() {
+        if !total.iter().any(|c| c.name == other_col.name) {
+            total.push(other_col);
+        }
+    }
+}
+
+/// Extract IOx statistics from parquet statistics.
+///
+/// This is required because upstream does not have a mapper from
+/// parquet statistics back to arrow or Rust native types.
+fn extract_iox_statistics(
+    parquet_stats: &ParquetStatistics,
+    min_max_set: bool,
+    iox_type: InfluxColumnType,
+    total_count: u64,
+    row_group_idx: usize,
+    column_name: &str,
+) -> Result<Statistics> {
+    let null_count = parquet_stats.null_count();
+
+    match (parquet_stats, iox_type) {
+        (ParquetStatistics::Boolean(stats), InfluxColumnType::Field(InfluxFieldType::Boolean)) => {
+            Ok(Statistics::Bool(StatValues {
+                min: min_max_set.then(|| *stats.min()),
+                max: min_max_set.then(|| *stats.max()),
+                distinct_count: parquet_stats
+                    .distinct_count()
+                    .and_then(|x| x.try_into().ok()),
+                null_count: Some(null_count),
+                total_count,
+            }))
+        }
+        (ParquetStatistics::Int64(stats), InfluxColumnType::Field(InfluxFieldType::Integer)) => {
+            Ok(Statistics::I64(StatValues {
+                min: min_max_set.then(|| *stats.min()),
+                max: min_max_set.then(|| *stats.max()),
+                distinct_count: parquet_stats
+                    .distinct_count()
+                    .and_then(|x| x.try_into().ok()),
+                null_count: Some(null_count),
+                total_count,
+            }))
+        }
+        (ParquetStatistics::Int64(stats), InfluxColumnType::Field(InfluxFieldType::UInteger)) => {
+            Ok(Statistics::U64(StatValues {
+                min: min_max_set.then(|| *stats.min() as u64),
+                max: min_max_set.then(|| *stats.max() as u64),
+                distinct_count: parquet_stats
+                    .distinct_count()
+                    .and_then(|x| x.try_into().ok()),
+                null_count: Some(null_count),
+                total_count,
+            }))
+        }
+        (ParquetStatistics::Double(stats), InfluxColumnType::Field(InfluxFieldType::Float)) => {
+            Ok(Statistics::F64(StatValues {
+                min: min_max_set.then(|| *stats.min()),
+                max: min_max_set.then(|| *stats.max()),
+                distinct_count: parquet_stats
+                    .distinct_count()
+                    .and_then(|x| x.try_into().ok()),
+                null_count: Some(null_count),
+                total_count,
+            }))
+        }
+        (ParquetStatistics::Int64(stats), InfluxColumnType::Timestamp) => {
+            Ok(Statistics::I64(StatValues {
+                min: Some(*stats.min()),
+                max: Some(*stats.max()),
+                distinct_count: parquet_stats
+                    .distinct_count()
+                    .and_then(|x| x.try_into().ok()),
+                null_count: Some(null_count),
+                total_count,
+            }))
+        }
+        (ParquetStatistics::ByteArray(stats), InfluxColumnType::Tag)
+        | (ParquetStatistics::ByteArray(stats), InfluxColumnType::Field(InfluxFieldType::String)) => {
+            Ok(Statistics::String(StatValues {
+                min: min_max_set
+                    .then(|| {
+                        stats
+                            .min()
+                            .as_utf8()
+                            .context(StatisticsUtf8Snafu {
+                                row_group: row_group_idx,
+                                column: column_name.to_string(),
+                            })
+                            .map(|x| x.to_string())
+                    })
+                    .transpose()?,
+                max: min_max_set
+                    .then(|| {
+                        stats
+                            .max()
+                            .as_utf8()
+                            .context(StatisticsUtf8Snafu {
+                                row_group: row_group_idx,
+                                column: column_name.to_string(),
+                            })
+                            .map(|x| x.to_string())
+                    })
+                    .transpose()?,
+                distinct_count: parquet_stats
+                    .distinct_count()
+                    .and_then(|x| x.try_into().ok()),
+                null_count: Some(null_count),
+                total_count,
+            }))
+        }
+        _ => Err(Error::StatisticsTypeMismatch {
+            row_group: row_group_idx,
+            column: column_name.to_string(),
+            expected: iox_type,
+            actual: parquet_stats.clone(),
+        }),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::{
+        array::{ArrayRef, StringArray, TimestampNanosecondArray},
+        record_batch::RecordBatch,
+    };
+    use data_types::CompactionLevel;
+    use datafusion_util::{unbounded_memory_pool, MemoryStream};
+    use schema::{builder::SchemaBuilder, TIME_DATA_TIMEZONE};
+
+    #[test]
+    fn iox_metadata_protobuf_round_trip() {
+        let object_store_id = ObjectStoreId::new();
+
+        let sort_key = SortKeyBuilder::new().with_col("sort_col").build();
+
+        let create_time = Time::from_timestamp(3234, 0).unwrap();
+
+        let iox_metadata = IoxMetadata {
+            object_store_id,
+            creation_timestamp: create_time,
+            namespace_id: NamespaceId::new(2),
+            namespace_name: Arc::from("hi"),
+            table_id: TableId::new(3),
+            table_name: Arc::from("weather"),
+            partition_key: PartitionKey::from("part"),
+            compaction_level: CompactionLevel::Initial,
+            sort_key: Some(sort_key),
+            max_l0_created_at: create_time,
+        };
+
+        let proto = iox_metadata.to_protobuf().unwrap();
+
+        let iox_metadata_again = IoxMetadata::from_protobuf(&proto).unwrap();
+
+        assert_eq!(iox_metadata, iox_metadata_again);
+    }
+
+    #[tokio::test]
+    async fn test_metadata_from_parquet_metadata() {
+        let meta = IoxMetadata {
+            object_store_id: ObjectStoreId::new(),
+            creation_timestamp: Time::from_timestamp_nanos(42),
+            namespace_id: NamespaceId::new(1),
+            namespace_name: "bananas".into(),
+            table_id: TableId::new(3),
+            table_name: "platanos".into(),
+            partition_key: "potato".into(),
+            compaction_level: CompactionLevel::FileNonOverlapped,
+            sort_key: None,
+            max_l0_created_at: Time::from_timestamp_nanos(42),
+        };
+
+        let array = StringArray::from_iter([Some("bananas")]);
+        let data: ArrayRef = Arc::new(array);
+
+        let timestamps = to_timestamp_array(&[1647695292000000000]);
+
+        // Build a schema that contains the IOx metadata, ensuring it is
+        // correctly populated in the final parquet file's metadata to be read
+        // back later in the test.
+        let schema = SchemaBuilder::new()
+            .influx_field("a", InfluxFieldType::String)
+            .timestamp()
+            .build()
+            .expect("could not create schema")
+            .as_arrow();
+
+        let batch = RecordBatch::try_new(schema, vec![data, timestamps]).unwrap();
+        let stream = Box::pin(MemoryStream::new(vec![batch.clone()]));
+
+        let (bytes, file_meta) =
+            crate::serialize::to_parquet_bytes(stream, &meta, unbounded_memory_pool())
+                .await
+                .expect("should serialize");
+
+        // Verify if the parquet file meta data has values
+        assert!(!file_meta.row_groups.is_empty());
+
+        // Read the metadata from the file bytes.
+        //
+        // This is quite wordy...
+        let iox_parquet_meta = IoxParquetMetaData::from_file_bytes(Bytes::from(bytes))
+            .expect("should decode")
+            .expect("should contain metadata");
+        assert_metadata(&iox_parquet_meta.decode().unwrap());
+
+        // Read the metadata directly from the file metadata returned from
+        // encoding, should be the same
+        let iox_from_file_meta = IoxParquetMetaData::try_from(file_meta)
+            .expect("failed to decode IoxParquetMetaData from file metadata");
+        assert_metadata(&iox_from_file_meta.decode().unwrap());
+
+        // Reproducer of https://github.com/influxdata/influxdb_iox/issues/4714
+        // Convert IOx meta data back to parquet meta data and verify it is still the same
+        let decoded = iox_from_file_meta.decode().unwrap();
+        assert_metadata(&decoded);
+    }
+
+    fn assert_metadata(decoded: &DecodedIoxParquetMetaData) {
+        let new_file_meta = decoded.parquet_file_meta();
+        assert!(new_file_meta.key_value_metadata().is_some());
+
+        let new_row_group_meta = decoded.parquet_row_group_metadata();
+        assert!(!new_row_group_meta.is_empty());
+        let col_meta = new_row_group_meta[0].column(0);
+        assert!(col_meta.statistics().is_some()); // There is statistics for column "a"
+
+        // Read the schema from the metadata.
+        //
+        // If this is not specified in the metadata when writing, it will be
+        // automatically inferred at read time and will not contain the IOx
+        // metadata.
+        let schema = decoded.read_schema().unwrap();
+        let (_, field) = schema.field(0);
+        assert_eq!(field.name(), "a");
+
+        // Try and access the IOx metadata that was embedded above (with the
+        // SchemaBuilder)
+        let col_summary = decoded.read_statistics(&schema).unwrap();
+        assert!(!col_summary.is_empty());
+    }
+
+    fn to_timestamp_array(timestamps: &[i64]) -> ArrayRef {
+        let array = timestamps
+            .iter()
+            .map(|v| Some(*v))
+            .collect::<TimestampNanosecondArray>()
+            .with_timezone_opt(TIME_DATA_TIMEZONE());
+        Arc::new(array)
+    }
+}
diff --git a/parquet_file/src/serialize.rs b/parquet_file/src/serialize.rs
new file mode 100644
index 0000000..5cec2fb
--- /dev/null
+++ b/parquet_file/src/serialize.rs
@@ -0,0 +1,277 @@
+//! Streaming [`RecordBatch`] / Parquet file encoder routines.
+//!
+//! [`RecordBatch`]: arrow::record_batch::RecordBatch
+
+use std::{io::Write, sync::Arc};
+
+use datafusion::{
+    error::DataFusionError, execution::memory_pool::MemoryPool,
+    physical_plan::SendableRecordBatchStream,
+};
+use datafusion_util::config::BATCH_SIZE;
+use futures::{pin_mut, TryStreamExt};
+use observability_deps::tracing::{debug, trace, warn};
+use parquet::{
+    basic::Compression,
+    errors::ParquetError,
+    file::{metadata::KeyValue, properties::WriterProperties},
+};
+use thiserror::Error;
+
+use crate::{
+    metadata::{IoxMetadata, METADATA_KEY},
+    writer::TrackedMemoryArrowWriter,
+};
+
+/// Parquet row group write size
+pub const ROW_GROUP_WRITE_SIZE: usize = 1024 * 1024;
+
+/// ensure read and write work well together
+/// Skip clippy due to <https://github.com/rust-lang/rust-clippy/issues/8159>.
+#[allow(clippy::assertions_on_constants)]
+const _: () = assert!(ROW_GROUP_WRITE_SIZE % BATCH_SIZE == 0);
+
+/// [`RecordBatch`] to Parquet serialisation errors.
+///
+/// [`RecordBatch`]: arrow::record_batch::RecordBatch
+#[derive(Debug, Error)]
+pub enum CodecError {
+    /// The result stream contained no batches.
+    #[error("no record batches to convert")]
+    NoRecordBatches,
+
+    /// The result stream contained at least one [`RecordBatch`] and all
+    /// instances yielded by the stream contained 0 rows.
+    ///
+    /// This would result in an empty file being uploaded to object store.
+    ///
+    /// [`RecordBatch`]: arrow::record_batch::RecordBatch
+    #[error("no rows to serialise")]
+    NoRows,
+
+    /// A DataFusion error during the plan execution.
+    ///
+    /// Of note: a ResourcesExhaused error likely means the buffer
+    /// used for parquet data became too large.
+    #[error(transparent)]
+    DataFusion(#[from] DataFusionError),
+
+    /// Serialising the [`IoxMetadata`] to protobuf-encoded bytes failed.
+    #[error("failed to serialize iox metadata: {0}")]
+    MetadataSerialisation(#[from] prost::EncodeError),
+
+    /// Writing the parquet file failed with the specified error.
+    #[error("failed to build parquet file: {0}")]
+    Writer(#[from] ParquetError),
+
+    /// Attempting to clone a handle to the provided write sink failed.
+    #[error("failed to obtain writer handle clone: {0}")]
+    CloneSink(std::io::Error),
+}
+
+impl From<CodecError> for DataFusionError {
+    fn from(value: CodecError) -> Self {
+        match value {
+            e @ (CodecError::NoRecordBatches
+            | CodecError::NoRows
+            | CodecError::MetadataSerialisation(_)
+            | CodecError::CloneSink(_)) => Self::External(Box::new(e)),
+            CodecError::Writer(e) => Self::ParquetError(e),
+            CodecError::DataFusion(e) => e,
+        }
+    }
+}
+
+impl From<crate::writer::Error> for CodecError {
+    fn from(value: crate::writer::Error) -> Self {
+        match value {
+            crate::writer::Error::Writer(e) => Self::Writer(e),
+            crate::writer::Error::OutOfMemory(e) => Self::DataFusion(e),
+        }
+    }
+}
+
+/// An IOx-specific, streaming [`RecordBatch`] to parquet file encoder.
+///
+/// This encoder discovers the schema from the first item in `batches`, and
+/// encodes each [`RecordBatch`] one by one into `W`. All [`RecordBatch`]
+/// yielded by the stream must be of the same schema, or this call will return
+/// an error.
+///
+/// IOx metadata is encoded into the parquet file's metadata under the key
+/// [`METADATA_KEY`], with a base64-wrapped, protobuf serialized
+/// [`proto::IoxMetadata`] structure.
+///
+/// Returns the serialized [`FileMetaData`] for the encoded parquet file, from
+/// which an [`IoxParquetMetaData`] can be derived.
+///
+/// # Errors
+///
+/// If the stream yields a [`RecordBatch`] containing no rows, a warning is
+/// logged and serialisation continues.
+///
+/// If [`to_parquet()`] observes at least one [`RecordBatch`], but 0 rows across
+/// all [`RecordBatch`], then [`CodecError::NoRows`] is returned as no useful
+/// data was serialised.
+///
+/// [`proto::IoxMetadata`]: generated_types::influxdata::iox::ingester::v1
+/// [`FileMetaData`]: parquet::format::FileMetaData
+/// [`IoxParquetMetaData`]: crate::metadata::IoxParquetMetaData
+/// [`RecordBatch`]: arrow::record_batch::RecordBatch
+pub async fn to_parquet<W>(
+    batches: SendableRecordBatchStream,
+    meta: &IoxMetadata,
+    pool: Arc<dyn MemoryPool>,
+    sink: W,
+) -> Result<parquet::format::FileMetaData, CodecError>
+where
+    W: Write + Send,
+{
+    // The ArrowWriter::write() call will return an error if any subsequent
+    // batch does not match this schema, enforcing schema uniformity.
+    let schema = batches.schema();
+
+    let stream = batches;
+    pin_mut!(stream);
+
+    // Serialize the IoxMetadata to the protobuf bytes.
+    let props = writer_props(meta)?;
+    let write_batch_size = props.write_batch_size();
+    let max_row_group_size = props.max_row_group_size();
+
+    // Construct the arrow serializer with the metadata as part of the parquet
+    // file properties.
+    let mut writer = TrackedMemoryArrowWriter::try_new(sink, Arc::clone(&schema), props, pool)?;
+
+    let mut num_batches = 0;
+    while let Some(batch) = stream.try_next().await? {
+        writer.write(batch)?;
+        num_batches += 1;
+    }
+
+    let writer_meta = writer.close()?;
+    if writer_meta.num_rows == 0 {
+        // throw warning if all input batches are empty
+        warn!("parquet serialisation encoded 0 rows");
+        return Err(CodecError::NoRows);
+    }
+
+    debug!(num_batches,
+           num_rows=writer_meta.num_rows,
+           object_store_id=?meta.object_store_id,
+           write_batch_size,
+           max_row_group_size,
+           "Created parquet file");
+
+    Ok(writer_meta)
+}
+
+/// A helper function that calls [`to_parquet()`], serialising the parquet file
+/// into an in-memory buffer and returning the resulting bytes.
+pub async fn to_parquet_bytes(
+    batches: SendableRecordBatchStream,
+    meta: &IoxMetadata,
+    pool: Arc<dyn MemoryPool>,
+) -> Result<(Vec<u8>, parquet::format::FileMetaData), CodecError> {
+    let mut bytes = vec![];
+
+    debug!(
+        ?meta,
+        "IOxMetaData provided for serializing the data into the in-memory buffer"
+    );
+
+    // Serialize the record batches into the in-memory buffer
+    let meta = to_parquet(batches, meta, pool, &mut bytes).await?;
+    bytes.shrink_to_fit();
+
+    trace!(?meta, "generated parquet file metadata");
+
+    Ok((bytes, meta))
+}
+
+/// Helper to construct [`WriterProperties`] , serialising the given
+/// [`IoxMetadata`] and embedding it as a key=value property keyed by
+/// [`METADATA_KEY`].
+fn writer_props(meta: &IoxMetadata) -> Result<WriterProperties, prost::EncodeError> {
+    let builder = WriterProperties::builder()
+        .set_key_value_metadata(Some(vec![KeyValue {
+            key: METADATA_KEY.to_string(),
+            value: Some(meta.to_base64()?),
+        }]))
+        .set_compression(Compression::ZSTD(Default::default()))
+        .set_max_row_group_size(ROW_GROUP_WRITE_SIZE);
+
+    Ok(builder.build())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::IoxParquetMetaData;
+    use arrow::{
+        array::{ArrayRef, StringArray},
+        record_batch::RecordBatch,
+    };
+    use bytes::Bytes;
+    use data_types::{CompactionLevel, NamespaceId, ObjectStoreId, TableId};
+    use datafusion::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+    use datafusion_util::{unbounded_memory_pool, MemoryStream};
+    use iox_time::Time;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn test_encode_stream() {
+        let meta = IoxMetadata {
+            object_store_id: ObjectStoreId::new(),
+            creation_timestamp: Time::from_timestamp_nanos(42),
+            namespace_id: NamespaceId::new(1),
+            namespace_name: "bananas".into(),
+            table_id: TableId::new(3),
+            table_name: "platanos".into(),
+            partition_key: "potato".into(),
+            compaction_level: CompactionLevel::FileNonOverlapped,
+            sort_key: None,
+            max_l0_created_at: Time::from_timestamp_nanos(42),
+        };
+
+        let batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        let stream = Box::pin(MemoryStream::new(vec![batch.clone()]));
+
+        let (bytes, _file_meta) = to_parquet_bytes(stream, &meta, unbounded_memory_pool())
+            .await
+            .expect("should serialize");
+
+        let bytes = Bytes::from(bytes);
+        // Read the metadata from the file bytes.
+        //
+        // This is quite wordy...
+        let iox_parquet_meta = IoxParquetMetaData::from_file_bytes(bytes.clone())
+            .expect("should decode")
+            .expect("should contain metadata")
+            .decode()
+            .expect("should decode IOx metadata")
+            .read_iox_metadata_new()
+            .expect("should read IOxMetadata");
+        assert_eq!(iox_parquet_meta, meta);
+
+        // Read the parquet file back to arrow records
+        let arrow_reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
+            .expect("should init builder")
+            .with_batch_size(100)
+            .build()
+            .expect("should create reader");
+
+        let mut record_batches = arrow_reader.into_iter().collect::<Vec<_>>();
+
+        assert_eq!(record_batches.len(), 1);
+        assert_eq!(
+            record_batches.pop().unwrap().expect("should be OK batch"),
+            batch
+        );
+    }
+
+    fn to_string_array(strs: &[&str]) -> ArrayRef {
+        let array: StringArray = strs.iter().map(|s| Some(*s)).collect();
+        Arc::new(array)
+    }
+}
diff --git a/parquet_file/src/storage.rs b/parquet_file/src/storage.rs
new file mode 100644
index 0000000..69798b2
--- /dev/null
+++ b/parquet_file/src/storage.rs
@@ -0,0 +1,701 @@
+//! This module is responsible for writing the given data to the specified
+//! object store and reading it back.
+
+use crate::{
+    metadata::{IoxMetadata, IoxParquetMetaData},
+    serialize::{self, CodecError},
+    ParquetFilePath,
+};
+use arrow::{
+    datatypes::{Field, SchemaRef},
+    record_batch::RecordBatch,
+};
+use bytes::Bytes;
+use data_types::TransitionPartitionId;
+use datafusion::{
+    datasource::{
+        listing::PartitionedFile,
+        object_store::ObjectStoreUrl,
+        physical_plan::{FileScanConfig, ParquetExec},
+    },
+    error::DataFusionError,
+    execution::memory_pool::MemoryPool,
+    physical_plan::{ExecutionPlan, SendableRecordBatchStream, Statistics},
+    prelude::SessionContext,
+};
+use datafusion_util::config::{iox_session_config, register_iox_object_store};
+use object_store::{DynObjectStore, ObjectMeta};
+use observability_deps::tracing::*;
+use schema::Projection;
+use std::{
+    fmt::Display,
+    sync::Arc,
+    time::{Duration, Instant},
+};
+use thiserror::Error;
+
+/// Errors returned during a Parquet "put" operation, covering [`RecordBatch`]
+/// pull from the provided stream, encoding, and finally uploading the bytes to
+/// the object store.
+///
+/// [`RecordBatch`]: arrow::record_batch::RecordBatch
+#[derive(Debug, Error)]
+pub enum UploadError {
+    /// A codec failure during serialisation.
+    #[error(transparent)]
+    Serialise(#[from] CodecError),
+
+    /// An error during Parquet metadata conversion when attempting to
+    /// instantiate a valid [`IoxParquetMetaData`] instance.
+    #[error("failed to construct IOx parquet metadata: {0}")]
+    Metadata(crate::metadata::Error),
+
+    /// Uploading the Parquet file to object store failed.
+    #[error("failed to upload to object storage: {0}")]
+    Upload(#[from] object_store::Error),
+}
+
+impl From<UploadError> for DataFusionError {
+    fn from(value: UploadError) -> Self {
+        match value {
+            UploadError::Serialise(e) => {
+                Self::Context(String::from("serialize"), Box::new(e.into()))
+            }
+            UploadError::Metadata(e) => Self::External(Box::new(e)),
+            UploadError::Upload(e) => Self::ObjectStore(e),
+        }
+    }
+}
+
+/// ID for an object store hooked up into DataFusion.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
+pub struct StorageId(&'static str);
+
+impl From<&'static str> for StorageId {
+    fn from(id: &'static str) -> Self {
+        Self(id)
+    }
+}
+
+impl AsRef<str> for StorageId {
+    fn as_ref(&self) -> &str {
+        self.0
+    }
+}
+
+impl std::fmt::Display for StorageId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+/// Inputs required to build a [`ParquetExec`] for one or multiple files.
+///
+/// The files shall be grouped by [`object_store_url`](Self::object_store_url). For each each object store, you shall
+/// create one [`ParquetExec`] and put each file into its own "file group".
+///
+/// [`ParquetExec`]: datafusion::datasource::physical_plan::ParquetExec
+#[derive(Debug, Clone)]
+pub struct ParquetExecInput {
+    /// Store where the file is located.
+    pub object_store_url: ObjectStoreUrl,
+
+    /// Object metadata.
+    pub object_meta: ObjectMeta,
+}
+
+impl ParquetExecInput {
+    /// Read parquet file into [`RecordBatch`]es.
+    ///
+    /// This should only be used for testing purposes.
+    pub async fn read_to_batches(
+        &self,
+        schema: SchemaRef,
+        projection: Projection<'_>,
+        session_ctx: &SessionContext,
+    ) -> Result<Vec<RecordBatch>, DataFusionError> {
+        // Compute final (output) schema after selection
+        let schema = Arc::new(
+            projection
+                .project_schema(&schema)
+                .as_ref()
+                .clone()
+                .with_metadata(Default::default()),
+        );
+        let statistics = Statistics::new_unknown(&schema);
+        let base_config = FileScanConfig {
+            object_store_url: self.object_store_url.clone(),
+            file_schema: schema,
+            file_groups: vec![vec![PartitionedFile {
+                object_meta: self.object_meta.clone(),
+                partition_values: vec![],
+                range: None,
+                extensions: None,
+            }]],
+            statistics,
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            // Parquet files ARE actually sorted but we don't care here since we just construct a `collect` plan.
+            output_ordering: vec![],
+        };
+        let exec = ParquetExec::new(base_config, None, None);
+        let exec_schema = exec.schema();
+        datafusion::physical_plan::collect(Arc::new(exec), session_ctx.task_ctx())
+            .await
+            .map(|batches| {
+                for batch in &batches {
+                    assert_eq!(batch.schema(), exec_schema);
+                }
+                batches
+            })
+    }
+}
+
+/// The [`ParquetStorage`] type encapsulates [`RecordBatch`] persistence to an
+/// underlying [`ObjectStore`].
+///
+/// [`RecordBatch`] instances are serialized to Parquet files, with IOx specific
+/// metadata ([`IoxParquetMetaData`]) attached.
+///
+/// Code that interacts with Parquet files in object storage should utilise this
+/// type that encapsulates the storage & retrieval implementation.
+///
+/// [`ObjectStore`]: object_store::ObjectStore
+/// [`RecordBatch`]: arrow::record_batch::RecordBatch
+#[derive(Debug, Clone)]
+pub struct ParquetStorage {
+    /// Underlying object store.
+    object_store: Arc<DynObjectStore>,
+
+    /// Storage ID to hook it into DataFusion.
+    id: StorageId,
+}
+
+impl Display for ParquetStorage {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "ParquetStorage(id={:?}, object_store={}",
+            self.id, self.object_store
+        )
+    }
+}
+
+impl ParquetStorage {
+    /// Initialise a new [`ParquetStorage`] using `object_store` as the
+    /// persistence layer.
+    pub fn new(object_store: Arc<DynObjectStore>, id: StorageId) -> Self {
+        Self { object_store, id }
+    }
+
+    /// Get underlying object store.
+    pub fn object_store(&self) -> &Arc<DynObjectStore> {
+        &self.object_store
+    }
+
+    /// Get ID.
+    pub fn id(&self) -> StorageId {
+        self.id
+    }
+
+    /// Fake DataFusion context for testing that contains this store
+    pub fn test_df_context(&self) -> SessionContext {
+        // set up "fake" DataFusion session
+        let object_store = Arc::clone(&self.object_store);
+        let session_ctx = SessionContext::new_with_config(iox_session_config());
+        register_iox_object_store(session_ctx.runtime_env(), self.id, object_store);
+        session_ctx
+    }
+
+    /// Push `batches`, a stream of [`RecordBatch`] instances, to object
+    /// storage.
+    ///
+    /// Any buffering needed is registered with the pool
+    ///
+    /// # Retries
+    ///
+    /// This method retries forever in the presence of object store errors. All
+    /// other errors are returned as they occur.
+    ///
+    /// [`RecordBatch`]: arrow::record_batch::RecordBatch
+    pub async fn upload(
+        &self,
+        batches: SendableRecordBatchStream,
+        partition_id: &TransitionPartitionId,
+        meta: &IoxMetadata,
+        pool: Arc<dyn MemoryPool>,
+    ) -> Result<(IoxParquetMetaData, usize), UploadError> {
+        let start = Instant::now();
+
+        // Stream the record batches into a parquet file.
+        //
+        // It would be nice to stream the encoded parquet to disk for this and
+        // eliminate the buffering in memory, but the lack of a streaming object
+        // store put negates any benefit of spilling to disk.
+        //
+        // This is not a huge concern, as the resulting parquet files are
+        // currently smallish on average.
+        let (data, parquet_file_meta) = serialize::to_parquet_bytes(batches, meta, pool).await?;
+
+        // Read the IOx-specific parquet metadata from the file metadata
+        let parquet_meta =
+            IoxParquetMetaData::try_from(parquet_file_meta).map_err(UploadError::Metadata)?;
+        trace!(
+            ?parquet_meta,
+            "IoxParquetMetaData coverted from Row Group Metadata (aka FileMetaData)"
+        );
+
+        // Derive the correct object store path from the metadata.
+        let path = ParquetFilePath::from((partition_id, meta)).object_store_path();
+
+        let file_size = data.len();
+        let data = Bytes::from(data);
+
+        debug!(
+            file_size,
+            object_store_id=?meta.object_store_id,
+            // includes the time to run the datafusion plan (that is the batches)
+            total_time_to_create_parquet_bytes=?(Instant::now() - start),
+            "Uploading parquet to object store"
+        );
+
+        // Retry uploading the file endlessly.
+        //
+        // This is abort-able by the user by dropping the upload() future.
+        //
+        // Cloning `data` is a ref count inc, rather than a data copy.
+        let mut retried = false;
+        while let Err(e) = self.object_store.put(&path, data.clone()).await {
+            warn!(error=%e, ?meta, "failed to upload parquet file to object storage, retrying");
+            tokio::time::sleep(Duration::from_secs(1)).await;
+            retried = true;
+        }
+
+        if retried {
+            info!(
+                ?meta,
+                "Succeeded uploading files to object storage on retry"
+            );
+        }
+
+        Ok((parquet_meta, file_size))
+    }
+
+    /// Inputs for [`ParquetExec`].
+    ///
+    /// See [`ParquetExecInput`] for more information.
+    ///
+    /// [`ParquetExec`]: datafusion::datasource::physical_plan::ParquetExec
+    pub fn parquet_exec_input(&self, path: &ParquetFilePath, file_size: usize) -> ParquetExecInput {
+        ParquetExecInput {
+            object_store_url: ObjectStoreUrl::parse(format!("iox://{}/", self.id))
+                .expect("valid object store URL"),
+            object_meta: ObjectMeta {
+                location: path.object_store_path(),
+                // we don't care about the "last modified" field
+                last_modified: Default::default(),
+                size: file_size,
+                e_tag: None,
+                version: None,
+            },
+        }
+    }
+}
+
+/// Error during projecting parquet file data to an expected schema.
+#[derive(Debug, Error)]
+#[allow(clippy::large_enum_variant)]
+pub enum ProjectionError {
+    /// Unknown field.
+    #[error("Unknown field: {0}")]
+    UnknownField(String),
+
+    /// Field type mismatch
+    #[error("Type mismatch, expected {expected:?} but got {actual:?}")]
+    FieldTypeMismatch {
+        /// Expected field.
+        expected: Field,
+
+        /// Actual field.
+        actual: Field,
+    },
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::{
+        array::{ArrayRef, Int64Array, IntervalMonthDayNanoArray, StringArray},
+        record_batch::RecordBatch,
+    };
+    use data_types::{CompactionLevel, NamespaceId, ObjectStoreId, PartitionId, TableId};
+    use datafusion::common::DataFusionError;
+    use datafusion_util::{unbounded_memory_pool, MemoryStream};
+    use iox_time::Time;
+    use std::collections::HashMap;
+
+    #[tokio::test]
+    async fn test_upload_metadata() {
+        let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+
+        let store = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+        let (partition_id, meta) = meta();
+        let batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+
+        // Serialize & upload the record batches.
+        let (file_meta, _file_size) = upload(&store, &partition_id, &meta, batch.clone()).await;
+
+        // Extract the various bits of metadata.
+        let file_meta = file_meta.decode().expect("should decode parquet metadata");
+        let got_iox_meta = file_meta
+            .read_iox_metadata_new()
+            .expect("should read IOx metadata from parquet meta");
+
+        // Ensure the metadata in the file decodes to the same IOx metadata we
+        // provided when uploading.
+        assert_eq!(got_iox_meta, meta);
+    }
+
+    #[tokio::test]
+    async fn test_simple_roundtrip() {
+        let batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        let schema = batch.schema();
+
+        assert_roundtrip(batch.clone(), Projection::All, schema, batch).await;
+    }
+
+    #[tokio::test]
+    async fn test_selection() {
+        let batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_int_array(&[1])),
+            ("c", to_string_array(&["foo"])),
+            ("d", to_int_array(&[2])),
+        ])
+        .unwrap();
+        let schema = batch.schema();
+
+        let expected_batch = RecordBatch::try_from_iter([
+            ("d", to_int_array(&[2])),
+            ("c", to_string_array(&["foo"])),
+        ])
+        .unwrap();
+        assert_roundtrip(batch, Projection::Some(&["d", "c"]), schema, expected_batch).await;
+    }
+
+    #[tokio::test]
+    async fn test_selection_unknown() {
+        let batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_int_array(&[1])),
+        ])
+        .unwrap();
+        let schema = batch.schema();
+
+        let expected_batch = RecordBatch::try_from_iter([("b", to_int_array(&[1]))]).unwrap();
+        assert_roundtrip(batch, Projection::Some(&["b", "c"]), schema, expected_batch).await;
+    }
+
+    #[tokio::test]
+    async fn test_file_has_different_column_order() {
+        let file_batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_int_array(&[1])),
+        ])
+        .unwrap();
+        let schema_batch = RecordBatch::try_from_iter([
+            ("b", to_int_array(&[1])),
+            ("a", to_string_array(&["value"])),
+        ])
+        .unwrap();
+        let schema = schema_batch.schema();
+        assert_roundtrip(file_batch, Projection::All, schema, schema_batch).await;
+    }
+
+    #[tokio::test]
+    async fn test_file_has_different_column_order_with_selection() {
+        let batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_int_array(&[1])),
+            ("c", to_string_array(&["foo"])),
+            ("d", to_int_array(&[2])),
+        ])
+        .unwrap();
+        let schema_batch = RecordBatch::try_from_iter([
+            ("b", to_int_array(&[1])),
+            ("d", to_int_array(&[2])),
+            ("c", to_string_array(&["foo"])),
+            ("a", to_string_array(&["value"])),
+        ])
+        .unwrap();
+        let schema = schema_batch.schema();
+
+        let expected_batch = RecordBatch::try_from_iter([
+            ("d", to_int_array(&[2])),
+            ("c", to_string_array(&["foo"])),
+        ])
+        .unwrap();
+        assert_roundtrip(batch, Projection::Some(&["d", "c"]), schema, expected_batch).await;
+    }
+
+    #[tokio::test]
+    async fn test_schema_check_fail_different_types() {
+        let batch = RecordBatch::try_from_iter([("a", to_interval_array(&[123456]))]).unwrap();
+        let other_batch = RecordBatch::try_from_iter([("a", to_int_array(&[123456]))]).unwrap();
+        let schema = batch.schema();
+        assert_schema_check_fail(
+            other_batch,
+            schema,
+            "Error during planning: Cannot cast file schema field a of type Int64 to table schema field of type Interval(MonthDayNano)",
+        ).await;
+    }
+
+    #[tokio::test]
+    async fn test_schema_check_fail_different_names() {
+        let batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        let other_batch = RecordBatch::try_from_iter([("b", to_string_array(&["value"]))]).unwrap();
+        let schema = batch.schema();
+        assert_schema_check_fail(
+            other_batch,
+            schema,
+            "Arrow error: Invalid argument error: Column 'a' is declared as non-nullable but contains null values",
+        ).await;
+    }
+
+    #[tokio::test]
+    async fn test_schema_check_fail_unknown_column() {
+        let batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_string_array(&["value"])),
+        ])
+        .unwrap();
+        let other_batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        let schema = batch.schema();
+        assert_schema_check_fail(
+            other_batch,
+            schema,
+            "Arrow error: Invalid argument error: Column 'b' is declared as non-nullable but contains null values",
+        ).await;
+    }
+
+    #[tokio::test]
+    async fn test_schema_check_ignore_additional_metadata_in_mem() {
+        let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+
+        let store = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+        let (partition_id, meta) = meta();
+        let batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        let schema = batch.schema();
+
+        // Serialize & upload the record batches.
+        let (_iox_md, file_size) = upload(&store, &partition_id, &meta, batch).await;
+
+        // add metadata to reference schema
+        let schema = Arc::new(
+            schema
+                .as_ref()
+                .clone()
+                .with_metadata(HashMap::from([(String::from("foo"), String::from("bar"))])),
+        );
+        download(
+            &store,
+            &partition_id,
+            &meta,
+            Projection::All,
+            schema,
+            file_size,
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_schema_check_ignore_additional_metadata_in_file() {
+        let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+
+        let store = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+        let (partition_id, meta) = meta();
+        let batch = RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        let schema = batch.schema();
+        // add metadata to stored batch
+        let batch = RecordBatch::try_new(
+            Arc::new(
+                schema
+                    .as_ref()
+                    .clone()
+                    .with_metadata(HashMap::from([(String::from("foo"), String::from("bar"))])),
+            ),
+            batch.columns().to_vec(),
+        )
+        .unwrap();
+
+        // Serialize & upload the record batches.
+        let (_iox_md, file_size) = upload(&store, &partition_id, &meta, batch).await;
+
+        download(
+            &store,
+            &partition_id,
+            &meta,
+            Projection::All,
+            schema,
+            file_size,
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_schema_check_ignores_extra_column_in_file() {
+        let file_batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_string_array(&["value"])),
+        ])
+        .unwrap();
+        let expected_batch =
+            RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        let schema = expected_batch.schema();
+        assert_roundtrip(file_batch, Projection::All, schema, expected_batch).await;
+    }
+
+    #[tokio::test]
+    async fn test_schema_check_ignores_type_for_unselected_column() {
+        let file_batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_string_array(&["value"])),
+        ])
+        .unwrap();
+        let schema_batch = RecordBatch::try_from_iter([
+            ("a", to_string_array(&["value"])),
+            ("b", to_int_array(&[1])),
+        ])
+        .unwrap();
+        let schema = schema_batch.schema();
+        let expected_batch =
+            RecordBatch::try_from_iter([("a", to_string_array(&["value"]))]).unwrap();
+        assert_roundtrip(file_batch, Projection::Some(&["a"]), schema, expected_batch).await;
+    }
+
+    fn to_string_array(strs: &[&str]) -> ArrayRef {
+        let array: StringArray = strs.iter().map(|s| Some(*s)).collect();
+        Arc::new(array)
+    }
+
+    fn to_interval_array(vals: &[i128]) -> ArrayRef {
+        let array: IntervalMonthDayNanoArray = vals.iter().map(|v| Some(*v)).collect();
+        Arc::new(array)
+    }
+
+    fn to_int_array(vals: &[i64]) -> ArrayRef {
+        let array: Int64Array = vals.iter().map(|v| Some(*v)).collect();
+        Arc::new(array)
+    }
+
+    fn meta() -> (TransitionPartitionId, IoxMetadata) {
+        (
+            TransitionPartitionId::Deprecated(PartitionId::new(4)),
+            IoxMetadata {
+                object_store_id: ObjectStoreId::new(),
+                creation_timestamp: Time::from_timestamp_nanos(42),
+                namespace_id: NamespaceId::new(1),
+                namespace_name: "bananas".into(),
+                table_id: TableId::new(3),
+                table_name: "platanos".into(),
+                partition_key: "potato".into(),
+                compaction_level: CompactionLevel::FileNonOverlapped,
+                sort_key: None,
+                max_l0_created_at: Time::from_timestamp_nanos(42),
+            },
+        )
+    }
+
+    async fn upload(
+        store: &ParquetStorage,
+        partition_id: &TransitionPartitionId,
+        meta: &IoxMetadata,
+        batch: RecordBatch,
+    ) -> (IoxParquetMetaData, usize) {
+        let stream = Box::pin(MemoryStream::new(vec![batch]));
+        store
+            .upload(stream, partition_id, meta, unbounded_memory_pool())
+            .await
+            .expect("should serialize and store sucessfully")
+    }
+
+    async fn download<'a>(
+        store: &ParquetStorage,
+        partition_id: &TransitionPartitionId,
+        meta: &IoxMetadata,
+        selection: Projection<'_>,
+        expected_schema: SchemaRef,
+        file_size: usize,
+    ) -> Result<RecordBatch, DataFusionError> {
+        let path: ParquetFilePath = (partition_id, meta).into();
+        store
+            .parquet_exec_input(&path, file_size)
+            .read_to_batches(expected_schema, selection, &store.test_df_context())
+            .await
+            .map(|mut batches| {
+                assert_eq!(batches.len(), 1);
+                batches.remove(0)
+            })
+    }
+
+    async fn assert_roundtrip(
+        upload_batch: RecordBatch,
+        selection: Projection<'_>,
+        expected_schema: SchemaRef,
+        expected_batch: RecordBatch,
+    ) {
+        let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+
+        let store = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+        // Serialize & upload the record batches.
+        let (partition_id, meta) = meta();
+        let (_iox_md, file_size) = upload(&store, &partition_id, &meta, upload_batch).await;
+
+        // And compare to the original input
+        let actual_batch = download(
+            &store,
+            &partition_id,
+            &meta,
+            selection,
+            expected_schema,
+            file_size,
+        )
+        .await
+        .unwrap();
+        assert_eq!(actual_batch, expected_batch);
+    }
+
+    async fn assert_schema_check_fail(
+        persisted_batch: RecordBatch,
+        expected_schema: SchemaRef,
+        msg: &str,
+    ) {
+        let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+
+        let store = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+        let (partition_id, meta) = meta();
+        let (_iox_md, file_size) = upload(&store, &partition_id, &meta, persisted_batch).await;
+
+        let err = download(
+            &store,
+            &partition_id,
+            &meta,
+            Projection::All,
+            expected_schema,
+            file_size,
+        )
+        .await
+        .unwrap_err();
+
+        // And compare to the original input
+        assert_eq!(err.to_string(), msg);
+    }
+}
diff --git a/parquet_file/src/writer.rs b/parquet_file/src/writer.rs
new file mode 100644
index 0000000..47a42ea
--- /dev/null
+++ b/parquet_file/src/writer.rs
@@ -0,0 +1,291 @@
+//! Memory tracked parquet writer
+use std::{fmt::Debug, io::Write, sync::Arc};
+
+use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
+use datafusion::{
+    error::DataFusionError,
+    execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation},
+};
+use observability_deps::tracing::warn;
+use parquet::{arrow::ArrowWriter, errors::ParquetError, file::properties::WriterProperties};
+use thiserror::Error;
+
+/// Errors related to [`TrackedMemoryArrowWriter`]
+#[derive(Debug, Error)]
+pub enum Error {
+    /// Writing the parquet file failed with the specified error.
+    #[error("failed to write parquet file: {0}")]
+    Writer(#[from] ParquetError),
+
+    /// Could not allocate sufficient memory
+    #[error("failed to allocate buffer while writing parquet: {0}")]
+    OutOfMemory(#[from] DataFusionError),
+}
+
+/// Results!
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Wraps an [`ArrowWriter`] to track its buffered memory in a
+/// DataFusion [`MemoryPool`]
+///
+/// If the memory used by the `ArrowWriter` exceeds the memory allowed
+/// by the `MemoryPool`, subsequent writes will fail.
+///
+/// Note no attempt is made to cap the memory used by the
+/// `ArrowWriter` (for example by flushing earlier), which might be a
+/// useful exercise.
+#[derive(Debug)]
+pub struct TrackedMemoryArrowWriter<W: Write + Send> {
+    /// The inner ArrowWriter
+    inner: ArrowWriter<W>,
+    /// DataFusion memory reservation with
+    reservation: MemoryReservation,
+}
+
+impl<W: Write + Send> TrackedMemoryArrowWriter<W> {
+    /// create a new `TrackedMemoryArrowWriter<`
+    pub fn try_new(
+        sink: W,
+        schema: SchemaRef,
+        props: WriterProperties,
+        pool: Arc<dyn MemoryPool>,
+    ) -> Result<Self> {
+        let inner = ArrowWriter::try_new(sink, schema, Some(props))?;
+        let consumer = MemoryConsumer::new("IOx ParquetWriter (TrackedMemoryArrowWriter)");
+        let reservation = consumer.register(&pool);
+
+        Ok(Self { inner, reservation })
+    }
+
+    /// Push a `RecordBatch` into the underlying writer, updating the
+    /// tracked allocation
+    pub fn write(&mut self, batch: RecordBatch) -> Result<()> {
+        // writer encodes the batch into its internal buffers
+        let result = self.inner.write(&batch);
+
+        // In progress memory, in bytes
+        let in_progress_size = self.inner.in_progress_size();
+
+        // update the allocation with the pool.
+        let reservation_result = self
+            .reservation
+            .try_resize(in_progress_size)
+            .map_err(Error::OutOfMemory);
+
+        // Log any errors
+        if let Err(e) = &reservation_result {
+            warn!(
+                %e,
+                in_progress_size,
+                in_progress_rows = self.inner.in_progress_rows(),
+                existing_allocation = self.reservation.size(),
+                "Could not allocate sufficient buffer memory for writing parquet data"
+            );
+        }
+
+        reservation_result?;
+        result?;
+        Ok(())
+    }
+
+    /// closes the writer, flushing any remaining data and returning
+    /// the written [`FileMetaData`]
+    ///
+    /// [`FileMetaData`]: parquet::format::FileMetaData
+    pub fn close(self) -> Result<parquet::format::FileMetaData> {
+        // reservation is returned on drop
+        Ok(self.inner.close()?)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use arrow::array::{ArrayRef, StringArray};
+    use datafusion::{common::assert_contains, execution::memory_pool::GreedyMemoryPool};
+    use rand::{distributions::Standard, rngs::StdRng, Rng, SeedableRng};
+
+    /// Number of rows to trigger writer flush
+    const TEST_MAX_ROW_GROUP_SIZE: usize = 100;
+
+    /// Ensure the writer can successfully write when configured with
+    /// a sufficiently sized pool
+    #[tokio::test]
+    async fn test_pool_allocation() {
+        test_helpers::maybe_start_logging();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(TEST_MAX_ROW_GROUP_SIZE)
+            .set_data_page_size_limit(10) // ensure each batch is written as a page
+            .build();
+
+        let mut data_gen = DataGenerator::new();
+
+        let pool_size = 10000;
+        let pool = memory_pool(pool_size);
+        let mut writer =
+            TrackedMemoryArrowWriter::try_new(vec![], data_gen.schema(), props, Arc::clone(&pool))
+                .unwrap();
+
+        // first batch exceeds page limit, so wrote to a page
+        writer.write(data_gen.batch(10)).unwrap();
+        assert!(writer.reservation.size() > 0);
+        assert_eq!(writer.reservation.size(), writer.inner.in_progress_size());
+        let previous_reservation = writer.reservation.size();
+
+        // Feed in more data to force more data to be written
+        writer.write(data_gen.batch(20)).unwrap();
+        assert!(
+            writer.reservation.size() > previous_reservation,
+            "reservation_size: {} > {previous_reservation}",
+            writer.reservation.size()
+        );
+
+        // Feed in 50 more batches of 5 rows each, and expect that the reservation
+        // continues to match (may not grow as pages are flushed)
+        for _ in 0..50 {
+            writer.write(data_gen.batch(5)).unwrap();
+            assert_eq!(writer.reservation.size(), writer.inner.in_progress_size())
+        }
+
+        println!("Final reservation is {}", pool.reserved());
+        assert_ne!(pool.reserved(), 0);
+        assert_eq!(pool.reserved(), writer.inner.in_progress_size());
+
+        // drop the writer and verify the memory is returned to the pool
+        std::mem::drop(writer);
+        assert_eq!(pool.reserved(), 0);
+    }
+
+    /// Ensure the writer errors if it needs to buffer more data than
+    /// allowed by pool
+    #[tokio::test]
+    async fn test_pool_memory_pressure() {
+        test_helpers::maybe_start_logging();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(TEST_MAX_ROW_GROUP_SIZE)
+            .set_data_page_size_limit(10) // ensure each batch is written as a page
+            .build();
+
+        let mut data_gen = DataGenerator::new();
+
+        let pool_size = 1000;
+        let pool = memory_pool(pool_size);
+        let mut writer =
+            TrackedMemoryArrowWriter::try_new(vec![], data_gen.schema(), props, Arc::clone(&pool))
+                .unwrap();
+
+        for _ in 0..100 {
+            match writer.write(data_gen.batch(10)) {
+                Err(Error::OutOfMemory(e)) => {
+                    println!("Test errored as expected: {e}");
+                    assert_contains!(
+                        e.to_string(),
+                        "IOx ParquetWriter (TrackedMemoryArrowWriter)"
+                    );
+                    return;
+                }
+                Err(e) => {
+                    panic!("Unexpected error. Expected OOM, got: {e}");
+                }
+                Ok(_) => {}
+            }
+        }
+        panic!("Writer did not error when pool limit exceeded");
+    }
+
+    /// Ensure the writer can successfully write even after an error
+    #[tokio::test]
+    async fn test_allocation_after_error() {
+        test_helpers::maybe_start_logging();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(TEST_MAX_ROW_GROUP_SIZE)
+            .set_data_page_size_limit(10) // ensure each batch is written as a page
+            .build();
+
+        let mut data_gen = DataGenerator::new();
+
+        let pool_size = 10000;
+        let pool = memory_pool(pool_size);
+        let mut writer =
+            TrackedMemoryArrowWriter::try_new(vec![], data_gen.schema(), props, Arc::clone(&pool))
+                .unwrap();
+
+        writer.write(data_gen.batch(10)).unwrap();
+        assert_ne!(writer.reservation.size(), 0);
+        assert_eq!(writer.reservation.size(), writer.inner.in_progress_size());
+
+        // write a bad batch and accounting should still add up
+        writer.write(data_gen.bad_batch(3)).unwrap();
+        assert_eq!(writer.reservation.size(), writer.inner.in_progress_size());
+
+        // feed more good batches and allocation should still match
+        for _ in 0..15 {
+            writer.write(data_gen.batch(13)).unwrap();
+            assert_ne!(writer.reservation.size(), 0);
+            assert_eq!(writer.reservation.size(), writer.inner.in_progress_size());
+        }
+    }
+
+    /// Creates arrays of psuedo random 16 digit strings. Since
+    /// parquet is excellent at compression, psudo random strings are
+    /// required to make page flusing work in reasonable ways.
+    struct DataGenerator {
+        rng: StdRng,
+    }
+
+    impl DataGenerator {
+        fn new() -> Self {
+            let seed = 42;
+            Self {
+                rng: SeedableRng::seed_from_u64(seed),
+            }
+        }
+
+        /// Returns a batch with the specified number of random strings
+        fn batch(&mut self, count: usize) -> RecordBatch {
+            RecordBatch::try_from_iter([("a", self.string_array(count))]).unwrap()
+        }
+
+        /// Returns a batch with a different scheam
+        fn bad_batch(&mut self, count: usize) -> RecordBatch {
+            RecordBatch::try_from_iter([("b", self.string_array(count))]).unwrap()
+        }
+
+        /// Makes a string array with `count` entries of data with
+        /// different values (parquet is super efficient at encoding the
+        /// same value)
+        fn string_array(&mut self, count: usize) -> ArrayRef {
+            let array: StringArray = (0..count).map(|_| Some(self.rand_string())).collect();
+            Arc::new(array)
+        }
+
+        /// Return the schema of the generated batches
+        fn schema(&mut self) -> SchemaRef {
+            self.batch(1).schema()
+        }
+
+        /// Make  random 16 digit string
+        fn rand_string(&mut self) -> String {
+            // sample_iter consumes the random generator so use
+            // self.rng to seed one
+            let seed: u64 = self.rng.gen_range(0..u64::MAX);
+            let rng: StdRng = SeedableRng::seed_from_u64(seed);
+            rng.sample_iter(&Standard)
+                .filter_map(|c: u8| {
+                    if c.is_ascii_digit() {
+                        Some(char::from(c))
+                    } else {
+                        // discard if out of range
+                        None
+                    }
+                })
+                .take(16)
+                .collect()
+        }
+    }
+
+    /// make a MemoryPool with the specified max size
+    fn memory_pool(max_size: usize) -> Arc<dyn MemoryPool> {
+        Arc::new(GreedyMemoryPool::new(max_size))
+    }
+}
diff --git a/parquet_file/tests/metadata.rs b/parquet_file/tests/metadata.rs
new file mode 100644
index 0000000..cfdf3ee
--- /dev/null
+++ b/parquet_file/tests/metadata.rs
@@ -0,0 +1,459 @@
+use std::{collections::HashMap, sync::Arc};
+
+use arrow::{
+    array::{ArrayRef, StringArray, TimestampNanosecondArray},
+    record_batch::RecordBatch,
+};
+use data_types::{
+    ColumnId, CompactionLevel, NamespaceId, ObjectStoreId, PartitionHashId, PartitionId,
+    PartitionKey, TableId, Timestamp, TransitionPartitionId,
+};
+use datafusion_util::{unbounded_memory_pool, MemoryStream};
+use iox_time::Time;
+use object_store::DynObjectStore;
+use parquet_file::{
+    metadata::IoxMetadata,
+    serialize::CodecError,
+    storage::{ParquetStorage, StorageId, UploadError},
+};
+use schema::{
+    builder::SchemaBuilder, sort::SortKey, InfluxColumnType, InfluxFieldType, TIME_COLUMN_NAME,
+    TIME_DATA_TIMEZONE,
+};
+
+#[tokio::test]
+async fn test_decoded_iox_metadata() {
+    // A representative IOx data sample (with a time column, an invariant upheld
+    // in the IOx write path)
+    let data = [
+        (
+            TIME_COLUMN_NAME,
+            to_timestamp_array(&[
+                // NOTE: not ordered to ensure min/max is derived, not head/tail
+                1646917692000000000,
+                1653311292000000000,
+                1647695292000000000,
+            ]),
+            InfluxColumnType::Timestamp,
+        ),
+        (
+            "some_field",
+            to_string_array(&["bananas", "platanos", "manzana"]),
+            InfluxColumnType::Field(InfluxFieldType::String),
+        ),
+        (
+            "null_field",
+            null_string_array(3),
+            InfluxColumnType::Field(InfluxFieldType::String),
+        ),
+    ];
+
+    let partition_id = TransitionPartitionId::Deprecated(PartitionId::new(4));
+
+    // And the metadata the batch would be encoded with if it came through the
+    // IOx write path.
+    let meta = IoxMetadata {
+        object_store_id: ObjectStoreId::new(),
+        creation_timestamp: Time::from_timestamp_nanos(42),
+        namespace_id: NamespaceId::new(1),
+        namespace_name: "bananas".into(),
+        table_id: TableId::new(3),
+        table_name: "platanos".into(),
+        partition_key: "potato".into(),
+        compaction_level: CompactionLevel::FileNonOverlapped,
+        sort_key: None,
+        max_l0_created_at: Time::from_timestamp_nanos(42),
+    };
+
+    let mut schema_builder = SchemaBuilder::new();
+    for (name, _array, column_type) in &data {
+        schema_builder.influx_column(*name, *column_type);
+    }
+    let schema = schema_builder.build().unwrap();
+
+    let batch = RecordBatch::try_new(
+        schema.as_arrow(),
+        data.into_iter()
+            .map(|(_name, array, _column_type)| array)
+            .collect(),
+    )
+    .unwrap();
+    let stream = Box::pin(MemoryStream::new(vec![batch.clone()]));
+
+    let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+    let storage = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+    let (iox_parquet_meta, file_size) = storage
+        .upload(stream, &partition_id, &meta, unbounded_memory_pool())
+        .await
+        .expect("failed to serialize & persist record batch");
+
+    // Sanity check - can't assert the actual value.
+    assert!(file_size > 0);
+
+    // Decode the IOx metadata embedded in the parquet file metadata.
+    let decoded = iox_parquet_meta
+        .decode()
+        .expect("failed to decode parquet file metadata");
+
+    // And verify the metadata matches the expected values.
+    assert_eq!(
+        decoded.row_count(),
+        3,
+        "row count statistics does not match input row count"
+    );
+
+    // Repro of 4714
+    let row_group_meta = decoded.parquet_row_group_metadata();
+    println!("ow_group_meta: {row_group_meta:#?}");
+    assert_eq!(row_group_meta.len(), 1);
+    assert_eq!(row_group_meta[0].columns().len(), 3); // time and some_field
+    assert!(row_group_meta[0].column(0).statistics().is_some()); // There is statistics for "time"
+    assert!(row_group_meta[0].column(1).statistics().is_some()); // There is statistics for "some_field"
+    assert!(row_group_meta[0].column(2).statistics().is_some()); // There is statistics for "null_field"
+
+    let schema = decoded.read_schema().unwrap();
+    let (_, field) = schema.field(0);
+    assert_eq!(field.name(), "time");
+    println!("schema: {schema:#?}");
+
+    let col_summary = decoded
+        .read_statistics(&schema)
+        .expect("Invalid Statistics");
+    assert_eq!(col_summary.len(), 3);
+
+    let got = decoded
+        .read_iox_metadata_new()
+        .expect("failed to deserialize embedded IOx metadata");
+    assert_eq!(
+        got, meta,
+        "embedded metadata does not match original metadata"
+    );
+}
+
+// Ensure that attempting to write an empty parquet file causes a error to be
+// raised. The caller can then decide if this is acceptable plan output or a
+// bug.
+//
+// It used to be considered a logical error to be producing empty parquet files
+// at all - we have previously identified cases of useless work being performed
+// by inducing a panic when observing a parquet file with 0 rows, however we now
+// tolerate 0 row outputs as the compactor can perform multiple splits at once,
+// which is problematic when a single chunk can overlap multiple split points:
+//
+//                  ────────────── Time ────────────▶
+//
+//                          │                │
+//                  ┌────────────────────────────────┐
+//                  │       │    Chunk 1     │       │
+//                  └────────────────────────────────┘
+//                          │                │
+//
+//                          │                │
+//
+//                      Split T1         Split T2
+//
+// If this chunk has an unusual distribution of writes over the time range it
+// covers, we can wind up with the split between T1 and T2 containing no data.
+// For example, if all the data is either before T1, or after T2 we can wind up
+// with a split plan such as this, where the middle sub-section contains no
+// data:
+//
+//                          │                │
+//                  ┌█████──────────────────────█████┐
+//                  │█████  │    Chunk 1     │  █████│
+//                  └█████──────────────────────█████┘
+//                          │                │
+//
+//                          │                │
+//
+//                      Split T1         Split T2
+//
+// It is not possible to use the chunk statistics (min/max timestamps) to
+// determine this empty sub-section will result ahead of time, therefore the
+// parquet encoder must tolerate it and raise a non-fatal error instead of
+// panicking.
+//
+// Relates to:
+//      * https://github.com/influxdata/influxdb_iox/issues/4695
+//      * https://github.com/influxdata/conductor/issues/1121
+#[tokio::test]
+async fn test_empty_parquet_file_panic() {
+    // A representative IOx data sample (with a time column, an invariant upheld
+    // in the IOx write path)
+    let data = [
+        (
+            TIME_COLUMN_NAME,
+            to_timestamp_array(&[]), // No data on purpose to reproduce the panic bug
+        ),
+        ("some_field", to_string_array(&[])),
+    ];
+
+    let partition_id = TransitionPartitionId::Deprecated(PartitionId::new(4));
+
+    // And the metadata the batch would be encoded with if it came through the
+    // IOx write path.
+    let meta = IoxMetadata {
+        object_store_id: ObjectStoreId::new(),
+        creation_timestamp: Time::from_timestamp_nanos(42),
+        namespace_id: NamespaceId::new(1),
+        namespace_name: "bananas".into(),
+        table_id: TableId::new(3),
+        table_name: "platanos".into(),
+        partition_key: "potato".into(),
+        compaction_level: CompactionLevel::FileNonOverlapped,
+        sort_key: None,
+        max_l0_created_at: Time::from_timestamp_nanos(42),
+    };
+
+    let batch = RecordBatch::try_from_iter(data).unwrap();
+    let stream = Box::pin(MemoryStream::new(vec![batch.clone()]));
+
+    let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+    let storage = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+    // Serialising empty data should cause a panic for human investigation.
+    let err = storage
+        .upload(stream, &partition_id, &meta, unbounded_memory_pool())
+        .await
+        .expect_err("empty file should raise an error");
+
+    assert!(matches!(err, UploadError::Serialise(CodecError::NoRows)));
+}
+
+#[tokio::test]
+async fn test_decoded_many_columns_with_null_cols_iox_metadata() {
+    // Increase these values to have larger test
+    let num_cols = 10;
+    let num_rows = 20;
+    let num_repeats = 5;
+
+    let mut data = Vec::with_capacity(num_cols);
+
+    let t = 1646917692000000000;
+    let mut time_arr: Vec<i64> = Vec::with_capacity(num_rows);
+    let mut string_arr = Vec::with_capacity(num_rows);
+
+    // Make long string data
+    fn make_long_str(len: usize) -> String {
+        "Long String Data".repeat(len)
+    }
+    let str = make_long_str(num_repeats);
+
+    // Data of time and string columns
+    for i in 0..num_rows {
+        time_arr.push(t + i as i64);
+        string_arr.push(str.as_str());
+    }
+
+    // First column is time
+    data.push((
+        TIME_COLUMN_NAME.to_string(),
+        to_timestamp_array(&time_arr),
+        InfluxColumnType::Timestamp,
+    ));
+    // Second column contains all nulls
+    data.push((
+        "column_name_1".to_string(),
+        null_string_array(num_rows),
+        InfluxColumnType::Field(InfluxFieldType::String),
+    ));
+    // Names of other columns
+    fn make_col_name(i: usize) -> String {
+        "column_name_".to_string() + i.to_string().as_str()
+    }
+    // Data of the rest of the columns
+    for i in 2..num_cols {
+        let col = make_col_name(i);
+        let col_data = (
+            col,
+            to_string_array(&string_arr),
+            InfluxColumnType::Field(InfluxFieldType::String),
+        );
+        data.push(col_data);
+    }
+
+    // And the metadata the batch would be encoded with if it came through the
+    // IOx write path.
+
+    // sort key includes all columns with time column last
+    let mut sort_key_data = Vec::with_capacity(num_cols);
+    for i in 1..num_cols {
+        let col = make_col_name(i);
+        sort_key_data.push(col);
+    }
+    sort_key_data.push(TIME_COLUMN_NAME.to_string());
+    let sort_key = SortKey::from_columns(sort_key_data);
+    let partition_id = TransitionPartitionId::Deprecated(PartitionId::new(4));
+    let meta = IoxMetadata {
+        object_store_id: ObjectStoreId::new(),
+        creation_timestamp: Time::from_timestamp_nanos(42),
+        namespace_id: NamespaceId::new(1),
+        namespace_name: "bananas".into(),
+        table_id: TableId::new(3),
+        table_name: "platanos".into(),
+        partition_key: "potato".into(),
+        compaction_level: CompactionLevel::FileNonOverlapped,
+        sort_key: Some(sort_key),
+        max_l0_created_at: Time::from_timestamp_nanos(42),
+    };
+
+    let mut schema_builder = SchemaBuilder::new();
+    for (name, _array, column_type) in &data {
+        schema_builder.influx_column(name, *column_type);
+    }
+    let schema = schema_builder.build().unwrap();
+
+    let batch = RecordBatch::try_new(
+        schema.as_arrow(),
+        data.into_iter()
+            .map(|(_name, array, _column_type)| array)
+            .collect(),
+    )
+    .unwrap();
+    let stream = Box::pin(MemoryStream::new(vec![batch.clone()]));
+
+    let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+    let storage = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+    let (iox_parquet_meta, file_size) = storage
+        .upload(stream, &partition_id, &meta, unbounded_memory_pool())
+        .await
+        .expect("failed to serialize & persist record batch");
+
+    // Sanity check - can't assert the actual value.
+    assert!(file_size > 0);
+
+    // Decode the IOx metadata embedded in the parquet file metadata.
+    let decoded = iox_parquet_meta
+        .decode()
+        .expect("failed to decode parquet file metadata");
+
+    // And verify the metadata matches the expected values.
+    assert_eq!(
+        decoded.row_count(),
+        num_rows,
+        "row count statistics does not match input row count"
+    );
+
+    let schema = decoded.read_schema().unwrap();
+    let (_, field) = schema.field(0);
+    assert_eq!(field.name(), "time");
+
+    let col_summary = decoded
+        .read_statistics(&schema)
+        .expect("Invalid Statistics");
+    assert_eq!(col_summary.len(), num_cols);
+
+    let got = decoded
+        .read_iox_metadata_new()
+        .expect("failed to deserialize embedded IOx metadata");
+    assert_eq!(
+        got, meta,
+        "embedded metadata does not match original metadata"
+    );
+}
+
+#[tokio::test]
+async fn test_derive_parquet_file_params() {
+    // A representative IOx data sample (with a time column, an invariant upheld
+    // in the IOx write path)
+    let data = vec![
+        to_string_array(&["bananas", "platanos", "manzana"]),
+        to_timestamp_array(&[
+            // NOTE: not ordered to ensure min/max extracted, not head/tail
+            1646917692000000000,
+            1653311292000000000,
+            1647695292000000000,
+        ]),
+    ];
+
+    // And the metadata the batch would be encoded with if it came through the
+    // IOx write path.
+    let table_id = TableId::new(3);
+    let partition_key = PartitionKey::from("potato");
+    let partition_hash_id = PartitionHashId::new(table_id, &partition_key);
+    let partition_id = TransitionPartitionId::Deterministic(partition_hash_id.clone());
+
+    let meta = IoxMetadata {
+        object_store_id: ObjectStoreId::new(),
+        creation_timestamp: Time::from_timestamp_nanos(1234),
+        namespace_id: NamespaceId::new(1),
+        namespace_name: "bananas".into(),
+        table_id,
+        table_name: "platanos".into(),
+        partition_key,
+        compaction_level: CompactionLevel::FileNonOverlapped,
+        sort_key: None,
+        max_l0_created_at: Time::from_timestamp_nanos(1234),
+    };
+
+    // Build a schema that contains the IOx metadata, ensuring it is correctly
+    // populated in the final parquet file's metadata.
+    let schema = SchemaBuilder::new()
+        .influx_field("some_field", InfluxFieldType::String)
+        .timestamp()
+        .build()
+        .expect("could not create schema")
+        .as_arrow();
+
+    let batch = RecordBatch::try_new(schema, data).unwrap();
+    let stream = Box::pin(MemoryStream::new(vec![batch.clone()]));
+
+    let object_store: Arc<DynObjectStore> = Arc::new(object_store::memory::InMemory::default());
+    let storage = ParquetStorage::new(object_store, StorageId::from("iox"));
+
+    let (iox_parquet_meta, file_size) = storage
+        .upload(stream, &partition_id, &meta, unbounded_memory_pool())
+        .await
+        .expect("failed to serialize & persist record batch");
+
+    // Use the IoxParquetMetaData and original IoxMetadata to derive a
+    // ParquetFileParams.
+    let column_id_map: HashMap<String, ColumnId> = HashMap::from([
+        ("some_field".into(), ColumnId::new(1)),
+        ("time".into(), ColumnId::new(2)),
+    ]);
+    let partition_id = PartitionId::new(1);
+    let catalog_data = meta.to_parquet_file(
+        partition_id,
+        Some(partition_hash_id),
+        file_size,
+        &iox_parquet_meta,
+        |name| *column_id_map.get(name).unwrap(),
+    );
+
+    // And verify the resulting statistics used in the catalog.
+    //
+    // NOTE: thrift-encoded metadata not checked
+    // TODO: check thrift-encoded metadata which may be the issue of bug 4695
+    assert_eq!(catalog_data.namespace_id, meta.namespace_id);
+    assert_eq!(catalog_data.table_id, meta.table_id);
+    assert_eq!(catalog_data.object_store_id, meta.object_store_id);
+    assert_eq!(catalog_data.file_size_bytes, file_size as i64);
+    assert_eq!(catalog_data.compaction_level, meta.compaction_level);
+    assert_eq!(catalog_data.created_at, Timestamp::new(1234));
+    assert_eq!(catalog_data.row_count, 3);
+    assert_eq!(catalog_data.min_time, Timestamp::new(1646917692000000000));
+    assert_eq!(catalog_data.max_time, Timestamp::new(1653311292000000000));
+    assert_eq!(catalog_data.max_l0_created_at, Timestamp::new(1234));
+}
+
+fn to_string_array(strs: &[&str]) -> ArrayRef {
+    let array: StringArray = strs.iter().map(|s| Some(*s)).collect();
+    Arc::new(array)
+}
+
+fn to_timestamp_array(timestamps: &[i64]) -> ArrayRef {
+    let array = timestamps
+        .iter()
+        .map(|v| Some(*v))
+        .collect::<TimestampNanosecondArray>()
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+    Arc::new(array)
+}
+
+fn null_string_array(num: usize) -> ArrayRef {
+    let array: StringArray = std::iter::repeat(None as Option<&str>).take(num).collect();
+    Arc::new(array)
+}
diff --git a/parquet_to_line_protocol/Cargo.toml b/parquet_to_line_protocol/Cargo.toml
new file mode 100644
index 0000000..a1e6b7f
--- /dev/null
+++ b/parquet_to_line_protocol/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "parquet_to_line_protocol"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+datafusion = { workspace = true }
+datafusion_util = { path = "../datafusion_util" }
+influxdb-line-protocol = { path = "../influxdb_line_protocol" }
+futures = {version = "0.3"}
+num_cpus = "1.16.0"
+object_store = { workspace = true }
+parquet_file  = { path = "../parquet_file" }
+schema = { path = "../schema" }
+tokio = "1.35"
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+mutable_batch_lp = { path = "../mutable_batch_lp" }
diff --git a/parquet_to_line_protocol/src/batch.rs b/parquet_to_line_protocol/src/batch.rs
new file mode 100644
index 0000000..734628e
--- /dev/null
+++ b/parquet_to_line_protocol/src/batch.rs
@@ -0,0 +1,246 @@
+use datafusion::arrow::{
+    array::{
+        as_boolean_array, as_dictionary_array, as_primitive_array, as_string_array, Array,
+        ArrayAccessor, StringArray,
+    },
+    datatypes::{Float64Type, Int32Type, Int64Type, TimestampNanosecondType, UInt64Type},
+    record_batch::RecordBatch,
+};
+use influxdb_line_protocol::{builder::FieldValue, FieldValue as LPFieldValue};
+use schema::{InfluxColumnType, InfluxFieldType, Schema};
+
+/// Converts a [`RecordBatch`] into line protocol lines.
+pub fn convert_to_lines(
+    measurement_name: &str,
+    iox_schema: &Schema,
+    batch: &RecordBatch,
+) -> Result<Vec<u8>, String> {
+    let mut lp_builder = influxdb_line_protocol::LineProtocolBuilder::new();
+
+    for index in 0..batch.num_rows() {
+        let lp_tags = lp_builder.measurement(measurement_name);
+
+        // Add all tags
+        let lp_tags = tags_values_iter(iox_schema, index, batch)
+            .into_iter()
+            .fold(lp_tags, |lp_tags, tag_column| {
+                lp_tags.tag(tag_column.name, tag_column.value)
+            });
+
+        // add fields
+        let mut fields = field_values_iter(iox_schema, index, batch).into_iter();
+
+        // need at least one field (to put builder into "AfterTag" mode
+        let first_field = fields
+            .next()
+            .ok_or_else(|| format!("Need at least one field, schema had none: {iox_schema:?}"))?;
+
+        let lp_fields = lp_tags.field(first_field.name, first_field);
+
+        // add rest of fileds
+        let lp_fields = fields.fold(lp_fields, |lp_fields, field| {
+            lp_fields.field(field.name, field)
+        });
+
+        let ts = timestamp_value(iox_schema, index, batch)?;
+        lp_builder = lp_fields.timestamp(ts).close_line();
+    }
+
+    Ok(lp_builder.build())
+}
+
+/// Return an iterator over all non null tags in a batch
+fn tags_values_iter<'a>(
+    iox_schema: &'a Schema,
+    row_index: usize,
+    batch: &'a RecordBatch,
+) -> impl IntoIterator<Item = TagColumn<'a>> {
+    iox_schema
+        .iter()
+        .enumerate()
+        .filter_map(move |(column_index, (influx_column_type, field))| {
+            if influx_column_type == InfluxColumnType::Tag {
+                // tags are always dictionaries
+                let arr = as_dictionary_array::<Int32Type>(batch.column(column_index))
+                    .downcast_dict::<StringArray>()
+                    .expect("Tag was not a string dictionary array");
+
+                // If the value of this column is not null, return it.
+                if arr.is_valid(row_index) {
+                    return Some(TagColumn {
+                        name: field.name(),
+                        value: arr.value(row_index),
+                    });
+                }
+            }
+            None
+        })
+}
+
+/// Represents a particular column along with code that knows how to add that value to a builder.
+struct TagColumn<'a> {
+    name: &'a str,
+    value: &'a str,
+}
+
+/// Return an iterator over all non null fields in a batch
+fn field_values_iter<'a>(
+    iox_schema: &'a Schema,
+    row_index: usize,
+    batch: &'a RecordBatch,
+) -> impl IntoIterator<Item = FieldColumn<'a>> {
+    iox_schema
+        .iter()
+        .enumerate()
+        .filter_map(move |(column_index, (influx_column_type, field))| {
+            // Skip any value that is NULL
+            let arr = batch.column(column_index);
+            if !arr.is_valid(row_index) {
+                return None;
+            }
+
+            let name = field.name();
+
+            // Extract the value from the relevant array and convert it
+            let value = match influx_column_type {
+                InfluxColumnType::Field(InfluxFieldType::Float) => {
+                    LPFieldValue::F64(as_primitive_array::<Float64Type>(arr).value(row_index))
+                }
+                InfluxColumnType::Field(InfluxFieldType::Integer) => {
+                    LPFieldValue::I64(as_primitive_array::<Int64Type>(arr).value(row_index))
+                }
+                InfluxColumnType::Field(InfluxFieldType::UInteger) => {
+                    LPFieldValue::U64(as_primitive_array::<UInt64Type>(arr).value(row_index))
+                }
+                InfluxColumnType::Field(InfluxFieldType::String) => {
+                    LPFieldValue::String(as_string_array(arr).value(row_index).into())
+                }
+                InfluxColumnType::Field(InfluxFieldType::Boolean) => {
+                    LPFieldValue::Boolean(as_boolean_array(arr).value(row_index))
+                }
+                // not a field
+                InfluxColumnType::Tag | InfluxColumnType::Timestamp => return None,
+            };
+
+            Some(FieldColumn { name, value })
+        })
+}
+
+/// Represents a particular Field column's value in a way that knows how to format
+struct FieldColumn<'a> {
+    name: &'a str,
+    value: LPFieldValue<'a>,
+}
+
+impl<'a> FieldValue for FieldColumn<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.value {
+            LPFieldValue::I64(v) => v.fmt(f),
+            LPFieldValue::U64(v) => v.fmt(f),
+            LPFieldValue::F64(v) => v.fmt(f),
+            LPFieldValue::String(v) => v.as_str().fmt(f),
+            LPFieldValue::Boolean(v) => v.fmt(f),
+        }
+    }
+}
+
+/// Find the timestamp value for the specified row
+fn timestamp_value<'a>(
+    iox_schema: &'a Schema,
+    row_index: usize,
+    batch: &'a RecordBatch,
+) -> Result<i64, String> {
+    let column_index = iox_schema
+        .iter()
+        .enumerate()
+        .filter_map(move |(column_index, (influx_column_type, _))| {
+            if influx_column_type == InfluxColumnType::Timestamp {
+                Some(column_index)
+            } else {
+                None
+            }
+        })
+        .next()
+        .ok_or_else(|| "No timestamp column found in schema".to_string())?;
+
+    // timestamps are always TimestampNanosecondArray's and should always have a timestamp value filled in
+    let arr = as_primitive_array::<TimestampNanosecondType>(batch.column(column_index));
+
+    if !arr.is_valid(row_index) {
+        Err(format!(
+            "TimestampValue was unexpectedly null at row {row_index}"
+        ))
+    } else {
+        Ok(arr.value(row_index))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mutable_batch_lp::lines_to_batches;
+    use schema::Projection;
+
+    #[test]
+    fn basic() {
+        round_trip("my_measurement_name,tag=foo value=4 1000");
+    }
+
+    #[test]
+    fn no_tags() {
+        round_trip("my_no_tag_measurement_name value=4 1000");
+    }
+
+    #[test]
+    #[should_panic = "Error parsing line protocol: PerLine { lines: [LineProtocol { source: FieldSetMissing, line: 1 }] }"]
+    fn no_fields() {
+        round_trip("my_no_tag_measurement_name,tag=4 1000");
+    }
+
+    #[test]
+    fn all_types() {
+        // Note we use cannonical format (e.g. 'true' instead of 't')
+        round_trip(
+            r#"m,tag=row1 float_field=64 450
+m,tag2=row2 float_field=65 550
+m,tag2=row3 int_field=65i 560
+m,tag2=row4 uint_field=5u 580
+m,tag2=row5 bool_field=true 590
+m,tag2=row6 str_field="blargh" 600
+m,tag2=multi_field bool_field=false,str_field="blargh" 610
+"#,
+        );
+    }
+
+    /// ensures that parsing line protocol to record batches and then
+    /// converting it back to line protocol results in the same output
+    ///
+    /// Note it must use cannonical format (e.g. 'true' instead of 't')
+    fn round_trip(lp: &str) {
+        let default_time = 0;
+        let mutable_batches =
+            lines_to_batches(lp, default_time).expect("Error parsing line protocol");
+        assert_eq!(
+            mutable_batches.len(),
+            1,
+            "round trip only supports one measurement"
+        );
+        let (table_name, mutable_batch) = mutable_batches.into_iter().next().unwrap();
+
+        let selection = Projection::All;
+        let record_batch = mutable_batch.to_arrow(selection).unwrap();
+        let iox_schema = mutable_batch.schema(selection).unwrap();
+
+        let output_lp = convert_to_lines(&table_name, &iox_schema, &record_batch)
+            .expect("error converting lines");
+        let output_lp = String::from_utf8_lossy(&output_lp);
+
+        let lp = lp.trim();
+        let output_lp = output_lp.trim();
+
+        assert_eq!(
+            lp, output_lp,
+            "\n\nInput:\n\n{lp}\n\nOutput:\n\n{output_lp}\n"
+        )
+    }
+}
diff --git a/parquet_to_line_protocol/src/lib.rs b/parquet_to_line_protocol/src/lib.rs
new file mode 100644
index 0000000..9efebb6
--- /dev/null
+++ b/parquet_to_line_protocol/src/lib.rs
@@ -0,0 +1,269 @@
+//! Code that can convert between parquet files and line protocol
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![allow(clippy::clone_on_ref_ptr)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use datafusion::{
+    arrow::datatypes::SchemaRef as ArrowSchemaRef,
+    datasource::{
+        file_format::{parquet::ParquetFormat, FileFormat},
+        listing::PartitionedFile,
+        object_store::ObjectStoreUrl,
+        physical_plan::{FileScanConfig, ParquetExec},
+    },
+    execution::{
+        context::{SessionState, TaskContext},
+        runtime_env::RuntimeEnv,
+    },
+    physical_plan::{execute_stream, SendableRecordBatchStream, Statistics},
+    prelude::SessionContext,
+};
+use datafusion_util::config::{iox_session_config, register_iox_object_store};
+use futures::{stream::BoxStream, StreamExt};
+use object_store::{
+    local::LocalFileSystem, path::Path as ObjectStorePath, ObjectMeta, ObjectStore,
+};
+use parquet_file::metadata::{IoxMetadata, METADATA_KEY};
+use schema::Schema;
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::{
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+mod batch;
+pub use batch::convert_to_lines;
+pub type Result<T = (), E = Error> = std::result::Result<T, E>;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid path: {:?}: {}", path, source))]
+    Path {
+        path: PathBuf,
+        source: object_store::path::Error,
+    },
+
+    #[snafu(display("Error listing: {:?}: {}", object_store_path, source))]
+    ObjectStorePath {
+        object_store_path: ObjectStorePath,
+        source: object_store::Error,
+    },
+
+    #[snafu(display(
+        "Can not find IOx metadata in parquet metadata. Could not find {}",
+        METADATA_KEY
+    ))]
+    MissingMetadata {},
+
+    #[snafu(display("Error reading IOx metadata: {}", source))]
+    Metadata {
+        source: parquet_file::metadata::Error,
+    },
+
+    #[snafu(display("Error inferring IOx schema: {}", source))]
+    InferringSchema {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display("Error reading batch: {}", source))]
+    ReadingBatch {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display("Error reading IOx schema: {}", source))]
+    Schema { source: schema::Error },
+
+    #[snafu(display("Error in processing task: {}", source))]
+    Task { source: tokio::task::JoinError },
+
+    #[snafu(display("Error converting: {}", message))]
+    Conversion { message: String },
+
+    #[snafu(display("Error executing: {}", source))]
+    ExecutingStream {
+        source: datafusion::error::DataFusionError,
+    },
+
+    #[snafu(display("IO Error: {}", source))]
+    IO { source: std::io::Error },
+}
+
+/// Converts a parquet file that was written by IOx from the local
+/// file system path specified a stream of line protocol bytes
+///
+/// Each returned `Vec<u8>` is guarnteed to have complete line
+/// protocol (aka lines are not split across the buffers)
+pub async fn convert_file<P>(path: P) -> Result<BoxStream<'static, Result<Vec<u8>>>>
+where
+    P: AsRef<Path> + Send,
+{
+    let path = path.as_ref();
+    let object_store_path =
+        ObjectStorePath::from_filesystem_path(path).context(PathSnafu { path })?;
+
+    // Fire up a parquet reader, read the batches, and then convert
+    // them asynchronously in parallel
+
+    let object_store = Arc::new(LocalFileSystem::new()) as Arc<dyn ObjectStore>;
+    let object_store_url = ObjectStoreUrl::local_filesystem();
+
+    let object_meta = object_store
+        .head(&object_store_path)
+        .await
+        .context(ObjectStorePathSnafu { object_store_path })?;
+
+    let reader = ParquetFileReader::try_new(object_store, object_store_url, object_meta).await?;
+
+    // Determines the measurement name from the IOx metadata
+    let schema = reader.schema();
+    let encoded_meta = schema
+        .metadata
+        .get(METADATA_KEY)
+        .context(MissingMetadataSnafu)?;
+
+    let iox_meta = IoxMetadata::from_base64(encoded_meta.as_bytes()).context(MetadataSnafu)?;
+
+    // Attempt to extract the IOx schema from the schema stored in the
+    // parquet file. This schema is where information such as what
+    // columns are tags and fields is stored
+    let iox_schema: Schema = schema.try_into().context(SchemaSnafu)?;
+
+    let iox_schema = Arc::new(iox_schema);
+
+    let measurement_name = iox_meta.table_name;
+
+    // now convert the record batches to line protocol, in parallel
+    let stream = reader
+        .read()
+        .await?
+        .map(move |batch| {
+            let iox_schema = Arc::clone(&iox_schema);
+            let measurement_name = Arc::clone(&measurement_name);
+            tokio::task::spawn(async move {
+                batch.context(ReadingBatchSnafu).and_then(|batch| {
+                    convert_to_lines(&measurement_name, &iox_schema, &batch)
+                        .map_err(|message| Error::Conversion { message })
+                })
+            })
+        })
+        // run some number of futures in parallel
+        .buffered(num_cpus::get())
+        // unwrap task result (check for panics)
+        .map(|result| match result {
+            Ok(res) => res,
+            Err(source) => Err(Error::Task { source }),
+        })
+        .boxed();
+
+    Ok(stream)
+}
+
+/// Handles the details of interacting with parquet libraries /
+/// readers. Tries not to have any IOx specific logic
+pub struct ParquetFileReader {
+    object_store: Arc<dyn ObjectStore>,
+    object_store_url: ObjectStoreUrl,
+    /// Name / path information of the object to read
+    object_meta: ObjectMeta,
+
+    /// Parquet file metadata
+    schema: ArrowSchemaRef,
+
+    /// DataFusion configuration, such as the target batchsize, etc
+    session_ctx: SessionContext,
+}
+
+impl std::fmt::Debug for ParquetFileReader {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ParquetFileReader")
+            .field("object_store", &self.object_store)
+            .field("object_store_url", &self.object_store_url)
+            .field("object_meta", &self.object_meta)
+            .field("schema", &self.schema)
+            .field("session_ctx", &"<CONTEXT>")
+            .finish()
+    }
+}
+
+impl ParquetFileReader {
+    /// Find and open the specified parquet file, and read its metadata / schema
+    pub async fn try_new(
+        object_store: Arc<dyn ObjectStore>,
+        object_store_url: ObjectStoreUrl,
+        object_meta: ObjectMeta,
+    ) -> Result<Self, Error> {
+        let runtime = Arc::new(RuntimeEnv::default());
+        let session_config = iox_session_config();
+        let session_state = SessionState::new_with_config_rt(session_config, runtime);
+
+        // Keep metadata so we can find the measurement name
+        let format = ParquetFormat::new().with_skip_metadata(Some(false));
+
+        // Use datafusion parquet reader to read the metadata from the
+        // file.
+        let schema = format
+            .infer_schema(&session_state, &object_store, &[object_meta.clone()])
+            .await
+            .context(InferringSchemaSnafu)?;
+
+        let session_ctx = SessionContext::new_with_state(session_state);
+
+        Ok(Self {
+            object_store,
+            object_store_url,
+            object_meta,
+            schema,
+            session_ctx,
+        })
+    }
+
+    // retrieves the Arrow schema for this file
+    pub fn schema(&self) -> ArrowSchemaRef {
+        Arc::clone(&self.schema)
+    }
+
+    /// read the parquet file as a stream
+    pub async fn read(&self) -> Result<SendableRecordBatchStream, Error> {
+        let file_schema = self.schema();
+        let statistics = Statistics::new_unknown(&file_schema);
+        let base_config = FileScanConfig {
+            object_store_url: self.object_store_url.clone(),
+            file_schema,
+            file_groups: vec![vec![PartitionedFile {
+                object_meta: self.object_meta.clone(),
+                partition_values: vec![],
+                range: None,
+                extensions: None,
+            }]],
+            statistics,
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![],
+        };
+
+        // set up enough datafusion context to do the real read session
+        let predicate = None;
+        let metadata_size_hint = None;
+        let exec = ParquetExec::new(base_config, predicate, metadata_size_hint);
+
+        let object_store = Arc::clone(&self.object_store);
+        register_iox_object_store(self.session_ctx.runtime_env(), "iox", object_store);
+        let task_ctx = Arc::new(TaskContext::from(&self.session_ctx));
+
+        execute_stream(Arc::new(exec), task_ctx).context(ExecutingStreamSnafu)
+    }
+}
diff --git a/partition/Cargo.toml b/partition/Cargo.toml
new file mode 100644
index 0000000..4ec967c
--- /dev/null
+++ b/partition/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "partition"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+chrono = { version = "0.4", default-features = false }
+data_types = { path = "../data_types" }
+hashbrown = { workspace = true }
+mutable_batch = { path = "../mutable_batch" }
+percent-encoding = "2.3.1"
+schema = { path = "../schema" }
+thiserror = "1.0.56"
+unicode-segmentation = "1.10.1"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+assert_matches = "1.5.0"
+criterion = { version = "0.5", default-features = false, features = [
+    "rayon",
+] }
+generated_types = { path = "../generated_types" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+paste = "1.0.14"
+proptest = { version = "1.4.0", default-features = false }
+rand = "0.8"
+test_helpers = { path = "../test_helpers" }
+
+[[bench]]
+name = "partitioner"
+harness = false
diff --git a/partition/benches/partitioner.rs b/partition/benches/partitioner.rs
new file mode 100644
index 0000000..21a2f9f
--- /dev/null
+++ b/partition/benches/partitioner.rs
@@ -0,0 +1,246 @@
+use std::path::Path;
+
+use criterion::{
+    criterion_group, criterion_main, measurement::WallTime, BatchSize, BenchmarkGroup, BenchmarkId,
+    Criterion, Throughput,
+};
+use data_types::partition_template::TablePartitionTemplateOverride;
+use generated_types::influxdata::iox::partition_template::v1::{self as proto, Bucket};
+use partition::partition_batch;
+use schema::Projection;
+
+fn partitioner_benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("partitioner");
+
+    ////////////////////////////////////////////////////////////////////////////
+    // A medium batch.
+    bench(
+        &mut group,
+        "tag_hit",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("env".to_string())),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "tag_miss",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("bananas".to_string())),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "YYYY-MM-DD strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat(
+                "%Y-%m-%d".to_string(),
+            )),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "long strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "hash bucket on tag",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::Bucket(Bucket {
+                tag_name: "env".to_string(),
+                num_buckets: 100,
+            })),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    ////////////////////////////////////////////////////////////////////////////
+    // A large batch.
+    bench(
+        &mut group,
+        "tag_hit",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("host".to_string())),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "tag_miss",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("bananas".to_string())),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "YYYY-MM-DD strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat(
+                "%Y-%m-%d".to_string(),
+            )),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "long strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "hash bucket on tag",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::Bucket(Bucket {
+                tag_name: "host".to_string(),
+                num_buckets: 100,
+            })),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    ////////////////////////////////////////////////////////////////////////////
+    // A small batch.
+    bench(
+        &mut group,
+        "tag_hit",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("location".to_string())),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "tag_miss",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("bananas".to_string())),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "YYYY-MM-DD strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat(
+                "%Y-%m-%d".to_string(),
+            )),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "long strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "hash bucket on tag",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::Bucket(Bucket {
+                tag_name: "location".to_string(),
+                num_buckets: 100,
+            })),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    group.finish();
+}
+
+fn bench(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    template_name: &str,
+    partition_template: Vec<proto::TemplatePart>,
+    file_path: &str, // Relative to the crate root
+) {
+    // Un-normalise the path, adjusting back to the crate root.
+    let file_path = format!("{}/../{}", env!("CARGO_MANIFEST_DIR"), file_path);
+    let path = Path::new(&file_path);
+    let partition_template = TablePartitionTemplateOverride::try_new(
+        Some(proto::PartitionTemplate {
+            parts: partition_template,
+        }),
+        &Default::default(),
+    )
+    .unwrap();
+
+    // Read the benchmark data
+    let data = std::fs::read_to_string(path).unwrap();
+    let row_count = data.chars().filter(|&v| v == '\n').count();
+
+    // Generate the mutable batch partitioner input
+    let mutable_batch_input: Vec<_> = mutable_batch_lp::lines_to_batches(&data, 42)
+        .unwrap()
+        .into_iter()
+        .map(|(_table_name, batch)| batch)
+        .collect();
+
+    // Generate the record batch partitioner input
+    let record_batch_input: Vec<_> = mutable_batch_input
+        .iter()
+        .map(|batch| batch.to_arrow(Projection::All).unwrap())
+        .collect();
+
+    group.throughput(Throughput::Elements(row_count as _));
+    group.bench_function(
+        BenchmarkId::new(
+            format!("{template_name} (mutable batch)"),
+            path.file_name().unwrap().to_str().unwrap(),
+        ),
+        |b| {
+            b.iter_batched(
+                || mutable_batch_input.clone(),
+                |input| {
+                    for batch in input {
+                        partition_batch(&batch, &partition_template).for_each(drop);
+                    }
+                },
+                BatchSize::NumIterations(1),
+            )
+        },
+    );
+    group.bench_function(
+        BenchmarkId::new(
+            format!("{template_name} (record batch)"),
+            path.file_name().unwrap().to_str().unwrap(),
+        ),
+        |b| {
+            b.iter_batched(
+                || record_batch_input.clone(),
+                |input| {
+                    for batch in input {
+                        partition_batch(&batch, &partition_template).for_each(drop);
+                    }
+                },
+                BatchSize::NumIterations(1),
+            )
+        },
+    );
+}
+
+criterion_group!(benches, partitioner_benchmarks);
+criterion_main!(benches);
diff --git a/partition/src/bucket.rs b/partition/src/bucket.rs
new file mode 100644
index 0000000..6e7df80
--- /dev/null
+++ b/partition/src/bucket.rs
@@ -0,0 +1,49 @@
+use data_types::partition_template;
+
+#[derive(Debug)]
+pub(super) struct BucketHasher {
+    num_buckets: u32,
+    last_assigned_bucket: Option<u32>,
+}
+
+impl BucketHasher {
+    pub(super) fn new(num_buckets: u32) -> Self {
+        Self {
+            num_buckets,
+            last_assigned_bucket: None,
+        }
+    }
+
+    /// Assign a bucket for the provided `tag_value` using the [`BucketHasher`]s
+    /// configuration.
+    pub(super) fn assign_bucket(&mut self, tag_value: &str) -> u32 {
+        let bucket = partition_template::bucket_for_tag_value(tag_value, self.num_buckets);
+        self.last_assigned_bucket = Some(bucket);
+        bucket
+    }
+
+    /// The last bucket assigned by the [`BucketHasher`].
+    pub(super) fn last_assigned_bucket(&self) -> Option<u32> {
+        self.last_assigned_bucket
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_last_assigned_bucket() {
+        let mut bucketer = BucketHasher::new(10);
+        assert_eq!(bucketer.last_assigned_bucket, None);
+
+        assert_eq!(bucketer.assign_bucket("foo"), 6);
+        assert_eq!(bucketer.last_assigned_bucket, Some(6));
+
+        assert_eq!(bucketer.assign_bucket("bat"), 5);
+        assert_eq!(bucketer.last_assigned_bucket, Some(5));
+
+        assert_eq!(bucketer.assign_bucket("qux"), 5);
+        assert_eq!(bucketer.last_assigned_bucket, Some(5));
+    }
+}
diff --git a/partition/src/filter.rs b/partition/src/filter.rs
new file mode 100644
index 0000000..099c900
--- /dev/null
+++ b/partition/src/filter.rs
@@ -0,0 +1,145 @@
+//! Functions for filtering rows from a [`MutableBatch`]
+//!
+//! The returned ranges can then be used with `MutableBatch::extend_from_range`
+
+use crate::Batch;
+use mutable_batch::MutableBatch;
+use std::ops::Range;
+
+/// Given a [`MutableBatch`] a time predicate and a set of row ranges, returns the row
+/// indexes that pass the predicate
+///
+/// # Panic
+///
+/// Panics if `batch` does not contain a time column of the correct type
+pub(crate) fn filter_time<'a, F>(
+    batch: &'a MutableBatch,
+    ranges: &'a [Range<usize>],
+    mut predicate: F,
+) -> Vec<Range<usize>>
+where
+    F: FnMut(i64) -> bool,
+{
+    let col_data = batch.time_column().expect("time column");
+
+    // Time column is not nullable so can skip checking mask
+    let mut ret = vec![];
+    for range in ranges {
+        let offset = range.start;
+        ret.extend(
+            filter_slice(&col_data[range.clone()], &mut predicate)
+                .map(|r| (r.start + offset)..(r.end + offset)),
+        )
+    }
+    ret
+}
+
+fn filter_slice<'a, T, F>(
+    col_data: &'a [T],
+    predicate: &'a mut F,
+) -> impl Iterator<Item = Range<usize>> + 'a
+where
+    T: Copy,
+    F: 'a + FnMut(T) -> bool,
+{
+    let mut range: Range<usize> = 0..0;
+    let mut values = col_data.iter();
+
+    std::iter::from_fn(move || loop {
+        match values.next() {
+            Some(value) if predicate(*value) => {
+                range.end += 1;
+                continue;
+            }
+            // Either finished or predicate failed
+            _ if range.start != range.end => {
+                let t = range.clone();
+                range.end += 1;
+                range.start = range.end;
+                return Some(t);
+            }
+            // Predicate failed and start == end
+            Some(_) => {
+                range.start += 1;
+                range.end += 1;
+            }
+            None => return None,
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mutable_batch::writer::Writer;
+    use rand::prelude::*;
+
+    fn make_rng() -> StdRng {
+        let seed = rand::rngs::OsRng.next_u64();
+        println!("Seed: {seed}");
+        StdRng::seed_from_u64(seed)
+    }
+
+    #[test]
+    fn test_filter_slice() {
+        let collected: Vec<_> =
+            filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x != 1 && x != 4).collect();
+        assert_eq!(collected, vec![0..1, 2..4, 5..7]);
+
+        let collected: Vec<_> =
+            filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x == 1 || x == 2 || x == 6).collect();
+        assert_eq!(collected, vec![1..3, 6..7])
+    }
+
+    #[test]
+    fn test_filter_fuzz() {
+        let mut rng = make_rng();
+        let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32()))
+            .take(1000)
+            .collect();
+
+        let mut predicate = |x: u32| x & 1 == 0;
+
+        let indexes: Vec<_> = filter_slice(&data, &mut predicate).flatten().collect();
+
+        let expected: Vec<_> = data
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, x)| match predicate(*x) {
+                true => Some(idx),
+                false => None,
+            })
+            .collect();
+
+        assert_eq!(indexes, expected);
+    }
+
+    #[test]
+    fn test_filter_batch() {
+        let mut batch = MutableBatch::new();
+        let mut rng = make_rng();
+        let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() as i64))
+            .take(1000)
+            .collect();
+
+        let ranges = &[0..87, 90..442, 634..800];
+        let mut predicate = |x: i64| x & 1 == 0;
+
+        let mut writer = Writer::new(&mut batch, 1000);
+        writer.write_time("time", data.iter().cloned()).unwrap();
+        writer.commit();
+
+        let actual: Vec<_> = filter_time(&batch, ranges, &mut predicate)
+            .into_iter()
+            .flatten()
+            .collect();
+
+        let expected: Vec<_> = ranges
+            .iter()
+            .flat_map(|r| r.clone())
+            .filter(|idx| predicate(data[*idx]))
+            .collect();
+
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/partition/src/lib.rs b/partition/src/lib.rs
new file mode 100644
index 0000000..d542bba
--- /dev/null
+++ b/partition/src/lib.rs
@@ -0,0 +1,1704 @@
+//! Functionality for partitioning data based on a partition template.
+//!
+//! The partitioning template, derived partition key format, and encodings are
+//! described in detail in the [`data_types::partition_template`] module.
+
+mod bucket;
+mod filter;
+mod strftime;
+mod traits;
+
+use std::{borrow::Cow, num::NonZeroUsize, ops::Range};
+
+use data_types::{
+    partition_template::{
+        TablePartitionTemplateOverride, TemplatePart, ENCODED_PARTITION_KEY_CHARS,
+        MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, PARTITION_KEY_DELIMITER, PARTITION_KEY_MAX_PART_LEN,
+        PARTITION_KEY_PART_TRUNCATED, PARTITION_KEY_VALUE_EMPTY_STR, PARTITION_KEY_VALUE_NULL_STR,
+    },
+    PartitionKey,
+};
+use hashbrown::HashMap;
+use mutable_batch::{MutableBatch, WritePayload};
+use percent_encoding::utf8_percent_encode;
+use thiserror::Error;
+use unicode_segmentation::UnicodeSegmentation;
+
+pub use self::traits::{Batch, PartitioningColumn, TimeColumnError};
+use self::{bucket::BucketHasher, strftime::StrftimeFormatter};
+
+/// An error generating a partition key for a row.
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Error, PartialEq, Eq, Clone)]
+pub enum PartitionKeyError {
+    /// The partition template defines a [`Template::TimeFormat`] part, but the
+    /// provided strftime formatter is invalid.
+    #[error("invalid strftime format in partition template")]
+    InvalidStrftime,
+
+    /// The partition template defines a [`Template::TagValue`] part, but the
+    /// column type is not "tag".
+    #[error("tag value partitioner does not accept input columns of type {0}")]
+    TagValueNotTag(String),
+
+    /// A "catch all" error for when a formatter returns [`std::fmt::Error`],
+    /// which contains no context.
+    #[error("partition key generation error")]
+    FmtError(#[from] std::fmt::Error),
+}
+
+/// Returns an iterator identifying consecutive ranges for a given partition key
+pub fn partition_batch<'a, T>(
+    batch: &'a T,
+    template: &'a TablePartitionTemplateOverride,
+) -> impl Iterator<Item = (Result<String, PartitionKeyError>, Range<usize>)> + 'a
+where
+    T: Batch,
+{
+    let parts = template.len();
+    if parts > MAXIMUM_NUMBER_OF_TEMPLATE_PARTS {
+        panic!(
+            "partition template contains {} parts, which exceeds the maximum of {} parts",
+            parts, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS
+        );
+    }
+
+    range_encode(partition_keys(batch, template.parts()))
+}
+
+/// A [`TablePartitionTemplateOverride`] is made up of one of more
+/// [`TemplatePart`]s that are rendered and joined together by
+/// [`PARTITION_KEY_DELIMITER`] to form a single partition key.
+///
+/// To avoid allocating intermediate strings, and performing column lookups for
+/// every row, each [`TemplatePart`] is converted to a [`Template`].
+///
+/// [`Template::fmt_row`] can then be used to render the template for that
+/// particular row to the provided string, without performing any additional
+/// column lookups
+#[derive(Debug)]
+#[allow(clippy::large_enum_variant)]
+enum Template<'a, T: PartitioningColumn> {
+    TagValue(&'a T, Option<&'a T::TagIdentityKey>),
+    TimeFormat(&'a [i64], StrftimeFormatter<'a>),
+    Bucket(&'a T, BucketHasher, Option<&'a T::TagIdentityKey>),
+
+    /// This batch is missing a partitioning tag column.
+    MissingTag,
+}
+
+impl<'a, T> Template<'a, T>
+where
+    T: PartitioningColumn,
+{
+    /// Renders this template to `out` for the row `idx`.
+    fn fmt_row<W: std::fmt::Write>(
+        &mut self,
+        out: &mut W,
+        idx: usize,
+    ) -> Result<(), PartitionKeyError> {
+        match self {
+            Template::TagValue(col, last_key) if col.is_valid(idx) => {
+                let this_key = col
+                    .get_tag_identity_key(idx)
+                    .ok_or_else(|| PartitionKeyError::TagValueNotTag(col.type_description()))?;
+
+                // Update the "is identical" tracking key for this new,
+                // potentially different key.
+                *last_key = Some(this_key);
+
+                out.write_str(encode_key_part(col.get_tag_value(this_key).unwrap()).as_ref())?
+            }
+            Template::TimeFormat(t, fmt) => fmt.render(t[idx], out)?,
+            Template::Bucket(col, bucketer, last_key) if col.is_valid(idx) => {
+                let this_key = col
+                    .get_tag_identity_key(idx)
+                    .ok_or_else(|| PartitionKeyError::TagValueNotTag(col.type_description()))?;
+                let this_value = col.get_tag_value(this_key).unwrap();
+                let bucket = bucketer.assign_bucket(this_value);
+
+                // Update the "is identical" tracking key for this new,
+                // potentially different key.
+                *last_key = Some(this_key);
+
+                write!(out, "{bucket}")?
+            }
+            // Either a tag that has no value for this given row index, or the
+            // batch does not contain this tag at all.
+            Template::TagValue(_, last_key) => {
+                // This row doesn't have a tag value, which should be carried
+                // forwards to be checked against the next row.
+                *last_key = None;
+                out.write_str(PARTITION_KEY_VALUE_NULL_STR)?
+            }
+            // Either a tag that has no value for this given row index, or the
+            // batch does not contain this tag at all.
+            Template::Bucket(_, _, last_key) => {
+                // This row doesn't have a tag value, which should be carried
+                // forwards to be checked against the next row.
+                *last_key = None;
+                out.write_str(PARTITION_KEY_VALUE_NULL_STR)?
+            }
+            Template::MissingTag => out.write_str(PARTITION_KEY_VALUE_NULL_STR)?,
+        }
+
+        Ok(())
+    }
+
+    /// Returns true if the partition key generated by `self` for `idx` will be
+    /// identical to the last generated key.
+    fn is_identical(&mut self, idx: usize) -> bool {
+        match self {
+            Template::TagValue(col, last_key) if col.is_valid(idx) => {
+                let this_key = match col.get_tag_identity_key(idx) {
+                    Some(key) => key,
+                    // This is an error, but for the purposes of identical checks,
+                    // it is treated as not identical, causing the error to be
+                    // raised when formatting is attempted.
+                    None => return false,
+                };
+
+                // Check if the key matches the last key, indicating the same value is going to
+                // be rendered.
+                last_key.map(|v| v == this_key).unwrap_or_default()
+            }
+            Template::TimeFormat(t, fmt) => {
+                // Check if the last value matches the current value, after
+                // optionally applying the precision reduction optimisation.
+                fmt.equals_last(t[idx])
+            }
+            Template::Bucket(col, fmt, last_key) if col.is_valid(idx) => {
+                // To perform an equality check for `idx` when it is a
+                // `Bucket` template part we must check in order:
+                //
+                //     1. If this dictionary key is the same as the
+                //        previous
+                //     2. If the assigned bucket is the same as the
+                //        previous
+                //
+                // While just checking the bucket is correct, checking
+                // the dictionary key first avoids unnecessary throwaway
+                // hashing work.
+                let this_key = match col.get_tag_identity_key(idx) {
+                    Some(key) => key,
+                    // This is an error, but for the purposes of identical checks,
+                    // it is treated as not identical, causing the error to be
+                    // raised when formatting is attempted.
+                    None => return false,
+                };
+
+                match last_key {
+                    Some(v) if this_key == *v => true,
+                    Some(_) => {
+                        col.get_tag_value(this_key)
+                            .map(|this_value| {
+                                // Grab the last assigned bucket, assign
+                                // a bucket for the current value and
+                                // check for equality.
+                                fmt.last_assigned_bucket()
+                                    .map(|last_bucket| last_bucket == fmt.assign_bucket(this_value))
+                                    .unwrap_or_default()
+                            })
+                            .unwrap_or_default()
+                    }
+                    None => false,
+                }
+            }
+            // The last row did not contain this key, and neither does this.
+            Template::TagValue(_, None) | Template::Bucket(_, _, None) => true,
+            // The last row did contain a key, but this one does not (therefore
+            // it differs).
+            Template::TagValue(_, Some(_)) | Template::Bucket(_, _, Some(_)) => false,
+            // The batch does not contain this tag at all - it always matches
+            // with the previous row.
+            Template::MissingTag => true,
+        }
+    }
+}
+
+fn encode_key_part(s: &str) -> Cow<'_, str> {
+    // Encode reserved characters and non-ascii characters.
+    let as_str: Cow<'_, str> = utf8_percent_encode(s, &ENCODED_PARTITION_KEY_CHARS).into();
+
+    match as_str.len() {
+        0 => Cow::Borrowed(PARTITION_KEY_VALUE_EMPTY_STR),
+        1..=PARTITION_KEY_MAX_PART_LEN => as_str,
+        _ => {
+            // This string exceeds the maximum byte length limit and must be
+            // truncated.
+            //
+            // Truncation of unicode strings can be tricky - this implementation
+            // avoids splitting unicode code-points nor graphemes. See the
+            // partition_template module docs in data_types before altering
+            // this.
+
+            // Preallocate the string to hold the long partition key part.
+            let mut buf = String::with_capacity(PARTITION_KEY_MAX_PART_LEN);
+
+            // This is a slow path, re-encoding the original input string -
+            // fortunately this is an uncommon path.
+            //
+            // Walk the string, encoding each grapheme (which includes spaces)
+            // individually, tracking the total length of the encoded string.
+            // Once it hits 199 bytes, stop and append a #.
+
+            let mut bytes = 0;
+            s.graphemes(true)
+                .map(|v| Cow::from(utf8_percent_encode(v, &ENCODED_PARTITION_KEY_CHARS)))
+                .take_while(|v| {
+                    bytes += v.len(); // Byte length of encoded grapheme
+                    bytes < PARTITION_KEY_MAX_PART_LEN
+                })
+                .for_each(|v| buf.push_str(v.as_ref()));
+
+            // Append the truncation marker.
+            buf.push(PARTITION_KEY_PART_TRUNCATED);
+
+            assert!(buf.len() <= PARTITION_KEY_MAX_PART_LEN);
+
+            Cow::Owned(buf)
+        }
+    }
+}
+
+/// Returns an iterator of partition keys for the given table batch.
+///
+/// This function performs deduplication on returned keys; the returned iterator
+/// yields [`Some`] containing the partition key string when a new key is
+/// generated, and [`None`] when the generated key would equal the last key.
+fn partition_keys<'a, T>(
+    batch: &'a T,
+    template_parts: impl Iterator<Item = TemplatePart<'a>>,
+) -> impl Iterator<Item = Option<Result<String, PartitionKeyError>>> + 'a
+where
+    T: Batch,
+{
+    // Extract the timestamp data.
+    let time = batch.time_column().expect("error reading time column");
+
+    // Convert TemplatePart into an ordered array of Template
+    let mut template = template_parts
+        .map(|v| match v {
+            TemplatePart::TagValue(col_name) => batch
+                .column(col_name)
+                .map_or_else(|| Template::MissingTag, |v| Template::TagValue(v, None)),
+            TemplatePart::TimeFormat(fmt) => {
+                Template::TimeFormat(time, StrftimeFormatter::new(fmt))
+            }
+            TemplatePart::Bucket(col_name, num_buckets) => batch.column(col_name).map_or_else(
+                || Template::MissingTag,
+                |v| Template::Bucket(v, BucketHasher::new(num_buckets), None),
+            ),
+        })
+        .collect::<Vec<_>>();
+
+    // Track the length of the last yielded partition key, and pre-allocate the
+    // next partition key string to match it.
+    //
+    // In the happy path, keys of consistent sizes are generated and the
+    // allocations reach a minimum. If the keys are inconsistent, at best a
+    // subset of allocations are eliminated, and at worst, a few bytes of memory
+    // is temporarily allocated until the resulting string is shrunk down.
+    let mut last_len = 5;
+
+    // The first row in a batch must always be evaluated to produce a key.
+    //
+    // Row 0 is guaranteed to exist, otherwise attempting to read the time
+    // column above would have caused a panic (no rows -> no time column).
+    let first = std::iter::once(Some(evaluate_template(&mut template, &mut last_len, 0)));
+
+    // The subsequent rows in a batch may generate the same key, and therefore a
+    // dedupe check is used before allocating & populating the partition key.
+    let rest = (1..batch.num_rows()).map(move |idx| {
+        // Check if this partition key is going to be different from the
+        // last, short-circuiting the check if it is.
+        if template.iter_mut().all(|t| t.is_identical(idx)) {
+            return None;
+        }
+
+        Some(evaluate_template(&mut template, &mut last_len, idx))
+    });
+
+    first.chain(rest)
+}
+
+/// Evaluate the partition template against the row indexed by `idx`.
+///
+/// # Panics
+///
+/// This method panics if `idx` exceeds the number of rows in the batch.
+fn evaluate_template<T: PartitioningColumn>(
+    template: &mut [Template<'_, T>],
+    last_len: &mut usize,
+    idx: usize,
+) -> Result<String, PartitionKeyError> {
+    let mut buf = String::with_capacity(*last_len);
+    let template_len = template.len();
+
+    // Evaluate each template part for this row
+    for (col_idx, col) in template.iter_mut().enumerate() {
+        // Evaluate the formatter for this template part against the row.
+        col.fmt_row(&mut buf, idx)?;
+
+        // If this isn't the last element in the template, insert a field
+        // delimiter.
+        if col_idx + 1 != template_len {
+            buf.push(PARTITION_KEY_DELIMITER);
+        }
+    }
+
+    *last_len = buf.len();
+    Ok(buf)
+}
+
+/// Takes an iterator of [`Option`] and merges identical consecutive elements
+/// together.
+///
+/// Any [`None`] yielded by `iterator` is added to the range for the previous
+/// [`Some`].
+fn range_encode<I, T>(mut iterator: I) -> impl Iterator<Item = (T, Range<usize>)>
+where
+    I: Iterator<Item = Option<T>>,
+    T: Eq,
+{
+    let mut last: Option<I::Item> = None;
+    let mut range: Range<usize> = 0..0;
+    std::iter::from_fn(move || loop {
+        match (iterator.next(), last.take()) {
+            // The iterator yeilds a NULL/identical value and there is a prior value
+            (Some(None), Some(v)) => {
+                range.end += 1;
+                last = Some(v);
+            }
+            // The iterator yeilds a value, and the last value matches
+            (Some(cur), Some(next)) => match cur == next {
+                true => {
+                    range.end += 1;
+                    last = Some(next);
+                }
+                false => {
+                    let t = range.clone();
+                    range.start = range.end;
+                    range.end += 1;
+                    last = Some(cur);
+                    return Some((next.unwrap(), t));
+                }
+            },
+            // There is no last value
+            (Some(cur), None) => {
+                range.end += 1;
+                last = Some(cur);
+            }
+            (None, Some(next)) => return Some((next.unwrap(), range.clone())),
+            (None, None) => return None,
+        }
+    })
+}
+
+/// An error partitioning a batch.
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Error, PartialEq, Eq, Clone)]
+pub enum PartitionWriteError {
+    /// An error deriving the partition key from the partition key template.
+    #[error("{0}")]
+    PartitionKey(#[from] PartitionKeyError),
+
+    /// An error accessing the time column.
+    #[error("{0}")]
+    TimeColumn(#[from] TimeColumnError),
+}
+
+/// A [`MutableBatch`] with a non-zero set of row ranges to write
+#[derive(Debug)]
+pub struct PartitionWrite<'a> {
+    batch: &'a MutableBatch,
+    ranges: Vec<Range<usize>>,
+    min_timestamp: i64,
+    max_timestamp: i64,
+    row_count: NonZeroUsize,
+}
+
+impl<'a> PartitionWrite<'a> {
+    /// Create a new [`PartitionWrite`] with the entire range of the provided batch
+    ///
+    /// # Panic
+    ///
+    /// Panics if the batch has no rows
+    pub fn new(batch: &'a MutableBatch) -> Result<Self, PartitionWriteError> {
+        let row_count = NonZeroUsize::new(batch.rows()).unwrap();
+        let time = batch.time_column()?;
+        let (min_timestamp, max_timestamp) = min_max_time(time);
+
+        // This `allow` can be removed when this issue is fixed and released:
+        // <https://github.com/rust-lang/rust-clippy/issues/11086>
+        #[allow(clippy::single_range_in_vec_init)]
+        Ok(Self {
+            batch,
+            ranges: vec![0..batch.rows()],
+            min_timestamp,
+            max_timestamp,
+            row_count,
+        })
+    }
+
+    /// Returns the minimum timestamp in the write
+    pub fn min_timestamp(&self) -> i64 {
+        self.min_timestamp
+    }
+
+    /// Returns the maximum timestamp in the write
+    pub fn max_timestamp(&self) -> i64 {
+        self.max_timestamp
+    }
+
+    /// Returns the number of rows in the write
+    pub fn rows(&self) -> NonZeroUsize {
+        self.row_count
+    }
+
+    /// Returns a [`PartitionWrite`] containing just the rows of `Self` that pass
+    /// the provided time predicate, or None if no rows
+    pub fn filter(&self, predicate: impl Fn(i64) -> bool) -> Option<PartitionWrite<'a>> {
+        let mut min_timestamp = i64::MAX;
+        let mut max_timestamp = i64::MIN;
+        let mut row_count = 0_usize;
+
+        // Construct a predicate that lets us inspect the timestamps as they are filtered
+        let inspect = |t| match predicate(t) {
+            true => {
+                min_timestamp = min_timestamp.min(t);
+                max_timestamp = max_timestamp.max(t);
+                row_count += 1;
+                true
+            }
+            false => false,
+        };
+
+        let ranges: Vec<_> = filter::filter_time(self.batch, &self.ranges, inspect);
+        let row_count = NonZeroUsize::new(row_count)?;
+
+        Some(PartitionWrite {
+            batch: self.batch,
+            ranges,
+            min_timestamp,
+            max_timestamp,
+            row_count,
+        })
+    }
+
+    /// Create a collection of [`PartitionWrite`] indexed by partition key
+    /// from a [`MutableBatch`] and [`TablePartitionTemplateOverride`]
+    pub fn partition(
+        batch: &'a MutableBatch,
+        partition_template: &TablePartitionTemplateOverride,
+    ) -> Result<HashMap<PartitionKey, Self>, PartitionWriteError> {
+        use hashbrown::hash_map::Entry;
+        let time = batch.time_column()?;
+
+        let mut partition_ranges = HashMap::new();
+        for (partition, range) in partition_batch(batch, partition_template) {
+            let row_count = NonZeroUsize::new(range.end - range.start).unwrap();
+            let (min_timestamp, max_timestamp) = min_max_time(&time[range.clone()]);
+
+            match partition_ranges.entry(PartitionKey::from(partition?)) {
+                Entry::Vacant(v) => {
+                    v.insert(PartitionWrite {
+                        batch,
+                        ranges: vec![range],
+                        min_timestamp,
+                        max_timestamp,
+                        row_count,
+                    });
+                }
+                Entry::Occupied(mut o) => {
+                    let pw = o.get_mut();
+                    pw.min_timestamp = pw.min_timestamp.min(min_timestamp);
+                    pw.max_timestamp = pw.max_timestamp.max(max_timestamp);
+                    pw.row_count = NonZeroUsize::new(pw.row_count.get() + row_count.get()).unwrap();
+                    pw.ranges.push(range);
+                }
+            }
+        }
+        Ok(partition_ranges)
+    }
+}
+
+impl<'a> WritePayload for PartitionWrite<'a> {
+    fn write_to_batch(&self, batch: &mut MutableBatch) -> mutable_batch::Result<()> {
+        batch.extend_from_ranges(self.batch, &self.ranges)
+    }
+}
+
+fn min_max_time(col: &[i64]) -> (i64, i64) {
+    let mut min_timestamp = i64::MAX;
+    let mut max_timestamp = i64::MIN;
+    for t in col {
+        min_timestamp = min_timestamp.min(*t);
+        max_timestamp = max_timestamp.max(*t);
+    }
+    (min_timestamp, max_timestamp)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use super::*;
+
+    use assert_matches::assert_matches;
+    use chrono::{format::StrftimeItems, DateTime, Datelike, Days, TimeZone, Utc};
+    use data_types::partition_template::{
+        build_column_values, test_table_partition_override, ColumnValue,
+    };
+    use mutable_batch::{writer::Writer, MutableBatch};
+    use proptest::{prelude::*, prop_compose, proptest, strategy::Strategy};
+    use rand::prelude::*;
+    use schema::{Projection, TIME_COLUMN_NAME};
+    use test_helpers::assert_error;
+
+    #[test]
+    fn return_err_if_no_time_column() {
+        let batch = MutableBatch::new();
+        let table_partition_template = Default::default();
+        assert_error!(
+            PartitionWrite::partition(&batch, &table_partition_template),
+            PartitionWriteError::TimeColumn(TimeColumnError::NotFound),
+        );
+    }
+
+    fn make_rng() -> StdRng {
+        let seed = rand::rngs::OsRng.next_u64();
+        println!("Seed: {seed}");
+        StdRng::seed_from_u64(seed)
+    }
+
+    /// Reproducer for https://github.com/influxdata/idpe/issues/17765
+    #[test]
+    fn test_equals_last() {
+        let ts = [
+            1686756903736785920, // last_eq=false, render, set last_ptr
+            42,                  // last_eq=false, render, set last_ptr
+            1686756903736785920, // last_eq=false, re-use, don't change last_ptr
+            1686756903736785920, // last_eq=false, re-use, don't change last_ptr
+            42,                  // last_eq=true (wrong), re-use
+        ];
+
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, ts.len());
+
+        writer.write_time("time", ts.into_iter()).unwrap();
+        writer.commit();
+
+        let keys =
+            generate_denormalised_keys(&batch, TablePartitionTemplateOverride::default().parts())
+                .unwrap();
+
+        assert_eq!(
+            keys,
+            &[
+                "2023-06-14",
+                "1970-01-01",
+                "2023-06-14",
+                "2023-06-14",
+                "1970-01-01",
+            ]
+        );
+    }
+
+    /// Generates a vector of partition key strings, or an error.
+    ///
+    /// This function normalises the de-duplicated output of
+    /// [`partition_keys()`], returning the last observed key when the dedupe
+    /// [`partition_keys()`] process returns [`None`].
+    fn generate_denormalised_keys<'a, 'b: 'a, T: Batch>(
+        batch: &'b T,
+        template_parts: impl Iterator<Item = TemplatePart<'a>>,
+    ) -> Result<Vec<String>, PartitionKeyError> {
+        let mut last_ret = None;
+        partition_keys(batch, template_parts)
+            .map(|v| match v {
+                Some(this) => {
+                    last_ret = Some(this.clone());
+                    this
+                }
+                None => last_ret
+                    .as_ref()
+                    .expect("must have observed prior key")
+                    .clone(),
+            })
+            .collect::<Result<Vec<_>, _>>()
+    }
+
+    /// A fixture test asserting the default partition key format, derived from
+    /// the default partition key template.
+    #[test]
+    fn test_default_fixture() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 1);
+
+        writer.write_time("time", vec![1].into_iter()).unwrap();
+        writer
+            .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter())
+            .unwrap();
+        writer.commit();
+
+        let template_parts =
+            TablePartitionTemplateOverride::try_new(None, &Default::default()).unwrap();
+        let keys: Vec<_> = partition_keys(&batch, template_parts.parts())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(keys, vec!["1970-01-01".to_string()])
+    }
+
+    #[test]
+    #[should_panic(expected = r#"error reading time column: NotFound"#)]
+    fn test_zero_sized_batch() {
+        let batch = MutableBatch::new();
+
+        let template_parts = test_table_partition_override(vec![
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::TagValue("bananas"),
+        ]);
+
+        let keys: Vec<_> = partition_batch(&batch, &template_parts).collect::<Vec<_>>();
+        assert_eq!(keys, vec![])
+    }
+
+    #[test]
+    fn test_range_encode() {
+        let collected: Vec<_> =
+            range_encode(vec![5, 5, 5, 7, 2, 2, 3].into_iter().map(Some)).collect();
+        assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)])
+    }
+
+    #[test]
+    fn test_range_encode_sparse() {
+        let collected: Vec<_> =
+            range_encode(vec![Some(5), None, None, Some(7), Some(2), None, Some(3)].into_iter())
+                .collect();
+        assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)])
+    }
+
+    #[test]
+    fn test_range_encode_fuzz() {
+        let mut rng = make_rng();
+        let original: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() % 20))
+            .take(1000)
+            .collect();
+
+        let rle: Vec<_> = range_encode(original.iter().cloned().map(Some)).collect();
+
+        let mut last_range = rle[0].1.clone();
+        for (_, range) in &rle[1..] {
+            assert_eq!(range.start, last_range.end);
+            assert_ne!(range.start, range.end);
+            last_range = range.clone();
+        }
+
+        let hydrated: Vec<_> = rle
+            .iter()
+            .flat_map(|(v, r)| std::iter::repeat(*v).take(r.end - r.start))
+            .collect();
+
+        assert_eq!(original, hydrated)
+    }
+
+    #[test]
+    fn test_partition() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_tag(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+        writer
+            .write_tag(
+                "device",
+                Some(&[0b00001110]),
+                vec![
+                    "97c953a1-70e6-4569-80e4-59d1f49ec3fa",
+                    "f1aac284-b8a1-4938-acf3-52a3d516ca14",
+                    "420bb984-4d1e-48ec-bbfc-10825fbf3221",
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::Bucket("device", 10),
+            TemplatePart::TagValue("bananas"), // column not present
+        ];
+
+        writer.commit();
+
+        let keys: Vec<_> = partition_keys(&batch, template_parts.clone().into_iter())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(
+            keys,
+            vec![
+                "1970-01-01 00:00:00|!|!|!".to_string(),
+                "1970-01-01 00:00:00|west|6|!".to_string(),
+                "1970-01-01 00:00:00|!|4|!".to_string(),
+                "1970-01-01 00:00:00|east|5|!".to_string(),
+                "1970-01-01 00:00:00|!|!|!".to_string()
+            ]
+        );
+
+        let record_batch = batch.to_arrow(Projection::All).unwrap();
+
+        let keys: Vec<_> = partition_keys(&record_batch, template_parts.into_iter())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(
+            keys,
+            vec![
+                "1970-01-01 00:00:00|!|!|!".to_string(),
+                "1970-01-01 00:00:00|west|6|!".to_string(),
+                "1970-01-01 00:00:00|!|4|!".to_string(),
+                "1970-01-01 00:00:00|east|5|!".to_string(),
+                "1970-01-01 00:00:00|!|!|!".to_string()
+            ]
+        );
+    }
+
+    #[test]
+    fn test_bucket_fixture() {
+        let mut bucketer = BucketHasher::new(10);
+        assert_eq!(bucketer.assign_bucket("foo"), 6);
+        assert_eq!(bucketer.last_assigned_bucket(), Some(6));
+        assert_eq!(bucketer.assign_bucket("bat"), 5);
+        assert_eq!(bucketer.last_assigned_bucket(), Some(5));
+        assert_eq!(bucketer.assign_bucket("qux"), 5);
+        assert_eq!(bucketer.last_assigned_bucket(), Some(5));
+    }
+
+    #[test]
+    fn test_sparse_representation() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 7);
+
+        writer
+            .write_time(
+                "time",
+                vec![
+                    1,
+                    1,
+                    1,
+                    1,
+                    1685971961464736000,
+                    1685971961464736000,
+                    1685971961464736000,
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        writer
+            .write_tag(
+                "region",
+                Some(&[0b01111111]),
+                vec![
+                    "platanos", "platanos", "platanos", "platanos", "platanos", "platanos",
+                    "bananas",
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        writer
+            .write_tag(
+                "device",
+                Some(&[0b01111111]),
+                vec!["foo", "bat", "qux", "bat", "foo", "foo", "foo"].into_iter(), // `bat` and `qux` both go to bucket 5, so those 3 values should yield the same key
+            )
+            .unwrap();
+
+        let template_parts = [
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::Bucket("device", 10),
+            TemplatePart::TagValue("bananas"), // column not present
+        ];
+
+        writer.commit();
+
+        let mut iter = partition_keys(&batch, template_parts.into_iter());
+
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("1970-01-01 00:00:00|platanos|6|!".to_string()))
+        );
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("1970-01-01 00:00:00|platanos|5|!".to_string()))
+        );
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("2023-06-05 13:32:41|platanos|6|!".to_string()))
+        );
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("2023-06-05 13:32:41|bananas|6|!".to_string()))
+        );
+    }
+
+    #[test]
+    fn partitioning_on_fields_panics() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_string(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [TemplatePart::TagValue("region")];
+
+        writer.commit();
+
+        let got: Result<Vec<_>, _> = generate_denormalised_keys(&batch, template_parts.into_iter());
+        assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_)));
+    }
+
+    #[test]
+    fn bucketing_on_fields_panics() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_string(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [TemplatePart::Bucket("region", 10)];
+
+        writer.commit();
+
+        let got: Result<Vec<_>, _> = generate_denormalised_keys(&batch, template_parts.into_iter());
+        assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_)));
+    }
+
+    fn identity<'a, T>(s: T) -> ColumnValue<'a>
+    where
+        T: Into<Cow<'a, str>>,
+    {
+        ColumnValue::Identity(s.into())
+    }
+
+    fn prefix<'a, T>(s: T) -> ColumnValue<'a>
+    where
+        T: Into<Cow<'a, str>>,
+    {
+        ColumnValue::Prefix(s.into())
+    }
+
+    fn year(y: i32) -> ColumnValue<'static> {
+        ColumnValue::Datetime {
+            begin: Utc.with_ymd_and_hms(y, 1, 1, 0, 0, 0).unwrap(),
+            end: Utc.with_ymd_and_hms(y + 1, 1, 1, 0, 0, 0).unwrap(),
+        }
+    }
+
+    fn bucket(bucket_id: u32) -> ColumnValue<'static> {
+        ColumnValue::Bucket(bucket_id)
+    }
+
+    // Generate a test that asserts the derived partition key matches
+    // "want_key", when using the provided "template" parts and set of "tags".
+    //
+    // Additionally validates that the derived key is reversible into the
+    // expected set of "want_reversed_tags" from the original inputs.
+    macro_rules! test_partition_key {
+        (
+            $name:ident,
+            template = $template:expr,              // Array/vec of TemplatePart
+            tags = $tags:expr,                      // Array/vec of (tag_name, value) tuples
+            want_key = $want_key:expr,              // Expected partition key string
+            want_reversed_tags = $want_reversed_tags:expr // Array/vec of (tag_name, value) reversed from $tags
+        ) => {
+            paste::paste! {
+                #[test]
+                fn [<test_partition_key_ $name>]() {
+                    let mut batch = MutableBatch::new();
+                    let mut writer = Writer::new(&mut batch, 1);
+
+                    let template = $template.into_iter().collect::<Vec<_>>();
+                    let template = test_table_partition_override(template);
+
+                    // Timestamp: 2023-05-29T13:03:16Z
+                    writer
+                        .write_time("time", vec![1685365396931384064].into_iter())
+                        .unwrap();
+
+                    for (col, value) in $tags {
+                        let v = String::from(value);
+                        writer
+                            .write_tag(col, Some(&[0b00000001]), vec![v.as_str()].into_iter())
+                            .unwrap();
+                    }
+
+                    writer.commit();
+
+                    // Generate the full set of partition keys, inserting the
+                    // last observed value when the next key is identical to
+                    // normalise the values.
+                    let keys = generate_denormalised_keys(&batch, template.parts())
+                        .unwrap();
+                    assert_eq!(keys, vec![$want_key.to_string()], "generated key differs");
+
+                    // Reverse the encoding.
+                    let reversed = build_column_values(&template, &keys[0]);
+
+                    // Expect the tags to be (str, ColumnValue) for the
+                    // comparison
+                    let want: Vec<(&str, ColumnValue<'_>)> = $want_reversed_tags
+                        .into_iter()
+                        .collect();
+
+                    let got = reversed.collect::<Vec<_>>();
+                    assert_eq!(got, want, "reversed key differs");
+                }
+            }
+        };
+    }
+
+    test_partition_key!(
+        simple,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 5),
+        ],
+        tags = [
+            ("a", "bananas"),
+            ("b", "are_good"),
+            ("c", "for_test_strings")
+        ],
+        want_key = "2023|bananas|are_good|1",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("bananas")),
+            ("b", identity("are_good")),
+            ("c", bucket(1)),
+        ]
+    );
+
+    test_partition_key!(
+        non_ascii,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+        ],
+        tags = [("a", "bananas"), ("b", "plátanos")],
+        want_key = "2023|bananas|pl%C3%A1tanos",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("bananas")),
+            ("b", identity("plátanos")),
+        ]
+    );
+
+    test_partition_key!(
+        single_tag_template_tag_not_present,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("b", "bananas")],
+        want_key = "!",
+        want_reversed_tags = []
+    );
+
+    test_partition_key!(
+        single_bucket_template_tag_not_present,
+        template = [TemplatePart::Bucket("a", 10)],
+        tags = [("b", "bananas")],
+        want_key = "!",
+        want_reversed_tags = []
+    );
+
+    test_partition_key!(
+        single_tag_template_tag_empty,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "")],
+        want_key = "^",
+        want_reversed_tags = [("a", identity(""))]
+    );
+
+    test_partition_key!(
+        single_bucket_template_tag_empty,
+        template = [TemplatePart::Bucket("a", 10)],
+        tags = [("a", "")],
+        want_key = "0",
+        want_reversed_tags = [("a", bucket(0))]
+    );
+
+    test_partition_key!(
+        missing_tag,
+        template = [
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10)
+        ],
+        tags = [("a", "bananas")],
+        want_key = "bananas|!|!",
+        want_reversed_tags = [("a", identity("bananas"))]
+    );
+
+    test_partition_key!(
+        unambiguous,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::TagValue("c"),
+            TemplatePart::TagValue("d"),
+            TemplatePart::TagValue("e"),
+        ],
+        tags = [("a", "|"), ("b", "!"), ("d", "%7C%21%257C"), ("e", "^")],
+        want_key = "2023|%7C|%21|!|%257C%2521%25257C|%5E",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("|")),
+            ("b", identity("!")),
+            ("d", identity("%7C%21%257C")),
+            ("e", identity("^"))
+        ]
+    );
+
+    test_partition_key!(
+        truncated_char_reserved,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "#")],
+        want_key = "%23",
+        want_reversed_tags = [("a", identity("#"))]
+    );
+
+    // Keys < 200 bytes long should not be truncated.
+    test_partition_key!(
+        truncate_length_199,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(199))],
+        want_key = "A".repeat(199),
+        want_reversed_tags = [("a", identity("A".repeat(199)))]
+    );
+
+    // Keys of exactly 200 bytes long should not be truncated.
+    test_partition_key!(
+        truncate_length_200,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(200))],
+        want_key = "A".repeat(200),
+        want_reversed_tags = [("a", identity("A".repeat(200)))]
+    );
+
+    // Keys > 200 bytes long should be truncated to exactly 200 bytes,
+    // terminated by a # character.
+    test_partition_key!(
+        truncate_length_201,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(201))],
+        want_key = format!("{}#", "A".repeat(199)),
+        want_reversed_tags = [("a", prefix("A".repeat(199)))]
+    );
+
+    // A key ending in an encoded sequence that does not cross the cut-off point
+    // is preserved.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                      ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%25`
+    //                      ^ cutoff
+    //
+    // So the entire encoded sequence should be preserved.
+    test_partition_key!(
+        truncate_encoding_sequence_ok,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(197)))],
+        want_key = format!("{}%25", "A".repeat(197)), // Not truncated
+        want_reversed_tags = [("a", identity(format!("{}%", "A".repeat(197))))]
+    );
+
+    // A key ending in an encoded sequence should not be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                    ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>% 25`            (space added for clarity)
+    //                    ^ cutoff
+    //
+    // Where naive slicing would result in truncating an encoding sequence and
+    // therefore the whole encoded sequence should be truncated.
+    test_partition_key!(
+        truncate_encoding_sequence_truncated_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(198)))],
+        want_key = format!("{}#", "A".repeat(198)), // Truncated
+        want_reversed_tags = [("a", prefix("A".repeat(198)))]
+    );
+
+    // A key ending in an encoded sequence should not be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                     ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%2 5`            (space added for clarity)
+    //                     ^ cutoff
+    //
+    // Where naive slicing would result in truncating an encoding sequence and
+    // therefore the whole encoded sequence should be truncated.
+    test_partition_key!(
+        truncate_encoding_sequence_truncated_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(199)))],
+        want_key = format!("{}#", "A".repeat(199)), // Truncated
+        want_reversed_tags = [("a", prefix("A".repeat(199)))]
+    );
+
+    // A key ending in a unicode code-point should never be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>🍌`
+    //                         ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%F0%9F%8D%8C`
+    //                         ^ cutoff
+    //
+    // Therefore the entire code-point should be removed from the truncated
+    // output.
+    //
+    // This test MUST NOT fail, or an invalid UTF-8 string is being generated
+    // which is unusable in languages (like Rust).
+    //
+    // Advances the cut-off to ensure the position within the code-point doesn't
+    // affect the output.
+    test_partition_key!(
+        truncate_within_code_point_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(194)))],
+        want_key = format!("{}#", "A".repeat(194)),
+        want_reversed_tags = [("a", prefix("A".repeat(194)))]
+    );
+    test_partition_key!(
+        truncate_within_code_point_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(195)))],
+        want_key = format!("{}#", "A".repeat(195)),
+        want_reversed_tags = [("a", prefix("A".repeat(195)))]
+    );
+    test_partition_key!(
+        truncate_within_code_point_3,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(196)))],
+        want_key = format!("{}#", "A".repeat(196)),
+        want_reversed_tags = [("a", prefix("A".repeat(196)))]
+    );
+
+    // A key ending in a unicode grapheme should never be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>நிbananas`
+    //                   ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>நிbananas`    (within a grapheme)
+    //                   ^ cutoff
+    //
+    // Therefore the entire grapheme (நி) should be removed from the truncated
+    // output.
+    //
+    // This is a conservative implementation, and may be relaxed in the future.
+    //
+    // This first test asserts that a grapheme can be included, and then
+    // subsequent tests increment the cut-off point by 1 byte each time.
+    test_partition_key!(
+        truncate_within_grapheme_0,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(181)))],
+        want_key = format!("{}%E0%AE%A8%E0%AE%BF#", "A".repeat(181)),
+        want_reversed_tags = [("a", prefix(format!("{}நி", "A".repeat(181))))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(182)))],
+        want_key = format!("{}#", "A".repeat(182)),
+        want_reversed_tags = [("a", prefix("A".repeat(182)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(183)))],
+        want_key = format!("{}#", "A".repeat(183)),
+        want_reversed_tags = [("a", prefix("A".repeat(183)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_3,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(184)))],
+        want_key = format!("{}#", "A".repeat(184)),
+        want_reversed_tags = [("a", prefix("A".repeat(184)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_4,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(185)))],
+        want_key = format!("{}#", "A".repeat(185)),
+        want_reversed_tags = [("a", prefix("A".repeat(185)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_5,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(186)))],
+        want_key = format!("{}#", "A".repeat(186)),
+        want_reversed_tags = [("a", prefix("A".repeat(186)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_6,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(187)))],
+        want_key = format!("{}#", "A".repeat(187)),
+        want_reversed_tags = [("a", prefix("A".repeat(187)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_7,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(188)))],
+        want_key = format!("{}#", "A".repeat(188)),
+        want_reversed_tags = [("a", prefix("A".repeat(188)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_8,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(189)))],
+        want_key = format!("{}#", "A".repeat(189)),
+        want_reversed_tags = [("a", prefix("A".repeat(189)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_9,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(190)))],
+        want_key = format!("{}#", "A".repeat(190)),
+        want_reversed_tags = [("a", prefix("A".repeat(190)))]
+    );
+
+    // As above, but the grapheme is the last portion of the generated string
+    // (no trailing bananas).
+    test_partition_key!(
+        truncate_grapheme_identity,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நி", "A".repeat(182)))],
+        want_key = format!("{}%E0%AE%A8%E0%AE%BF", "A".repeat(182)),
+        want_reversed_tags = [("a", identity(format!("{}நி", "A".repeat(182))))]
+    );
+
+    /// A test using an invalid strftime format string.
+    #[test]
+    fn test_invalid_strftime() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 1);
+
+        writer.write_time("time", vec![1].into_iter()).unwrap();
+        writer
+            .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter())
+            .unwrap();
+        writer.commit();
+
+        let template = [TemplatePart::TimeFormat("%3F")]
+            .into_iter()
+            .collect::<Vec<_>>();
+        let template = test_table_partition_override(template);
+
+        let ret = partition_keys(&batch, template.parts())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>();
+
+        assert_matches!(ret, Err(PartitionKeyError::InvalidStrftime));
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "partition template contains 9 parts, which exceeds the maximum of 8 parts"
+    )]
+    fn test_too_many_parts() {
+        let template = test_table_partition_override(
+            std::iter::repeat(TemplatePart::TagValue("bananas"))
+                .take(9)
+                .collect(),
+        );
+
+        let _ = partition_batch(&MutableBatch::new(), &template);
+    }
+
+    // These values are arbitrarily chosen when building an input to the
+    // partitioner.
+
+    // Arbitrary tag names are selected from this set of candidates (to ensure
+    // there's always some overlap, rather than truly random strings).
+    const TEST_TAG_NAME_SET: &[&str] = &["A", "B", "C", "D", "E", "F"];
+
+    // Arbitrary template parts are selected from this set.
+    const TEST_TEMPLATE_PARTS: &[TemplatePart<'static>] = &[
+        TemplatePart::TimeFormat("%Y|%m|%d!-string"),
+        TemplatePart::TimeFormat("%Y|%m|%d!-%%bananas"),
+        TemplatePart::TimeFormat("%Y/%m/%d"),
+        TemplatePart::TimeFormat("%Y-%m-%d"),
+        TemplatePart::TagValue(""),
+        TemplatePart::TagValue("A"),
+        TemplatePart::TagValue("B"),
+        TemplatePart::TagValue("C"),
+        TemplatePart::TagValue("tags!"),
+        TemplatePart::TagValue("%tags!"),
+        TemplatePart::TagValue("my_tag"),
+        TemplatePart::TagValue("my|tag"),
+        TemplatePart::TagValue("%%%%|!!!!|"),
+        TemplatePart::Bucket("D", 10),
+        TemplatePart::Bucket("E", 100),
+        TemplatePart::Bucket("F", 1000),
+    ];
+
+    prop_compose! {
+        /// Yields a vector of up to [`MAXIMUM_NUMBER_OF_TEMPLATE_PARTS`] unique
+        /// template parts, chosen from [`TEST_TEMPLATE_PARTS`].
+        fn arbitrary_template_parts()(set in proptest::collection::vec(
+                proptest::sample::select(TEST_TEMPLATE_PARTS),
+                (1, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS) // Set size range
+            )) -> Vec<TemplatePart<'static>> {
+            let mut set = set;
+            set.dedup_by(|a, b| format!("{a:?}") == format!("{b:?}"));
+            set
+        }
+    }
+
+    prop_compose! {
+        /// Yield a HashMap of between 1 and 10 (column_name, random string
+        /// value) with tag names chosen from [`TEST_TAG_NAME_SET`].
+        fn arbitrary_tag_value_map()(v in proptest::collection::hash_map(
+                proptest::sample::select(TEST_TAG_NAME_SET).prop_map(ToString::to_string),
+                any::<String>(),
+                (1, 10) // Set size range
+            )) -> HashMap<String, String> {
+            v
+        }
+    }
+
+    prop_compose! {
+        /// Yield a Vec containing an identical timestamp run of random length,
+        /// up to `max_run_len`,
+        fn arbitrary_timestamp_run(max_run_len: usize)(v in 0_i64..i64::MAX, run_len in 1..max_run_len) -> Vec<i64> {
+            let mut x = Vec::with_capacity(run_len);
+            x.resize(run_len, v);
+            x
+        }
+    }
+
+    /// Yield a Vec of timestamp values that more accurately model real
+    /// timestamps than pure random selection.
+    ///
+    /// Runs of identical timestamps are generated with
+    /// [`arbitrary_timestamp_run()`], which are then shuffled to produce a list
+    /// of timestamps with limited repeats, sometimes consecutively.
+    fn arbitrary_timestamps() -> impl Strategy<Value = Vec<i64>> {
+        proptest::collection::vec(arbitrary_timestamp_run(6), 10..100)
+            .prop_map(|v| v.into_iter().flatten().collect::<Vec<_>>())
+            .prop_shuffle()
+    }
+
+    enum ExpectedColumnValue {
+        String(String),
+        TSRange(DateTime<Utc>, DateTime<Utc>),
+        Bucket(u32),
+    }
+
+    impl ExpectedColumnValue {
+        fn expect_string(&self) -> &String {
+            match self {
+                Self::String(s) => s,
+                Self::TSRange(_, _) => panic!("expected string, got TS range"),
+                Self::Bucket(_) => panic!("expected string, got bucket id"),
+            }
+        }
+
+        fn expect_ts_range(&self) -> (DateTime<Utc>, DateTime<Utc>) {
+            match self {
+                Self::String(_) => panic!("expected TS range, got string"),
+                Self::TSRange(b, e) => (*b, *e),
+                Self::Bucket(_) => panic!("expected TS range, got bucket id"),
+            }
+        }
+
+        fn expect_bucket_id(&self) -> u32 {
+            match self {
+                Self::String(_) => panic!("expected bucket id, got string"),
+                Self::TSRange(_, _) => panic!("expected bucket id, got TS range"),
+                Self::Bucket(bucket_id) => *bucket_id,
+            }
+        }
+    }
+
+    proptest! {
+        /// A property test that asserts a write comprised of an arbitrary
+        /// subset of [`TEST_TAG_NAME_SET`] with randomised values, that is
+        /// partitioned using a partitioning template arbitrarily selected from
+        /// [`TEST_TEMPLATE_PARTS`], can be reversed to the full set of tags
+        /// and/or hash-bucket IDs via [`build_column_values()`].
+        #[test]
+        fn prop_reversible_mapping(
+            template in arbitrary_template_parts(),
+            tag_values in arbitrary_tag_value_map(),
+            ts in 0_i64..i64::MAX,
+        ) {
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, 1);
+
+            let template = template.clone().into_iter().collect::<Vec<_>>();
+            let template = test_table_partition_override(template);
+
+            writer
+                .write_time("time", vec![ts].into_iter())
+                .unwrap();
+
+            for (col, value) in &tag_values {
+                writer
+                    .write_tag(col.as_str(), Some(&[0b00000001]), vec![value.as_str()].into_iter())
+                    .unwrap();
+            }
+
+            writer.commit();
+            let keys: Vec<_> = generate_denormalised_keys(&batch, template.parts())
+                .unwrap();
+            assert_eq!(keys.len(), 1);
+
+            // Reverse the encoding.
+            let reversed: Vec<(&str, ColumnValue<'_>)> = build_column_values(&template, &keys[0]).collect();
+
+            // Build the expected set of reversed tags by filtering out any
+            // NULL tags (preserving empty string values).
+            let ts = Utc.timestamp_nanos(ts);
+            let want_reversed: Vec<(&str, ExpectedColumnValue)> = template.parts().filter_map(|v| match v {
+                TemplatePart::TagValue(col_name) if tag_values.contains_key(col_name) => {
+                    // This tag had a (potentially empty) value wrote and should
+                    // appear in the reversed output.
+                    Some((col_name, ExpectedColumnValue::String(tag_values.get(col_name).unwrap().to_string())))
+                }
+                TemplatePart::TimeFormat("%Y/%m/%d" | "%Y-%m-%d") => {
+                    let begin = Utc.with_ymd_and_hms(ts.year(), ts.month(), ts.day(), 0, 0, 0).unwrap();
+                    let end = begin + Days::new(1);
+                    Some((TIME_COLUMN_NAME, ExpectedColumnValue::TSRange(begin, end)))
+                }
+                TemplatePart::Bucket(col_name, num_buckets) if tag_values.contains_key(col_name) => {
+                    // Hash-bucketing is not fully-reversible from value to
+                    // tag-name (intentionally so, it makes it much simpler to
+                    // implement).
+                    //
+                    // The test must assign buckets as they are when the
+                    // partition key is rendered.
+                    let want_bucket = BucketHasher::new(num_buckets).assign_bucket(tag_values.get(col_name).unwrap());
+                    Some((col_name, ExpectedColumnValue::Bucket(want_bucket)))
+                }
+                _ => None,
+            }).collect();
+
+            assert_eq!(want_reversed.len(), reversed.len());
+
+            for ((want_col, want_val), (got_col, got_val)) in want_reversed.iter().zip(reversed.iter()) {
+                assert_eq!(got_col, want_col, "column names differ");
+
+                match got_val {
+                    ColumnValue::Identity(_) => {
+                        // An identity is both equal to, and a prefix of, the
+                        // original value.
+                        let want_val = want_val.expect_string();
+                        assert_eq!(got_val, &want_val, "identity values differ");
+                        assert!(
+                            got_val.is_prefix_match_of(want_val),
+                            "prefix mismatch; {:?} is not a prefix of {:?}",
+                            got_val,
+                            want_val,
+                        );
+                    },
+                    ColumnValue::Prefix(_) => {
+                        let want_val = want_val.expect_string();
+                        assert!(
+                            got_val.is_prefix_match_of(want_val),
+                            "prefix mismatch; {:?} is not a prefix of {:?}",
+                            got_val,
+                            want_val,
+                        );
+                    },
+                    ColumnValue::Datetime{..} => {
+                        let (want_begin, want_end) = want_val.expect_ts_range();
+                        match got_val {
+                            ColumnValue::Datetime{begin, end} => {
+                                assert_eq!(want_begin, *begin);
+                                assert_eq!(want_end, *end);
+                            }
+                            _ => panic!("expected datatime column value but got: {:?}", got_val)
+                        }
+                    },
+                    ColumnValue::Bucket(got_bucket_id) => {
+                        let want_bucket_id = want_val.expect_bucket_id();
+                        assert_eq!(*got_bucket_id, want_bucket_id);
+                    }
+                };
+            }
+        }
+
+        /// A property test that asserts the partitioner tolerates (does not
+        /// panic) randomised, potentially invalid strftime formatter strings.
+        #[test]
+        fn prop_arbitrary_strftime_format(fmt in any::<String>()) {
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, 1);
+
+            // This sequence causes chrono's formatter to panic with a "do not
+            // use this" message...
+            //
+            // This is validated to not be part of the formatter (among other
+            // invalid sequences) when constructing a template from the user
+            // input/proto.
+            //
+            // Uniquely this causes a panic, whereas others do not - so it must
+            // be filtered out when fuzz-testing that invalid sequences do not
+            // cause a panic in the key generator.
+            prop_assume!(!fmt.contains("%#z"));
+
+            // Generate a single time-based partitioning template with a
+            // randomised format string.
+            let template = vec![
+                TemplatePart::TimeFormat(&fmt),
+            ];
+            let template = test_table_partition_override(template);
+
+            // Timestamp: 2023-05-29T13:03:16Z
+            writer
+                .write_time("time", vec![1685365396931384064].into_iter())
+                .unwrap();
+
+            writer
+                .write_tag("bananas", Some(&[0b00000001]), vec!["great"].into_iter())
+                .unwrap();
+
+            writer.commit();
+            let ret = partition_keys(&batch, template.parts())
+                .map(|v| v.expect("non-identical consecutive keys"))
+                .collect::<Result<Vec<_>, _>>();
+
+            // The is allowed to succeed or fail under this test (but not
+            // panic), and the returned error/value must match certain
+            // properties:
+            match ret {
+                Ok(v) => { assert_eq!(v.len(), 1); },
+                Err(e) => { assert_matches!(e, PartitionKeyError::InvalidStrftime); },
+            }
+        }
+
+        // Drives the strftime formatter through the "front door", using the
+        // same interface as a user would call to partition data. This validates
+        // the integration between the various formatters, range encoders,
+        // dedupe, etc.
+        #[test]
+        fn prop_strftime_integration(
+            times in arbitrary_timestamps(),
+            format in prop_oneof![
+                Just("%Y-%m-%d"), // Default scheme
+                Just("%s")        // Unix seconds, to drive increased cache miss rate in strftime formatter
+            ]
+        ) {
+            use std::fmt::Write;
+
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, times.len());
+            let row_count = times.len();
+
+            let template = test_table_partition_override(vec![TemplatePart::TimeFormat(format)]);
+
+            writer
+                .write_time("time", times.clone().into_iter())
+                .unwrap();
+
+            writer.commit();
+
+            let fmt = StrftimeItems::new(format);
+            let iter = partition_batch(&batch, &template);
+
+            let mut observed_rows = 0;
+
+            // For each partition key and the calculated row range
+            for (key, range) in iter {
+                let key = key.unwrap();
+
+                observed_rows += range.len();
+
+                // Validate all rows in that range render to the same timestamp
+                // value as the partition key when using the same format, using
+                // a known-good formatter.
+                for ts in &times[range] {
+                    // Generate the control string.
+                    let mut control = String::new();
+                    let _ = write!(
+                        control,
+                        "{}",
+                        Utc.timestamp_nanos(*ts)
+                            .format_with_items(fmt.clone())
+                    );
+                    assert_eq!(control, key);
+                }
+            }
+
+            assert_eq!(observed_rows, row_count);
+        }
+    }
+}
diff --git a/partition/src/strftime.rs b/partition/src/strftime.rs
new file mode 100644
index 0000000..bd52300
--- /dev/null
+++ b/partition/src/strftime.rs
@@ -0,0 +1,415 @@
+use std::fmt::Write;
+
+use chrono::{format::StrftimeItems, TimeZone, Utc};
+
+use crate::PartitionKeyError;
+
+use super::encode_key_part;
+
+/// The number of nanoseconds in 1 day, definitely recited from memory.
+const DAY_NANOSECONDS: i64 = 86_400_000_000_000;
+
+/// The default YMD formatter spec.
+const YMD_SPEC: &str = "%Y-%m-%d";
+
+/// A FIFO ring buffer, holding `N` lazily initialised slots.
+///
+/// This is optimised for low values of `N` (where N*T covers a few cache lines)
+/// as it performs an O(n) linear search.
+#[derive(Debug)]
+struct RingBuffer<const N: usize, T> {
+    buf: [Option<T>; N],
+
+    /// Index into to the last wrote value.
+    last_idx: usize,
+}
+
+impl<const N: usize, T> Default for RingBuffer<N, T>
+where
+    T: Default,
+{
+    fn default() -> Self {
+        Self {
+            buf: [(); N].map(|_| Default::default()), // default init for non-const type
+            last_idx: N - 1,
+        }
+    }
+}
+
+impl<const N: usize, T> RingBuffer<N, T>
+where
+    T: Default,
+{
+    /// Return a mutable reference to the next slot to be overwrote. This method
+    /// initialises the slot if it has not been previously used.
+    ///
+    /// This is like an "insert" operation, but allows the caller to re-use the
+    /// contents of the slot to minimise allocations.
+    ///
+    /// This is an O(1) operation.
+    fn next_slot(&mut self) -> &mut T {
+        // Advance the next slot pointer
+        self.last_idx += 1;
+        self.last_idx %= N;
+
+        let v = self.buf[self.last_idx].get_or_insert_with(Default::default);
+
+        v
+    }
+
+    /// Drop the last buffer entry.
+    ///
+    /// This may cause spurious cache misses due to the short-circuiting search
+    /// observing an empty element, potentially before non-empty elements.
+    fn drop_last(&mut self) {
+        self.buf[self.last_idx] = None;
+    }
+
+    /// Find the first initialised slot that causes `F` to evaluate to true,
+    /// returning the slot contents.
+    ///
+    /// This is a O(n) linear search operation, which for small N can be as
+    /// fast, or faster, than a hashmap lookup by key.
+    fn find<F>(&self, f: F) -> Option<&'_ T>
+    where
+        F: Fn(&T) -> bool,
+    {
+        for v in &self.buf {
+            let v = v.as_ref()?;
+            if f(v) {
+                return Some(v);
+            }
+        }
+        None
+    }
+}
+
+/// A strftime-like formatter of epoch timestamps with nanosecond granularity.
+///
+/// # Deferred Errors
+///
+/// If the provided stftime formatter is invalid, an
+/// [`PartitionKeyError::InvalidStrftime`] error is raised during the formatting
+/// call to [`StrftimeFormatter::render()`] and not during initialisation. This
+/// is a limitation of the underlying library.
+///
+/// # Caching
+///
+/// It is very common for batches of writes to contain multiple measurements
+/// taken at the same timestamp; for example, a periodic scraper of metric
+/// values will assign a single timestamp for the entire batch of observations.
+///
+/// To leverage this reuse of timestamps, this type retains a cache of the 5
+/// most recently observed distinct timestamps to avoid recomputing the same
+/// formatted string for each repeat occurrence.
+///
+/// In the best case, this reduces N row formats down to a single format
+/// operation, and in the worst case, it changes the memory overhead from "rows"
+/// to "rows + 5" which amortises nicely as batch sizes increase. If more than 5
+/// timestamps are observed, the existing buffer allocations are reused when
+/// computing the replacement values.
+///
+/// # `YYYY-MM-DD` Reduction Specialisation
+///
+/// The default (and therefore most common) formatting spec is "%Y-%m-%d", as
+/// this is the IOx default partitioning template. The vast majority of writes
+/// will utilise this format spec.
+///
+/// Because this spec is so common, a special case optimisation is utilised for
+/// it: for any given timestamp, first normalise the value by reducing the
+/// precision such that the timestamp is rounded down to the nearest whole day
+/// before further processing.
+///
+/// This removes all the sub-day variance (hours, minutes, seconds, etc) from
+/// the value, without changing the formatter output (it still produces the same
+/// string). This in turn causes any timestamp from the same day to be a cache
+/// hit with any prior value for the same day, regardless of "time" portion of
+/// the timestamp.
+///
+/// Combined with the above cache, this raises the cache hit rate to ~100% for
+/// write batches that span less than 6 days, effectively amortising the cost of
+/// timestamp formatting to O(1) for these very common batches.
+#[derive(Debug)]
+pub(super) struct StrftimeFormatter<'a> {
+    /// The strftime formatter definition.
+    ///
+    /// NOTE: the value below is UNVALIDATED - if the input strftime format
+    /// contains invalid formatter directives, then the error is deferred until
+    /// formatting a timestamp.
+    format: StrftimeItems<'a>,
+
+    /// As an optimisation, when this formatter is using the default YYYY-MM-DD
+    /// partitioning template, timestamps are normalised to per-day granularity,
+    /// preventing variances in the timestamp of less-than 1 day from causing a
+    /// miss in the cached "values".
+    ///
+    /// This optimisation massively increases the reuse of cached, pre-formatted
+    /// strings.
+    is_ymd_format: bool,
+
+    /// A set of 5 most recently added timestamps, and the formatted string they
+    /// map to.
+    values: RingBuffer<5, (i64, String)>,
+
+    /// The last observed timestamp.
+    ///
+    /// This value changes each time a timestamp is returned to the user, either
+    /// from the cache of pre-generated strings, or by generating a new one and
+    /// MUST always track the last timestamp given to
+    /// [`StrftimeFormatter::render()`].
+    last_ts: Option<i64>,
+}
+
+impl<'a> StrftimeFormatter<'a> {
+    /// Initialise a new [`StrftimeFormatter`] with the given stftime-like
+    /// format string.
+    ///
+    /// The exact formatter specification is [documented here].
+    ///
+    /// If the formatter contains an invalid spec, an error is raised when
+    /// formatting.
+    ///
+    /// [documented here]:
+    ///     https://docs.rs/chrono/latest/chrono/format/strftime/index.html
+    pub(super) fn new(format: &'a str) -> Self {
+        let mut is_default_format = false;
+        if format == YMD_SPEC {
+            is_default_format = true;
+        }
+
+        Self {
+            format: StrftimeItems::new(format),
+            is_ymd_format: is_default_format,
+            values: RingBuffer::default(),
+            last_ts: None,
+        }
+    }
+
+    /// Format `timestamp` to the format spec provided during initialisation,
+    /// writing the result to `out`.
+    pub(super) fn render<W>(&mut self, timestamp: i64, mut out: W) -> Result<(), PartitionKeyError>
+    where
+        W: std::fmt::Write,
+    {
+        // Optionally apply the default format reduction optimisation.
+        let timestamp = self.maybe_reduce(timestamp);
+
+        // Retain this timestamp as the last observed timestamp.
+        self.last_ts = Some(timestamp);
+
+        // Check if this timestamp has already been rendered.
+        if let Some(v) = self.values.find(|(t, _v)| *t == timestamp) {
+            // It has! Re-use the existing formatted string.
+            out.write_str(&v.1)?;
+            return Ok(());
+        }
+
+        // Obtain a mutable reference to the next item to be replaced, re-using
+        // the string buffer within it to avoid allocating (or initialising it
+        // if it was not yet initialised).
+        let buf = self.values.next_slot();
+
+        // Reset the slot value
+        buf.0 = timestamp;
+        buf.1.clear();
+
+        // Format the timestamp value into the slot buffer.
+        if write!(
+            buf.1,
+            "{}",
+            Utc.timestamp_nanos(timestamp)
+                .format_with_items(self.format.clone()) // Cheap clone of refs
+        )
+        .is_err()
+        {
+            // The string buffer may be empty, or contain partially rendered
+            // output before the error was raised.
+            //
+            // Remove this entry from the cache to prevent there being a mapping
+            // of `timestamp -> <empty or incomplete output>`.
+            self.values.drop_last();
+            return Err(PartitionKeyError::InvalidStrftime);
+        };
+
+        // Encode any reserved characters in this new string.
+        buf.1 = encode_key_part(&buf.1).to_string();
+
+        // Render this new value to the caller's buffer
+        out.write_str(&buf.1)?;
+
+        Ok(())
+    }
+
+    /// Reduce the precision of the timestamp iff using the default "%Y-%m-%d"
+    /// formatter string, returning a value rounded to the nearest whole day.
+    ///
+    /// If the formatter is not this special-case value, `timestamp` is returned
+    /// unchanged.
+    fn maybe_reduce(&self, timestamp: i64) -> i64 {
+        if !self.is_ymd_format {
+            return timestamp;
+        }
+        // Don't map timestamps less than the value we would subtract.
+        if timestamp < DAY_NANOSECONDS {
+            return timestamp;
+        }
+        timestamp - (timestamp % DAY_NANOSECONDS)
+    }
+
+    /// Returns true if the output of rendering `timestamp` will match the last
+    /// rendered timestamp, after optionally applying the precision reduction
+    /// optimisation.
+    pub(crate) fn equals_last(&self, timestamp: i64) -> bool {
+        // Optionally apply the default format reduction optimisation.
+        let timestamp = self.maybe_reduce(timestamp);
+
+        self.last_ts.map(|v| v == timestamp).unwrap_or_default()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+    use data_types::partition_template::{TablePartitionTemplateOverride, TemplatePart};
+    use proptest::prelude::*;
+
+    use super::*;
+
+    #[test]
+    fn test_default_formatter() {
+        let template = TablePartitionTemplateOverride::default();
+        let expect = template.parts().collect::<Vec<_>>();
+
+        // If this assert fails (and it probably shouldn't!) then you may want
+        // to consider changing the special case optimisation above.
+        assert_matches!(expect.as_slice(), &[TemplatePart::TimeFormat(YMD_SPEC)]);
+    }
+
+    #[test]
+    fn test_never_empty() {
+        let mut fmt = StrftimeFormatter::new("");
+
+        let mut buf = String::new();
+        fmt.render(42, &mut buf).expect("should render string");
+        assert!(!buf.is_empty());
+        assert_eq!(buf, "^");
+    }
+
+    #[test]
+    fn test_incomplete_formatter() {
+        let mut fmt = StrftimeFormatter::new("%");
+
+        let mut buf = String::new();
+        let got = fmt.render(42, &mut buf);
+        assert_matches!(got, Err(PartitionKeyError::InvalidStrftime));
+    }
+
+    #[test]
+    fn test_incomplete_formatter_removes_bad_mapping() {
+        let mut fmt = StrftimeFormatter::new("%s");
+
+        let mut buf = String::new();
+        fmt.render(42, &mut buf).unwrap();
+
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), None, None, None, None]
+        );
+
+        // This obviously isn't possible through normal usage, but to trigger
+        // the "failed to render" code path, reach in and tweak the formatter to
+        // cause it to fail.
+        fmt.format = StrftimeItems::new("%");
+
+        // Trigger the "cannot format" code path
+        fmt.render(4242, &mut buf).expect_err("invalid formatter");
+
+        // And ensure the ring buffer was left in a clean state
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), None, None, None, None]
+        );
+    }
+
+    #[test]
+    fn test_uses_ring_buffer() {
+        let mut fmt = StrftimeFormatter::new("%H");
+        let mut buf = String::new();
+
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(12345, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+
+        // Assert the above repetitive values were deduped in the cache.
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), Some((12345, _)), None, None, None]
+        );
+        assert_eq!(fmt.values.last_idx, 1);
+    }
+
+    const FORMATTER_SPEC_PARTS: &[&str] = &[
+        "%Y", "%m", "%d", "%H", "%m", "%.9f", "%r", "%+", "%t", "%n", "%A", "%c",
+    ];
+
+    prop_compose! {
+        /// Yield an arbitrary formatter spec selected from
+        /// [`FORMATTER_SPEC_PARTS`] delimited by a random character.
+        fn arbitrary_formatter_spec()(
+            delimiter in any::<char>(),
+            v in proptest::collection::vec(
+                proptest::sample::select(FORMATTER_SPEC_PARTS).prop_map(ToString::to_string),
+                (0, 10) // Set size range
+            )) -> String {
+            v.join(&delimiter.to_string())
+        }
+    }
+
+    fn default_formatter_spec() -> impl Strategy<Value = String> {
+        Just(YMD_SPEC.to_string())
+    }
+
+    proptest! {
+        /// The [`StrftimeFormatter`] is a glorified wrapper around chrono's
+        /// formatter, therefore this test asserts the following property:
+        ///
+        ///     For any timestamp and formatter, the output of this type must
+        ///     match the output of chrono's formatter, after key encoding.
+        ///
+        /// Validating this asserts correctness of the wrapper itself, assuming
+        /// chrono's formatter produces correct output. Note the encoding is
+        /// tested in the actual partitioner module.
+        #[test]
+        fn prop_differential_validation(
+            timestamps in prop::collection::vec(any::<i64>(), 1..100),
+            format in prop_oneof![arbitrary_formatter_spec(), default_formatter_spec(), any::<String>()],
+        ) {
+            let mut fmt = StrftimeFormatter::new(&format);
+            let items = StrftimeItems::new(&format);
+
+            for ts in timestamps {
+                // Generate the control string.
+                let mut control = String::new();
+                let _ = write!(
+                    control,
+                    "{}",
+                    Utc.timestamp_nanos(ts)
+                        .format_with_items(items.clone())
+                );
+                let control = encode_key_part(&control);
+
+                // Generate the test string.
+                let mut test = String::new();
+                if fmt.render(ts, &mut test).is_err() {
+                    // Any error results in the key not being used, so any
+                    // differences are inconsequential.
+                    continue;
+                }
+
+                assert_eq!(control, test);
+            }
+        }
+    }
+}
diff --git a/partition/src/traits.rs b/partition/src/traits.rs
new file mode 100644
index 0000000..439e2a6
--- /dev/null
+++ b/partition/src/traits.rs
@@ -0,0 +1,61 @@
+mod mutable_batch;
+mod record_batch;
+
+use thiserror::Error;
+
+/// An error accessing the time column of a batch.
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Error, PartialEq, Eq, Clone)]
+pub enum TimeColumnError {
+    /// The batch did not have a time column.
+    #[error("No time column found")]
+    NotFound,
+}
+
+/// The behavior a column in a batch needs to have to be partitioned
+pub trait PartitioningColumn: std::fmt::Debug {
+    /// The type of a thing that can be used to identify whether a tag has changed or not; may or
+    /// may not be the actual tag
+    type TagIdentityKey: ?Sized + PartialEq;
+
+    /// Whether the value at the given row index is valid or NULL
+    fn is_valid(&self, idx: usize) -> bool;
+
+    /// The raw packed validity bytes.
+    ///
+    /// The validity mask MUST follow the Arrow specification for validity masks
+    /// (<https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps>).
+    fn valid_bytes(&self) -> &[u8];
+
+    /// Get the identity of the tag at the given row index.
+    ///
+    /// The return value is only valid if `is_valid(idx)` for the same `idx`
+    /// returns true.
+    fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey>;
+
+    /// Get the value of the tag that has the given identity
+    fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str>;
+
+    /// A string describing this column's data type; used in error messages
+    fn type_description(&self) -> String;
+}
+
+/// Behavior of a batch of data used by partitioning code
+pub trait Batch {
+    /// The type of this batch's columns
+    type Column: PartitioningColumn;
+
+    /// How many rows are in this batch
+    fn num_rows(&self) -> usize;
+
+    /// The column in the batch with the given name, if any
+    fn column(&self, column: &str) -> Option<&Self::Column>;
+
+    /// Return the values in the time column in this batch. Return an error if the batch has no
+    /// time column.
+    ///
+    /// # Panics
+    ///
+    /// If a time column exists but its data isn't the expected type, this function will panic.
+    fn time_column(&self) -> Result<&[i64], TimeColumnError>;
+}
diff --git a/partition/src/traits/mutable_batch.rs b/partition/src/traits/mutable_batch.rs
new file mode 100644
index 0000000..981740d
--- /dev/null
+++ b/partition/src/traits/mutable_batch.rs
@@ -0,0 +1,60 @@
+use super::{Batch, PartitioningColumn, TimeColumnError};
+use mutable_batch::{
+    column::{Column as MutableBatchColumn, ColumnData},
+    MutableBatch,
+};
+use schema::TIME_COLUMN_NAME;
+
+impl PartitioningColumn for MutableBatchColumn {
+    type TagIdentityKey = i32;
+
+    fn is_valid(&self, idx: usize) -> bool {
+        self.valid_mask().get(idx)
+    }
+
+    fn valid_bytes(&self) -> &[u8] {
+        self.valid_mask().bytes()
+    }
+
+    fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey> {
+        debug_assert!(PartitioningColumn::is_valid(self, idx));
+        match self.data() {
+            ColumnData::Tag(col_data, _, _) => Some(&col_data[idx]),
+            _ => None,
+        }
+    }
+
+    fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str> {
+        match self.data() {
+            ColumnData::Tag(_, dictionary, _) => dictionary.lookup_id(*tag_identity_key),
+            _ => None,
+        }
+    }
+
+    fn type_description(&self) -> String {
+        self.influx_type().to_string()
+    }
+}
+
+impl Batch for MutableBatch {
+    type Column = MutableBatchColumn;
+
+    fn num_rows(&self) -> usize {
+        self.rows()
+    }
+
+    fn column(&self, column: &str) -> Option<&Self::Column> {
+        self.column(column).ok()
+    }
+
+    fn time_column(&self) -> Result<&[i64], TimeColumnError> {
+        let time_column = self
+            .column(TIME_COLUMN_NAME)
+            .map_err(|_| TimeColumnError::NotFound)?;
+
+        match &time_column.data() {
+            ColumnData::I64(col_data, _) => Ok(col_data),
+            x => unreachable!("expected i64 got {}", x),
+        }
+    }
+}
diff --git a/partition/src/traits/record_batch.rs b/partition/src/traits/record_batch.rs
new file mode 100644
index 0000000..57f0dff
--- /dev/null
+++ b/partition/src/traits/record_batch.rs
@@ -0,0 +1,82 @@
+use super::{Batch, PartitioningColumn, TimeColumnError};
+use arrow::{
+    array::{Array, DictionaryArray, StringArray, TimestampNanosecondArray},
+    datatypes::{DataType, Int32Type},
+    record_batch::RecordBatch,
+};
+use schema::TIME_COLUMN_NAME;
+use std::sync::Arc;
+
+impl PartitioningColumn for Arc<dyn Array> {
+    type TagIdentityKey = str;
+
+    fn is_valid(&self, idx: usize) -> bool {
+        Array::is_valid(&self, idx)
+    }
+
+    fn valid_bytes(&self) -> &[u8] {
+        self.nulls()
+            .expect("this RecordBatch's Array should be nullable")
+            .validity()
+    }
+
+    fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey> {
+        debug_assert!(PartitioningColumn::is_valid(self, idx));
+        match self.data_type() {
+            DataType::Utf8 => self
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .map(|col_data| col_data.value(idx)),
+            DataType::Dictionary(key, value)
+                if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+            {
+                let dict = self
+                    .as_any()
+                    .downcast_ref::<DictionaryArray<Int32Type>>()
+                    .expect("should have gotten a DictionaryArray");
+
+                let values = dict
+                    .values()
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("should have gotten a StringArray");
+                Some(values.value(dict.key(idx)?))
+            }
+            _ => None,
+        }
+    }
+
+    fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str> {
+        Some(tag_identity_key)
+    }
+
+    fn type_description(&self) -> String {
+        self.data_type().to_string()
+    }
+}
+
+impl Batch for RecordBatch {
+    type Column = Arc<dyn Array>;
+
+    fn num_rows(&self) -> usize {
+        self.num_rows()
+    }
+
+    fn column(&self, column: &str) -> Option<&Self::Column> {
+        self.column_by_name(column)
+    }
+
+    fn time_column(&self) -> Result<&[i64], TimeColumnError> {
+        let time_column = self
+            .column_by_name(TIME_COLUMN_NAME)
+            .ok_or(TimeColumnError::NotFound)?;
+
+        Ok(time_column
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .expect("time column was an unexpected type")
+            .values()
+            .inner()
+            .typed_data())
+    }
+}
diff --git a/predicate/Cargo.toml b/predicate/Cargo.toml
new file mode 100644
index 0000000..5e5c828
--- /dev/null
+++ b/predicate/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "predicate"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+chrono = { version = "0.4", default-features = false }
+data_types = { path = "../data_types" }
+datafusion = { workspace = true }
+datafusion_util = { path = "../datafusion_util" }
+itertools = "0.12"
+observability_deps = { path = "../observability_deps" }
+query_functions = { path = "../query_functions"}
+schema = { path = "../schema" }
+snafu = "0.8"
+sqlparser = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+test_helpers = { path = "../test_helpers" }
diff --git a/predicate/src/delete_expr.rs b/predicate/src/delete_expr.rs
new file mode 100644
index 0000000..fc241f0
--- /dev/null
+++ b/predicate/src/delete_expr.rs
@@ -0,0 +1,224 @@
+use data_types::{DeleteExpr, Op, Scalar};
+use datafusion::{
+    logical_expr::BinaryExpr,
+    prelude::{binary_expr, lit, Expr},
+};
+use snafu::{ResultExt, Snafu};
+use std::ops::Deref;
+
+pub(crate) fn expr_to_df(expr: DeleteExpr) -> Expr {
+    let column = datafusion::prelude::Column {
+        relation: None,
+        name: expr.column,
+    };
+
+    binary_expr(
+        Expr::Column(column),
+        op_to_df(expr.op),
+        lit(scalar_to_df(expr.scalar)),
+    )
+}
+
+#[derive(Debug, Snafu)]
+#[allow(clippy::large_enum_variant)]
+pub enum DataFusionToExprError {
+    #[snafu(display("unsupported expression: {:?}", expr))]
+    UnsupportedExpression { expr: Expr },
+
+    #[snafu(display("unsupported operants: left {:?}; right {:?}", left, right))]
+    UnsupportedOperants { left: Expr, right: Expr },
+
+    #[snafu(display("cannot convert datafusion operator: {}", source))]
+    CannotConvertDataFusionOperator {
+        source: crate::delete_expr::DataFusionToOpError,
+    },
+
+    #[snafu(display("cannot convert datafusion scalar value: {}", source))]
+    CannotConvertDataFusionScalarValue {
+        source: crate::delete_expr::DataFusionToScalarError,
+    },
+}
+
+pub(crate) fn df_to_expr(expr: Expr) -> Result<DeleteExpr, DataFusionToExprError> {
+    match expr {
+        Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
+            let (column, scalar) = match (left.deref(), right.deref()) {
+                // The delete predicate parser currently only supports `<column><op><value>`, not `<value><op><column>`,
+                // however this could can easily be extended to support the latter case as well.
+                (Expr::Column(column), Expr::Literal(value)) => {
+                    let column = column.name.clone();
+
+                    let scalar = df_to_scalar(value.clone())
+                        .context(CannotConvertDataFusionScalarValueSnafu)?;
+
+                    (column, scalar)
+                }
+                (other_left, other_right) => {
+                    return Err(DataFusionToExprError::UnsupportedOperants {
+                        left: other_left.clone(),
+                        right: other_right.clone(),
+                    });
+                }
+            };
+
+            let op = df_to_op(op).context(CannotConvertDataFusionOperatorSnafu)?;
+
+            Ok(DeleteExpr { column, op, scalar })
+        }
+        other => Err(DataFusionToExprError::UnsupportedExpression { expr: other }),
+    }
+}
+
+pub(crate) fn op_to_df(op: Op) -> datafusion::logical_expr::Operator {
+    match op {
+        Op::Eq => datafusion::logical_expr::Operator::Eq,
+        Op::Ne => datafusion::logical_expr::Operator::NotEq,
+    }
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_copy_implementations)] // allow extensions
+pub enum DataFusionToOpError {
+    #[snafu(display("unsupported operator: {:?}", op))]
+    UnsupportedOperator {
+        op: datafusion::logical_expr::Operator,
+    },
+}
+
+pub(crate) fn df_to_op(op: datafusion::logical_expr::Operator) -> Result<Op, DataFusionToOpError> {
+    match op {
+        datafusion::logical_expr::Operator::Eq => Ok(Op::Eq),
+        datafusion::logical_expr::Operator::NotEq => Ok(Op::Ne),
+        other => Err(DataFusionToOpError::UnsupportedOperator { op: other }),
+    }
+}
+
+pub(crate) fn scalar_to_df(scalar: Scalar) -> datafusion::scalar::ScalarValue {
+    use datafusion::scalar::ScalarValue;
+    match scalar {
+        Scalar::Bool(value) => ScalarValue::Boolean(Some(value)),
+        Scalar::I64(value) => ScalarValue::Int64(Some(value)),
+        Scalar::F64(value) => ScalarValue::Float64(Some(value.into())),
+        Scalar::String(value) => ScalarValue::Utf8(Some(value)),
+    }
+}
+
+#[derive(Debug, Snafu)]
+pub enum DataFusionToScalarError {
+    #[snafu(display("unsupported scalar value: {:?}", value))]
+    UnsupportedScalarValue {
+        value: datafusion::scalar::ScalarValue,
+    },
+}
+
+pub(crate) fn df_to_scalar(
+    scalar: datafusion::scalar::ScalarValue,
+) -> Result<Scalar, DataFusionToScalarError> {
+    use datafusion::scalar::ScalarValue;
+    match scalar {
+        ScalarValue::Utf8(Some(value)) => Ok(Scalar::String(value)),
+        ScalarValue::Int64(Some(value)) => Ok(Scalar::I64(value)),
+        ScalarValue::Float64(Some(value)) => Ok(Scalar::F64(value.into())),
+        ScalarValue::Boolean(Some(value)) => Ok(Scalar::Bool(value)),
+        other => Err(DataFusionToScalarError::UnsupportedScalarValue { value: other }),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::ops::Not;
+
+    use arrow::datatypes::DataType;
+    use test_helpers::assert_contains;
+
+    use super::*;
+    use datafusion::prelude::col;
+
+    #[test]
+    fn test_roundtrips() {
+        assert_expr_works(
+            DeleteExpr {
+                column: "foo".to_string(),
+                op: Op::Eq,
+                scalar: Scalar::Bool(true),
+            },
+            r#""foo"=true"#,
+        );
+        assert_expr_works(
+            DeleteExpr {
+                column: "bar".to_string(),
+                op: Op::Ne,
+                scalar: Scalar::I64(-1),
+            },
+            r#""bar"!=-1"#,
+        );
+        assert_expr_works(
+            DeleteExpr {
+                column: "baz".to_string(),
+                op: Op::Eq,
+                scalar: Scalar::F64((-1.1).into()),
+            },
+            r#""baz"=-1.1"#,
+        );
+        assert_expr_works(
+            DeleteExpr {
+                column: "col".to_string(),
+                op: Op::Eq,
+                scalar: Scalar::String("foo".to_string()),
+            },
+            r#""col"='foo'"#,
+        );
+    }
+
+    fn assert_expr_works(expr: DeleteExpr, display: &str) {
+        let df_expr = expr_to_df(expr.clone());
+        let expr2 = df_to_expr(df_expr).unwrap();
+        assert_eq!(expr2, expr);
+
+        assert_eq!(expr.to_string(), display);
+    }
+
+    #[test]
+    fn test_unsupported_expression() {
+        let expr = (col("foo").eq(lit("x"))).not();
+        let res = df_to_expr(expr);
+        assert_contains!(res.unwrap_err().to_string(), "unsupported expression:");
+    }
+
+    #[test]
+    fn test_unsupported_operants() {
+        let expr = col("foo").eq(col("bar"));
+        let res = df_to_expr(expr);
+        assert_contains!(res.unwrap_err().to_string(), "unsupported operants:");
+    }
+
+    #[test]
+    fn test_unsupported_scalar_value() {
+        let array = datafusion::scalar::ScalarValue::new_list(&[], &DataType::Float64);
+        let scalar = datafusion::scalar::ScalarValue::List(array);
+        let res = df_to_scalar(scalar);
+        assert_contains!(res.unwrap_err().to_string(), "unsupported scalar value:");
+    }
+
+    #[test]
+    fn test_unsupported_scalar_value_in_expr() {
+        let arr =
+            datafusion::scalar::ScalarValue::new_list(&[], &arrow::datatypes::DataType::Float64);
+        let expr = col("foo").eq(lit(datafusion::scalar::ScalarValue::List(arr)));
+        let res = df_to_expr(expr);
+        assert_contains!(res.unwrap_err().to_string(), "unsupported scalar value:");
+    }
+
+    #[test]
+    fn test_unsupported_operator() {
+        let res = df_to_op(datafusion::logical_expr::Operator::Lt);
+        assert_contains!(res.unwrap_err().to_string(), "unsupported operator:");
+    }
+
+    #[test]
+    fn test_unsupported_operator_in_expr() {
+        let expr = col("foo").lt(lit("x"));
+        let res = df_to_expr(expr);
+        assert_contains!(res.unwrap_err().to_string(), "unsupported operator:");
+    }
+}
diff --git a/predicate/src/delete_predicate.rs b/predicate/src/delete_predicate.rs
new file mode 100644
index 0000000..b298a63
--- /dev/null
+++ b/predicate/src/delete_predicate.rs
@@ -0,0 +1,442 @@
+use crate::delete_expr::{df_to_expr, expr_to_df};
+use chrono::DateTime;
+use data_types::{DeleteExpr, DeletePredicate, TimestampRange};
+use datafusion::{
+    logical_expr::Operator,
+    prelude::{binary_expr, lit, Column, Expr},
+};
+use snafu::Snafu;
+use sqlparser::{
+    ast::{BinaryOperator, Expr as SqlParserExpr, Ident, Statement, Value},
+    dialect::GenericDialect,
+    parser::Parser,
+};
+
+/// Parse Delete Predicates
+/// Parse Error
+#[derive(Debug, Snafu)]
+pub enum Error {
+    /// Invalid time format
+    #[snafu(display("Invalid timestamp: {}", value))]
+    InvalidTimestamp { value: String },
+
+    /// Invalid time range
+    #[snafu(display("Invalid time range: ({}, {})", start, stop))]
+    InvalidTimeRange { start: String, stop: String },
+
+    /// Predicate syntax error
+    #[snafu(display("Invalid predicate syntax: ({})", value))]
+    InvalidSyntax { value: String },
+
+    /// Predicate semantics error
+    #[snafu(display("Invalid predicate semantics: ({})", value))]
+    InvalidSemantics { value: String },
+
+    /// Predicate include non supported expression
+    #[snafu(display("Delete predicate must be conjunctive expressions of binary 'column_name = literal' or 'column_name != literal': ({})", value))]
+    NotSupportPredicate { value: String },
+}
+
+/// Result type for Parser Cient
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl From<DeletePredicate> for crate::Predicate {
+    fn from(pred: DeletePredicate) -> Self {
+        Self {
+            field_columns: None,
+            range: Some(pred.range),
+            exprs: pred.exprs.into_iter().map(expr_to_df).collect(),
+            value_expr: vec![],
+        }
+    }
+}
+
+/// Parse and convert the delete grpc API into ParseDeletePredicate to send to server
+pub fn parse_delete_predicate(
+    start_time: &str,
+    stop_time: &str,
+    predicate: &str,
+) -> Result<DeletePredicate> {
+    // parse and check time range
+    let (start_time, stop_time) = parse_time_range(start_time, stop_time)?;
+
+    // Parse the predicate
+    let delete_exprs = parse_predicate(predicate)?;
+
+    Ok(DeletePredicate {
+        range: TimestampRange::new(start_time, stop_time),
+        exprs: delete_exprs,
+    })
+}
+
+/// Parse the predicate and convert it into datafusion expression
+/// A delete predicate is a conjunctive expression of many
+/// binary expressions of 'colum = constant' or 'column != constant'
+///
+fn parse_predicate(predicate: &str) -> Result<Vec<DeleteExpr>> {
+    if predicate.is_empty() {
+        return Ok(vec![]);
+    }
+
+    // "DELETE FROM table_name WHERE predicate"
+    // Table name can be anything to have sqlparser work on the right sql syntax
+    let mut sql = "DELETE FROM table_name WHERE ".to_string();
+    sql.push_str(predicate);
+
+    // parse the delete sql
+    let dialect = GenericDialect {};
+    let ast = Parser::parse_sql(&dialect, sql.as_str());
+    match ast {
+        Err(parse_err) => {
+            let error_str = format!("{predicate}, {parse_err}");
+            Err(Error::InvalidSyntax { value: error_str })
+        }
+        Ok(mut stmt) => {
+            if stmt.len() != 1 {
+                return Err(Error::InvalidSemantics {
+                    value: predicate.to_string(),
+                });
+            }
+
+            let stmt = stmt.pop();
+            match stmt {
+                Some(Statement::Delete {
+                    selection: Some(expr),
+                    ..
+                }) => {
+                    // split this expr into smaller binary if any
+                    let mut exprs = vec![];
+                    let split = split_members(&expr, &mut exprs);
+                    if !split {
+                        return Err(Error::NotSupportPredicate {
+                            value: predicate.to_string(),
+                        });
+                    }
+                    Ok(exprs)
+                }
+                _ => Err(Error::InvalidSemantics {
+                    value: predicate.to_string(),
+                }),
+            }
+        }
+    }
+}
+
+/// Recursively split all "AND" expressions into smaller ones
+/// Example: "A AND B AND C" => [A, B, C]
+/// Return false if not all of them are AND of binary expression of
+/// "column_name = literal" or "column_name != literal"
+///
+/// The split expressions will be converted into data fusion expressions
+fn split_members(predicate: &SqlParserExpr, predicates: &mut Vec<DeleteExpr>) -> bool {
+    // The below code built to be compatible with
+    // https://github.com/influxdata/influxdb/blob/master/predicate/parser_test.go
+    match predicate {
+        SqlParserExpr::BinaryOp {
+            left,
+            op: BinaryOperator::And,
+            right,
+        } => {
+            if !split_members(left, predicates) {
+                return false;
+            }
+            if !split_members(right, predicates) {
+                return false;
+            }
+        }
+        SqlParserExpr::BinaryOp { left, op, right } => {
+            // Verify Operator
+            let op = match op {
+                BinaryOperator::Eq => Operator::Eq,
+                BinaryOperator::NotEq => Operator::NotEq,
+                _ => return false,
+            };
+
+            // verify if left is identifier (column name)
+            let column = match &**left {
+                SqlParserExpr::Identifier(Ident {
+                    value,
+                    quote_style: _, // all quotes are ignored as done in idpe
+                }) => Expr::Column(Column {
+                    relation: None,
+                    name: value.to_string(),
+                }),
+                _ => return false, // not a column name
+            };
+
+            // verify if right is a literal or an identifier (e.g column name)
+            let value = match &**right {
+                SqlParserExpr::Identifier(Ident {
+                    value,
+                    quote_style: _,
+                }) => lit(value.to_string()),
+                SqlParserExpr::Value(Value::DoubleQuotedString(value)) => lit(value.to_string()),
+                SqlParserExpr::Value(Value::SingleQuotedString(value)) => lit(value.to_string()),
+                SqlParserExpr::Value(Value::NationalStringLiteral(value)) => lit(value.to_string()),
+                SqlParserExpr::Value(Value::HexStringLiteral(value)) => lit(value.to_string()),
+                SqlParserExpr::Value(Value::Number(v, _)) => match v.parse::<i64>() {
+                    Ok(v) => lit(v),
+                    Err(_) => lit(v.parse::<f64>().unwrap()),
+                },
+                SqlParserExpr::Value(Value::Boolean(v)) => lit(*v),
+                _ => return false, // not a literal
+            };
+
+            let expr = binary_expr(column, op, value);
+            let expr: Result<DeleteExpr, _> = df_to_expr(expr);
+            match expr {
+                Ok(expr) => {
+                    predicates.push(expr);
+                }
+                Err(_) => {
+                    // cannot convert
+                    return false;
+                }
+            }
+        }
+        _ => return false,
+    }
+
+    true
+}
+
+/// Parse a time and return its time in nanosecond
+fn parse_time(input: &str) -> Result<i64> {
+    // This input can be in timestamp form that end with Z such as 1970-01-01T00:00:00Z
+    // See examples here https://docs.influxdata.com/influxdb/v2.0/reference/cli/influx/delete/#delete-all-points-within-a-specified-time-frame
+    let datetime_result = DateTime::parse_from_rfc3339(input);
+    match datetime_result {
+        Ok(datetime) => datetime
+            .timestamp_nanos_opt()
+            .ok_or_else(|| Error::InvalidTimestamp {
+                value: datetime.to_string(),
+            }),
+        Err(timestamp_err) => {
+            // See if it is in nanosecond form
+            let time_result = input.parse::<i64>();
+            match time_result {
+                Ok(nano) => Ok(nano),
+                Err(nano_err) => {
+                    // wrong format, return both error
+                    let error_str = format!("{timestamp_err}, {nano_err}");
+                    Err(Error::InvalidTimestamp { value: error_str })
+                }
+            }
+        }
+    }
+}
+
+/// Parse a time range [start, stop]
+fn parse_time_range(start: &str, stop: &str) -> Result<(i64, i64)> {
+    let start_time = parse_time(start)?;
+    let stop_time = parse_time(stop)?;
+    if start_time > stop_time {
+        return Err(Error::InvalidTimeRange {
+            start: start.to_string(),
+            stop: stop.to_string(),
+        });
+    }
+
+    Ok((start_time, stop_time))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use data_types::{Op, Scalar};
+
+    #[test]
+    fn test_time_range_valid() {
+        let start = r#"100"#;
+        let stop = r#"100"#;
+        let result = parse_time_range(start, stop).unwrap();
+        let expected = (100, 100);
+        assert_eq!(result, expected);
+
+        let start = r#"100"#;
+        let stop = r#"200"#;
+        let result = parse_time_range(start, stop).unwrap();
+        let expected = (100, 200);
+        assert_eq!(result, expected);
+
+        let start = r#"1970-01-01T00:00:00Z"#; // this is nano 0
+        let stop = r#"1970-01-01T00:00:00Z"#;
+        let result = parse_time_range(start, stop).unwrap();
+        let expected = (0, 0);
+        assert_eq!(result, expected);
+
+        // let start = r#"1970-01-01T00:00:00Z"#;  // this is nano 0
+        // let stop = r#"now()"#;  // -- Not working. Need to find a way to test this
+        // let result = ParseDeletePredicate::parse_time_range(start, stop).unwrap();
+        // let expected = (0, 0);
+        // assert_eq!(result, expected);
+
+        let start = r#"1970-01-01T00:00:00Z"#;
+        let stop = r#"100"#;
+        let result = parse_time_range(start, stop).unwrap();
+        let expected = (0, 100);
+        assert_eq!(result, expected);
+
+        let start = r#"1970-01-01T00:00:00Z"#;
+        let stop = r#"1970-01-01T00:01:00Z"#;
+        let result = parse_time_range(start, stop).unwrap();
+        let expected = (0, 60000000000);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_time_range_invalid() {
+        let start = r#"100"#;
+        let stop = r#"-100"#;
+        let result = parse_time_range(start, stop);
+        assert!(result.is_err());
+
+        let start = r#"100"#;
+        let stop = r#"50"#;
+        let result = parse_time_range(start, stop);
+        assert!(result.is_err());
+
+        let start = r#"100"#;
+        let stop = r#"1970-01-01T00:00:00Z"#;
+        let result = parse_time_range(start, stop);
+        assert!(result.is_err());
+
+        let start = r#"1971-09-01T00:00:10Z"#;
+        let stop = r#"1971-09-01T00:00:05Z"#;
+        let result = parse_time_range(start, stop);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_parse_timestamp() {
+        let input = r#"123"#;
+        let time = parse_time(input).unwrap();
+        assert_eq!(time, 123);
+
+        // must parse time
+        let input = r#"1970-01-01T00:00:00Z"#;
+        let time = parse_time(input).unwrap();
+        assert_eq!(time, 0);
+
+        let input = r#"1971-02-01T15:30:21Z"#;
+        let time = parse_time(input).unwrap();
+        assert_eq!(time, 34270221000000000);
+    }
+
+    #[test]
+    fn test_parse_timestamp_negative() {
+        let input = r#"-123"#;
+        let time = parse_time(input).unwrap();
+        assert_eq!(time, -123);
+    }
+
+    #[test]
+    fn test_parse_timestamp_invalid() {
+        let input = r#"123gdb"#;
+        parse_time(input).unwrap_err();
+
+        let input = r#"1970-01-01T00:00:00"#;
+        parse_time(input).unwrap_err();
+
+        // It turn out this is not invalid but return1 1971
+        let input = r#"1971-02-01:30:21Z"#;
+        parse_time(input).unwrap_err();
+    }
+
+    #[test]
+    fn test_parse_timestamp_out_of_range() {
+        let input = r#"99999999999999999999999999999999"#;
+        let time = parse_time(input);
+        assert!(time.is_err());
+    }
+
+    #[test]
+    fn test_parse_predicate() {
+        let pred = r#"city= Boston and cost !=100 and state != "MA" AND temp=87.5"#;
+        let result = parse_predicate(pred).unwrap();
+
+        println!("{result:#?}");
+
+        let expected = vec![
+            DeleteExpr::new(
+                "city".to_string(),
+                Op::Eq,
+                Scalar::String("Boston".to_string()),
+            ),
+            DeleteExpr::new("cost".to_string(), Op::Ne, Scalar::I64(100)),
+            DeleteExpr::new(
+                "state".to_string(),
+                Op::Ne,
+                Scalar::String("MA".to_string()),
+            ),
+            DeleteExpr::new("temp".to_string(), Op::Eq, Scalar::F64((87.5).into())),
+        ];
+
+        assert_eq!(result, expected)
+    }
+
+    #[test]
+    fn test_parse_predicate_invalid() {
+        let pred = r#"city= Boston Or cost !=100 and state != "MA""#; // OR
+        let result = parse_predicate(pred);
+        assert!(result.is_err());
+
+        let pred = r#"city= Boston and cost !=100+1 and state != "MA""#; // 100 + 1
+        let result = parse_predicate(pred);
+        assert!(result.is_err());
+
+        let pred = r#"cost > 100"#; // >
+        let result = parse_predicate(pred);
+        assert!(result.is_err());
+
+        let pred = r#"cost <= 100"#; // <
+        let result = parse_predicate(pred);
+        assert!(result.is_err());
+
+        let pred = r#"cost gt 100"#; // >
+        let result = parse_predicate(pred);
+        assert!(result.is_err());
+
+        let pred = r#"city = cost = 100"#; // >
+        let result = parse_predicate(pred);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_full_delete_pred() {
+        let start = r#"1970-01-01T00:00:00Z"#; // this is nano 0
+        let stop = r#"200"#;
+        let pred = r#"cost != 100"#;
+
+        let result = parse_delete_predicate(start, stop, pred).unwrap();
+        assert_eq!(result.range.start(), 0);
+        assert_eq!(result.range.end(), 200);
+
+        let expected = vec![DeleteExpr::new(
+            "cost".to_string(),
+            Op::Ne,
+            Scalar::I64(100),
+        )];
+        assert_eq!(result.exprs, expected);
+    }
+
+    #[test]
+    fn test_full_delete_pred_invalid_time_range() {
+        let start = r#"100"#;
+        let stop = r#"50"#;
+        let pred = r#"cost != 100"#;
+
+        let result = parse_delete_predicate(start, stop, pred);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_full_delete_pred_invalid_pred() {
+        let start = r#"100"#;
+        let stop = r#"200"#;
+        let pred = r#"cost > 100"#;
+
+        let result = parse_delete_predicate(start, stop, pred);
+        assert!(result.is_err());
+    }
+}
diff --git a/predicate/src/lib.rs b/predicate/src/lib.rs
new file mode 100644
index 0000000..5dd9591
--- /dev/null
+++ b/predicate/src/lib.rs
@@ -0,0 +1,589 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+pub mod delete_expr;
+pub mod delete_predicate;
+pub mod rpc_predicate;
+
+use data_types::TimestampRange;
+use datafusion::{
+    common::tree_node::{TreeNodeVisitor, VisitRecursion},
+    error::DataFusionError,
+    logical_expr::{binary_expr, BinaryExpr},
+    prelude::{col, Expr},
+};
+use datafusion_util::{lit_timestamptz_nano, make_range_expr, AsExpr};
+use observability_deps::tracing::debug;
+use rpc_predicate::VALUE_COLUMN_NAME;
+use schema::TIME_COLUMN_NAME;
+use std::{collections::BTreeSet, fmt, ops::Not};
+
+/// This `Predicate` represents the empty predicate (aka that evaluates to true for all rows).
+pub const EMPTY_PREDICATE: Predicate = Predicate {
+    field_columns: None,
+    exprs: vec![],
+    range: None,
+    value_expr: vec![],
+};
+
+/// A unified Predicate structure for IOx that understands the
+/// InfluxDB data model (e.g. Fields and Tags and timestamps) as well
+/// as for arbitrary other predicates that are expressed by
+/// DataFusion's [`Expr`] type.
+///
+/// Note that the InfluxDB data model (e.g. ParsedLine's)
+/// distinguishes between some types of columns (tags and fields), and
+/// likewise the semantics of this structure can express some types of
+/// restrictions that only apply to certain types of columns.
+///
+/// Example:
+/// ```
+/// use predicate::Predicate;
+/// use datafusion::prelude::{col, lit};
+///
+/// let p = Predicate::new()
+///    .with_range(1, 100)
+///    .with_expr(col("foo").eq(lit(42)));
+///
+/// assert_eq!(
+///   p.to_string(),
+///   "Predicate range: [1 - 100] exprs: [foo = Int32(42)]"
+/// );
+/// ```
+#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd)]
+pub struct Predicate {
+    /// Optional field (aka "column") restriction. If present,
+    /// restricts the results to only tables which have *at least one*
+    /// of the fields in field_columns.
+    pub field_columns: Option<BTreeSet<String>>,
+
+    /// Optional timestamp range: only rows within this range are included in
+    /// results. Other rows are excluded
+    pub range: Option<TimestampRange>,
+
+    /// Optional arbitrary predicates, represented as list of
+    /// DataFusion expressions applied a logical conjunction (aka they
+    /// are 'AND'ed together). Only rows that evaluate to TRUE for all
+    /// these expressions should be returned. Other rows are excluded
+    /// from the results.
+    pub exprs: Vec<Expr>,
+
+    /// Optional arbitrary predicates on the special `_value` column
+    /// which represents the value of any column.
+    ///
+    /// These expressions are applied to `field_columns` projections
+    /// in the form of `CASE` statement conditions.
+    pub value_expr: Vec<ValueExpr>,
+}
+
+impl Predicate {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Return true if this predicate has any general purpose predicates
+    pub fn has_exprs(&self) -> bool {
+        !self.exprs.is_empty()
+    }
+
+    /// Return a DataFusion [`Expr`] predicate representing the
+    /// combination of AND'ing all (`exprs`) and timestamp restriction
+    /// in this Predicate.
+    ///
+    /// Returns None if there are no `Expr`'s restricting
+    /// the data
+    pub fn filter_expr(&self) -> Option<Expr> {
+        let expr_iter = std::iter::once(self.make_timestamp_predicate_expr())
+            // remove None
+            .flatten()
+            .chain(self.exprs.iter().cloned());
+
+        // combine all items together with AND
+        expr_iter.reduce(|accum, expr| accum.and(expr))
+    }
+
+    /// Return true if the field / column should be included in results
+    pub fn should_include_field(&self, field_name: &str) -> bool {
+        match &self.field_columns {
+            None => true, // No field restriction on predicate
+            Some(field_names) => field_names.contains(field_name),
+        }
+    }
+
+    /// Creates a DataFusion predicate for appliying a timestamp range:
+    ///
+    /// `range.start <= time and time < range.end`
+    fn make_timestamp_predicate_expr(&self) -> Option<Expr> {
+        self.range
+            .map(|range| make_range_expr(range.start(), range.end(), TIME_COLUMN_NAME))
+    }
+
+    /// Returns true if ths predicate evaluates to true for all rows
+    pub fn is_empty(&self) -> bool {
+        self == &EMPTY_PREDICATE
+    }
+
+    /// Return a negated DF logical expression for the given delete predicates
+    pub fn negated_expr<S>(delete_predicates: &[S]) -> Option<Expr>
+    where
+        S: AsRef<Self>,
+    {
+        if delete_predicates.is_empty() {
+            return None;
+        }
+
+        let pred = Self::default().with_delete_predicates(delete_predicates);
+
+        // Make a conjunctive expression of the pred.exprs
+        let mut val = None;
+        for e in pred.exprs {
+            match val {
+                None => val = Some(e),
+                Some(expr) => val = Some(expr.and(e)),
+            }
+        }
+
+        val
+    }
+
+    /// Merge the given delete predicates into this select predicate.
+    /// Since we want to eliminate data filtered by the delete predicates,
+    /// they are first converted into their negated form: NOT(delete_predicate)
+    /// then added/merged into the selection one
+    pub fn with_delete_predicates<S>(mut self, delete_predicates: &[S]) -> Self
+    where
+        S: AsRef<Self>,
+    {
+        // Create a list of disjunctive negated expressions.
+        // Example: there are two deletes as follows (note that time_range is stored separated in the Predicate
+        //  but we need to put it together with the exprs here)
+        //   . Delete_1: WHERE city != "Boston"  AND temp = 70  AND time_range in [10, 30)
+        //   . Delete 2: WHERE state = "NY" AND route != "I90" AND time_range in [20, 50)
+        // The negated list will be "NOT(Delete_1)", NOT(Delete_2)" which means
+        //    NOT(city != "Boston"  AND temp = 70 AND time_range in [10, 30]),  NOT(state = "NY" AND route != "I90" AND time_range in [20, 50]) which means
+        //   [NOT(city = Boston") OR NOT(temp = 70) OR NOT(time_range in [10, 30])], [NOT(state = "NY") OR NOT(route != "I90") OR NOT(time_range in [20, 50])]
+        // Note that the "NOT(time_range in [20, 50])]" or "NOT(20 <= time <= 50)"" is replaced with "time < 20 OR time > 50"
+
+        for pred in delete_predicates {
+            let pred = pred.as_ref();
+
+            let mut expr: Option<Expr> = None;
+
+            // Time range
+            if let Some(range) = pred.range {
+                // time_expr =  NOT(start <= time_range <= end)
+                // Equivalent to: (time < start OR time > end)
+                let time_expr = col(TIME_COLUMN_NAME)
+                    .lt(lit_timestamptz_nano(range.start()))
+                    .or(col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(range.end())));
+
+                match expr {
+                    None => expr = Some(time_expr),
+                    Some(e) => expr = Some(e.or(time_expr)),
+                }
+            }
+
+            // Exprs
+            for exp in &pred.exprs {
+                match expr {
+                    None => expr = Some(exp.clone().not()),
+                    Some(e) => expr = Some(e.or(exp.clone().not())),
+                }
+            }
+
+            // Push the negated expression of the delete predicate into the list exprs of the select predicate
+            if let Some(e) = expr {
+                self.exprs.push(e);
+            }
+        }
+        self
+    }
+
+    /// Removes the timestamp range from this predicate, if the range
+    /// is for the entire min/max valid range.
+    ///
+    /// This is used in certain cases to retain compatibility with the
+    /// existing storage engine
+    pub(crate) fn with_clear_timestamp_if_max_range(mut self) -> Self {
+        self.range = self.range.take().and_then(|range| {
+            // FIXME(lesam): This should properly be contains_all, but until
+            // https://github.com/influxdata/idpe/issues/13094 is fixed we are more permissive
+            // about what timestamp range we consider 'all time'
+            if range.contains_nearly_all() {
+                debug!("Cleared timestamp max-range");
+
+                None
+            } else {
+                Some(range)
+            }
+        });
+
+        self
+    }
+}
+
+impl fmt::Display for Predicate {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fn iter_to_str<S>(s: impl IntoIterator<Item = S>) -> String
+        where
+            S: ToString,
+        {
+            s.into_iter()
+                .map(|v| v.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        }
+
+        write!(f, "Predicate")?;
+
+        if let Some(field_columns) = &self.field_columns {
+            write!(f, " field_columns: {{{}}}", iter_to_str(field_columns))?;
+        }
+
+        if let Some(range) = &self.range {
+            // TODO: could be nice to show this as actual timestamps (not just numbers)?
+            write!(f, " range: [{} - {}]", range.start(), range.end())?;
+        }
+
+        if !self.exprs.is_empty() {
+            write!(f, " exprs: [")?;
+            for (i, expr) in self.exprs.iter().enumerate() {
+                write!(f, "{expr}")?;
+                if i < self.exprs.len() - 1 {
+                    write!(f, ", ")?;
+                }
+            }
+            write!(f, "]")?;
+        }
+        Ok(())
+    }
+}
+
+impl Predicate {
+    /// Sets the timestamp range
+    pub fn with_range(mut self, start: i64, end: i64) -> Self {
+        // Without more thought, redefining the timestamp range would
+        // lose the old range. Asser that that cannot happen.
+        assert!(
+            self.range.is_none(),
+            "Unexpected re-definition of timestamp range"
+        );
+
+        self.range = Some(TimestampRange::new(start, end));
+        self
+    }
+
+    /// sets the optional timestamp range, if any
+    pub fn with_maybe_timestamp_range(mut self, range: Option<TimestampRange>) -> Self {
+        // Without more thought, redefining the timestamp range would
+        // lose the old range. Asser that that cannot happen.
+        assert!(
+            range.is_none() || self.range.is_none(),
+            "Unexpected re-definition of timestamp range"
+        );
+        self.range = range;
+        self
+    }
+
+    /// Add an  exprestion "time > retention_time"
+    pub fn with_retention(mut self, retention_time: i64) -> Self {
+        let expr = col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(retention_time));
+        self.exprs.push(expr);
+        self
+    }
+
+    /// Adds an expression to the list of general purpose predicates
+    pub fn with_expr(self, expr: Expr) -> Self {
+        self.with_exprs([expr])
+    }
+
+    /// Adds a ValueExpr to the list of value expressons
+    pub fn with_value_expr(mut self, value_expr: ValueExpr) -> Self {
+        self.value_expr.push(value_expr);
+        self
+    }
+
+    /// Builds a regex matching expression from the provided column name and
+    /// pattern. Values not matching the regex will be filtered out.
+    pub fn with_regex_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
+        let expr = query_functions::regex_match_expr(col(column), pattern.into());
+        self.with_expr(expr)
+    }
+
+    /// Builds a regex "not matching" expression from the provided column name
+    /// and pattern. Values *matching* the regex will be filtered out.
+    pub fn with_regex_not_match_expr(self, column: &str, pattern: impl Into<String>) -> Self {
+        let expr = query_functions::regex_not_match_expr(col(column), pattern.into());
+        self.with_expr(expr)
+    }
+
+    /// Sets field_column restriction
+    pub fn with_field_columns(
+        mut self,
+        columns: impl IntoIterator<Item = impl Into<String>>,
+    ) -> Result<Self, &'static str> {
+        // We need to distinguish predicates like `column_name In
+        // (foo, bar)` and `column_name = foo and column_name = bar` in order to handle
+        // this
+        if self.field_columns.is_some() {
+            return Err("Complex/Multi field predicates are not yet supported");
+        }
+
+        let column_names = columns
+            .into_iter()
+            .map(|s| s.into())
+            .collect::<BTreeSet<_>>();
+
+        self.field_columns = Some(column_names);
+        Ok(self)
+    }
+
+    /// Adds all expressions to the list of general purpose predicates
+    pub fn with_exprs(mut self, filters: impl IntoIterator<Item = Expr>) -> Self {
+        self.exprs.extend(filters);
+        self
+    }
+}
+
+// Wrapper around `Expr::BinaryExpr` where left input is known to be
+// single Column reference to the `_value` column
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd)]
+pub struct ValueExpr {
+    expr: Expr,
+}
+
+impl TryFrom<Expr> for ValueExpr {
+    /// Returns the original Expr if conversion doesn't work
+    type Error = Expr;
+
+    /// tries to create a new ValueExpr. If `expr` follows the
+    /// expected pattrn, returns Ok(Self). If not, returns Err(expr)
+    fn try_from(expr: Expr) -> Result<Self, Self::Error> {
+        if let Expr::BinaryExpr(BinaryExpr {
+            left,
+            op: _,
+            right: _,
+        }) = &expr
+        {
+            if let Expr::Column(inner) = left.as_ref() {
+                if inner.name == VALUE_COLUMN_NAME {
+                    return Ok(Self { expr });
+                }
+            }
+        }
+        Err(expr)
+    }
+}
+
+impl ValueExpr {
+    /// Returns a new [`Expr`] with the reference to the `_value`
+    /// column replaced with the specified column name
+    pub fn replace_col(&self, name: &str) -> Expr {
+        if let Expr::BinaryExpr(BinaryExpr { left: _, op, right }) = &self.expr {
+            binary_expr(name.as_expr(), *op, right.as_ref().clone())
+        } else {
+            unreachable!("Unexpected content in ValueExpr")
+        }
+    }
+}
+
+impl From<ValueExpr> for Expr {
+    fn from(value_expr: ValueExpr) -> Self {
+        value_expr.expr
+    }
+}
+
+/// Recursively walk an expression tree, checking if the expression is
+/// row-based.
+///
+/// A row-based function takes one row in and produces
+/// one value as output.
+///
+/// Note that even though a predicate expression  like `col < 5` can be used to
+/// filter rows, the expression itself is row-based (produces a single boolean).
+///
+/// Examples of non row based expressions are Aggregate and
+/// Window function which produce different cardinality than their
+/// input.
+struct RowBasedVisitor {
+    row_based: bool,
+}
+
+impl Default for RowBasedVisitor {
+    fn default() -> Self {
+        Self { row_based: true }
+    }
+}
+
+impl TreeNodeVisitor for RowBasedVisitor {
+    type N = Expr;
+
+    fn pre_visit(&mut self, expr: &Expr) -> Result<VisitRecursion, DataFusionError> {
+        match expr {
+            Expr::Alias(_)
+            | Expr::Between { .. }
+            | Expr::BinaryExpr { .. }
+            | Expr::Case { .. }
+            | Expr::Cast { .. }
+            | Expr::Column(_)
+            | Expr::Exists { .. }
+            | Expr::GetIndexedField { .. }
+            | Expr::InList { .. }
+            | Expr::InSubquery { .. }
+            | Expr::IsFalse(_)
+            | Expr::IsNotFalse(_)
+            | Expr::IsNotNull(_)
+            | Expr::IsNotTrue(_)
+            | Expr::IsNotUnknown(_)
+            | Expr::IsNull(_)
+            | Expr::IsTrue(_)
+            | Expr::IsUnknown(_)
+            | Expr::Like { .. }
+            | Expr::Literal(_)
+            | Expr::Negative(_)
+            | Expr::Not(_)
+            | Expr::OuterReferenceColumn(_, _)
+            | Expr::Placeholder { .. }
+            | Expr::ScalarFunction { .. }
+            | Expr::ScalarSubquery(_)
+            | Expr::ScalarVariable(_, _)
+            | Expr::SimilarTo { .. }
+            | Expr::Sort { .. }
+            | Expr::TryCast { .. }
+            | Expr::Wildcard { .. } => Ok(VisitRecursion::Continue),
+            Expr::AggregateFunction { .. } | Expr::GroupingSet(_) | Expr::WindowFunction { .. } => {
+                self.row_based = false;
+                Ok(VisitRecursion::Stop)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use data_types::{MAX_NANO_TIME, MIN_NANO_TIME};
+    use datafusion::prelude::{col, lit};
+
+    #[test]
+    fn test_default_predicate_is_empty() {
+        let p = Predicate::default();
+        assert!(p.is_empty());
+    }
+
+    #[test]
+    fn test_non_default_predicate_is_not_empty() {
+        let p = Predicate::new().with_range(1, 100);
+
+        assert!(!p.is_empty());
+    }
+
+    #[test]
+    fn predicate_display_ts() {
+        // TODO make this a doc example?
+        let p = Predicate::new().with_range(1, 100);
+
+        assert_eq!(p.to_string(), "Predicate range: [1 - 100]");
+    }
+
+    #[test]
+    fn predicate_display_ts_and_expr() {
+        let p = Predicate::new()
+            .with_range(1, 100)
+            .with_expr(col("foo").eq(lit(42)).and(col("bar").lt(lit(11))));
+
+        assert_eq!(
+            p.to_string(),
+            "Predicate range: [1 - 100] exprs: [foo = Int32(42) AND bar < Int32(11)]"
+        );
+    }
+
+    #[test]
+    fn predicate_display_full() {
+        let p = Predicate::new()
+            .with_range(1, 100)
+            .with_expr(col("foo").eq(lit(42)))
+            .with_field_columns(vec!["f1", "f2"])
+            .unwrap();
+
+        assert_eq!(
+            p.to_string(),
+            "Predicate field_columns: {f1, f2} range: [1 - 100] exprs: [foo = Int32(42)]"
+        );
+    }
+
+    #[test]
+    fn predicate_multi_field_cols_not_supported() {
+        let err = Predicate::new()
+            .with_field_columns(vec!["f1", "f2"])
+            .unwrap()
+            .with_field_columns(vec!["f1", "f2"])
+            .unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            "Complex/Multi field predicates are not yet supported"
+        );
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_out_of_range() {
+        let p = Predicate::new()
+            .with_range(1, 100)
+            .with_expr(col("foo").eq(lit(42)));
+
+        let expected = p.clone();
+
+        // no rewrite
+        assert_eq!(p.with_clear_timestamp_if_max_range(), expected);
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_out_of_range_low() {
+        let p = Predicate::new()
+            .with_range(MIN_NANO_TIME, 100)
+            .with_expr(col("foo").eq(lit(42)));
+
+        let expected = p.clone();
+
+        // no rewrite
+        assert_eq!(p.with_clear_timestamp_if_max_range(), expected);
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_out_of_range_high() {
+        let p = Predicate::new()
+            .with_range(0, MAX_NANO_TIME + 1)
+            .with_expr(col("foo").eq(lit(42)));
+
+        let expected = p.clone();
+
+        // no rewrite
+        assert_eq!(p.with_clear_timestamp_if_max_range(), expected);
+    }
+
+    #[test]
+    fn test_clear_timestamp_if_max_range_in_range() {
+        let p = Predicate::new()
+            .with_range(MIN_NANO_TIME, MAX_NANO_TIME + 1)
+            .with_expr(col("foo").eq(lit(42)));
+
+        let expected = Predicate::new().with_expr(col("foo").eq(lit(42)));
+        // rewrite
+        assert_eq!(p.with_clear_timestamp_if_max_range(), expected);
+    }
+}
diff --git a/predicate/src/rpc_predicate.rs b/predicate/src/rpc_predicate.rs
new file mode 100644
index 0000000..0b7ea46
--- /dev/null
+++ b/predicate/src/rpc_predicate.rs
@@ -0,0 +1,440 @@
+mod column_rewrite;
+mod field_rewrite;
+mod measurement_rewrite;
+mod rewrite;
+mod value_rewrite;
+
+use crate::rpc_predicate::column_rewrite::missing_tag_to_null;
+use crate::Predicate;
+
+use datafusion::common::tree_node::TreeNode;
+use datafusion::common::ToDFSchema;
+use datafusion::error::Result as DataFusionResult;
+use datafusion::execution::context::ExecutionProps;
+use datafusion::optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext};
+use datafusion::prelude::{lit, Expr};
+use observability_deps::tracing::{debug, trace};
+use schema::Schema;
+use std::collections::BTreeSet;
+use std::sync::Arc;
+
+use self::field_rewrite::FieldProjectionRewriter;
+use self::measurement_rewrite::rewrite_measurement_references;
+use self::value_rewrite::rewrite_field_value_references;
+
+pub use self::rewrite::{iox_expr_rewrite, simplify_predicate};
+
+/// Any column references to this name are rewritten to be
+/// the actual table name by the Influx gRPC planner.
+///
+/// This is required to support predicates like
+/// `_measurement = "foo" OR tag1 = "bar"`
+///
+/// The plan for each table will have the value of `_measurement`
+/// filled in with a literal for the respective name of that field
+pub const MEASUREMENT_COLUMN_NAME: &str = "_measurement";
+
+/// A reference to a field's name which is used to represent column
+/// projections in influx RPC predicates.
+///
+/// For example, a predicate like
+/// ```text
+/// _field = temperature
+/// ```
+///
+/// Means to select only the (field) column named "temperature"
+///
+/// Any equality expressions using this column name are removed and
+/// replaced with projections on the specified column.
+pub const FIELD_COLUMN_NAME: &str = "_field";
+
+/// Any column references to this name are rewritten to be a disjunctive set of
+/// expressions to all field columns for the table schema.
+///
+/// This is required to support predicates like
+/// `_value` = 1.77
+///
+/// The plan for each table will have expression containing `_value` rewritten
+/// into multiple expressions (one for each field column).
+pub const VALUE_COLUMN_NAME: &str = "_value";
+
+/// Special group key for `read_group` requests.
+///
+/// Treat these specially and use `""` as a placeholder value (instead of a real column) to mirror what TSM does.
+/// See <https://github.com/influxdata/influxdb_iox/issues/2693#issuecomment-947695442>
+/// for more details.
+///
+/// See also [`GROUP_KEY_SPECIAL_STOP`].
+pub const GROUP_KEY_SPECIAL_START: &str = "_start";
+
+/// Special group key for `read_group` requests.
+///
+/// Treat these specially and use `""` as a placeholder value (instead of a real column) to mirror what TSM does.
+/// See <https://github.com/influxdata/influxdb_iox/issues/2693#issuecomment-947695442>
+/// for more details.
+///
+/// See also [`GROUP_KEY_SPECIAL_START`].
+pub const GROUP_KEY_SPECIAL_STOP: &str = "_stop";
+
+/// [`InfluxRpcPredicate`] implements the semantics of the InfluxDB
+/// Storage gRPC and handles mapping details such as `_field` and
+/// `_measurement` predicates into the corresponding IOx structures.
+#[derive(Debug, Clone, Default)]
+pub struct InfluxRpcPredicate {
+    /// Optional table restriction. If present, restricts the results
+    /// to only tables whose names are in `table_names`
+    table_names: Option<BTreeSet<String>>,
+
+    /// The inner predicate
+    inner: Predicate,
+}
+
+impl InfluxRpcPredicate {
+    /// Create a new [`InfluxRpcPredicate`]
+    pub fn new(table_names: Option<BTreeSet<String>>, predicate: Predicate) -> Self {
+        Self {
+            table_names,
+            inner: predicate,
+        }
+    }
+
+    /// Create a new [`InfluxRpcPredicate`] for the given table
+    pub fn new_table(table: impl Into<String>, predicate: Predicate) -> Self {
+        Self::new(Some(std::iter::once(table.into()).collect()), predicate)
+    }
+
+    /// Removes the timestamp range from this predicate, if the range
+    /// is for the entire min/max valid range.
+    ///
+    /// This is used in certain cases to retain compatibility with the
+    /// existing storage engine which uses the max range to mean "all
+    /// the data for all time"
+    pub fn clear_timestamp_if_max_range(self) -> Self {
+        Self {
+            inner: self.inner.with_clear_timestamp_if_max_range(),
+            ..self
+        }
+    }
+
+    /// Since InfluxRPC predicates may have references to
+    /// `_measurement` columns or other table / table schema specific
+    /// restrictions, a predicate must specialized for each table
+    /// prior to being applied by IOx to a specific table.
+    ///
+    /// See [`normalize_predicate`] for more details on the
+    /// transformations applied.
+    ///
+    /// Returns a list of (TableName, [`Predicate`])
+    pub fn table_predicates(
+        &self,
+        table_info: &dyn QueryNamespaceMeta,
+    ) -> DataFusionResult<Vec<(Arc<str>, Predicate)>> {
+        let table_names = match &self.table_names {
+            Some(table_names) => itertools::Either::Left(table_names.iter().cloned()),
+            None => itertools::Either::Right(table_info.table_names().into_iter()),
+        };
+
+        table_names
+            .map(|table| {
+                let schema = table_info.table_schema(&table);
+                let predicate = match schema {
+                    Some(schema) => normalize_predicate(&table, schema, &self.inner)?,
+                    None => {
+                        // if we don't know about this table, we can't
+                        // do any predicate specialization. This can
+                        // happen if there is a request for
+                        // "measurement fields" for a non existent
+                        // measurement, for example
+                        self.inner.clone()
+                    }
+                };
+                Ok((Arc::from(table), predicate))
+            })
+            .collect()
+    }
+
+    /// Returns the table names this predicate is restricted to if any
+    pub fn table_names(&self) -> Option<&BTreeSet<String>> {
+        self.table_names.as_ref()
+    }
+
+    /// Returns true if ths predicate evaluates to true for all rows
+    pub fn is_empty(&self) -> bool {
+        self.table_names.is_none() && self.inner.is_empty()
+    }
+}
+
+/// Information required to normalize predicates
+pub trait QueryNamespaceMeta {
+    /// Returns a list of table names in this namespace
+    fn table_names(&self) -> Vec<String>;
+
+    /// Schema for a specific table if the table exists.
+    ///
+    /// TODO: Make this return Option<&Schema>
+    fn table_schema(&self, table_name: &str) -> Option<Schema>;
+}
+
+/// Predicate that has been "specialized" / normalized for a
+/// particular table. Specifically:
+///
+/// * all references to the [MEASUREMENT_COLUMN_NAME] column in any
+/// `Exprs` are rewritten with the actual table name
+/// * any expression on the [VALUE_COLUMN_NAME] column is rewritten to be
+/// applied across all field columns.
+/// * any expression on the [FIELD_COLUMN_NAME] is rewritten to be
+/// applied as a projection to specific columns.
+///
+/// For example if the original predicate was
+/// ```text
+/// _measurement = "some_table"
+/// ```
+///
+/// When evaluated on table "cpu" then the predicate is rewritten to
+/// ```text
+/// "cpu" = "some_table"
+/// ```
+///
+/// if the original predicate contained
+/// ```text
+/// _value > 34.2
+/// ```
+///
+/// When evaluated on table "cpu" then the expression is rewritten as a
+/// collection of disjunctive expressions against all field columns
+/// ```text
+/// ("field1" > 34.2 OR "field2" > 34.2 OR "fieldn" > 34.2)
+/// ```
+fn normalize_predicate(
+    table_name: &str,
+    schema: Schema,
+    predicate: &Predicate,
+) -> DataFusionResult<Predicate> {
+    let mut predicate = predicate.clone();
+
+    let mut field_projections = FieldProjectionRewriter::new(schema.clone());
+
+    let mut field_value_exprs = vec![];
+
+    let props = ExecutionProps::new();
+    let df_schema = schema.as_arrow().to_dfschema_ref()?;
+    let simplify_context = SimplifyContext::new(&props).with_schema(Arc::clone(&df_schema));
+    let simplifier = ExprSimplifier::new(simplify_context);
+
+    predicate.exprs = predicate
+        .exprs
+        .into_iter()
+        .map(|e| {
+            debug!(?e, "rewriting expr");
+
+            let e = e
+                .transform(&|e| rewrite_measurement_references(table_name, e))
+                .map(|e| log_rewrite(e, "rewrite_measurement_references"))
+                // Rewrite any references to `_value = some_value` to literal true values.
+                // Keeps track of these expressions, which can then be used to
+                // augment field projections with conditions using `CASE` statements.
+                .and_then(|e| rewrite_field_value_references(&mut field_value_exprs, e))
+                .map(|e| log_rewrite(e, "rewrite_field_value_references"))
+                // Rewrite any references to `_field` with a literal
+                // and keep track of referenced field names to add to
+                // the field column projection set.
+                .and_then(|e| field_projections.rewrite_field_exprs(e))
+                .map(|e| log_rewrite(e, "field_projections"))
+                // Any column references that exist in the RPC predicate must exist
+                // in the table's schema as tags. Replace any column references that
+                // do not exist, or that are not tags, with NULL.
+                // Field values always use `_value` as a name and are handled above.
+                .and_then(|e| e.transform(&|e| missing_tag_to_null(&schema, e)))
+                .map(|e| log_rewrite(e, "missing_columums"))
+                // apply IOx specific rewrites (that unlock other simplifications)
+                .and_then(rewrite::iox_expr_rewrite)
+                .map(|e| log_rewrite(e, "rewrite"))
+                // apply type_coercing so datafuson simplification can deal with this
+                .and_then(|e| simplifier.coerce(e, Arc::clone(&df_schema)))
+                .map(|e| log_rewrite(e, "coerce_expr"))
+                // Call DataFusion simplification logic
+                .and_then(|e| simplifier.simplify(e))
+                .map(|e| log_rewrite(e, "simplify_expr"))
+                .and_then(rewrite::simplify_predicate)
+                .map(|e| log_rewrite(e, "simplify_predicate"));
+
+            debug!(?e, "rewritten expr");
+            e
+        })
+        // Filter out literal true so is_empty works correctly
+        .filter(|f| match f {
+            Err(_) => true,
+            Ok(expr) => (*expr) != lit(true),
+        })
+        .collect::<DataFusionResult<Vec<_>>>()?;
+
+    // Store any field value (`_value`) expressions on the `Predicate`.
+    predicate.value_expr = field_value_exprs;
+
+    // save any field projections
+    field_projections.add_to_predicate(predicate)
+}
+
+fn log_rewrite(expr: Expr, description: &str) -> Expr {
+    trace!(?expr, %description, "After rewrite");
+    expr
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::Predicate;
+
+    use super::*;
+    use arrow::datatypes::DataType;
+    use datafusion::{
+        prelude::{col, lit},
+        scalar::ScalarValue,
+    };
+    use datafusion_util::lit_dict;
+    use test_helpers::assert_contains;
+
+    #[test]
+    fn test_normalize_predicate_coerced() {
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("t1").eq(lit("f1"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(col("t1").eq(lit_dict("f1")));
+
+        assert_eq!(predicate, expected);
+    }
+
+    #[test]
+    fn test_normalize_predicate_field_rewrite() {
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("_field").eq(lit("f1"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_field_columns(vec!["f1"]).unwrap();
+
+        assert_eq!(predicate, expected);
+    }
+
+    #[test]
+    fn test_normalize_predicate_field_rewrite_multi_field() {
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new()
+                .with_expr(col("_field").eq(lit("f1")).or(col("_field").eq(lit("f2")))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new()
+            .with_field_columns(vec!["f1", "f2"])
+            .unwrap();
+
+        assert_eq!(predicate, expected);
+    }
+
+    #[test]
+    fn test_normalize_predicate_field_non_existent_field() {
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("_field").eq(lit("not_a_field"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new()
+            .with_field_columns(vec![] as Vec<String>)
+            .unwrap();
+        assert_eq!(&expected.field_columns, &Some(BTreeSet::new()));
+        assert_eq!(predicate, expected);
+    }
+
+    #[test]
+    fn test_normalize_predicate_field_non_tag() {
+        // should treat
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("not_a_tag").eq(lit("blarg"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_expr(lit(ScalarValue::Boolean(None)));
+        assert_eq!(predicate, expected);
+    }
+
+    #[test]
+    fn test_normalize_predicate_field_rewrite_multi_field_unsupported() {
+        let err = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new()
+                // predicate refers to a column other than _field which is not supported
+                .with_expr(
+                    col("t1")
+                        .eq(lit("my_awesome_tag_value"))
+                        .or(col("_field").eq(lit("f2"))),
+                ),
+        )
+        .unwrap_err();
+
+        let expected = r#"Error during planning: Unsupported _field predicate: t1 = Utf8("my_awesome_tag_value") OR _field = Utf8("f2")"#;
+
+        assert_contains!(err.to_string(), expected);
+    }
+
+    #[test]
+    fn test_normalize_predicate_field_rewrite_not_eq() {
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new().with_expr(col("_field").not_eq(lit("f1"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_field_columns(vec!["f2"]).unwrap();
+
+        assert_eq!(predicate, expected);
+    }
+
+    #[test]
+    fn test_normalize_predicate_field_rewrite_field_multi_expressions() {
+        let predicate = normalize_predicate(
+            "table",
+            schema(),
+            &Predicate::new()
+                // put = and != predicates in *different* exprs
+                .with_expr(col("_field").eq(lit("f1")))
+                .with_expr(col("_field").not_eq(lit("f2"))),
+        )
+        .unwrap();
+
+        let expected = Predicate::new().with_field_columns(vec!["f1"]).unwrap();
+
+        assert_eq!(predicate, expected);
+    }
+
+    fn schema() -> Schema {
+        schema::builder::SchemaBuilder::new()
+            .tag("t1")
+            .tag("t2")
+            .field("f1", DataType::Int64)
+            .unwrap()
+            .field("f2", DataType::Int64)
+            .unwrap()
+            .build()
+            .unwrap()
+    }
+
+    #[allow(dead_code)]
+    const fn assert_send<T: Send>() {}
+
+    // `InfluxRpcPredicate` shall be `Send`, otherwise we will have problems constructing plans for InfluxRPC
+    // concurrently.
+    const _: () = assert_send::<InfluxRpcPredicate>();
+}
diff --git a/predicate/src/rpc_predicate/column_rewrite.rs b/predicate/src/rpc_predicate/column_rewrite.rs
new file mode 100644
index 0000000..a4cdf72
--- /dev/null
+++ b/predicate/src/rpc_predicate/column_rewrite.rs
@@ -0,0 +1,96 @@
+use datafusion::{
+    common::tree_node::Transformed, error::Result as DataFusionResult, prelude::*,
+    scalar::ScalarValue,
+};
+use schema::{InfluxColumnType, Schema};
+
+/// Logic for rewriting expressions from influxrpc that reference non
+/// existent columns, or columns that are not tags, to NULL.
+pub(crate) fn missing_tag_to_null(
+    schema: &Schema,
+    expr: Expr,
+) -> DataFusionResult<Transformed<Expr>> {
+    Ok(match expr {
+        Expr::Column(col) if !tag_column_exists(schema, &col)? => Transformed::Yes(lit_null()),
+        expr => Transformed::No(expr),
+    })
+}
+
+fn tag_column_exists(schema: &Schema, col: &Column) -> DataFusionResult<bool> {
+    // todo a real error here (rpc_predicates shouldn't have table/relation qualifiers)
+    assert!(col.relation.is_none());
+
+    let exists = schema
+        .find_index_of(&col.name)
+        .map(|i| schema.field(i).0)
+        .map(|influx_column_type| influx_column_type == InfluxColumnType::Tag)
+        .unwrap_or(false);
+    Ok(exists)
+}
+
+fn lit_null() -> Expr {
+    lit(ScalarValue::Utf8(None))
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{arrow::datatypes::DataType, common::tree_node::TreeNode};
+    use schema::SchemaBuilder;
+
+    use super::*;
+
+    #[test]
+    fn all_columns_defined_no_rewrite() {
+        // t1 = "foo"
+        let expr = col("t1").eq(lit("foo"));
+        assert_eq!(rewrite(expr.clone()), expr);
+
+        // t2 = "bar"
+        let expr = col("t2").eq(lit("bar"));
+        assert_eq!(rewrite(expr.clone()), expr);
+    }
+
+    #[test]
+    fn all_columns_not_defined() {
+        // non_defined = "foo" --> NULL = "foo"
+        let expr = col("non_defined").eq(lit("foo"));
+        let expected = lit_null().eq(lit("foo"));
+        assert_eq!(rewrite(expr), expected);
+
+        // non_defined = 1.4 --> NULL = 1.4
+        let expr = col("non_defined").eq(lit(1.4));
+        // No type is inferred so this is a literal null string (even though it maybe should be a literal float)
+        let expected = lit_null().eq(lit(1.4));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    #[test]
+    fn some_columns_not_defined() {
+        // t1 = "foo" AND non_defined = "bar" --> t1 = "foo" and NULL = "bar"
+        let expr = col("t1")
+            .eq(lit("foo"))
+            .and(col("non_defined").eq(lit("bar")));
+        let expected = col("t1").eq(lit("foo")).and(lit_null().eq(lit("bar")));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    #[test]
+    fn column_is_field() {
+        let expr = col("f1").eq(lit(31));
+        let expected = lit_null().eq(lit(31));
+        assert_eq!(rewrite(expr), expected);
+    }
+
+    fn rewrite(expr: Expr) -> Expr {
+        let schema = SchemaBuilder::new()
+            .tag("t1")
+            .tag("t2")
+            .field("f1", DataType::Int64)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        expr.transform(&|expr| missing_tag_to_null(&schema, expr))
+            .unwrap()
+    }
+}
diff --git a/predicate/src/rpc_predicate/field_rewrite.rs b/predicate/src/rpc_predicate/field_rewrite.rs
new file mode 100644
index 0000000..94cc4db
--- /dev/null
+++ b/predicate/src/rpc_predicate/field_rewrite.rs
@@ -0,0 +1,513 @@
+use crate::Predicate;
+
+use super::FIELD_COLUMN_NAME;
+use arrow::array::{as_boolean_array, as_string_array, ArrayRef, StringArray};
+use arrow::compute::kernels;
+use arrow::record_batch::RecordBatch;
+use datafusion::common::tree_node::{TreeNode, TreeNodeVisitor, VisitRecursion};
+use datafusion::common::DFSchema;
+use datafusion::error::{DataFusionError, Result as DataFusionResult};
+use datafusion::logical_expr::utils::split_conjunction_owned;
+use datafusion::physical_expr::create_physical_expr;
+use datafusion::physical_expr::execution_props::ExecutionProps;
+use datafusion::physical_plan::ColumnarValue;
+use datafusion::prelude::{lit, Expr};
+use schema::Schema;
+use std::sync::Arc;
+
+/// Logic for rewriting expressions from influxrpc that reference
+/// `_field` to projections (aka column selection)
+///
+/// Rewrites a predicate on `_field` as a projection against a
+/// specific schema to a literal true in the expression and remembers
+/// which fields were selected.
+///
+/// For example, if the query has a predicate with the expression
+/// `_field = "load4"`, FieldProjectionRewriter rewrites the predicate
+/// by replacing `_field = "load4" to `true`, and adds the column "load4"
+/// to the [`Predicate`]'s projection.
+///
+/// This rewrite also handle more complicated expressions such as
+/// the expression `_field = "load4" OR _field = "load5" OR ` is
+/// replaced by `true`, and the columns ("load4", "load5") are
+/// added to the predicate's projection.
+///
+/// This rewrite can not handle non-conjuction predicates that refer
+/// to both `_field` and some other column, such as `_field = "f1" OR
+/// tag1 = "host.example.com"). Such predicates would need to be
+/// handled at runtime as they depend on the data in the other
+/// columns, which is not available at planning time.
+#[derive(Debug)]
+pub(crate) struct FieldProjectionRewriter {
+    /// single column expressions (only refer to `_field`). If there
+    /// are any such expressions, ALL of them must evaluate to true
+    /// against a field's name in order to include that field in the
+    /// output
+    field_predicates: Vec<Expr>,
+    /// The input schema (from where we know the field)
+    schema: Schema,
+}
+
+impl FieldProjectionRewriter {
+    /// Create a new [`FieldProjectionRewriter`] targeting the given schema
+    pub(crate) fn new(schema: Schema) -> Self {
+        Self {
+            field_predicates: vec![],
+            schema,
+        }
+    }
+
+    /// Rewrites the predicate. See the description on
+    /// [`FieldProjectionRewriter`] for more details.
+    pub(crate) fn rewrite_field_exprs(&mut self, expr: Expr) -> DataFusionResult<Expr> {
+        // for predicates like `A AND B AND C`
+        // rewrite `A`, `B` and `C` separately and put them back together
+        let rewritten_expr = split_conjunction_owned(expr)
+            .into_iter()
+            // apply the rewrite individually
+            .map(|expr| self.rewrite_single_conjunct(expr))
+            // check for errors
+            .collect::<DataFusionResult<Vec<Expr>>>()?
+            // put the Exprs back together with AND
+            .into_iter()
+            .reduce(|acc, expr| acc.and(expr))
+            .expect("at least one expr");
+
+        Ok(rewritten_expr)
+    }
+
+    // Rewrites a single predicate. Does not handle AND specially
+    fn rewrite_single_conjunct(&mut self, expr: Expr) -> DataFusionResult<Expr> {
+        let mut finder = ColumnReferencesFinder::default();
+        expr.visit(&mut finder)?;
+
+        // rewrite any expression that only references _field to `true`
+        match (finder.saw_field_reference, finder.saw_non_field_reference) {
+            // Only saw _field column references, rewrite
+            (true, false) => {
+                self.field_predicates.push(expr);
+                Ok(lit(true))
+            }
+            // saw both _field and other column references, can't handle this case yet
+            // https://github.com/influxdata/influxdb_iox/issues/5310
+            (true, true) => Err(DataFusionError::Plan(format!(
+                "Unsupported _field predicate: {expr}"
+            ))),
+            // Didn't see any references, or only non _field references, nothing to do
+            (false, _) => Ok(expr),
+        }
+    }
+
+    /// Converts all field_predicates we have seen into a field column
+    /// restriction on the predicate by evaluating the expressions at plan time
+    ///
+    /// Uses arrow evaluation kernels to support arbitrary predicates,
+    /// including regex etc
+    pub(crate) fn add_to_predicate(self, predicate: Predicate) -> DataFusionResult<Predicate> {
+        // Common case is that there are no _field predicates, in
+        // which case we are done
+        if self.field_predicates.is_empty() {
+            return Ok(predicate);
+        }
+
+        // Form an array of strings from the field *names*:
+        //
+        // ┌─────────┐
+        // │ _field  │
+        // │  ----   │
+        // │  "f1"   │
+        // │  "f2"   │
+        // │  "f3"   │
+        // └─────────┘
+        let field_names: ArrayRef = Arc::new(
+            self.schema
+                .fields_iter()
+                .map(|f| f.name())
+                .map(Some)
+                .collect::<StringArray>(),
+        );
+
+        let batch = RecordBatch::try_from_iter(vec![(FIELD_COLUMN_NAME, Arc::clone(&field_names))])
+            .expect("Error creating _field record batch");
+
+        // Ceremony to prepare to evaluate the predicates
+        let input_schema = batch.schema();
+        let input_df_schema: DFSchema = input_schema.as_ref().clone().try_into().unwrap();
+        let props = ExecutionProps::default();
+        let exprs = self
+            .field_predicates
+            .into_iter()
+            .map(|expr| create_physical_expr(&expr, &input_df_schema, &input_schema, &props))
+            .collect::<DataFusionResult<Vec<_>>>()
+            .map_err(|e| DataFusionError::Internal(format!("Unsupported _field predicate: {e}")))?;
+
+        // evaluate into a boolean array where each element is true if
+        // the field name evaluated to true for all predicates, and
+        // false otherwise
+        let matching = exprs
+            .into_iter()
+            // evaluate each field_predicate against the actual field
+            // names. For example, if we have two predicates like
+            //
+            // _field !~= 'f2'
+            // _field != 'f3'
+            //
+            // We will produce two output arrays:
+            // ┌─────────┐  ┌─────────┐
+            // │  true   │  │  true   │
+            // │  false  │  │  true   │
+            // │  true   │  │  false  │
+            // └─────────┘  └─────────┘
+            .map(|expr| match expr.evaluate(&batch) {
+                Ok(ColumnarValue::Array(arr)) => arr,
+                Ok(ColumnarValue::Scalar(s)) => {
+                    panic!("Unexpected result evaluating {expr:?} against {batch:?}: {s:?}")
+                }
+                Err(e) => panic!("Unexpected err evaluating {expr:?} against {batch:?}: {e}"),
+            })
+            // Now combine the arrays using AND to get a single output
+            // boolean array. For the example above, we would get
+            // ┌─────────┐
+            // │  true   │
+            // │  false  │
+            // │  false  │
+            // └─────────┘
+            .reduce(|acc, arr| {
+                // apply boolean AND
+                let bool_array =
+                    kernels::boolean::and(as_boolean_array(&acc), as_boolean_array(&arr))
+                        .expect("Error computing AND");
+                Arc::new(bool_array) as ArrayRef
+            })
+            .unwrap();
+
+        assert_eq!(matching.len(), field_names.len());
+
+        // now find all field names with a 'true' entry in the
+        // corresponding row. From our example above:
+        // ┌──────┐
+        // │_field│
+        // │ ---- ├─────────┐
+        // │ "f1" │  true ◀─┼─────f1 matches
+        // │ "f2" │  false  │
+        // │ "f3" │  false  │
+        // └──────┴─────────┘
+        let new_fields = as_boolean_array(&matching)
+            .iter()
+            .zip(as_string_array(&field_names).iter())
+            .filter_map(|(matching, field_name)| {
+                if matching == Some(true) {
+                    // this array was constructed with no nulls, so
+                    // the field_name should not be null
+                    Some(field_name.unwrap())
+                } else {
+                    None
+                }
+            });
+
+        let predicate = predicate
+            .with_field_columns(new_fields)
+            // errors are possible if the field colmns are not supported
+            .map_err(|e| DataFusionError::NotImplemented(e.to_string()))?;
+
+        Ok(predicate)
+    }
+}
+
+// Analyzes an expressions column references and finds:
+// * Column references to `_field`
+// * Column references to other columns
+#[derive(Debug, Default)]
+struct ColumnReferencesFinder {
+    saw_field_reference: bool,
+    saw_non_field_reference: bool,
+}
+
+impl TreeNodeVisitor for ColumnReferencesFinder {
+    type N = Expr;
+    fn pre_visit(&mut self, expr: &Expr) -> DataFusionResult<VisitRecursion> {
+        if let Expr::Column(col) = expr {
+            if col.name == FIELD_COLUMN_NAME {
+                self.saw_field_reference = true;
+            } else {
+                self.saw_non_field_reference = true;
+            }
+        }
+
+        // terminate early if we have already found both
+        if self.saw_field_reference && self.saw_non_field_reference {
+            Ok(VisitRecursion::Stop)
+        } else {
+            Ok(VisitRecursion::Continue)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::DataType;
+    use datafusion::prelude::{case, col};
+    use schema::builder::SchemaBuilder;
+    use test_helpers::assert_contains;
+
+    #[test]
+    fn test_field_column_rewriter() {
+        let schema = make_schema();
+        let cases = vec![
+            (
+                // f1 = 1.82
+                col("f1").eq(lit(1.82)),
+                col("f1").eq(lit(1.82)),
+                None,
+            ),
+            (
+                // _field = "f1"
+                field_ref().eq(lit("f1")),
+                lit(true),
+                Some(vec!["f1"]),
+            ),
+            (
+                // _field = "not_a_field"
+                // should not match any rows
+                field_ref().eq(lit("not_a_field")),
+                lit(true),
+                Some(vec![]),
+            ),
+            (
+                // _field != "f1"
+                field_ref().not_eq(lit("f1")),
+                lit(true),
+                Some(vec!["f2", "f3", "f4"]),
+            ),
+            (
+                // reverse operand order
+                // f1 = _field
+                lit("f1").eq(field_ref()),
+                lit(true),
+                Some(vec!["f1"]),
+            ),
+            (
+                // reverse operand order
+                // f1 != _field
+                lit("f1").not_eq(field_ref()),
+                lit(true),
+                Some(vec!["f2", "f3", "f4"]),
+            ),
+            (
+                // mismatched != and =
+                // (_field != f1) AND (_field = f3)
+                field_ref().not_eq(lit("f1")).and(field_ref().eq(lit("f3"))),
+                lit(true).and(lit(true)),
+                Some(vec!["f3"]),
+            ),
+            (
+                // mismatched = and !=
+                // (_field = f1) OR (_field != f3)
+                field_ref().eq(lit("f1")).or(field_ref().not_eq(lit("f3"))),
+                lit(true),
+                Some(vec!["f1", "f2", "f4"]),
+            ),
+            (
+                // (_field = f1) OR (_field = f2)
+                field_ref().eq(lit("f1")).or(field_ref().eq(lit("f2"))),
+                lit(true),
+                Some(vec!["f1", "f2"]),
+            ),
+            (
+                // mix of _field and non _field, connected by AND
+                // (_field = f2) AND (f2 = 5)
+                field_ref().eq(lit("f2")).and(col("f2").eq(lit(5.0))),
+                lit(true).and(col("f2").eq(lit(5.0))),
+                Some(vec!["f2"]),
+            ),
+            (
+                // mix of multiple _field and non _field, but connected by ANDs
+                // (f1 = 5) AND (_field = f1) AND (f2 = 6)
+                col("f1")
+                    .eq(lit(5.0))
+                    .and(field_ref().eq(lit("f1")))
+                    .and(col("f2").eq(lit(6.0))),
+                col("f1")
+                    .eq(lit(5.0))
+                    .and(lit(true))
+                    .and(col("f2").eq(lit(6.0))),
+                Some(vec!["f1"]),
+            ),
+            (
+                // (f1 = 5) AND (((_field = f1) OR (_field = f3)) OR (_field = f2))
+                col("f1").eq(lit(5.0)).and(
+                    field_ref()
+                        .eq(lit("f1"))
+                        .or(field_ref().eq(lit("f3")))
+                        .or(field_ref().eq(lit("f2"))),
+                ),
+                col("f1").eq(lit(5.0)).and(lit(true)),
+                Some(vec!["f1", "f2", "f3"]),
+            ),
+            (
+                // (_field != f1) AND (_field != f2)
+                field_ref()
+                    .not_eq(lit("f1"))
+                    .and(field_ref().not_eq(lit("f2"))),
+                lit(true).and(lit(true)),
+                Some(vec!["f3", "f4"]),
+            ),
+            (
+                // (f1 = 5) AND (((_field != f1) AND (_field != f3)) AND (_field != f2))
+                col("f1").eq(lit(5.0)).and(
+                    field_ref()
+                        .not_eq(lit("f1"))
+                        .and(field_ref().not_eq(lit("f3")))
+                        .and(field_ref().not_eq(lit("f2"))),
+                ),
+                col("f1")
+                    .eq(lit(5.0))
+                    .and(lit(true))
+                    .and(lit(true))
+                    .and(lit(true)),
+                Some(vec!["f4"]),
+            ),
+            (
+                // _field IS NOT NULL
+                field_ref().is_not_null(),
+                lit(true),
+                Some(vec!["f1", "f2", "f3", "f4"]),
+            ),
+            (
+                // case _field
+                //   WHEN "f1" THEN true
+                //   WHEN  "f2"  THEN false
+                // END
+                case(field_ref())
+                    .when(lit("f1"), lit(true))
+                    .when(lit("f2"), lit(false))
+                    .end()
+                    .unwrap(),
+                lit(true),
+                Some(vec!["f1"]),
+            ),
+            (
+                // _field = f1 AND _measurement = m1
+                field_ref()
+                    .eq(lit("f1"))
+                    .and(col("_measurement").eq(lit("m1"))),
+                lit(true).and(col("_measurement").eq(lit("m1"))),
+                Some(vec!["f1"]),
+            ),
+            (
+                // (_field =~ 'f1')
+                regex_match(field_ref(), "f1"),
+                lit(true),
+                Some(vec!["f1"]),
+            ),
+            (
+                // (_field =~ 'f1|f2')
+                regex_match(field_ref(), "f1|f2"),
+                lit(true),
+                Some(vec!["f1", "f2"]),
+            ),
+            (
+                // RegexNotMatch
+                // (_field !=~ 'f1|f2')
+                regex_not_match(field_ref(), "f1|f2"),
+                lit(true),
+                Some(vec!["f3", "f4"]),
+            ),
+            (
+                // (_field =~ 'f1|f2') AND (_field !~= 'f2') AND (foo = 5.0)
+                regex_match(field_ref(), "f1|f2")
+                    .and(regex_not_match(field_ref(), "f2"))
+                    .and(col("foo").eq(lit(5.0))),
+                lit(true).and(lit(true)).and(col("foo").eq(lit(5.0))),
+                Some(vec!["f1"]),
+            ),
+        ];
+
+        for (input, exp_expr, exp_field_columns) in cases {
+            println!(
+                "Running test\ninput: {input:?}\nexpected_expr: {exp_expr:?}\nexpected_field_columns: {exp_field_columns:?}\n"
+            );
+            let mut rewriter = FieldProjectionRewriter::new(schema.clone());
+
+            let rewritten = rewriter.rewrite_field_exprs(input).unwrap();
+            assert_eq!(rewritten, exp_expr);
+
+            let predicate = rewriter.add_to_predicate(Predicate::new()).unwrap();
+
+            let actual_field_columns = predicate
+                .field_columns
+                .as_ref()
+                .map(|field_columns| field_columns.iter().map(|s| s.as_str()).collect::<Vec<_>>());
+
+            assert_eq!(actual_field_columns, exp_field_columns);
+        }
+    }
+
+    #[test]
+    fn test_field_column_rewriter_unsupported() {
+        let schema = make_schema();
+        let cases = vec![
+            (
+                // mix of field and non field, connected by OR
+                // f1 = _field OR f1 = 5.0
+                lit("f1").eq(field_ref()).or(col("f1").eq(lit(5.0))),
+                "Unsupported _field predicate",
+            ),
+            (
+                // more complicated
+                // f1 = _field AND (_field = f2 OR f2 = 5.0)
+                lit("f1")
+                    .eq(field_ref())
+                    .and(field_ref().eq(lit("f2")).or(col("f2").eq(lit(5.0)))),
+                "Unsupported _field predicate",
+            ),
+        ];
+
+        for (input, exp_error) in cases {
+            println!("Running test\ninput: {input:?}\nexpected_error: {exp_error:?}\n");
+
+            let run_case = || {
+                let mut rewriter = FieldProjectionRewriter::new(schema.clone());
+                // check for error in rewrite_field_exprs
+                rewriter.rewrite_field_exprs(input)?;
+                // check for error adding to predicate
+                rewriter.add_to_predicate(Predicate::new())
+            };
+
+            let err = run_case().expect_err("Expected error rewriting, but was successful");
+            assert_contains!(err.to_string(), exp_error);
+        }
+    }
+
+    /// returns a reference to the special _field column
+    fn field_ref() -> Expr {
+        col(FIELD_COLUMN_NAME)
+    }
+
+    /// Returns a regex_match expression arg ~= pattern
+    fn regex_match(arg: Expr, pattern: impl Into<String>) -> Expr {
+        query_functions::regex_match_expr(arg, pattern.into())
+    }
+
+    /// Returns a regex_match expression arg !~= pattern
+    fn regex_not_match(arg: Expr, pattern: impl Into<String>) -> Expr {
+        query_functions::regex_not_match_expr(arg, pattern.into())
+    }
+
+    fn make_schema() -> Schema {
+        SchemaBuilder::new()
+            .tag("foo")
+            .tag("bar")
+            .field("f1", DataType::Float64)
+            .unwrap()
+            .field("f2", DataType::Float64)
+            .unwrap()
+            .field("f3", DataType::Float64)
+            .unwrap()
+            .field("f4", DataType::Float64)
+            .unwrap()
+            .build()
+            .unwrap()
+    }
+}
diff --git a/predicate/src/rpc_predicate/measurement_rewrite.rs b/predicate/src/rpc_predicate/measurement_rewrite.rs
new file mode 100644
index 0000000..ea367ef
--- /dev/null
+++ b/predicate/src/rpc_predicate/measurement_rewrite.rs
@@ -0,0 +1,24 @@
+use datafusion::common::tree_node::Transformed;
+use datafusion::error::Result as DataFusionResult;
+use datafusion::prelude::{lit, Column, Expr};
+
+use super::MEASUREMENT_COLUMN_NAME;
+
+/// Rewrites all references to the [MEASUREMENT_COLUMN_NAME] column
+/// with the actual table name
+pub(crate) fn rewrite_measurement_references(
+    table_name: &str,
+    expr: Expr,
+) -> DataFusionResult<Transformed<Expr>> {
+    Ok(match expr {
+        // rewrite col("_measurement") --> "table_name"
+        Expr::Column(Column { relation, name }) if name == MEASUREMENT_COLUMN_NAME => {
+            // should not have a qualified foo._measurement
+            // reference
+            assert!(relation.is_none());
+            Transformed::Yes(lit(table_name))
+        }
+        // no rewrite needed
+        _ => Transformed::No(expr),
+    })
+}
diff --git a/predicate/src/rpc_predicate/rewrite.rs b/predicate/src/rpc_predicate/rewrite.rs
new file mode 100644
index 0000000..08fd660
--- /dev/null
+++ b/predicate/src/rpc_predicate/rewrite.rs
@@ -0,0 +1,531 @@
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    error::Result,
+    logical_expr::{binary_expr, expr::Case, BinaryExpr, Cast, Like, Operator},
+    prelude::Expr,
+};
+
+/// Special purpose `Expr` rewrite rules for IOx
+///
+/// DataFusion has many `Expr` rewrite / simplification rules that are
+/// generally useful.  However, there are several that (currently)
+/// only seem to make sense for IOx which are placed here:
+///
+/// 1. Fold past CASE blocks
+///
+/// # Fold past CASE blocks / translate to boolean CASE #3585
+///
+/// Inlines boolean BinaryExprs that have a CASE as an argument such
+/// as the following (note the `= 'cpu'`):
+///
+/// ```sql
+/// CASE
+///   WHEN tag_col IS NULL THEN ''
+///   ELSE tag_col
+///   END = 'cpu'
+/// ```
+///
+/// By "inlining" the = 'cpu' to each WHEN into:
+///
+/// ```sql
+/// CASE
+///  WHEN tag_col IS NULL THEN '' = 'cpu'
+///  ELSE tag_col = 'cpu'
+/// END
+/// ```
+pub fn iox_expr_rewrite(expr: Expr) -> Result<Expr> {
+    expr.transform(&iox_expr_rewrite_inner)
+}
+
+fn iox_expr_rewrite_inner(expr: Expr) -> Result<Transformed<Expr>> {
+    Ok(match expr {
+        Expr::BinaryExpr(BinaryExpr { left, op, right }) if is_case(&left) && is_comparison(op) => {
+            Transformed::Yes(inline_case(true, *left, *right, op))
+        }
+        Expr::BinaryExpr(BinaryExpr { left, op, right })
+            if is_case(&right) && is_comparison(op) =>
+        {
+            Transformed::Yes(inline_case(false, *left, *right, op))
+        }
+        expr => Transformed::No(expr),
+    })
+}
+
+/// Special purpose `Expr` rewrite rules for an Expr that is used as a predicate.
+///
+/// In general the rewrite rules in Datafusion and IOx attempt to
+/// preserve the semantics of an expression, especially with respect to
+/// nulls. This means that certain expressions can not be simplified
+/// (as they may become null)
+///
+/// However, for `Expr`s used as filters, only rows for which the
+/// `Expr` evaluates to 'true' are returned. Those rows for which the
+/// `Expr` evaluates to `false` OR `null` are filtered out.
+///
+/// This function simplifies `Expr`s that are being used as
+/// predicates.
+///
+/// Currently it is special cases, but it would be great to generalize
+/// it and contribute it back to DataFusion
+pub fn simplify_predicate(expr: Expr) -> Result<Expr> {
+    expr.transform(&simplify_predicate_inner)
+}
+
+fn simplify_predicate_inner(expr: Expr) -> Result<Transformed<Expr>> {
+    // look for this structure:
+    //
+    //  NOT(col IS NULL) AND col = 'foo'
+    //
+    // and replace it with
+    //
+    // col = 'foo'
+    //
+    // Proof:
+    // Case 1: col is NULL
+    //
+    // not (NULL IS NULL) AND col = 'foo'
+    // not (true) AND NULL = 'foo'
+    // NULL
+    //
+    // Case 2: col is not NULL and not equal to 'foo'
+    // not (false) AND false
+    // true AND false
+    // false
+    //
+    // Case 3: col is not NULL and equal to 'foo'
+    // not (false) AND true
+    // true AND true
+    // true
+    match expr {
+        Expr::BinaryExpr(BinaryExpr {
+            left,
+            op: Operator::And,
+            right,
+        }) => {
+            if let (Some(coll), Some(colr)) = (is_col_not_null(&left), is_col_op_lit(&right)) {
+                if colr == coll {
+                    return Ok(Transformed::Yes(*right));
+                }
+            } else if let (Some(coll), Some(colr)) = (is_col_op_lit(&left), is_col_not_null(&right))
+            {
+                if colr == coll {
+                    return Ok(Transformed::Yes(*left));
+                }
+            };
+
+            Ok(Transformed::No(Expr::BinaryExpr(BinaryExpr {
+                left,
+                op: Operator::And,
+                right,
+            })))
+        }
+        expr => Ok(Transformed::No(expr)),
+    }
+}
+
+/// if we can rewrite this case statement
+fn is_case(expr: &Expr) -> bool {
+    // don't support the `CASE <expr> WHEN <..> ELSE <..> END` syntax yet
+    matches!(expr, Expr::Case(Case { expr: None, .. }))
+}
+
+/// Returns true if this binary operator returns a boolean value
+fn is_comparison(op: Operator) -> bool {
+    // explicitly list all of these operators so when new ones are
+    // added to the enum we will have to update this `match`
+    match op {
+        Operator::BitwiseAnd => false,
+        Operator::BitwiseOr => false,
+        Operator::BitwiseShiftLeft => false,
+        Operator::BitwiseShiftRight => false,
+        Operator::BitwiseXor => false,
+        Operator::Eq => true,
+        Operator::NotEq => true,
+        Operator::Lt => true,
+        Operator::LtEq => true,
+        Operator::Gt => true,
+        Operator::GtEq => true,
+        Operator::Plus => false,
+        Operator::Minus => false,
+        Operator::Multiply => false,
+        Operator::Divide => false,
+        Operator::Modulo => false,
+        Operator::And => true,
+        Operator::Or => true,
+        Operator::IsDistinctFrom => true,
+        Operator::IsNotDistinctFrom => true,
+        Operator::RegexMatch => true,
+        Operator::RegexIMatch => true,
+        Operator::RegexNotMatch => true,
+        Operator::RegexNotIMatch => true,
+        Operator::StringConcat => false,
+        // array containment operators
+        Operator::ArrowAt => true,
+        Operator::AtArrow => true,
+    }
+}
+
+fn inline_case(case_on_left: bool, left: Expr, right: Expr, op: Operator) -> Expr {
+    let (when_then_expr, else_expr, other) = match (case_on_left, left, right) {
+        (
+            true,
+            Expr::Case(Case {
+                expr: None,
+                when_then_expr,
+                else_expr,
+            }),
+            right,
+        ) => (when_then_expr, else_expr, right),
+        (
+            false,
+            left,
+            Expr::Case(Case {
+                expr: None,
+                when_then_expr,
+                else_expr,
+            }),
+        ) => (when_then_expr, else_expr, left),
+        _ => unreachable!(),
+    };
+
+    let when_then_expr = when_then_expr
+        .into_iter()
+        .map(|(when, then)| {
+            let then = Box::new(if case_on_left {
+                binary_expr(*then, op, other.clone())
+            } else {
+                binary_expr(other.clone(), op, *then)
+            });
+            (when, then)
+        })
+        .collect();
+
+    let else_expr = else_expr.map(|else_expr| {
+        Box::new(if case_on_left {
+            binary_expr(*else_expr, op, other)
+        } else {
+            binary_expr(other, op, *else_expr)
+        })
+    });
+
+    Expr::Case(Case {
+        expr: None,
+        when_then_expr,
+        else_expr,
+    })
+}
+
+/// returns the column name for a column expression
+fn is_col(expr: &Expr) -> Option<&str> {
+    match expr {
+        Expr::Column(c) => Some(c.name.as_str()),
+        Expr::Cast(Cast { expr, data_type: _ }) => is_col(expr),
+        _ => None,
+    }
+}
+
+/// returns the column name for an expression like `IS NULL(col)`
+fn is_col_null(expr: &Expr) -> Option<&str> {
+    if let Expr::IsNull(arg) = &expr {
+        is_col(arg)
+    } else {
+        None
+    }
+}
+
+/// returns the column name for an expression like `IS NOT NULL(col)` or `NOT(IS NULL(col))`
+fn is_col_not_null(expr: &Expr) -> Option<&str> {
+    match expr {
+        Expr::IsNotNull(arg) => is_col(arg),
+        Expr::Not(arg) => is_col_null(arg),
+        _ => None,
+    }
+}
+
+fn is_lit(expr: &Expr) -> bool {
+    matches!(expr, Expr::Literal(_))
+}
+
+/// returns the column name for an expression like `col = <lit>`
+fn is_col_op_lit(expr: &Expr) -> Option<&str> {
+    match expr {
+        Expr::BinaryExpr(BinaryExpr { left, op: _, right }) if is_lit(right) => is_col(left),
+        Expr::BinaryExpr(BinaryExpr { left, op: _, right }) if is_lit(left) => is_col(right),
+        Expr::Like(Like { expr, pattern, .. }) if is_lit(pattern) => is_col(expr),
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::ops::{Add, Not};
+
+    use super::*;
+    use arrow::datatypes::DataType;
+    use datafusion::prelude::{case, cast, col, lit, when};
+
+    #[test]
+    fn test_fold_case_expr() {
+        // no rewrites with base expression form
+        let expr = case(col("tag"))
+            .when(lit("foo"), lit("case1"))
+            .when(lit("bar"), lit("case2"))
+            .otherwise(lit("case3"))
+            .unwrap()
+            .eq(lit("case2"));
+
+        let expected = expr.clone();
+        assert_eq!(expected, iox_expr_rewrite(expr).unwrap());
+    }
+
+    #[test]
+    fn test_fold_case_basic() {
+        // CASE WHEN tag IS NULL then '' ELSE tag END = 'bar'
+        let expr = make_case(col("tag").is_null(), lit(""), col("tag")).eq(lit("bar"));
+
+        // CASE WHEN tag IS NULL then '' = 'bar' ELSE tag = 'bar' END
+        let expected = make_case(
+            col("tag").is_null(),
+            lit("").eq(lit("bar")),
+            col("tag").eq(lit("bar")),
+        );
+
+        assert_eq!(expected, iox_expr_rewrite(expr).unwrap());
+    }
+
+    #[test]
+    fn test_fold_case_basic_reversed() {
+        // test with "foo" = CASE...
+
+        //  'bar' = CASE WHEN tag IS NULL then '' ELSE tag END
+        let expr = lit("bar").eq(make_case(col("tag").is_null(), lit(""), col("tag")));
+
+        // CASE WHEN tag IS NULL then '' = 'bar' ELSE tag = 'bar' END
+        let expected = make_case(
+            col("tag").is_null(),
+            lit("bar").eq(lit("")),
+            lit("bar").eq(col("tag")),
+        );
+
+        assert_eq!(expected, iox_expr_rewrite(expr).unwrap());
+    }
+
+    #[test]
+    fn test_fold_case_both_sides() {
+        //  CASE WHEN tag IS NULL then '' ELSE tag END =
+        //  CASE WHEN other_tag IS NULL then '' ELSE other_tag END
+        let expr = make_case(col("tag").is_null(), lit(""), col("tag")).eq(make_case(
+            col("other_tag").is_null(),
+            lit(""),
+            col("other_tag"),
+        ));
+
+        let expected = make_case(
+            col("tag").is_null(),
+            lit("").eq(make_case(
+                col("other_tag").is_null(),
+                lit(""),
+                col("other_tag"),
+            )),
+            col("tag").eq(make_case(
+                col("other_tag").is_null(),
+                lit(""),
+                col("other_tag"),
+            )),
+        );
+
+        assert_eq!(expected, iox_expr_rewrite(expr).unwrap());
+    }
+
+    #[test]
+    fn test_fold_case_ops() {
+        run_case(Operator::BitwiseAnd, false, lit(1), lit(2));
+        run_case(Operator::Eq, true, lit("foo"), lit("bar"));
+        run_case(Operator::NotEq, true, lit("foo"), lit("bar"));
+        run_case(Operator::Lt, true, lit("foo"), lit("bar"));
+        run_case(Operator::LtEq, true, lit("foo"), lit("bar"));
+        run_case(Operator::Gt, true, lit("foo"), lit("bar"));
+        run_case(Operator::GtEq, true, lit("foo"), lit("bar"));
+        run_case(Operator::Plus, false, lit(1), lit(2));
+        run_case(Operator::Minus, false, lit(1), lit(2));
+        run_case(Operator::Multiply, false, lit(1), lit(2));
+        run_case(Operator::Divide, false, lit(1), lit(2));
+        run_case(Operator::Modulo, false, lit(1), lit(2));
+        run_case(Operator::And, true, lit("foo"), lit("bar"));
+        run_case(Operator::Or, true, lit("foo"), lit("bar"));
+        run_case(Operator::IsDistinctFrom, true, lit("foo"), lit("bar"));
+        run_case(Operator::IsNotDistinctFrom, true, lit("foo"), lit("bar"));
+        run_case(Operator::RegexMatch, true, lit("foo"), lit("bar"));
+        run_case(Operator::RegexIMatch, true, lit("foo"), lit("bar"));
+        run_case(Operator::RegexNotMatch, true, lit("foo"), lit("bar"));
+        run_case(Operator::RegexNotIMatch, true, lit("foo"), lit("bar"));
+    }
+
+    fn run_case(op: Operator, expect_rewrite: bool, lit1: Expr, lit2: Expr) {
+        // CASE WHEN tag IS NULL then '' ELSE tag END = 'bar'
+        let expr = binary_expr(
+            make_case(col("tag").is_null(), lit1.clone(), col("tag")),
+            op,
+            lit2.clone(),
+        );
+
+        // CASE WHEN tag IS NULL then '' = 'bar' ELSE tag = 'bar' END
+        let expected = if expect_rewrite {
+            make_case(
+                col("tag").is_null(),
+                binary_expr(lit1, op, lit2.clone()),
+                binary_expr(col("tag"), op, lit2),
+            )
+        } else {
+            expr.clone()
+        };
+
+        assert_eq!(expected, iox_expr_rewrite(expr).unwrap());
+    }
+
+    #[test]
+    // test with more than one when expr
+    fn test_fold_case_multiple_when_expr() {
+        // CASE
+        //  WHEN tag IS NULL     THEN 'is null'
+        //  WHEN tag IS NOT NULL THEN 'is not null'
+        //  ELSE 'WTF?`
+        // END = 'is null'
+        let expr = when(col("tag").is_null(), lit("is null"))
+            .when(col("tag").is_not_null(), lit("is not null"))
+            .otherwise(lit("WTF?"))
+            .unwrap()
+            .eq(lit("is null"));
+
+        // CASE
+        //  WHEN tag IS NULL     THEN 'is null' = 'is null'
+        //  WHEN tag IS NOT NULL THEN 'is not null' = 'is null'
+        //  ELSE 'WTF?' = 'is null'
+        // END
+        let expected = when(col("tag").is_null(), lit("is null").eq(lit("is null")))
+            .when(
+                col("tag").is_not_null(),
+                lit("is not null").eq(lit("is null")),
+            )
+            .otherwise(lit("WTF?").eq(lit("is null")))
+            .unwrap();
+
+        assert_eq!(expected, iox_expr_rewrite(expr).unwrap());
+    }
+
+    #[test]
+    // negative  test with a non binary expr
+    fn test_fold_case_non_binary() {
+        // CASE
+        //  WHEN tag IS NULL     THEN 1
+        //  ELSE 2
+        // END = 1
+        let expr = when(col("tag").is_null(), lit(1))
+            .otherwise(lit(2))
+            .unwrap()
+            .add(lit(1));
+
+        let expected = expr.clone();
+        assert_eq!(expected, iox_expr_rewrite(expr).unwrap());
+    }
+
+    fn make_case(when_expr: Expr, then_expr: Expr, otherwise_expr: Expr) -> Expr {
+        when(when_expr, then_expr)
+            .otherwise(otherwise_expr)
+            .unwrap()
+    }
+
+    #[test]
+    fn test_simplify_predicate() {
+        let expr = col("foo").is_null().not().and(col("foo").eq(lit("bar")));
+        let expected = col("foo").eq(lit("bar"));
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_reversed() {
+        let expr = col("foo").eq(lit("bar")).and(col("foo").is_null().not());
+        let expected = col("foo").eq(lit("bar"));
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_different_col() {
+        // only works when col references are the same
+        let expr = col("foo").is_null().not().and(col("foo2").eq(lit("bar")));
+        let expected = expr.clone();
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_different_col_reversed() {
+        // only works when col references are the same
+        let expr = col("foo2").eq(lit("bar")).and(col("foo").is_null().not());
+        let expected = expr.clone();
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_is_not_null() {
+        let expr = col("foo").is_not_null().and(col("foo").eq(lit("bar")));
+        let expected = col("foo").eq(lit("bar"));
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_complex() {
+        // can't rewrite to some thing else fancy on the right
+        let expr = col("foo").is_null().not().and(col("foo").eq(col("foo")));
+        let expected = expr.clone();
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_cast_left() {
+        let expr = cast(col("foo"), DataType::Utf8)
+            .is_null()
+            .not()
+            .and(col("foo").eq(lit("bar")));
+        let expected = col("foo").eq(lit("bar"));
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_cast_right() {
+        let expr = col("foo")
+            .is_null()
+            .not()
+            .and(cast(col("foo"), DataType::Utf8).eq(lit("bar")));
+        let expected = cast(col("foo"), DataType::Utf8).eq(lit("bar"));
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    #[test]
+    fn test_simplify_predicate_cast_both() {
+        let expr = cast(col("foo"), DataType::Utf8)
+            .is_null()
+            .not()
+            .and(cast(col("foo"), DataType::Utf8).eq(lit("bar")));
+        let expected = cast(col("foo"), DataType::Utf8).eq(lit("bar"));
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+
+    fn like(expr: Expr, pattern: Expr) -> Expr {
+        let expr = Box::new(expr);
+        let pattern = Box::new(pattern);
+        Expr::Like(Like {
+            negated: false,
+            expr,
+            pattern,
+            escape_char: None,
+            case_insensitive: false,
+        })
+    }
+
+    #[test]
+    fn test_simplify_predicate_like() {
+        let expr = col("foo").is_null().not().and(like(col("foo"), lit("bar")));
+        let expected = like(col("foo"), lit("bar"));
+        assert_eq!(expected, simplify_predicate(expr).unwrap());
+    }
+}
diff --git a/predicate/src/rpc_predicate/value_rewrite.rs b/predicate/src/rpc_predicate/value_rewrite.rs
new file mode 100644
index 0000000..d91946a
--- /dev/null
+++ b/predicate/src/rpc_predicate/value_rewrite.rs
@@ -0,0 +1,81 @@
+use datafusion::common::tree_node::{TreeNode, TreeNodeRewriter};
+use datafusion::error::Result as DataFusionResult;
+use datafusion::prelude::{lit, Expr};
+
+use crate::ValueExpr;
+
+/// Rewrites an expression on `_value` as a boolean true literal, pushing any
+/// encountered expressions onto `value_exprs` so they can be moved onto column
+/// projections.
+pub(crate) fn rewrite_field_value_references(
+    value_exprs: &mut Vec<ValueExpr>,
+    expr: Expr,
+) -> DataFusionResult<Expr> {
+    let mut rewriter = FieldValueRewriter { value_exprs };
+    expr.rewrite(&mut rewriter)
+}
+
+struct FieldValueRewriter<'a> {
+    value_exprs: &'a mut Vec<ValueExpr>,
+}
+
+impl<'a> TreeNodeRewriter for FieldValueRewriter<'a> {
+    type N = Expr;
+
+    fn mutate(&mut self, expr: Expr) -> DataFusionResult<Expr> {
+        // try and convert Expr into a ValueExpr
+        match expr.try_into() {
+            // found a value expr. Save and replace with true
+            Ok(value_expr) => {
+                self.value_exprs.push(value_expr);
+                Ok(lit(true))
+            }
+            // not a ValueExpr, so leave the same
+            Err(expr) => Ok(expr),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::rpc_predicate::VALUE_COLUMN_NAME;
+
+    use datafusion::prelude::col;
+
+    #[test]
+    fn test_field_value_rewriter() {
+        let mut rewriter = FieldValueRewriter {
+            value_exprs: &mut vec![],
+        };
+
+        let cases = vec![
+            (col("f1").eq(lit(1.82)), col("f1").eq(lit(1.82)), vec![]),
+            (col("t2"), col("t2"), vec![]),
+            (
+                col(VALUE_COLUMN_NAME).eq(lit(1.82)),
+                // _value = 1.82 -> true
+                lit(true),
+                vec![ValueExpr {
+                    expr: col(VALUE_COLUMN_NAME).eq(lit(1.82)),
+                }],
+            ),
+        ];
+
+        for (input, exp, mut value_exprs) in cases {
+            let rewritten = input.rewrite(&mut rewriter).unwrap();
+            assert_eq!(rewritten, exp);
+            assert_eq!(rewriter.value_exprs, &mut value_exprs);
+        }
+
+        // Test case with single field.
+        let mut rewriter = FieldValueRewriter {
+            value_exprs: &mut vec![],
+        };
+
+        let input = col(VALUE_COLUMN_NAME).gt(lit(1.88));
+        let rewritten = input.clone().rewrite(&mut rewriter).unwrap();
+        assert_eq!(rewritten, lit(true));
+        assert_eq!(rewriter.value_exprs, &mut vec![ValueExpr { expr: input }]);
+    }
+}
diff --git a/query_functions/Cargo.toml b/query_functions/Cargo.toml
new file mode 100644
index 0000000..4585bad
--- /dev/null
+++ b/query_functions/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "query_functions"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+chrono = { version = "0.4", default-features = false }
+datafusion = { workspace = true }
+once_cell = "1"
+regex = "1"
+regex-syntax = "0.8.1"
+schema = { path = "../schema" }
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+datafusion_util = { path = "../datafusion_util" }
+itertools = "0.12.0"
+tokio = { version = "1.35", features = ["macros", "parking_lot"] }
diff --git a/query_functions/src/coalesce_struct.rs b/query_functions/src/coalesce_struct.rs
new file mode 100644
index 0000000..b33920d
--- /dev/null
+++ b/query_functions/src/coalesce_struct.rs
@@ -0,0 +1,447 @@
+//! `COALESCE`, but works for structs.
+//!
+//! Candidate for upstreaming as per <https://github.com/apache/arrow-datafusion/issues/6074>.
+//!
+//! For struct types, this preforms a recursive "first none-null" filling.
+//!
+//! For non-struct types (like uint32) this works like the normal `coalesce` function.
+//!
+//! # Example
+//!
+//! ```sql
+//! coalesce_nested(
+//!   NULL,
+//!   {
+//!     a: 1,
+//!     b: NULL,
+//!     c: NULL,
+//!     d: NULL,
+//!   },
+//!   {
+//!     a: 2,
+//!     b: NULL,
+//!     c: {a: NULL},
+//!     d: {a: 2, b: NULL},
+//!   },
+//!   {
+//!     a: 3,
+//!     b: NULL,
+//!     c: NULL,
+//!     d: {a: 3, b: 3},
+//!   },
+//! )
+//!
+//! =
+//!
+//! {
+//!   a: 1,
+//!   b: NULL,
+//!   c: {a: NULL},
+//!   d: {a: 2, b: 3},
+//! }
+//! ```
+use std::{any::Any, sync::Arc};
+
+use arrow::{
+    array::{Array, StructArray},
+    compute::{is_null, kernels::zip::zip},
+    datatypes::DataType,
+};
+use datafusion::{
+    common::cast::as_struct_array,
+    error::{DataFusionError, Result},
+    logical_expr::{ScalarUDF, ScalarUDFImpl, Signature, Volatility},
+    physical_plan::ColumnarValue,
+    prelude::Expr,
+    scalar::ScalarValue,
+};
+use once_cell::sync::Lazy;
+
+/// The name of the `coalesce_struct` UDF given to DataFusion.
+pub const COALESCE_STRUCT_UDF_NAME: &str = "coalesce_struct";
+
+#[derive(Debug)]
+struct CoalesceStructUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for CoalesceStructUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        COALESCE_STRUCT_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{COALESCE_STRUCT_UDF_NAME} expects at least 1 argument"
+            )));
+        }
+        let first_dt = &arg_types[0];
+
+        for (idx, dt) in arg_types.iter().enumerate() {
+            if dt != first_dt {
+                let idx = idx + 1;
+                return Err(DataFusionError::Plan(format!(
+                    "{COALESCE_STRUCT_UDF_NAME} expects all arguments to have the same type, but first arg is '{first_dt}' and arg {idx} (1-based) is '{dt}'",
+                )));
+            }
+        }
+
+        Ok(first_dt.clone())
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        #[allow(clippy::manual_try_fold)]
+        args.iter().enumerate().fold(Ok(None), |accu, (pos, arg)| {
+            let Some(accu) = accu? else {return Ok(Some(arg.clone()))};
+
+            if accu.data_type() != arg.data_type() {
+                return Err(DataFusionError::Plan(format!(
+                    "{} expects all arguments to have the same type, but first arg is '{}' and arg {} (1-based) is '{}'",
+                    COALESCE_STRUCT_UDF_NAME,
+                    accu.data_type(),
+                    pos + 1,
+                    arg.data_type(),
+                )));
+            }
+
+            let (array1, array2) = match (accu, arg) {
+                (ColumnarValue::Scalar(scalar1), ColumnarValue::Scalar(scalar2)) =>  {
+                    return Ok(Some(ColumnarValue::Scalar(scalar_coalesce_struct(scalar1, scalar2))));
+                }
+                (ColumnarValue::Scalar(s), ColumnarValue::Array(array2)) => {
+                    let array1 = s.to_array_of_size(array2.len())?;
+                    (array1, Arc::clone(array2))
+                }
+                (ColumnarValue::Array(array1), ColumnarValue::Scalar(s)) => {
+                    let array2 = s.to_array_of_size(array1.len())?;
+                    (array1, array2)
+                }
+                (ColumnarValue::Array(array1), ColumnarValue::Array(array2)) => {
+                    (array1, Arc::clone(array2))
+                }
+            };
+
+            let array = arrow_coalesce_struct(&array1, &array2)?;
+            Ok(Some(ColumnarValue::Array(array)))
+        })?.ok_or_else(|| DataFusionError::Plan(format!(
+                "{COALESCE_STRUCT_UDF_NAME} expects at least 1 argument"
+            )))
+    }
+}
+
+/// Implementation of `coalesce_struct`.
+///
+/// See [module-level docs](self) for more information.
+pub static COALESCE_STRUCT_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(CoalesceStructUDF {
+        signature: Signature::variadic_any(Volatility::Immutable),
+    }))
+});
+
+/// Recursively fold [`Array`]s.
+fn arrow_coalesce_struct(
+    array1: &dyn Array,
+    array2: &dyn Array,
+) -> Result<Arc<dyn Array>, DataFusionError> {
+    if matches!(array1.data_type(), DataType::Struct(_)) {
+        let array1 = as_struct_array(array1)?;
+        let array2 = as_struct_array(array2)?;
+
+        let cols = array1
+            .columns()
+            .iter()
+            .zip(array2.columns())
+            .zip(array1.fields())
+            .map(|((col1, col2), field)| {
+                let out = arrow_coalesce_struct(&col1, &col2)?;
+                Ok((Arc::clone(field), out)) as Result<_, DataFusionError>
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+
+        let array = StructArray::from(cols);
+        Ok(Arc::new(array))
+    } else {
+        let array = zip(&is_null(array1)?, array2, array1)?;
+        Ok(array)
+    }
+}
+
+/// Recursively fold [`ScalarValue`]s.
+fn scalar_coalesce_struct(scalar1: ScalarValue, scalar2: &ScalarValue) -> ScalarValue {
+    match (scalar1, scalar2) {
+        (ScalarValue::Struct(Some(vals1), fields1), ScalarValue::Struct(Some(vals2), _)) => {
+            let vals = vals1
+                .into_iter()
+                .zip(vals2)
+                .map(|(v1, v2)| scalar_coalesce_struct(v1, v2))
+                .collect();
+            ScalarValue::Struct(Some(vals), fields1)
+        }
+        (scalar1, scalar2) if scalar1.is_null() => scalar2.clone(),
+        (scalar1, _) => scalar1,
+    }
+}
+
+/// Create logical `coalesce_struct` expression.
+///
+/// See [module-level docs](self) for more information.
+pub fn coalesce_struct(args: Vec<Expr>) -> Expr {
+    COALESCE_STRUCT_UDF.call(args)
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::{
+        datatypes::{Field, Fields, Schema},
+        record_batch::RecordBatch,
+    };
+    use datafusion::prelude::SessionContext;
+    use datafusion::{
+        assert_batches_eq,
+        common::assert_contains,
+        prelude::{col, lit},
+        scalar::ScalarValue,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test() {
+        let fields_b = Fields::from(vec![
+            Field::new("ba", DataType::UInt64, true),
+            Field::new("bb", DataType::UInt64, true),
+        ]);
+        let fields = Fields::from(vec![
+            Field::new("a", DataType::UInt64, true),
+            Field::new("b", DataType::Struct(fields_b.clone()), true),
+        ]);
+        let dt = DataType::Struct(fields.clone());
+
+        assert_case_ok(
+            [
+                ColumnarValue::Array(ScalarValue::UInt64(None).to_array().unwrap()),
+                ColumnarValue::Array(ScalarValue::UInt64(Some(1)).to_array().unwrap()),
+                ColumnarValue::Array(ScalarValue::UInt64(Some(2)).to_array().unwrap()),
+            ],
+            &DataType::UInt64,
+            ["+-----+", "| out |", "+-----+", "| 1   |", "+-----+"],
+        )
+        .await;
+
+        assert_case_ok(
+            [ColumnarValue::Array(
+                ScalarValue::Struct(None, fields.clone())
+                    .to_array()
+                    .unwrap(),
+            )],
+            &dt,
+            ["+-----+", "| out |", "+-----+", "|     |", "+-----+"],
+        )
+        .await;
+
+        assert_case_ok(
+            [
+                ColumnarValue::Array(
+                    ScalarValue::Struct(None, fields.clone())
+                        .to_array()
+                        .unwrap(),
+                ),
+                ColumnarValue::Array(
+                    ScalarValue::Struct(
+                        Some(vec![
+                            ScalarValue::UInt64(Some(1)),
+                            ScalarValue::Struct(None, fields_b.clone()),
+                        ]),
+                        fields.clone(),
+                    )
+                    .to_array()
+                    .unwrap(),
+                ),
+                ColumnarValue::Array(
+                    ScalarValue::Struct(None, fields.clone())
+                        .to_array()
+                        .unwrap(),
+                ),
+                ColumnarValue::Array(
+                    ScalarValue::Struct(
+                        Some(vec![
+                            ScalarValue::UInt64(Some(2)),
+                            ScalarValue::Struct(
+                                Some(vec![
+                                    ScalarValue::UInt64(Some(3)),
+                                    ScalarValue::UInt64(None),
+                                ]),
+                                fields_b.clone(),
+                            ),
+                        ]),
+                        fields.clone(),
+                    )
+                    .to_array()
+                    .unwrap(),
+                ),
+            ],
+            &dt,
+            [
+                "+--------------------------+",
+                "| out                      |",
+                "+--------------------------+",
+                "| {a: 1, b: {ba: 3, bb: }} |",
+                "+--------------------------+",
+            ],
+        )
+        .await;
+
+        // same case as above, but with ColumnarValue::Scalar
+        assert_case_ok(
+            [
+                ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())),
+                ColumnarValue::Scalar(ScalarValue::Struct(
+                    Some(vec![
+                        ScalarValue::UInt64(Some(1)),
+                        ScalarValue::Struct(None, fields_b.clone()),
+                    ]),
+                    fields.clone(),
+                )),
+                ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())),
+                ColumnarValue::Scalar(ScalarValue::Struct(
+                    Some(vec![
+                        ScalarValue::UInt64(Some(2)),
+                        ScalarValue::Struct(
+                            Some(vec![
+                                ScalarValue::UInt64(Some(3)),
+                                ScalarValue::UInt64(None),
+                            ]),
+                            fields_b.clone(),
+                        ),
+                    ]),
+                    fields.clone(),
+                )),
+                ColumnarValue::Array(
+                    ScalarValue::Struct(None, fields.clone())
+                        .to_array()
+                        .unwrap(),
+                ),
+            ],
+            &dt,
+            [
+                "+--------------------------+",
+                "| out                      |",
+                "+--------------------------+",
+                "| {a: 1, b: {ba: 3, bb: }} |",
+                "+--------------------------+",
+            ],
+        )
+        .await;
+
+        assert_case_err(
+            [],
+            &dt,
+            "Error during planning: coalesce_struct expects at least 1 argument",
+        )
+        .await;
+
+        assert_case_err(
+            [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array().unwrap()), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array().unwrap())],
+            &dt,
+            "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is"
+        )
+        .await;
+
+        assert_case_err(
+            [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array().unwrap()), ColumnarValue::Scalar(ScalarValue::Struct(None, fields_b.clone()))],
+            &dt,
+            "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is"
+        )
+        .await;
+
+        assert_case_err(
+            [ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array().unwrap())],
+            &dt,
+            "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is"
+        )
+        .await;
+
+        assert_case_err(
+            [ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())), ColumnarValue::Scalar(ScalarValue::Struct(None, fields_b.clone()))],
+            &dt,
+            "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is"
+        )
+        .await;
+    }
+
+    async fn assert_case_ok<const N: usize, const M: usize>(
+        vals: [ColumnarValue; N],
+        dt: &DataType,
+        expected: [&'static str; M],
+    ) {
+        let actual = run_plan(vals.to_vec(), dt).await.unwrap();
+        assert_batches_eq!(expected, &actual);
+    }
+
+    async fn assert_case_err<const N: usize>(
+        vals: [ColumnarValue; N],
+        dt: &DataType,
+        expected: &'static str,
+    ) {
+        let actual = run_plan(vals.to_vec(), dt).await.unwrap_err();
+        assert_contains!(actual.to_string(), expected);
+    }
+
+    async fn run_plan(
+        vals: Vec<ColumnarValue>,
+        dt: &DataType,
+    ) -> Result<Vec<RecordBatch>, DataFusionError> {
+        let col_names = (0..vals.len())
+            .map(|idx| format!("col{idx}"))
+            .collect::<Vec<_>>();
+
+        let cols = vals
+            .iter()
+            .zip(&col_names)
+            .filter_map(|(val, col_name)| match val {
+                ColumnarValue::Array(a) => Some((col_name.as_str(), Arc::clone(a))),
+                ColumnarValue::Scalar(_) => None,
+            })
+            .collect::<Vec<_>>();
+        let rb = if cols.is_empty() {
+            RecordBatch::new_empty(Arc::new(Schema::new([])))
+        } else {
+            RecordBatch::try_from_iter(cols.into_iter())?
+        };
+
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
+        let df = ctx.table("t").await?;
+        let df = df.select(vec![coalesce_struct(
+            vals.iter()
+                .zip(col_names)
+                .map(|(val, col_name)| match val {
+                    ColumnarValue::Array(_) => col(col_name),
+                    ColumnarValue::Scalar(s) => lit(s.clone()),
+                })
+                .collect(),
+        )
+        .alias("out")])?;
+
+        // execute the query
+        let batches: Vec<RecordBatch> = df.collect().await?;
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0].num_rows(), 1);
+
+        for batch in &batches {
+            assert_eq!(batch.num_columns(), 1);
+            assert_eq!(batch.column(0).data_type(), dt);
+        }
+
+        Ok(batches)
+    }
+}
diff --git a/query_functions/src/gapfill.rs b/query_functions/src/gapfill.rs
new file mode 100644
index 0000000..a47fece
--- /dev/null
+++ b/query_functions/src/gapfill.rs
@@ -0,0 +1,310 @@
+//! Scalar functions to support queries that perform
+//! gap filling.
+//!
+//! Gap filling in IOx occurs with queries of the form:
+//!
+//! ```sql
+//! SELECT
+//!   location,
+//!   DATE_BIN_GAPFILL(INTERVAL '1 minute', time, '1970-01-01T00:00:00Z') AS minute,
+//!   LOCF(AVG(temp))
+//!   INTERPOLATE(AVG(humidity))
+//! FROM temps
+//! WHERE time > NOW() - INTERVAL '6 hours' AND time < NOW()
+//! GROUP BY LOCATION, MINUTE
+//! ```
+//!
+//! The functions `DATE_BIN_GAPFILL`, `LOCF`, and `INTERPOLATE` are special,
+//! in that they don't have normal implementations, but instead
+//! are transformed by logical optimizer rule `HandleGapFill` to
+//! produce a plan that fills gaps.
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, TimeUnit};
+use datafusion::{
+    error::{DataFusionError, Result},
+    logical_expr::{
+        BuiltinScalarFunction, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, Volatility,
+    },
+    physical_plan::ColumnarValue,
+};
+use once_cell::sync::Lazy;
+use schema::InfluxFieldType;
+
+/// The name of the date_bin_gapfill UDF given to DataFusion.
+pub const DATE_BIN_GAPFILL_UDF_NAME: &str = "date_bin_gapfill";
+
+#[derive(Debug)]
+struct DateBinGapFillUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for DateBinGapFillUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        DATE_BIN_GAPFILL_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Timestamp(TimeUnit::Nanosecond, None))
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::NotImplemented(format!(
+            "{DATE_BIN_GAPFILL_UDF_NAME} is not yet implemented"
+        )))
+    }
+}
+
+/// (Non-)Implementation of date_bin_gapfill.
+/// This function takes arguments identical to `date_bin()` but
+/// works in conjunction with the logical optimizer rule
+/// `HandleGapFill` to fill gaps in time series data.
+pub(crate) static DATE_BIN_GAPFILL: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    // DATE_BIN_GAPFILL should have the same signature as DATE_BIN,
+    // so that just adding _GAPFILL can turn a query into a gap-filling query.
+    let mut signatures = BuiltinScalarFunction::DateBin.signature();
+    // We don't want this to be optimized away before we can give a helpful error message
+    signatures.volatility = Volatility::Volatile;
+
+    Arc::new(ScalarUDF::from(DateBinGapFillUDF {
+        signature: signatures,
+    }))
+});
+
+/// The name of the locf UDF given to DataFusion.
+pub const LOCF_UDF_NAME: &str = "locf";
+
+#[derive(Debug)]
+struct LocfUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for LocfUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        LOCF_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{LOCF_UDF_NAME} should have at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::NotImplemented(format!(
+            "{LOCF_UDF_NAME} is not yet implemented"
+        )))
+    }
+}
+
+/// (Non-)Implementation of locf.
+/// This function takes a single argument of any type and
+/// produces a value of the same type. It is
+/// used in the context of gap-filling queries to represent
+/// "last observation carried forward." It does not have
+/// an implementation since it will be consumed by the logical optimizer rule
+/// `HandleGapFill`.
+pub(crate) static LOCF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(LocfUDF {
+        signature: Signature::any(1, Volatility::Volatile),
+    }))
+});
+
+/// The name of the interpolate UDF given to DataFusion.
+pub const INTERPOLATE_UDF_NAME: &str = "interpolate";
+
+#[derive(Debug)]
+struct InterpolateUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for InterpolateUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        INTERPOLATE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{INTERPOLATE_UDF_NAME} should have at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::NotImplemented(format!(
+            "{INTERPOLATE_UDF_NAME} is not yet implemented"
+        )))
+    }
+}
+
+/// (Non-)Implementation of interpolate.
+/// This function takes a single numeric argument and
+/// produces a value of the same type. It is
+/// used in the context of gap-filling queries to indicate
+/// columns that should be inmterpolated. It does not have
+/// an implementation since it will be consumed by the logical optimizer rule
+/// `HandleGapFill`.
+pub(crate) static INTERPOLATE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    let signatures = [
+        InfluxFieldType::Float,
+        InfluxFieldType::Integer,
+        InfluxFieldType::UInteger,
+    ]
+    .iter()
+    .flat_map(|&influx_type| {
+        [
+            TypeSignature::Exact(vec![influx_type.into()]),
+            TypeSignature::Exact(vec![DataType::Struct(
+                vec![
+                    Field::new("value", influx_type.into(), true),
+                    Field::new(
+                        "time",
+                        DataType::Timestamp(TimeUnit::Nanosecond, None),
+                        true,
+                    ),
+                ]
+                .into(),
+            )]),
+            TypeSignature::Exact(vec![DataType::Struct(
+                vec![
+                    Field::new("value", influx_type.into(), true),
+                    Field::new(
+                        "time",
+                        DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
+                        true,
+                    ),
+                ]
+                .into(),
+            )]),
+        ]
+    })
+    .collect();
+    Arc::new(ScalarUDF::from(InterpolateUDF {
+        signature: Signature::one_of(signatures, Volatility::Volatile),
+    }))
+});
+
+#[cfg(test)]
+mod test {
+    use arrow::array::{ArrayRef, Float64Array, TimestampNanosecondArray};
+    use arrow::record_batch::RecordBatch;
+    use datafusion::common::assert_contains;
+    use datafusion::error::Result;
+    use datafusion::prelude::{col, Expr, SessionContext};
+    use datafusion::scalar::ScalarValue;
+    use datafusion_util::lit_timestamptz_nano;
+    use schema::TIME_DATA_TIMEZONE;
+    use std::sync::Arc;
+
+    fn date_bin_gapfill(stride: Expr, source: Expr, origin: Expr) -> Expr {
+        crate::registry()
+            .udf(super::DATE_BIN_GAPFILL_UDF_NAME)
+            .expect("should be registered")
+            .call(vec![stride, source, origin])
+    }
+
+    fn lit_interval_milliseconds(v: i64) -> Expr {
+        Expr::Literal(ScalarValue::new_interval_mdn(0, 0, v * 1_000_000))
+    }
+
+    #[tokio::test]
+    async fn date_bin_gapfill_errs() -> Result<()> {
+        let times = Arc::new(
+            TimestampNanosecondArray::from(vec![Some(1000)])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+        let rb = RecordBatch::try_from_iter(vec![("time", times as ArrayRef)])?;
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
+
+        let df = ctx.table("t").await?.select(vec![date_bin_gapfill(
+            lit_interval_milliseconds(360_000),
+            col("time"),
+            lit_timestamptz_nano(0),
+        )])?;
+        let res = df.collect().await;
+        let expected = "date_bin_gapfill is not yet implemented";
+        assert_contains!(res.expect_err("should be an error").to_string(), expected);
+        Ok(())
+    }
+
+    fn locf(arg: Expr) -> Expr {
+        crate::registry()
+            .udf(super::LOCF_UDF_NAME)
+            .expect("should be registered")
+            .call(vec![arg])
+    }
+
+    #[tokio::test]
+    async fn locf_errs() {
+        let arg = Arc::new(Float64Array::from(vec![100.0]));
+        let rb = RecordBatch::try_from_iter(vec![("f0", arg as ArrayRef)]).unwrap();
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
+        let df = ctx
+            .table("t")
+            .await
+            .unwrap()
+            .select(vec![locf(col("f0"))])
+            .unwrap();
+        let res = df.collect().await;
+        let expected = "locf is not yet implemented";
+        assert_contains!(res.expect_err("should be an error").to_string(), expected);
+    }
+
+    fn interpolate(arg: Expr) -> Expr {
+        crate::registry()
+            .udf(super::INTERPOLATE_UDF_NAME)
+            .expect("should be registered")
+            .call(vec![arg])
+    }
+
+    #[tokio::test]
+    async fn interpolate_errs() {
+        let arg = Arc::new(Float64Array::from(vec![100.0]));
+        let rb = RecordBatch::try_from_iter(vec![("f0", arg as ArrayRef)]).unwrap();
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
+        let df = ctx
+            .table("t")
+            .await
+            .unwrap()
+            .select(vec![interpolate(col("f0"))])
+            .unwrap();
+        let res = df.collect().await;
+        let expected = "interpolate is not yet implemented";
+        assert!(res
+            .expect_err("should be an error")
+            .to_string()
+            .contains(expected));
+    }
+}
diff --git a/query_functions/src/group_by.rs b/query_functions/src/group_by.rs
new file mode 100644
index 0000000..ed16ddd
--- /dev/null
+++ b/query_functions/src/group_by.rs
@@ -0,0 +1,126 @@
+//! This module contains definitions for Timeseries specific Grouping
+//! and Aggregate functions in IOx, designed to be compatible with
+//! InfluxDB classic
+
+use datafusion::prelude::Expr;
+use snafu::Snafu;
+
+use crate::window;
+
+#[allow(missing_docs)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Aggregate not yet supported {}. See https://github.com/influxdata/influxdb_iox/issues/480",
+        agg
+    ))]
+    AggregateNotSupported { agg: String },
+}
+
+#[allow(missing_docs)]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+#[derive(Debug, Clone, PartialEq, Eq, Copy)]
+
+/// TimeSeries specific aggregates or selector functions
+///
+/// Aggregates are as in typical databases: they combine a column of
+/// data into a single scalar values by some arithemetic calculation.
+///
+/// Selector_functions are similar to aggregates in that reduce a
+/// column of data into a single row by *selecting* a single row.  In
+/// other words, they can return the timestamp value from the
+/// associated row in addition to its value.
+pub enum Aggregate {
+    /// Aggregate: the sum of all values in the column
+    Sum,
+
+    /// Aggregate: the total number of column values
+    Count,
+
+    /// Selector: Selects the minimum value of a column and the
+    /// associated timestamp. In the case of multiple rows with the
+    /// same min value, the earliest timestamp is used
+    Min,
+
+    /// Selector: Selects the maximum value of a column and the
+    /// associated timestamp. In the case of multiple rows with the
+    /// same max value, the earliest timestamp is used
+    Max,
+
+    /// Selector: Selects the value of a column with the minimum
+    /// timestamp and the associated timestamp. In the case of
+    /// multiple rows with the min timestamp, one is abritrarily
+    /// chosen
+    First,
+
+    /// Selector: Selects the value of a column with the minimum
+    /// timestamp and the associated timestamp. In the case of
+    /// multiple rows with the min timestamp, one is abritrarily
+    /// chosen
+    Last,
+
+    /// Aggregate: Average (geometric mean) column's value
+    Mean,
+
+    /// No grouping is applied
+    None,
+}
+
+/// Represents some duration in time
+#[allow(missing_docs)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum WindowDuration {
+    /// Variable sized window,
+    Variable { months: i64, negative: bool },
+
+    /// fixed size, in nanoseconds
+    Fixed { nanoseconds: i64 },
+}
+
+impl Aggregate {
+    /// Create the appropriate DataFusion expression for this aggregate
+    pub fn to_datafusion_expr(self, input: Expr) -> Result<Expr> {
+        use datafusion::prelude::{avg, count, max, min, sum};
+        match self {
+            Self::Sum => Ok(sum(input)),
+            Self::Count => Ok(count(input)),
+            Self::Min => Ok(min(input)),
+            Self::Max => Ok(max(input)),
+            Self::First => AggregateNotSupportedSnafu { agg: "First" }.fail(),
+            Self::Last => AggregateNotSupportedSnafu { agg: "Last" }.fail(),
+            Self::Mean => Ok(avg(input)),
+            Self::None => AggregateNotSupportedSnafu { agg: "None" }.fail(),
+        }
+    }
+}
+
+impl WindowDuration {
+    /// Does this duration represent 0 nanoseconds?
+    pub fn empty() -> Self {
+        Self::Fixed { nanoseconds: 0 }
+    }
+
+    /// Create a duration from nanoseconds
+    pub fn from_nanoseconds(nanoseconds: i64) -> Self {
+        Self::Fixed { nanoseconds }
+    }
+
+    /// Create a duration from a number of months
+    pub fn from_months(months: i64, negative: bool) -> Self {
+        Self::Variable { months, negative }
+    }
+}
+
+// Translation to the structures for the underlying window
+// implementation
+impl From<&WindowDuration> for window::Duration {
+    fn from(window_duration: &WindowDuration) -> Self {
+        match window_duration {
+            WindowDuration::Variable { months, negative } => {
+                Self::from_months_with_negative(*months, *negative)
+            }
+            WindowDuration::Fixed { nanoseconds } => Self::from_nsecs(*nanoseconds),
+        }
+    }
+}
diff --git a/query_functions/src/lib.rs b/query_functions/src/lib.rs
new file mode 100644
index 0000000..658a5fa
--- /dev/null
+++ b/query_functions/src/lib.rs
@@ -0,0 +1,234 @@
+//! DataFusion User Defined Functions (UDF/ UDAF) for IOx
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(unreachable_pub)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use datafusion::{
+    execution::FunctionRegistry,
+    prelude::{lit, Expr, SessionContext},
+};
+use group_by::WindowDuration;
+use window::EncodedWindowDuration;
+
+pub mod coalesce_struct;
+
+/// Grouping by structs
+pub mod group_by;
+
+/// Regular Expressions
+mod regex;
+
+/// Selector Functions
+pub mod selectors;
+
+/// Sleep function.
+mod sleep;
+
+/// window_bounds expressions
+mod window;
+
+pub mod gapfill;
+
+/// Function registry
+mod registry;
+mod to_timestamp;
+
+pub use crate::regex::clean_non_meta_escapes;
+pub use crate::regex::REGEX_MATCH_UDF_NAME;
+pub use crate::regex::REGEX_NOT_MATCH_UDF_NAME;
+pub use crate::sleep::SLEEP_UDF_NAME;
+
+/// Return an Expr that invokes a InfluxRPC compatible regex match to
+/// determine which values satisfy the pattern. Equivalent to:
+///
+/// ```text
+/// col ~= /pattern/
+/// ```
+pub fn regex_match_expr(input: Expr, pattern: String) -> Expr {
+    registry()
+        .udf(regex::REGEX_MATCH_UDF_NAME)
+        .expect("RegexMatch function not registered")
+        .call(vec![input, lit(pattern)])
+}
+
+/// Return an Expr that invokes a InfluxRPC compatible regex match to
+/// determine which values do not satisfy the pattern. Equivalent to:
+///
+/// ```text
+/// col !~ /pattern/
+/// ```
+pub fn regex_not_match_expr(input: Expr, pattern: String) -> Expr {
+    registry()
+        .udf(regex::REGEX_NOT_MATCH_UDF_NAME)
+        .expect("NotRegexMatch function not registered")
+        .call(vec![input, lit(pattern)])
+}
+
+/// Create a DataFusion `Expr` that invokes `window_bounds` with the
+/// appropriate every and offset arguments at runtime
+pub fn make_window_bound_expr(
+    time_arg: Expr,
+    every: WindowDuration,
+    offset: WindowDuration,
+) -> Expr {
+    let encoded_every: EncodedWindowDuration = every.into();
+    let encoded_offset: EncodedWindowDuration = offset.into();
+
+    registry()
+        .udf(window::WINDOW_BOUNDS_UDF_NAME)
+        .expect("WindowBounds function not registered")
+        .call(vec![
+            time_arg,
+            lit(encoded_every.ty),
+            lit(encoded_every.field1),
+            lit(encoded_every.field2),
+            lit(encoded_offset.ty),
+            lit(encoded_offset.field1),
+            lit(encoded_offset.field2),
+        ])
+}
+
+/// Return an [`FunctionRegistry`] with the implementations of IOx UDFs
+pub fn registry() -> &'static dyn FunctionRegistry {
+    registry::instance()
+}
+
+/// registers scalar functions so they can be invoked via SQL
+pub fn register_scalar_functions(ctx: &SessionContext) {
+    let registry = registry();
+    for f in registry.udfs() {
+        let udf = registry.udf(&f).unwrap();
+        ctx.register_udf(udf.as_ref().clone())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use arrow::{
+        array::{ArrayRef, StringArray, TimestampNanosecondArray},
+        record_batch::RecordBatch,
+    };
+    use datafusion::{assert_batches_eq, prelude::col};
+    use schema::TIME_DATA_TIMEZONE;
+    use std::sync::Arc;
+
+    use super::*;
+
+    /// plumbing test to validate registry is connected. functions are
+    /// tested more thoroughly in their own modules
+    #[tokio::test]
+    async fn test_regex_match_expr() {
+        let batch = RecordBatch::try_from_iter(vec![(
+            "data",
+            Arc::new(StringArray::from(vec!["Foo", "Bar", "FooBar"])) as ArrayRef,
+        )])
+        .unwrap();
+
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", batch).unwrap();
+        let result = ctx
+            .table("t")
+            .await
+            .unwrap()
+            .filter(regex_match_expr(col("data"), "Foo".into()))
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        let expected = vec![
+            "+--------+",
+            "| data   |",
+            "+--------+",
+            "| Foo    |",
+            "| FooBar |",
+            "+--------+",
+        ];
+
+        assert_batches_eq!(&expected, &result);
+    }
+
+    /// plumbing test to validate registry is connected. functions are
+    /// tested more thoroughly in their own modules
+    #[tokio::test]
+    async fn test_regex_not_match_expr() {
+        let batch = RecordBatch::try_from_iter(vec![(
+            "data",
+            Arc::new(StringArray::from(vec!["Foo", "Bar", "FooBar"])) as ArrayRef,
+        )])
+        .unwrap();
+
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", batch).unwrap();
+        let result = ctx
+            .table("t")
+            .await
+            .unwrap()
+            .filter(regex_not_match_expr(col("data"), "Foo".into()))
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        let expected = vec!["+------+", "| data |", "+------+", "| Bar  |", "+------+"];
+
+        assert_batches_eq!(&expected, &result);
+    }
+
+    /// plumbing test to validate registry is connected. functions are
+    /// tested more thoroughly in their own modules
+    #[tokio::test]
+    async fn test_make_window_bound_expr() {
+        let batch = RecordBatch::try_from_iter(vec![(
+            "time",
+            Arc::new(
+                TimestampNanosecondArray::from(vec![Some(1000), Some(2000)])
+                    .with_timezone_opt(TIME_DATA_TIMEZONE()),
+            ) as ArrayRef,
+        )])
+        .unwrap();
+
+        let each = WindowDuration::Fixed { nanoseconds: 100 };
+        let every = WindowDuration::Fixed { nanoseconds: 200 };
+
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", batch).unwrap();
+        let result = ctx
+            .table("t")
+            .await
+            .unwrap()
+            .select(vec![
+                col("time"),
+                make_window_bound_expr(col("time"), each, every).alias("bound"),
+            ])
+            .unwrap()
+            .collect()
+            .await
+            .unwrap();
+
+        let expected = vec![
+            "+----------------------------+-------------------------------+",
+            "| time                       | bound                         |",
+            "+----------------------------+-------------------------------+",
+            "| 1970-01-01T00:00:00.000001 | 1970-01-01T00:00:00.000001100 |",
+            "| 1970-01-01T00:00:00.000002 | 1970-01-01T00:00:00.000002100 |",
+            "+----------------------------+-------------------------------+",
+        ];
+
+        assert_batches_eq!(&expected, &result);
+    }
+}
diff --git a/query_functions/src/regex.rs b/query_functions/src/regex.rs
new file mode 100644
index 0000000..2e3feae
--- /dev/null
+++ b/query_functions/src/regex.rs
@@ -0,0 +1,388 @@
+use std::sync::Arc;
+
+use arrow::{
+    array::{as_string_array, ArrayRef, BooleanArray},
+    datatypes::DataType,
+};
+use datafusion::{
+    error::DataFusionError,
+    logical_expr::{ScalarFunctionImplementation, ScalarUDF, Volatility},
+    physical_plan::ColumnarValue,
+    prelude::create_udf,
+    scalar::ScalarValue,
+};
+use once_cell::sync::Lazy;
+
+/// The name of the regex_match UDF given to DataFusion.
+pub const REGEX_MATCH_UDF_NAME: &str = "influx_regex_match";
+
+/// The name of the not_regex_match UDF given to DataFusion.
+pub const REGEX_NOT_MATCH_UDF_NAME: &str = "influx_regex_not_match";
+
+/// Implementation of regexp_match
+pub(crate) static REGEX_MATCH_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(create_udf(
+        REGEX_MATCH_UDF_NAME,
+        // takes two arguments: regex, pattern
+        vec![DataType::Utf8, DataType::Utf8],
+        Arc::new(DataType::Boolean),
+        Volatility::Stable,
+        regex_match_expr_impl(true),
+    ))
+});
+
+/// Implementation of regexp_not_match
+pub(crate) static REGEX_NOT_MATCH_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(create_udf(
+        REGEX_NOT_MATCH_UDF_NAME,
+        // takes two arguments: regex, pattern
+        vec![DataType::Utf8, DataType::Utf8],
+        Arc::new(DataType::Boolean),
+        Volatility::Stable,
+        regex_match_expr_impl(false),
+    ))
+});
+
+/// Given a column containing string values and a single regex pattern,
+/// `regex_match_expr` determines which values satisfy the pattern and which do
+/// not.
+///
+/// If `matches` is true then this expression will filter values that do not
+/// satisfy the regex (equivalent to `col ~= /pattern/`). If `matches` is `false`
+/// then the expression will filter values that *do* match the regex, which is
+/// equivalent to `col !~ /pattern/`.
+///
+/// This UDF is designed to support the regex operator that can be pushed down
+/// via the InfluxRPC API.
+///
+fn regex_match_expr_impl(matches: bool) -> ScalarFunctionImplementation {
+    // N.B., this function does not utilise the Arrow regexp compute
+    // kernel because in order to act as a filter it needs to return a
+    // boolean array of comparison results, not an array of strings as
+    // the regex compute kernel does and it needs to implement the
+    // regexp syntax for influxrpc.
+
+    let func = move |args: &[ColumnarValue]| {
+        assert_eq!(args.len(), 2); // only works over a single column and pattern at a time.
+
+        let pattern = match &args[1] {
+            // second arg was array (not constant)
+            ColumnarValue::Array(_) => {
+                return Err(DataFusionError::NotImplemented(format!(
+                    "regex_match({matches}) with non scalar patterns not yet implemented"
+                )))
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(pattern)) => pattern,
+            ColumnarValue::Scalar(arg) => {
+                return Err(DataFusionError::Internal(format!(
+                    "Expected string pattern to regex match({matches}), got: {arg:?}"
+                )))
+            }
+        };
+
+        let pattern = pattern.as_ref().ok_or_else(|| {
+            DataFusionError::NotImplemented(
+                "NULL patterns not supported in regex match".to_string(),
+            )
+        })?;
+
+        // Attempt to make the pattern compatible with what is accepted by
+        // the golang regexp library which is different than Rust's regexp
+        let pattern = clean_non_meta_escapes(pattern);
+
+        let pattern = regex::Regex::new(&pattern).map_err(|e| {
+            DataFusionError::Internal(format!("error compiling regex pattern: {e}"))
+        })?;
+
+        match &args[0] {
+            ColumnarValue::Array(arr) => {
+                let results = as_string_array(arr)
+                    .iter()
+                    .map(|row| {
+                        // in arrow, any value can be null.
+                        // Here we decide to make our UDF to return null when either base or exponent is null.
+                        row.map(|v| pattern.is_match(v) == matches)
+                    })
+                    .collect::<BooleanArray>();
+
+                Ok(ColumnarValue::Array(Arc::new(results) as ArrayRef))
+            }
+            ColumnarValue::Scalar(ScalarValue::Utf8(row)) => {
+                let res = row.as_ref().map(|v| pattern.is_match(v) == matches);
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(res)))
+            }
+            ColumnarValue::Scalar(v) => Err(DataFusionError::Internal(format!(
+                "regex_match({matches}) expected first argument to be utf8, got ('{v}')"
+            ))),
+        }
+    };
+
+    Arc::new(func)
+}
+
+fn is_valid_character_after_escape(c: char) -> bool {
+    // same list as https://docs.rs/regex-syntax/0.6.25/src/regex_syntax/ast/parse.rs.html#1445-1538
+    match c {
+        '0'..='7' => true,
+        '8'..='9' => true,
+        'x' | 'u' | 'U' => true,
+        'p' | 'P' => true,
+        'd' | 's' | 'w' | 'D' | 'S' | 'W' => true,
+        _ => regex_syntax::is_meta_character(c),
+    }
+}
+
+/// Removes all `/` patterns that the rust regex library would reject
+/// and rewrites them to their unescaped form.
+///
+/// For example, `\:` is rewritten to `:` as `\:` is not a valid
+/// escape sequence in the `regexp` crate but is valid in golang's
+/// regexp implementation.
+///
+/// This is done for compatibility purposes so that the regular
+/// expression matching in Rust more closely follows the matching in
+/// golang, used by the influx storage rpc.
+///
+/// See <https://github.com/rust-lang/regex/issues/501> for more details
+pub fn clean_non_meta_escapes(pattern: &str) -> String {
+    if pattern.is_empty() {
+        return pattern.to_string();
+    }
+
+    #[derive(Debug, Copy, Clone)]
+    enum SlashState {
+        No,
+        Single,
+        Double,
+    }
+
+    let mut next_state = SlashState::No;
+
+    let next_chars = pattern
+        .chars()
+        .map(Some)
+        .skip(1)
+        .chain(std::iter::once(None));
+
+    // emit char based on previous
+    let new_pattern: String = pattern
+        .chars()
+        .zip(next_chars)
+        .filter_map(|(c, next_char)| {
+            let cur_state = next_state;
+            next_state = match (c, cur_state) {
+                ('\\', SlashState::No) => SlashState::Single,
+                ('\\', SlashState::Single) => SlashState::Double,
+                ('\\', SlashState::Double) => SlashState::Single,
+                _ => SlashState::No,
+            };
+
+            // Decide to emit `c` or not
+            match (cur_state, c, next_char) {
+                (SlashState::No, '\\', Some(next_char))
+                | (SlashState::Double, '\\', Some(next_char))
+                    if !is_valid_character_after_escape(next_char) =>
+                {
+                    None
+                }
+                _ => Some(c),
+            }
+        })
+        .collect();
+
+    new_pattern
+}
+
+#[cfg(test)]
+mod test {
+
+    use arrow::{
+        array::{StringArray, UInt64Array},
+        record_batch::RecordBatch,
+        util::pretty::pretty_format_batches,
+    };
+    use datafusion::prelude::SessionContext;
+    use datafusion::{
+        error::DataFusionError,
+        prelude::{col, lit, Expr},
+    };
+    use std::sync::Arc;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn regex_match_expr() {
+        let cases = vec![
+            (
+                ".*", // match everything except NULL values
+                true, // keep the values matched
+                vec![
+                    "+---------------+--------+",
+                    "| words         | length |",
+                    "+---------------+--------+",
+                    "| air           | 3      |",
+                    "| aphex twin    | 10     |",
+                    "| bruce         | 5      |",
+                    "| Blood Orange  | 12     |",
+                    "| cocteau twins | 13     |",
+                    "+---------------+--------+",
+                ],
+            ),
+            (
+                ".*",  // match everything except NULL values
+                false, // filter away all the values matched
+                vec!["++", "++"],
+            ),
+            (
+                "", // an empty pattern also matches everything except NULL
+                true,
+                vec![
+                    "+---------------+--------+",
+                    "| words         | length |",
+                    "+---------------+--------+",
+                    "| air           | 3      |",
+                    "| aphex twin    | 10     |",
+                    "| bruce         | 5      |",
+                    "| Blood Orange  | 12     |",
+                    "| cocteau twins | 13     |",
+                    "+---------------+--------+",
+                ],
+            ),
+            (
+                ".+O.*", // match just words containing "O".
+                true,
+                vec![
+                    "+--------------+--------+",
+                    "| words        | length |",
+                    "+--------------+--------+",
+                    "| Blood Orange | 12     |",
+                    "+--------------+--------+",
+                ],
+            ),
+            (
+                "^(a|b).*", // match everything beginning with "a" or "b"
+                false,      // negate expression and filter away anything that matches
+                vec![
+                    "+---------------+--------+",
+                    "| words         | length |",
+                    "+---------------+--------+",
+                    "| Blood Orange  | 12     |",
+                    "| cocteau twins | 13     |",
+                    "+---------------+--------+",
+                ],
+            ),
+            (
+                "twi",
+                true, // keep the values matched
+                vec![
+                    "+---------------+--------+",
+                    "| words         | length |",
+                    "+---------------+--------+",
+                    "| aphex twin    | 10     |",
+                    "| cocteau twins | 13     |",
+                    "+---------------+--------+",
+                ],
+            ),
+        ];
+
+        for (pattern, matches, expected) in cases.into_iter() {
+            let args = vec![col("words"), lit(pattern)];
+
+            let regex_expr = if matches {
+                REGEX_MATCH_UDF.call(args)
+            } else {
+                REGEX_NOT_MATCH_UDF.call(args)
+            };
+
+            let actual = run_plan(regex_expr).await.unwrap();
+
+            assert_eq!(
+                expected, actual,
+                "\n\nEXPECTED:\n{expected:#?}\nACTUAL:\n{actual:#?}\n"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn regex_match_expr_invalid_regex() {
+        // an invalid regex pattern
+        let regex_expr = crate::regex_match_expr(col("words"), "[".to_string());
+
+        let actual = run_plan(regex_expr).await.expect_err("expected error");
+        assert!(actual.to_string().contains("error compiling regex pattern"))
+    }
+
+    // Run a plan against the following input table as "t"
+    async fn run_plan(op: Expr) -> Result<Vec<String>, DataFusionError> {
+        // define data for table
+        let words = vec![
+            Some("air"),
+            Some("aphex twin"),
+            Some("bruce"),
+            Some("Blood Orange"),
+            None,
+            None,
+            Some("cocteau twins"),
+        ];
+
+        let lengths = words
+            .iter()
+            .map(|word| word.map(|word| word.len() as u64))
+            .collect::<UInt64Array>();
+
+        let words = StringArray::from(words);
+
+        let rb = RecordBatch::try_from_iter(vec![
+            ("words", Arc::new(words) as ArrayRef),
+            ("length", Arc::new(lengths)),
+        ])
+        .unwrap();
+
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
+        let df = ctx.table("t").await.unwrap();
+        let df = df.filter(op).unwrap();
+
+        // execute the query
+        let record_batches = df.collect().await?;
+
+        Ok(pretty_format_batches(&record_batches)
+            .unwrap()
+            .to_string()
+            .split('\n')
+            .map(|s| s.to_owned())
+            .collect())
+    }
+
+    #[test]
+    fn test_clean_non_meta_escapes() {
+        let cases = vec![
+            ("", ""),
+            (r"\", r"\"),
+            (r"\\", r"\\"),
+            // : is not a special meta character
+            (r"\:", r#":"#),
+            // . is a special meta character
+            (r"\.", r"\."),
+            (r"foo\", r"foo\"),
+            (r"foo\\", r"foo\\"),
+            (r"foo\:", r#"foo:"#),
+            (r"foo\xff", r"foo\xff"),
+            (r"fo\\o", r"fo\\o"),
+            (r"fo\:o", r#"fo:o"#),
+            (r"fo\:o\x123", r"fo:o\x123"),
+            (r"fo\:o\x123\:", r"fo:o\x123:"),
+            (r"foo\\\:bar", r"foo\\:bar"),
+            (r"foo\\\:bar\\\:", r"foo\\:bar\\:"),
+            ("foo", "foo"),
+        ];
+
+        for (pattern, expected) in cases {
+            let cleaned_pattern = clean_non_meta_escapes(pattern);
+            assert_eq!(
+                cleaned_pattern, expected,
+                "Expected '{pattern}' to be cleaned to '{expected}', got '{cleaned_pattern}'"
+            );
+        }
+    }
+}
diff --git a/query_functions/src/registry.rs b/query_functions/src/registry.rs
new file mode 100644
index 0000000..609b83f
--- /dev/null
+++ b/query_functions/src/registry.rs
@@ -0,0 +1,73 @@
+use std::{collections::HashSet, sync::Arc};
+
+use datafusion::{
+    common::{DataFusionError, Result as DataFusionResult},
+    execution::FunctionRegistry,
+    logical_expr::{AggregateUDF, ScalarUDF, WindowUDF},
+};
+use once_cell::sync::Lazy;
+
+use crate::{gapfill, regex, sleep, to_timestamp, window};
+
+static REGISTRY: Lazy<IOxFunctionRegistry> = Lazy::new(IOxFunctionRegistry::new);
+
+/// Lookup for all DataFusion User Defined Functions used by IOx
+#[derive(Debug)]
+pub(crate) struct IOxFunctionRegistry {}
+
+impl IOxFunctionRegistry {
+    fn new() -> Self {
+        Self {}
+    }
+}
+
+impl FunctionRegistry for IOxFunctionRegistry {
+    fn udfs(&self) -> HashSet<String> {
+        [
+            to_timestamp::TO_TIMESTAMP_FUNCTION_NAME,
+            gapfill::DATE_BIN_GAPFILL_UDF_NAME,
+            gapfill::LOCF_UDF_NAME,
+            gapfill::INTERPOLATE_UDF_NAME,
+            regex::REGEX_MATCH_UDF_NAME,
+            regex::REGEX_NOT_MATCH_UDF_NAME,
+            sleep::SLEEP_UDF_NAME,
+            window::WINDOW_BOUNDS_UDF_NAME,
+        ]
+        .into_iter()
+        .map(|s| s.to_string())
+        .collect()
+    }
+
+    fn udf(&self, name: &str) -> DataFusionResult<Arc<ScalarUDF>> {
+        match name {
+            to_timestamp::TO_TIMESTAMP_FUNCTION_NAME => Ok(to_timestamp::TO_TIMESTAMP_UDF.clone()),
+            gapfill::DATE_BIN_GAPFILL_UDF_NAME => Ok(gapfill::DATE_BIN_GAPFILL.clone()),
+            gapfill::LOCF_UDF_NAME => Ok(gapfill::LOCF.clone()),
+            gapfill::INTERPOLATE_UDF_NAME => Ok(gapfill::INTERPOLATE.clone()),
+            regex::REGEX_MATCH_UDF_NAME => Ok(regex::REGEX_MATCH_UDF.clone()),
+            regex::REGEX_NOT_MATCH_UDF_NAME => Ok(regex::REGEX_NOT_MATCH_UDF.clone()),
+            sleep::SLEEP_UDF_NAME => Ok(sleep::SLEEP_UDF.clone()),
+            window::WINDOW_BOUNDS_UDF_NAME => Ok(window::WINDOW_BOUNDS_UDF.clone()),
+            _ => Err(DataFusionError::Plan(format!(
+                "IOx FunctionRegistry does not contain function '{name}'"
+            ))),
+        }
+    }
+
+    fn udaf(&self, name: &str) -> DataFusionResult<Arc<AggregateUDF>> {
+        Err(DataFusionError::Plan(format!(
+            "IOx FunctionRegistry does not contain user defined aggregate function '{name}'"
+        )))
+    }
+
+    fn udwf(&self, name: &str) -> DataFusionResult<Arc<WindowUDF>> {
+        Err(DataFusionError::Plan(format!(
+            "IOx FunctionRegistry does not contain user defined window function '{name}'"
+        )))
+    }
+}
+
+/// Return a reference to the global function registry
+pub(crate) fn instance() -> &'static IOxFunctionRegistry {
+    &REGISTRY
+}
diff --git a/query_functions/src/selectors.rs b/query_functions/src/selectors.rs
new file mode 100644
index 0000000..f018527
--- /dev/null
+++ b/query_functions/src/selectors.rs
@@ -0,0 +1,1424 @@
+//! ## Overview
+//!
+//! *Selector functions* are special IOx SQL aggregate functions,
+//! designed to provide the same semantics as the [selector functions]
+//! in [InfluxQL]
+//!
+//! Selector functions are similar to standard aggregate functions in
+//! that they collapse (aggregate) an input set of rows into a single
+//! row.
+//!
+//! Selector functions are different than regular aggregate functions
+//! because they rely on and return a `time` in addition to the
+//! `value`. Time is implicit in InfluxQL, but not in SQL, so the
+//! selector function invocation is slightly different in IOx SQL than
+//! in InfluxQL.
+//!
+//! Each selector function returns a two part value, the `value` of
+//! the first argument as well as the value for the corresponding
+//! second argument, which must timestamp.
+//!
+//! ## Example
+//!
+//! Given the following input:
+//!
+//! ```text
+//! +----------------------+-------------+
+//! | time                 | water_level |
+//! +----------------------+-------------+
+//! | 2019-08-28T07:22:00Z | 9.8         |
+//! | 2019-08-28T07:23:00Z | 9.7         |
+//! | 2019-08-28T07:24:00Z | 10.00       |
+//! | 2019-08-28T07:25:00Z | 9.9         |
+//! +----------------------+-------------+
+//! ```
+//!
+//! Using the SQL `min` aggregate function (not a selector) finds the
+//! minimum `water_level` value, `9.7` in this case:
+//!
+//! ```sql
+//! select min(water_level) from "h2o_feet";
+//!
+//! +-------------+
+//! | water_level |
+//! +-------------+
+//! | 9.7         |
+//! +-------------+
+//! ```
+//!
+//! There is no easy way in SQL to determine at which value of `time`
+//! the minimum value occurred, however `selector_min` returns this as well:
+//!
+//! ```sql
+//! select selector_min(water_level, time) from "h2o_feet";
+//!
+//! +----------------------------------------------+
+//! | selector_min(water_level,time)               |
+//! +----------------------------------------------+
+//! | {"value": 9.7, "time": 2019-08-28T07:23:00Z} |
+//! +----------------------------------------------+
+//! ```
+//!
+//! Note that the output is a `struct` with two fields, `value` and
+//! `time`. To access the values, you can use the field reference
+//! `['field_name']` syntax (note the use of single quotes `'` around
+//! the field names):
+//!
+//! ```sql
+//! select
+//!   selector_min(water_level, time)['time'],
+//!   selector_min(water_level, time)['value']
+//! from "h2o_feet";
+//! +----------------------------------------+-----------------------------------------+
+//! | selector_first(water_level,time)[time] | selector_first(water_level,time)[value] |
+//! +----------------------------------------+-----------------------------------------+
+//! | 2019-08-28T07:23:00Z                   | 9.7                                     |
+//! +----------------------------------------+-----------------------------------------+
+//! ```
+//!
+//! ## Supported Selectors
+//!
+//! IOx supports the following selectors:
+//!
+//! 1. `selector_first`: `time` and `value` of the row with earliest `time` in the group
+//! 2. `selector_last`: `time` and `value` of the row with latest `time` in the group
+//! 3. `selector_min`: `time` and `value` of the row with smallest `value` in the group
+//! 4. `selector_max`: `time` and `value` of the row with largest `value` in the group
+//!
+//! For `selector_first` / `selector_last`, if there are multiple
+//! rows with same minimum / maximum timestamp, the value returned is
+//! arbitrary
+//!
+//! For `selector_min` / `selector_max`, if there are multiple rows
+//! with the same minimum / maximum value, the value with the smallest
+//! timestamp is chosen.
+//!
+//! [InfluxQL]: https://docs.influxdata.com/influxdb/v1.8/query_language/
+//! [selector functions]: https://docs.influxdata.com/influxdb/v1.8/query_language/functions/#selectors
+use std::{fmt::Debug, sync::Arc};
+
+use arrow::datatypes::DataType;
+use datafusion::{
+    error::Result as DataFusionResult,
+    logical_expr::{AccumulatorFactoryFunction, Signature, Volatility},
+    physical_plan::{udaf::AggregateUDF, Accumulator},
+    prelude::SessionContext,
+};
+
+mod internal;
+use internal::{Comparison, Selector, Target};
+
+mod type_handling;
+use type_handling::AggType;
+
+/// registers selector functions so they can be invoked via SQL
+pub fn register_selector_aggregates(ctx: &SessionContext) {
+    ctx.register_udaf(selector_first());
+    ctx.register_udaf(selector_last());
+    ctx.register_udaf(selector_min());
+    ctx.register_udaf(selector_max());
+}
+
+/// Returns a DataFusion user defined aggregate function for computing
+/// the first(value, time) selector function, returning a struct:
+///
+/// first(value, time) -> struct { value, time }
+///
+/// ```text
+/// {
+///   value: value at the row of the minimum of the time column.
+///   time: value of the minimum time column
+/// }
+/// ```
+///
+/// If there are multiple rows with the minimum timestamp value, the
+/// value returned is arbitrary
+pub fn selector_first() -> AggregateUDF {
+    make_uda("selector_first", FactoryBuilder::new(SelectorType::First))
+}
+
+/// Returns a DataFusion user defined aggregate function for computing
+/// the last(value, time) selector function, returning a struct:
+///
+/// last(value, time) -> struct { value, time }
+///
+/// ```text
+/// {
+///   value: value at the row of the maximum of the time column.
+///   time: value of the maximum time column
+/// }
+/// ```
+///
+/// If there are multiple rows with the maximum timestamp value, the
+/// value is arbitrary
+pub fn selector_last() -> AggregateUDF {
+    make_uda("selector_last", FactoryBuilder::new(SelectorType::Last))
+}
+
+/// Returns a DataFusion user defined aggregate function for computing
+/// the min(value, time) selector function, returning a struct:
+///
+/// min(value, time) -> struct { value, time }
+///
+/// ```text
+/// {
+///   value: value at the row with minimum value
+///   time: value of time for row with minimum value
+/// }
+/// ```
+///
+/// If there are multiple rows with the same minimum value, the value
+/// with the first (earliest/smallest) timestamp is chosen
+pub fn selector_min() -> AggregateUDF {
+    make_uda("selector_min", FactoryBuilder::new(SelectorType::Min))
+}
+
+/// Returns a DataFusion user defined aggregate function for computing
+/// the max(value, time) selector function, returning a struct:
+///
+/// max(value, time) -> struct { value, time }
+///
+/// ```text
+/// {
+///   value: value at the row with maximum value
+///   time: value of time for row with maximum value
+/// }
+/// ```
+///
+/// If there are multiple rows with the same maximum value, the value
+/// with the first (earliest/smallest) timestamp is chosen
+pub fn selector_max() -> AggregateUDF {
+    make_uda("selector_max", FactoryBuilder::new(SelectorType::Max))
+}
+
+#[derive(Debug, Clone, Copy)]
+enum SelectorType {
+    First,
+    Last,
+    Min,
+    Max,
+}
+
+/// Builder to create the appropriate typed factory functions for selectors
+#[derive(Debug)]
+struct FactoryBuilder {
+    selector_type: SelectorType,
+}
+
+impl FactoryBuilder {
+    fn new(selector_type: SelectorType) -> Self {
+        Self { selector_type }
+    }
+
+    fn build_state_type_factory(&self) -> StateTypeFactory {
+        Arc::new(move |return_type| {
+            let agg_type = AggType::try_from_return_type(return_type)?;
+            Ok(Arc::new(agg_type.state_datatypes()))
+        })
+    }
+
+    /// Returns a function that instantiates the accumulator, consuming self
+    fn build_accumulator_factory(self) -> AccumulatorFactoryFunction {
+        let Self { selector_type } = self;
+
+        Arc::new(move |return_type| {
+            let agg_type = AggType::try_from_return_type(return_type)?;
+            let value_type = agg_type.value_type;
+            let timezone = match agg_type.time_type {
+                DataType::Timestamp(_, tz) => tz.clone(),
+                _ => None,
+            };
+            let other_types = agg_type.other_types;
+
+            let accumulator: Box<dyn Accumulator> = match selector_type {
+                SelectorType::First => Box::new(Selector::new(
+                    Comparison::Min,
+                    Target::Time,
+                    timezone,
+                    value_type,
+                    other_types.iter().cloned(),
+                )?),
+                SelectorType::Last => Box::new(Selector::new(
+                    Comparison::Max,
+                    Target::Time,
+                    timezone,
+                    value_type,
+                    other_types.iter().cloned(),
+                )?),
+                SelectorType::Min => Box::new(Selector::new(
+                    Comparison::Min,
+                    Target::Value,
+                    timezone,
+                    value_type,
+                    other_types.iter().cloned(),
+                )?),
+                SelectorType::Max => Box::new(Selector::new(
+                    Comparison::Max,
+                    Target::Value,
+                    timezone,
+                    value_type,
+                    other_types.iter().cloned(),
+                )?),
+            };
+            Ok(accumulator)
+        })
+    }
+}
+
+type ReturnTypeFunction = Arc<dyn Fn(&[DataType]) -> DataFusionResult<Arc<DataType>> + Send + Sync>;
+type StateTypeFactory =
+    Arc<dyn Fn(&DataType) -> DataFusionResult<Arc<Vec<DataType>>> + Send + Sync>;
+
+/// Create a User Defined Aggregate Function (UDAF) for datafusion.
+fn make_uda(name: &str, factory_builder: FactoryBuilder) -> AggregateUDF {
+    // All selectors support the same input types / signatures
+    let input_signature = Signature::variadic_any(Volatility::Stable);
+
+    // return type of the selector is based on the input arguments.
+    //
+    // The inputs are (value, time) and the output is a struct with a
+    // 'value' and 'time' field of the same time.
+    let captured_name = name.to_string();
+    let return_type_func: ReturnTypeFunction = Arc::new(move |arg_types| {
+        let agg_type = AggType::try_from_arg_types(arg_types, &captured_name)?;
+        Ok(Arc::new(agg_type.return_type()))
+    });
+
+    // state type given the return type
+    let state_type_factory = factory_builder.build_state_type_factory();
+
+    AggregateUDF::new(
+        name,
+        &input_signature,
+        &return_type_func,
+        &factory_builder.build_accumulator_factory(),
+        &state_type_factory,
+    )
+}
+
+#[cfg(test)]
+mod test {
+    use arrow::{
+        array::{
+            BooleanArray, Float64Array, Int64Array, StringArray, TimestampNanosecondArray,
+            UInt64Array,
+        },
+        datatypes::{Field, Schema, SchemaRef},
+        record_batch::RecordBatch,
+        util::pretty::pretty_format_batches,
+    };
+    use datafusion::{datasource::MemTable, prelude::*};
+
+    use super::*;
+    use utils::{run_case, run_case_tz, run_cases_err};
+
+    mod first {
+        use super::*;
+
+        #[tokio::test]
+        async fn test_f64() {
+            run_case(
+                selector_first().call(vec![col("f64_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_first(t.f64_value,t.time)             |",
+                    "+------------------------------------------------+",
+                    "| {value: 2.0, time: 1970-01-01T00:00:00.000001} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_first().call(vec![col("f64_not_normal_1_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_first(t.f64_not_normal_1_value,t.time) |",
+                    "+-------------------------------------------------+",
+                    "| {value: NaN, time: 1970-01-01T00:00:00.000001}  |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_first().call(vec![col("f64_not_normal_2_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_first(t.f64_not_normal_2_value,t.time) |",
+                    "+-------------------------------------------------+",
+                    "| {value: -inf, time: 1970-01-01T00:00:00.000001} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_first().call(vec![col("f64_not_normal_3_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_first(t.f64_not_normal_3_value,t.time) |",
+                    "+-------------------------------------------------+",
+                    "| {value: NaN, time: 1970-01-01T00:00:00.000001}  |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_first().call(vec![col("f64_not_normal_4_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_first(t.f64_not_normal_4_value,t.time) |",
+                    "+-------------------------------------------------+",
+                    "| {value: 1.0, time: 1970-01-01T00:00:00.000001}  |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_first().call(vec![col("f64_not_normal_5_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_first(t.f64_not_normal_5_value,t.time) |",
+                    "+-------------------------------------------------+",
+                    "| {value: 2.0, time: 1970-01-01T00:00:00.000001}  |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_i64() {
+            run_case(
+                selector_first().call(vec![col("i64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_first(t.i64_value,t.time)            |",
+                    "+-----------------------------------------------+",
+                    "| {value: 20, time: 1970-01-01T00:00:00.000001} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_u64() {
+            run_case(
+                selector_first().call(vec![col("u64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_first(t.u64_value,t.time)            |",
+                    "+-----------------------------------------------+",
+                    "| {value: 20, time: 1970-01-01T00:00:00.000001} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_string() {
+            run_case(
+                selector_first().call(vec![col("string_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_first(t.string_value,t.time)          |",
+                    "+------------------------------------------------+",
+                    "| {value: two, time: 1970-01-01T00:00:00.000001} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_bool() {
+            run_case(
+                selector_first().call(vec![col("bool_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_first(t.bool_value,t.time)             |",
+                    "+-------------------------------------------------+",
+                    "| {value: true, time: 1970-01-01T00:00:00.000001} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_with_other() {
+            run_case(
+                selector_first().call(vec![col("f64_value"), col("time"), col("bool_value"), col("f64_not_normal_1_value"), col("i64_2_value")]),
+                vec![
+                    "+----------------------------------------------------------------------------------------+",
+                    "| selector_first(t.f64_value,t.time,t.bool_value,t.f64_not_normal_1_value,t.i64_2_value) |",
+                    "+----------------------------------------------------------------------------------------+",
+                    "| {value: 2.0, time: 1970-01-01T00:00:00.000001, other_1: true, other_2: NaN, other_3: } |",
+                    "+----------------------------------------------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_first().call(vec![col("i64_2_value"), col("time"), col("bool_value"), col("f64_not_normal_1_value"), col("i64_2_value")]),
+                vec![
+                    "+------------------------------------------------------------------------------------------+",
+                    "| selector_first(t.i64_2_value,t.time,t.bool_value,t.f64_not_normal_1_value,t.i64_2_value) |",
+                    "+------------------------------------------------------------------------------------------+",
+                    "| {value: 50, time: 1970-01-01T00:00:00.000005, other_1: false, other_2: inf, other_3: 50} |",
+                    "+------------------------------------------------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_time_tie_breaker() {
+            run_case(
+                selector_first().call(vec![col("f64_value"), col("time_dup")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_first(t.f64_value,t.time_dup)         |",
+                    "+------------------------------------------------+",
+                    "| {value: 2.0, time: 1970-01-01T00:00:00.000001} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_err() {
+            run_cases_err(selector_first(), "selector_first").await;
+        }
+
+        #[tokio::test]
+        async fn test_i64_tz() {
+            run_case_tz(
+                selector_first().call(vec![col("i64_value"), col("time")]),
+                Some("Australia/Hobart".into()),
+                vec![
+                    "+-----------------------------------------------------+",
+                    "| selector_first(t.i64_value,t.time)                  |",
+                    "+-----------------------------------------------------+",
+                    "| {value: 20, time: 1970-01-01T11:00:00.000001+11:00} |",
+                    "+-----------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+    }
+
+    mod last {
+        use super::*;
+
+        #[tokio::test]
+        async fn test_f64() {
+            run_case(
+                selector_last().call(vec![col("f64_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_last(t.f64_value,t.time)              |",
+                    "+------------------------------------------------+",
+                    "| {value: 3.0, time: 1970-01-01T00:00:00.000006} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_last().call(vec![col("f64_not_normal_1_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_last(t.f64_not_normal_1_value,t.time)  |",
+                    "+-------------------------------------------------+",
+                    "| {value: -inf, time: 1970-01-01T00:00:00.000006} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_last().call(vec![col("f64_not_normal_2_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_last(t.f64_not_normal_2_value,t.time) |",
+                    "+------------------------------------------------+",
+                    "| {value: inf, time: 1970-01-01T00:00:00.000006} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_last().call(vec![col("f64_not_normal_3_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_last(t.f64_not_normal_3_value,t.time) |",
+                    "+------------------------------------------------+",
+                    "| {value: NaN, time: 1970-01-01T00:00:00.000006} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_last().call(vec![col("f64_not_normal_4_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_last(t.f64_not_normal_4_value,t.time) |",
+                    "+------------------------------------------------+",
+                    "| {value: 3.0, time: 1970-01-01T00:00:00.000006} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_last().call(vec![col("f64_not_normal_5_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_last(t.f64_not_normal_5_value,t.time) |",
+                    "+------------------------------------------------+",
+                    "| {value: 4.0, time: 1970-01-01T00:00:00.000006} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_i64() {
+            run_case(
+                selector_last().call(vec![col("i64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_last(t.i64_value,t.time)             |",
+                    "+-----------------------------------------------+",
+                    "| {value: 30, time: 1970-01-01T00:00:00.000006} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_u64() {
+            run_case(
+                selector_last().call(vec![col("u64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_last(t.u64_value,t.time)             |",
+                    "+-----------------------------------------------+",
+                    "| {value: 30, time: 1970-01-01T00:00:00.000006} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_string() {
+            run_case(
+                selector_last().call(vec![col("string_value"), col("time")]),
+                vec![
+                    "+--------------------------------------------------+",
+                    "| selector_last(t.string_value,t.time)             |",
+                    "+--------------------------------------------------+",
+                    "| {value: three, time: 1970-01-01T00:00:00.000006} |",
+                    "+--------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_bool() {
+            run_case(
+                selector_last().call(vec![col("bool_value"), col("time")]),
+                vec![
+                    "+--------------------------------------------------+",
+                    "| selector_last(t.bool_value,t.time)               |",
+                    "+--------------------------------------------------+",
+                    "| {value: false, time: 1970-01-01T00:00:00.000006} |",
+                    "+--------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_with_other() {
+            run_case(
+                selector_last().call(vec![col("f64_value"), col("time"), col("bool_value"), col("f64_not_normal_3_value"), col("i64_2_value")]),
+                vec![
+                    "+-------------------------------------------------------------------------------------------+",
+                    "| selector_last(t.f64_value,t.time,t.bool_value,t.f64_not_normal_3_value,t.i64_2_value)     |",
+                    "+-------------------------------------------------------------------------------------------+",
+                    "| {value: 3.0, time: 1970-01-01T00:00:00.000006, other_1: false, other_2: NaN, other_3: 30} |",
+                    "+-------------------------------------------------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_last().call(vec![col("u64_2_value"), col("time"), col("bool_value"), col("f64_not_normal_4_value"), col("i64_2_value")]),
+                vec![
+                    "+------------------------------------------------------------------------------------------+",
+                    "| selector_last(t.u64_2_value,t.time,t.bool_value,t.f64_not_normal_4_value,t.i64_2_value)  |",
+                    "+------------------------------------------------------------------------------------------+",
+                    "| {value: 50, time: 1970-01-01T00:00:00.000005, other_1: false, other_2: inf, other_3: 50} |",
+                    "+------------------------------------------------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_time_tie_breaker() {
+            run_case(
+                selector_last().call(vec![col("f64_value"), col("time_dup")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_last(t.f64_value,t.time_dup)          |",
+                    "+------------------------------------------------+",
+                    "| {value: 5.0, time: 1970-01-01T00:00:00.000003} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_err() {
+            run_cases_err(selector_last(), "selector_last").await;
+        }
+
+        #[tokio::test]
+        async fn test_i64_tz() {
+            run_case_tz(
+                selector_last().call(vec![col("i64_value"), col("time")]),
+                Some("Australia/Adelaide".into()),
+                vec![
+                    "+-----------------------------------------------------+",
+                    "| selector_last(t.i64_value,t.time)                   |",
+                    "+-----------------------------------------------------+",
+                    "| {value: 30, time: 1970-01-01T09:30:00.000006+09:30} |",
+                    "+-----------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+    }
+
+    mod min {
+        use super::*;
+
+        #[tokio::test]
+        async fn test_f64() {
+            run_case(
+                selector_min().call(vec![col("f64_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_min(t.f64_value,t.time)               |",
+                    "+------------------------------------------------+",
+                    "| {value: 1.0, time: 1970-01-01T00:00:00.000004} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_min().call(vec![col("f64_not_normal_1_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_min(t.f64_not_normal_1_value,t.time)   |",
+                    "+-------------------------------------------------+",
+                    "| {value: -inf, time: 1970-01-01T00:00:00.000003} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_min().call(vec![col("f64_not_normal_2_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_min(t.f64_not_normal_2_value,t.time)   |",
+                    "+-------------------------------------------------+",
+                    "| {value: -inf, time: 1970-01-01T00:00:00.000001} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_min().call(vec![col("f64_not_normal_3_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_min(t.f64_not_normal_3_value,t.time)  |",
+                    "+------------------------------------------------+",
+                    "| {value: NaN, time: 1970-01-01T00:00:00.000001} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_i64() {
+            run_case(
+                selector_min().call(vec![col("i64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_min(t.i64_value,t.time)              |",
+                    "+-----------------------------------------------+",
+                    "| {value: 10, time: 1970-01-01T00:00:00.000004} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_min().call(vec![col("i64_2_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_min(t.i64_2_value,t.time)            |",
+                    "+-----------------------------------------------+",
+                    "| {value: 30, time: 1970-01-01T00:00:00.000006} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_u64() {
+            run_case(
+                selector_min().call(vec![col("u64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_min(t.u64_value,t.time)              |",
+                    "+-----------------------------------------------+",
+                    "| {value: 10, time: 1970-01-01T00:00:00.000004} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_string() {
+            run_case(
+                selector_min().call(vec![col("string_value"), col("time")]),
+                vec![
+                    "+--------------------------------------------------+",
+                    "| selector_min(t.string_value,t.time)              |",
+                    "+--------------------------------------------------+",
+                    "| {value: a_one, time: 1970-01-01T00:00:00.000004} |",
+                    "+--------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_bool() {
+            run_case(
+                selector_min().call(vec![col("bool_value"), col("time")]),
+                vec![
+                    "+--------------------------------------------------+",
+                    "| selector_min(t.bool_value,t.time)                |",
+                    "+--------------------------------------------------+",
+                    "| {value: false, time: 1970-01-01T00:00:00.000002} |",
+                    "+--------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_with_other() {
+            run_case(
+                selector_min().call(vec![col("u64_value"), col("time"), col("bool_value"), col("f64_not_normal_1_value"), col("i64_2_value")]),
+                vec![
+                    "+---------------------------------------------------------------------------------------+",
+                    "| selector_min(t.u64_value,t.time,t.bool_value,t.f64_not_normal_1_value,t.i64_2_value)  |",
+                    "+---------------------------------------------------------------------------------------+",
+                    "| {value: 10, time: 1970-01-01T00:00:00.000004, other_1: true, other_2: NaN, other_3: } |",
+                    "+---------------------------------------------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_time_tie_breaker() {
+            run_case(
+                selector_min().call(vec![col("f64_not_normal_2_value"), col("time_dup")]),
+                vec![
+                    "+---------------------------------------------------+",
+                    "| selector_min(t.f64_not_normal_2_value,t.time_dup) |",
+                    "+---------------------------------------------------+",
+                    "| {value: -inf, time: 1970-01-01T00:00:00.000001}   |",
+                    "+---------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_min().call(vec![col("bool_const"), col("time_dup")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_min(t.bool_const,t.time_dup)           |",
+                    "+-------------------------------------------------+",
+                    "| {value: true, time: 1970-01-01T00:00:00.000001} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_err() {
+            run_cases_err(selector_min(), "selector_min").await;
+        }
+
+        #[tokio::test]
+        async fn test_i64_tz() {
+            run_case_tz(
+                selector_min().call(vec![col("i64_value"), col("time")]),
+                Some("Pacific/Chatham".into()),
+                vec![
+                    "+-----------------------------------------------------+",
+                    "| selector_min(t.i64_value,t.time)                    |",
+                    "+-----------------------------------------------------+",
+                    "| {value: 10, time: 1970-01-01T12:45:00.000004+12:45} |",
+                    "+-----------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+    }
+
+    mod max {
+        use super::*;
+
+        #[tokio::test]
+        async fn test_f64() {
+            run_case(
+                selector_max().call(vec![col("f64_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_max(t.f64_value,t.time)               |",
+                    "+------------------------------------------------+",
+                    "| {value: 5.0, time: 1970-01-01T00:00:00.000005} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_max().call(vec![col("f64_not_normal_1_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_max(t.f64_not_normal_1_value,t.time)  |",
+                    "+------------------------------------------------+",
+                    "| {value: NaN, time: 1970-01-01T00:00:00.000001} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_max().call(vec![col("f64_not_normal_2_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_max(t.f64_not_normal_2_value,t.time)  |",
+                    "+------------------------------------------------+",
+                    "| {value: inf, time: 1970-01-01T00:00:00.000004} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_max().call(vec![col("f64_not_normal_3_value"), col("time")]),
+                vec![
+                    "+------------------------------------------------+",
+                    "| selector_max(t.f64_not_normal_3_value,t.time)  |",
+                    "+------------------------------------------------+",
+                    "| {value: NaN, time: 1970-01-01T00:00:00.000001} |",
+                    "+------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_i64() {
+            run_case(
+                selector_max().call(vec![col("i64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_max(t.i64_value,t.time)              |",
+                    "+-----------------------------------------------+",
+                    "| {value: 50, time: 1970-01-01T00:00:00.000005} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_max().call(vec![col("i64_2_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_max(t.i64_2_value,t.time)            |",
+                    "+-----------------------------------------------+",
+                    "| {value: 50, time: 1970-01-01T00:00:00.000005} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_u64() {
+            run_case(
+                selector_max().call(vec![col("u64_value"), col("time")]),
+                vec![
+                    "+-----------------------------------------------+",
+                    "| selector_max(t.u64_value,t.time)              |",
+                    "+-----------------------------------------------+",
+                    "| {value: 50, time: 1970-01-01T00:00:00.000005} |",
+                    "+-----------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_string() {
+            run_case(
+                selector_max().call(vec![col("string_value"), col("time")]),
+                vec![
+                    "+---------------------------------------------------+",
+                    "| selector_max(t.string_value,t.time)               |",
+                    "+---------------------------------------------------+",
+                    "| {value: z_five, time: 1970-01-01T00:00:00.000005} |",
+                    "+---------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_bool() {
+            run_case(
+                selector_max().call(vec![col("bool_value"), col("time")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_max(t.bool_value,t.time)               |",
+                    "+-------------------------------------------------+",
+                    "| {value: true, time: 1970-01-01T00:00:00.000001} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_with_other() {
+            run_case(
+                selector_max().call(vec![col("u64_value"), col("time"), col("bool_value"), col("f64_not_normal_1_value"), col("i64_2_value")]),
+                vec![
+                    "+------------------------------------------------------------------------------------------+",
+                    "| selector_max(t.u64_value,t.time,t.bool_value,t.f64_not_normal_1_value,t.i64_2_value)     |",
+                    "+------------------------------------------------------------------------------------------+",
+                    "| {value: 50, time: 1970-01-01T00:00:00.000005, other_1: false, other_2: inf, other_3: 50} |",
+                    "+------------------------------------------------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_max().call(vec![col("bool_const"), col("time"), col("bool_value"), col("f64_not_normal_1_value"), col("i64_value")]),
+                vec![
+                    "+-------------------------------------------------------------------------------------------+",
+                    "| selector_max(t.bool_const,t.time,t.bool_value,t.f64_not_normal_1_value,t.i64_value)       |",
+                    "+-------------------------------------------------------------------------------------------+",
+                    "| {value: true, time: 1970-01-01T00:00:00.000001, other_1: true, other_2: NaN, other_3: 20} |",
+                    "+-------------------------------------------------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_time_tie_breaker() {
+            run_case(
+                selector_max().call(vec![col("f64_not_normal_2_value"), col("time_dup")]),
+                vec![
+                    "+---------------------------------------------------+",
+                    "| selector_max(t.f64_not_normal_2_value,t.time_dup) |",
+                    "+---------------------------------------------------+",
+                    "| {value: inf, time: 1970-01-01T00:00:00.000002}    |",
+                    "+---------------------------------------------------+",
+                ],
+            )
+            .await;
+
+            run_case(
+                selector_max().call(vec![col("bool_const"), col("time_dup")]),
+                vec![
+                    "+-------------------------------------------------+",
+                    "| selector_max(t.bool_const,t.time_dup)           |",
+                    "+-------------------------------------------------+",
+                    "| {value: true, time: 1970-01-01T00:00:00.000001} |",
+                    "+-------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+
+        #[tokio::test]
+        async fn test_err() {
+            run_cases_err(selector_max(), "selector_max").await;
+        }
+
+        #[tokio::test]
+        async fn test_i64_tz() {
+            run_case_tz(
+                selector_max().call(vec![col("i64_value"), col("time")]),
+                Some("-05:00".into()),
+                vec![
+                    "+-----------------------------------------------------+",
+                    "| selector_max(t.i64_value,t.time)                    |",
+                    "+-----------------------------------------------------+",
+                    "| {value: 50, time: 1969-12-31T19:00:00.000005-05:00} |",
+                    "+-----------------------------------------------------+",
+                ],
+            )
+            .await;
+        }
+    }
+
+    mod utils {
+        use arrow::datatypes::TimeUnit;
+
+        use super::*;
+
+        /// Runs the expr using `run_plan` and compares the result to `expected`
+        pub async fn run_case(expr: Expr, expected: Vec<&'static str>) {
+            run_case_tz(expr, None, expected).await
+        }
+
+        /// Runs the expr using `run_plan` in the requested timezone and compares the result to `expected`
+        pub async fn run_case_tz(expr: Expr, tz: Option<Arc<str>>, expected: Vec<&'static str>) {
+            if let Some(tz) = tz.as_ref() {
+                println!("Running case for {expr} in timezone {tz}");
+            } else {
+                println!("Running case for {expr}");
+            }
+
+            let actual = run_plan(vec![expr.clone()], tz).await;
+
+            assert_eq!(
+                expected, actual,
+                "\n\nexpr: {expr}\n\nEXPECTED:\n{expected:#?}\nACTUAL:\n{actual:#?}\n"
+            );
+        }
+
+        pub async fn run_case_err(expr: Expr, expected: &str) {
+            println!("Running error case for {expr}");
+
+            let (schema, input) = input(None);
+            let actual = run_with_inputs(Arc::clone(&schema), vec![expr.clone()], input.clone())
+                .await
+                .unwrap_err()
+                .to_string();
+
+            assert_eq!(
+                expected, actual,
+                "\n\nexpr: {expr}\n\nEXPECTED:\n{expected:#?}\nACTUAL:\n{actual:#?}\n"
+            );
+        }
+
+        pub async fn run_cases_err(selector: AggregateUDF, name: &str) {
+            run_case_err(
+                selector.call(vec![]),
+                &format!("Error during planning: {name} requires at least 2 arguments, got 0"),
+            )
+            .await;
+
+            run_case_err(
+                selector.call(vec![col("f64_value")]),
+                &format!("Error during planning: {name} requires at least 2 arguments, got 1"),
+            )
+            .await;
+
+            run_case_err(
+                selector.call(vec![col("time"), col("f64_value")]),
+                &format!("Error during planning: {name} second argument must be a timestamp, but got Float64"),
+            )
+            .await;
+
+            run_case_err(
+                selector.call(vec![col("time"), col("f64_value"), col("bool_value")]),
+                &format!("Error during planning: {name} second argument must be a timestamp, but got Float64"),
+            )
+            .await;
+
+            run_case_err(
+                selector.call(vec![col("f64_value"), col("bool_value"), col("time")]),
+                &format!("Error during planning: {name} second argument must be a timestamp, but got Boolean"),
+            )
+            .await;
+        }
+
+        fn input(tz: Option<Arc<str>>) -> (SchemaRef, Vec<RecordBatch>) {
+            // define a schema for input
+            // (value) and timestamp
+            let schema = Arc::new(Schema::new(vec![
+                Field::new("f64_value", DataType::Float64, true),
+                Field::new("f64_not_normal_1_value", DataType::Float64, true),
+                Field::new("f64_not_normal_2_value", DataType::Float64, true),
+                Field::new("f64_not_normal_3_value", DataType::Float64, true),
+                Field::new("f64_not_normal_4_value", DataType::Float64, true),
+                Field::new("f64_not_normal_5_value", DataType::Float64, true),
+                Field::new("i64_value", DataType::Int64, true),
+                Field::new("i64_2_value", DataType::Int64, true),
+                Field::new("u64_value", DataType::UInt64, true),
+                Field::new("u64_2_value", DataType::UInt64, true),
+                Field::new("string_value", DataType::Utf8, true),
+                Field::new("bool_value", DataType::Boolean, true),
+                Field::new("bool_const", DataType::Boolean, true),
+                Field::new(
+                    "time",
+                    DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()),
+                    true,
+                ),
+                Field::new(
+                    "time_dup",
+                    DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()),
+                    true,
+                ),
+            ]));
+
+            // define data in two partitions
+            let batch1 = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Float64Array::from(vec![Some(2.0), Some(4.0), None])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::NAN),
+                        Some(f64::INFINITY),
+                        Some(f64::NEG_INFINITY),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::NEG_INFINITY),
+                        Some(f64::NEG_INFINITY),
+                        Some(f64::NEG_INFINITY),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::NAN),
+                        Some(f64::NAN),
+                        Some(f64::NAN),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(1.0),
+                        Some(f64::NEG_INFINITY),
+                        Some(f64::NEG_INFINITY),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(2.0),
+                        Some(f64::NAN),
+                        Some(f64::NAN),
+                    ])),
+                    Arc::new(Int64Array::from(vec![Some(20), Some(40), None])),
+                    Arc::new(Int64Array::from(vec![None, None, None])),
+                    Arc::new(UInt64Array::from(vec![Some(20), Some(40), None])),
+                    Arc::new(UInt64Array::from(vec![Some(20), Some(40), None])),
+                    Arc::new(StringArray::from(vec![Some("two"), Some("four"), None])),
+                    Arc::new(BooleanArray::from(vec![Some(true), Some(false), None])),
+                    Arc::new(BooleanArray::from(vec![Some(true), Some(true), Some(true)])),
+                    Arc::new(
+                        TimestampNanosecondArray::from(vec![1000, 2000, 3000])
+                            .with_timezone_opt(tz.clone()),
+                    ),
+                    Arc::new(
+                        TimestampNanosecondArray::from(vec![1000, 1000, 2000])
+                            .with_timezone_opt(tz.clone()),
+                    ),
+                ],
+            )
+            .unwrap();
+
+            // No values in this batch
+            let batch2 = match RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Float64Array::from(vec![] as Vec<Option<f64>>)),
+                    Arc::new(Float64Array::from(vec![] as Vec<Option<f64>>)),
+                    Arc::new(Float64Array::from(vec![] as Vec<Option<f64>>)),
+                    Arc::new(Float64Array::from(vec![] as Vec<Option<f64>>)),
+                    Arc::new(Float64Array::from(vec![] as Vec<Option<f64>>)),
+                    Arc::new(Float64Array::from(vec![] as Vec<Option<f64>>)),
+                    Arc::new(Int64Array::from(vec![] as Vec<Option<i64>>)),
+                    Arc::new(Int64Array::from(vec![] as Vec<Option<i64>>)),
+                    Arc::new(UInt64Array::from(vec![] as Vec<Option<u64>>)),
+                    Arc::new(UInt64Array::from(vec![] as Vec<Option<u64>>)),
+                    Arc::new(StringArray::from(vec![] as Vec<Option<&str>>)),
+                    Arc::new(BooleanArray::from(vec![] as Vec<Option<bool>>)),
+                    Arc::new(BooleanArray::from(vec![] as Vec<Option<bool>>)),
+                    Arc::new(
+                        TimestampNanosecondArray::from(vec![] as Vec<i64>)
+                            .with_timezone_opt(tz.clone()),
+                    ),
+                    Arc::new(
+                        TimestampNanosecondArray::from(vec![] as Vec<i64>)
+                            .with_timezone_opt(tz.clone()),
+                    ),
+                ],
+            ) {
+                Ok(a) => a,
+                _ => unreachable!(),
+            };
+
+            let batch3 = RecordBatch::try_new(
+                Arc::clone(&schema),
+                vec![
+                    Arc::new(Float64Array::from(vec![Some(1.0), Some(5.0), Some(3.0)])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::NAN),
+                        Some(f64::INFINITY),
+                        Some(f64::NEG_INFINITY),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::INFINITY),
+                        Some(f64::INFINITY),
+                        Some(f64::INFINITY),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::NAN),
+                        Some(f64::NAN),
+                        Some(f64::NAN),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::INFINITY),
+                        Some(f64::INFINITY),
+                        Some(3.0),
+                    ])),
+                    Arc::new(Float64Array::from(vec![
+                        Some(f64::NAN),
+                        Some(f64::NAN),
+                        Some(4.0),
+                    ])),
+                    Arc::new(Int64Array::from(vec![Some(10), Some(50), Some(30)])),
+                    Arc::new(Int64Array::from(vec![None, Some(50), Some(30)])),
+                    Arc::new(UInt64Array::from(vec![Some(10), Some(50), Some(30)])),
+                    Arc::new(UInt64Array::from(vec![Some(10), Some(50), None])),
+                    Arc::new(StringArray::from(vec![
+                        Some("a_one"),
+                        Some("z_five"),
+                        Some("three"),
+                    ])),
+                    Arc::new(BooleanArray::from(vec![
+                        Some(true),
+                        Some(false),
+                        Some(false),
+                    ])),
+                    Arc::new(BooleanArray::from(vec![Some(true), Some(true), Some(true)])),
+                    Arc::new(
+                        TimestampNanosecondArray::from(vec![4000, 5000, 6000])
+                            .with_timezone_opt(tz.clone()),
+                    ),
+                    Arc::new(
+                        TimestampNanosecondArray::from(vec![2000, 3000, 3000])
+                            .with_timezone_opt(tz.clone()),
+                    ),
+                ],
+            )
+            .unwrap();
+
+            let input = vec![batch1, batch2, batch3];
+
+            (schema, input)
+        }
+
+        /// Run a plan against the following input table as "t"
+        ///
+        /// ```text
+        /// +-----------+-----------+-----------+--------------+------------+----------------------------+,
+        /// | f64_value | i64_value | u64_value | string_value | bool_value | time                       |,
+        /// +-----------+-----------+--------------+------------+----------------------------+,
+        /// | 2         | 20        | 20        | two          | true       | 1970-01-01T00:00:00.000001 |,
+        /// | 4         | 40        | 40        | four         | false      | 1970-01-01T00:00:00.000002 |,
+        /// |           |           |           |              |            | 1970-01-01T00:00:00.000003 |,
+        /// | 1         | 10        | 10        | a_one        | true       | 1970-01-01T00:00:00.000004 |,
+        /// | 5         | 50        | 50        | z_five       | false      | 1970-01-01T00:00:00.000005 |,
+        /// | 3         | 30        | 30        | three        | false      | 1970-01-01T00:00:00.000006 |,
+        /// +-----------+-----------+--------------+------------+----------------------------+,
+        /// ```
+        async fn run_plan(aggs: Vec<Expr>, tz: Option<Arc<str>>) -> Vec<String> {
+            let (schema, input) = input(tz);
+
+            // Ensure the answer is the same regardless of the order of inputs
+            let input_string = pretty_format_batches(&input).unwrap();
+            let results = run_with_inputs(Arc::clone(&schema), aggs.clone(), input.clone())
+                .await
+                .unwrap();
+
+            use itertools::Itertools;
+            // Get all permutations of the input
+            for p in input.iter().permutations(3) {
+                let p_batches = p.into_iter().cloned().collect::<Vec<_>>();
+                let p_input_string = pretty_format_batches(&p_batches).unwrap();
+                let p_results = run_with_inputs(Arc::clone(&schema), aggs.clone(), p_batches)
+                    .await
+                    .unwrap();
+                assert_eq!(
+                    results, p_results,
+                    "Mismatch with permutation.\n\
+                            Input1 \n\n\
+                            {input_string}\n\n\
+                            produces output:\n\n\
+                            {results:#?}\n\n\
+                            Input 2\n\n\
+                            {p_input_string}\n\n\
+                            produces output:\n\n\
+                            {p_results:#?}\n\n"
+                );
+            }
+
+            results
+        }
+
+        async fn run_with_inputs(
+            schema: SchemaRef,
+            aggs: Vec<Expr>,
+            inputs: Vec<RecordBatch>,
+        ) -> DataFusionResult<Vec<String>> {
+            let provider = MemTable::try_new(Arc::clone(&schema), vec![inputs])?;
+            let ctx = SessionContext::new();
+            ctx.register_table("t", Arc::new(provider))?;
+
+            let df = ctx.table("t").await?;
+            let df = df.aggregate(vec![], aggs)?;
+
+            // execute the query
+            let record_batches = df.collect().await?;
+
+            Ok(pretty_format_batches(&record_batches)?
+                .to_string()
+                .split('\n')
+                .map(|s| s.to_owned())
+                .collect())
+        }
+    }
+}
diff --git a/query_functions/src/selectors/internal.rs b/query_functions/src/selectors/internal.rs
new file mode 100644
index 0000000..a136cc1
--- /dev/null
+++ b/query_functions/src/selectors/internal.rs
@@ -0,0 +1,324 @@
+//! Internal implementaton of InfluxDB "Selector" Functions
+//! Tests are in selector module
+//!
+//! This module is implemented with macros rather than generic types;
+//! I tried valiantly (at least in my mind) to use Generics , but I
+//! couldn't get the traits to work out correctly (as Bool, I64/F64
+//! and Utf8 arrow types don't share enough in common).
+
+use std::{fmt::Debug, sync::Arc};
+
+use arrow::{
+    array::{Array, ArrayRef, TimestampNanosecondArray},
+    compute::kernels::aggregate::{max as array_max, min as array_min},
+    datatypes::DataType,
+};
+use datafusion::{
+    error::{DataFusionError, Result as DataFusionResult},
+    physical_plan::{
+        expressions::{MaxAccumulator, MinAccumulator},
+        Accumulator,
+    },
+    scalar::ScalarValue,
+};
+
+use super::type_handling::make_struct_scalar;
+
+/// How to compare values/time.
+#[derive(Debug, Clone, Copy)]
+pub enum Comparison {
+    Min,
+    Max,
+}
+
+impl Comparison {
+    fn is_update<T>(&self, old: &T, new: &T) -> bool
+    where
+        T: PartialOrd,
+    {
+        match self {
+            Self::Min => new < old,
+            Self::Max => old < new,
+        }
+    }
+}
+
+/// What to compare?
+#[derive(Debug, Clone, Copy)]
+pub enum Target {
+    Time,
+    Value,
+}
+
+/// Did we find a new min/max
+#[derive(Debug)]
+enum ActionNeeded {
+    UpdateValueAndTime,
+    UpdateTime,
+    Nothing,
+}
+
+impl ActionNeeded {
+    fn update_value(&self) -> bool {
+        match self {
+            Self::UpdateValueAndTime => true,
+            Self::UpdateTime => false,
+            Self::Nothing => false,
+        }
+    }
+
+    fn update_time(&self) -> bool {
+        match self {
+            Self::UpdateValueAndTime => true,
+            Self::UpdateTime => true,
+            Self::Nothing => false,
+        }
+    }
+}
+
+/// Common state implementation for different selectors.
+#[derive(Debug)]
+pub struct Selector {
+    comp: Comparison,
+    target: Target,
+    timezone: Option<Arc<str>>,
+    value: ScalarValue,
+    time: Option<i64>,
+    other: Box<[ScalarValue]>,
+}
+
+impl Selector {
+    pub fn new<'a>(
+        comp: Comparison,
+        target: Target,
+        timezone: Option<Arc<str>>,
+        data_type: &'a DataType,
+        other_types: impl IntoIterator<Item = &'a DataType>,
+    ) -> DataFusionResult<Self> {
+        Ok(Self {
+            comp,
+            target,
+            timezone,
+            value: ScalarValue::try_from(data_type)?,
+            time: None,
+            other: other_types
+                .into_iter()
+                .map(ScalarValue::try_from)
+                .collect::<DataFusionResult<_>>()?,
+        })
+    }
+
+    fn update_time_based(
+        &mut self,
+        value_arr: &ArrayRef,
+        time_arr: &ArrayRef,
+        other_arrs: &[ArrayRef],
+    ) -> DataFusionResult<()> {
+        let time_arr = arrow::compute::nullif(time_arr, &arrow::compute::is_null(&value_arr)?)?;
+
+        let time_arr = time_arr
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            // the input type arguments should be ensured by datafusion
+            .expect("Second argument was time");
+        let cur_time = match self.comp {
+            Comparison::Min => array_min(time_arr),
+            Comparison::Max => array_max(time_arr),
+        };
+
+        let need_update = match (&self.time, &cur_time) {
+            (Some(time), Some(cur_time)) => self.comp.is_update(time, cur_time),
+            // No existing min/max, so update needed
+            (None, Some(_)) => true,
+            // No actual min/max time found, so no update needed
+            (_, None) => false,
+        };
+
+        if need_update {
+            let index = time_arr
+                .iter()
+                // arrow doesn't tell us what index had the
+                // min/max, so need to find it ourselves
+                .enumerate()
+                .filter(|(_, time)| cur_time == *time)
+                .map(|(idx, _)| idx)
+                // break tie: favor first value
+                .next()
+                .unwrap(); // value always exists
+
+            // update all or nothing in case of an error
+            let value_new = ScalarValue::try_from_array(&value_arr, index)?;
+            let other_new = other_arrs
+                .iter()
+                .map(|arr| ScalarValue::try_from_array(arr, index))
+                .collect::<DataFusionResult<_>>()?;
+
+            self.time = cur_time;
+            self.value = value_new;
+            self.other = other_new;
+        }
+
+        Ok(())
+    }
+
+    fn update_value_based(
+        &mut self,
+        value_arr: &ArrayRef,
+        time_arr: &ArrayRef,
+        other_arrs: &[ArrayRef],
+    ) -> DataFusionResult<()> {
+        use ActionNeeded::*;
+
+        let cur_value = match self.comp {
+            Comparison::Min => {
+                let mut min_accu = MinAccumulator::try_new(value_arr.data_type())?;
+                min_accu.update_batch(&[Arc::clone(value_arr)])?;
+                min_accu.evaluate()?
+            }
+            Comparison::Max => {
+                let mut max_accu = MaxAccumulator::try_new(value_arr.data_type())?;
+                max_accu.update_batch(&[Arc::clone(value_arr)])?;
+                max_accu.evaluate()?
+            }
+        };
+
+        let action_needed = match (&self.value.is_null(), &cur_value.is_null()) {
+            (false, false) => {
+                if self.comp.is_update(&self.value, &cur_value) {
+                    // new min/max found
+                    UpdateValueAndTime
+                } else if cur_value == self.value {
+                    // same maximum found, time might need update
+                    UpdateTime
+                } else {
+                    Nothing
+                }
+            }
+            // No existing min/max value, so update needed
+            (true, false) => UpdateValueAndTime,
+            // No actual min/max value found, so no update needed
+            (_, true) => Nothing,
+        };
+
+        if action_needed.update_value() {
+            self.value = cur_value;
+            self.time = None; // ignore time associated with old value
+        }
+
+        // Note even though we are computing the MAX value,
+        // the timestamp returned is the one with the *lowest*
+        // numerical value
+        if action_needed.update_time() {
+            // only keep values where we've found our current value.
+            // Note: We MUST also mask-out NULLs in `value_arr`, otherwise we may easily select that!
+            let time_arr = arrow::compute::nullif(
+                time_arr,
+                &arrow::compute::kernels::cmp::neq(
+                    &self.value.to_array_of_size(time_arr.len())?,
+                    &value_arr,
+                )?,
+            )?;
+            let time_arr =
+                arrow::compute::nullif(&time_arr, &arrow::compute::is_null(&value_arr)?)?;
+
+            let time_arr = time_arr
+                .as_any()
+                .downcast_ref::<TimestampNanosecondArray>()
+                // the input type arguments should be ensured by datafusion
+                .expect("Second argument was time");
+
+            // Note: we still use the MINIMUM timestamp here even if this is the max VALUE aggregator.
+            let found_new_time = match (array_min(time_arr), self.time) {
+                (Some(x), Some(y)) => {
+                    if x < y {
+                        self.time = Some(x);
+                        true
+                    } else {
+                        false
+                    }
+                }
+                (Some(x), None) => {
+                    self.time = Some(x);
+                    true
+                }
+                (None, _) => false,
+            };
+
+            // update other if required
+            if found_new_time && !self.other.is_empty() {
+                let index = time_arr
+                    .iter()
+                    // arrow doesn't tell us what index had the
+                    // minimum, so need to find it ourselves
+                    .enumerate()
+                    .filter(|(_, time)| self.time == *time)
+                    .map(|(idx, _)| idx)
+                    // break tie: favor first value
+                    .next()
+                    .unwrap(); // value always exists
+
+                self.other = other_arrs
+                    .iter()
+                    .map(|arr| ScalarValue::try_from_array(arr, index))
+                    .collect::<DataFusionResult<_>>()?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl Accumulator for Selector {
+    fn state(&self) -> DataFusionResult<Vec<ScalarValue>> {
+        Ok([
+            self.value.clone(),
+            ScalarValue::TimestampNanosecond(self.time, self.timezone.clone()),
+        ]
+        .into_iter()
+        .chain(self.other.iter().cloned())
+        .collect())
+    }
+
+    fn update_batch(&mut self, values: &[ArrayRef]) -> DataFusionResult<()> {
+        if values.is_empty() {
+            return Ok(());
+        }
+
+        if values.len() < 2 {
+            return Err(DataFusionError::Internal(format!(
+                "Internal error: Expected at least 2 arguments passed to selector function but got {}",
+                values.len()
+            )));
+        }
+
+        let value_arr = &values[0];
+        let time_arr = &values[1];
+        let other_arrs = &values[2..];
+
+        match self.target {
+            Target::Time => self.update_time_based(value_arr, time_arr, other_arrs)?,
+            Target::Value => self.update_value_based(value_arr, time_arr, other_arrs)?,
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[ArrayRef]) -> DataFusionResult<()> {
+        // merge is the same operation as update for these selectors
+        self.update_batch(states)
+    }
+
+    fn evaluate(&self) -> DataFusionResult<ScalarValue> {
+        Ok(make_struct_scalar(
+            &self.value,
+            &ScalarValue::TimestampNanosecond(self.time, self.timezone.clone()),
+            self.other.iter(),
+        ))
+    }
+
+    fn size(&self) -> usize {
+        std::mem::size_of_val(self) - std::mem::size_of_val(&self.value)
+            + self.value.size()
+            + self.other.iter().map(|s| s.size()).sum::<usize>()
+    }
+}
diff --git a/query_functions/src/selectors/type_handling.rs b/query_functions/src/selectors/type_handling.rs
new file mode 100644
index 0000000..ace2bfc
--- /dev/null
+++ b/query_functions/src/selectors/type_handling.rs
@@ -0,0 +1,174 @@
+use arrow::datatypes::{DataType, Field, Fields, TimeUnit};
+use datafusion::{
+    error::{DataFusionError, Result as DataFusionResult},
+    scalar::ScalarValue,
+};
+
+/// Name of the output struct field that holds the value that was the main input into the selector, i.e. from which we
+/// have selected the first/last/min/max value.
+const STRUCT_FIELD_VALUE: &str = "value";
+
+/// Name of the output struct field that holds the time.
+const STRUCT_FIELD_TIME: &str = "time";
+
+/// Name of the output struct field that holds other values that just point to the selected row but for which we do NOT
+/// evaluate first/last/min/max.
+fn struct_field_other(idx: usize) -> String {
+    format!("other_{}", idx + 1)
+}
+
+/// Create [`Fields`] for the output struct.
+fn make_struct_fields<'a>(
+    value_type: &'a DataType,
+    time_type: &'a DataType,
+    other_types: impl IntoIterator<Item = &'a DataType>,
+) -> Fields {
+    let fields = [
+        Field::new(STRUCT_FIELD_VALUE, value_type.clone(), true),
+        Field::new(STRUCT_FIELD_TIME, time_type.clone(), true),
+    ]
+    .into_iter()
+    .chain(
+        other_types
+            .into_iter()
+            .enumerate()
+            .map(|(i, dt)| Field::new(struct_field_other(i), (*dt).clone(), true)),
+    )
+    .collect::<Vec<_>>();
+
+    fields.into()
+}
+
+/// Create output struct [`DataType`].
+///
+/// This will be a struct with the following fields:
+///
+/// - `value`
+/// - `time`
+/// - `other_{1..}` (depending on the other input to the selector function).
+pub fn make_struct_datatype<'a>(
+    value_type: &'a DataType,
+    time_type: &'a DataType,
+    other_types: impl IntoIterator<Item = &'a DataType>,
+) -> DataType {
+    DataType::Struct(make_struct_fields(value_type, time_type, other_types))
+}
+
+/// Create output struct [`ScalarValue`].
+///
+/// This will be a struct with the following fields:
+///
+/// - `value`
+/// - `time`
+/// - `other_{1..}` (depending on the other input to the selector function).
+pub fn make_struct_scalar<'a>(
+    value: &'a ScalarValue,
+    time: &'a ScalarValue,
+    other: impl IntoIterator<Item = &'a ScalarValue>,
+) -> ScalarValue {
+    let data_fields: Vec<_> = [value.clone(), time.clone()]
+        .into_iter()
+        .chain(other.into_iter().cloned())
+        .collect();
+    let value_type = value.data_type();
+    let time_type = time.data_type();
+    let other_types: Vec<_> = data_fields[2..].iter().map(|s| s.data_type()).collect();
+
+    ScalarValue::Struct(
+        Some(data_fields),
+        make_struct_fields(&value_type, &time_type, &other_types),
+    )
+}
+
+/// Contains types of the aggregator.
+#[derive(Debug)]
+pub struct AggType<'a> {
+    /// Type of the value that is fed into the selector and for which we select the row that satisfies first/last/min/max.
+    pub value_type: &'a DataType,
+
+    /// Type of the time that is fed into the selector and for which we select the row that satisfies first/last/min/max.
+    pub time_type: &'a DataType,
+
+    /// Types of the other values that are picked for the same row for which [value](Self::value_type) was selected. The do
+    /// NOT influence the row selection in any way.
+    pub other_types: Box<[&'a DataType]>,
+}
+
+impl<'a> AggType<'a> {
+    /// Return type of the aggregator.
+    ///
+    /// See [`make_struct_datatype`].
+    pub fn return_type(&self) -> DataType {
+        make_struct_datatype(
+            self.value_type,
+            self.time_type,
+            self.other_types.iter().copied(),
+        )
+    }
+
+    /// Return the state in which the arguments are stored
+    pub fn state_datatypes(&self) -> Vec<DataType> {
+        [self.value_type.clone(), self.time_type.clone()]
+            .into_iter()
+            .chain(self.other_types.iter().copied().cloned())
+            .collect()
+    }
+
+    /// Try to exract types from [`return_type`](Self::return_type).
+    pub fn try_from_return_type(return_type: &'a DataType) -> DataFusionResult<Self> {
+        if let DataType::Struct(fields) = return_type {
+            if fields.len() < 2 {
+                return Err(DataFusionError::Plan(format!(
+                    "requires at least 2 arguments, got {}",
+                    fields.len()
+                )));
+            }
+
+            let value_type = fields[0].data_type();
+            let time_type = fields[1].data_type();
+            let other_types = fields[2..].iter().map(|f| f.data_type()).collect();
+
+            match time_type {
+                DataType::Timestamp(TimeUnit::Nanosecond, _) => Ok(Self {
+                    value_type,
+                    time_type,
+                    other_types,
+                }),
+                _ => Err(DataFusionError::Plan(format!(
+                    "second argument must be a timestamp, but got {time_type}"
+                ))),
+            }
+        } else {
+            Err(DataFusionError::Execution(format!(
+                "Cannot create selector type from non-struct return type: {return_type}"
+            )))
+        }
+    }
+
+    /// Try to extract type from argument types that where passed into the aggregator UDF.
+    ///
+    /// The `name` is used to generated better error messages.
+    pub fn try_from_arg_types(arg_types: &'a [DataType], name: &str) -> DataFusionResult<Self> {
+        if arg_types.len() < 2 {
+            return Err(DataFusionError::Plan(format!(
+                "{} requires at least 2 arguments, got {}",
+                name,
+                arg_types.len()
+            )));
+        }
+
+        let value_type = &arg_types[0];
+        let time_type = &arg_types[1];
+        let other_types = arg_types[2..].iter().collect();
+        match time_type {
+            DataType::Timestamp(TimeUnit::Nanosecond, _) => Ok(Self {
+                value_type,
+                time_type,
+                other_types,
+            }),
+            _ => Err(DataFusionError::Plan(format!(
+                "{name} second argument must be a timestamp, but got {time_type}"
+            ))),
+        }
+    }
+}
diff --git a/query_functions/src/sleep.rs b/query_functions/src/sleep.rs
new file mode 100644
index 0000000..1995c40
--- /dev/null
+++ b/query_functions/src/sleep.rs
@@ -0,0 +1,94 @@
+use std::{any::Any, sync::Arc};
+
+use arrow::datatypes::{DataType, TimeUnit};
+use datafusion::{
+    error::{DataFusionError, Result},
+    logical_expr::{ScalarUDF, ScalarUDFImpl, Signature, Volatility},
+    physical_plan::ColumnarValue,
+};
+use once_cell::sync::Lazy;
+
+/// The name of the "sleep" UDF given to DataFusion.
+pub const SLEEP_UDF_NAME: &str = "sleep";
+
+#[derive(Debug)]
+struct SleepUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for SleepUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        SLEEP_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Null)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::Internal(
+            "sleep function should have been replaced by optimizer pass to avoid thread blocking"
+                .to_owned(),
+        ))
+    }
+}
+
+/// Implementation of "sleep"
+pub(crate) static SLEEP_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(SleepUDF {
+        signature: Signature::uniform(
+            1,
+            vec![
+                DataType::Null,
+                DataType::Duration(TimeUnit::Second),
+                DataType::Duration(TimeUnit::Millisecond),
+                DataType::Duration(TimeUnit::Millisecond),
+                DataType::Duration(TimeUnit::Microsecond),
+                DataType::Duration(TimeUnit::Nanosecond),
+                DataType::Float32,
+                DataType::Float64,
+            ],
+            Volatility::Volatile,
+        ),
+    }))
+});
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{
+        common::assert_contains,
+        logical_expr::LogicalPlanBuilder,
+        physical_plan::common::collect,
+        prelude::{lit, SessionContext},
+        scalar::ScalarValue,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test() {
+        let ctx = SessionContext::new();
+        let plan = LogicalPlanBuilder::empty(true)
+            .project([SLEEP_UDF.call(vec![lit(ScalarValue::Null)]).alias("sleep")])
+            .unwrap()
+            .build()
+            .unwrap();
+        let plan = ctx.state().create_physical_plan(&plan).await.unwrap();
+        let err = collect(plan.execute(0, ctx.task_ctx()).unwrap())
+            .await
+            .unwrap_err();
+
+        assert_contains!(
+            err.to_string(),
+            "sleep function should have been replaced by optimizer pass"
+        );
+    }
+}
diff --git a/query_functions/src/to_timestamp.rs b/query_functions/src/to_timestamp.rs
new file mode 100644
index 0000000..4df6c0b
--- /dev/null
+++ b/query_functions/src/to_timestamp.rs
@@ -0,0 +1,85 @@
+//! Implementation of `to_timestamp` function that
+//! overrides the built in version in DataFusion because the semantics changed
+//! upstream: <https://github.com/apache/arrow-datafusion/pull/7844>
+//!
+//!
+//! See <https://github.com/influxdata/influxdb_iox/issues/9164> for more details
+use std::sync::Arc;
+
+use arrow::datatypes::DataType;
+use arrow::datatypes::TimeUnit;
+use datafusion::common::internal_err;
+use datafusion::error::Result;
+use datafusion::logical_expr::ScalarUDFImpl;
+use datafusion::logical_expr::Signature;
+use datafusion::physical_expr::datetime_expressions;
+use datafusion::physical_expr::expressions::cast_column;
+use datafusion::{
+    error::DataFusionError,
+    logical_expr::{ScalarUDF, Volatility},
+    physical_plan::ColumnarValue,
+};
+use once_cell::sync::Lazy;
+
+/// The name of the function
+pub const TO_TIMESTAMP_FUNCTION_NAME: &str = "to_timestamp";
+
+#[derive(Debug)]
+struct ToTimestampUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for ToTimestampUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        TO_TIMESTAMP_FUNCTION_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Timestamp(TimeUnit::Nanosecond, None))
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        if args.len() != 1 {
+            return internal_err!("to_timestamp expected 1 argument, got {}", args.len());
+        }
+
+        match args[0].data_type() {
+            // call through to arrow cast kernel
+            DataType::Int64 | DataType::Timestamp(_, _) => cast_column(
+                &args[0],
+                &DataType::Timestamp(TimeUnit::Nanosecond, None),
+                None,
+            ),
+            DataType::Utf8 => datetime_expressions::to_timestamp_nanos(args),
+            dt => internal_err!("to_timestamp does not support argument type '{dt}'"),
+        }
+    }
+}
+
+/// Implementation of to_timestamp
+pub(crate) static TO_TIMESTAMP_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(ToTimestampUDF {
+        signature: Signature::uniform(
+            1,
+            vec![
+                DataType::Int64,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                DataType::Timestamp(TimeUnit::Microsecond, None),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                DataType::Timestamp(TimeUnit::Second, None),
+                DataType::Utf8,
+            ],
+            Volatility::Immutable,
+        ),
+    }))
+});
+
+// https://github.com/apache/arrow-datafusion/pull/7844
diff --git a/query_functions/src/window.rs b/query_functions/src/window.rs
new file mode 100644
index 0000000..7196b0a
--- /dev/null
+++ b/query_functions/src/window.rs
@@ -0,0 +1,409 @@
+mod internal;
+
+pub use internal::Duration;
+use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE};
+
+use std::sync::Arc;
+
+use arrow::{
+    array::{Array, ArrayRef, TimestampNanosecondArray},
+    datatypes::DataType,
+};
+use datafusion::{
+    logical_expr::{ScalarUDF, Volatility},
+    physical_plan::ColumnarValue,
+    prelude::*,
+    scalar::ScalarValue,
+};
+use once_cell::sync::Lazy;
+
+use crate::group_by::WindowDuration;
+
+// Reuse DataFusion error and Result types for this module
+pub use datafusion::error::{DataFusionError, Result as DataFusionResult};
+
+/// The name of the window_bounds UDF given to DataFusion.
+pub(crate) const WINDOW_BOUNDS_UDF_NAME: &str = "influx_window_bounds";
+
+/// Implementation of window_bounds
+pub(crate) static WINDOW_BOUNDS_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(create_udf(
+        WINDOW_BOUNDS_UDF_NAME,
+        // takes 7 arguments (see [`window_bounds_udf`] for details)
+        vec![
+            TIME_DATA_TYPE(),
+            // encoded every
+            DataType::Utf8,
+            DataType::Int64,
+            DataType::Boolean,
+            // encoded offset
+            DataType::Utf8,
+            DataType::Int64,
+            DataType::Boolean,
+        ],
+        Arc::new(TIME_DATA_TYPE()),
+        Volatility::Stable,
+        Arc::new(window_bounds_udf),
+    ))
+});
+
+/// Implement the window bounds function as a DataFusion UDF where
+/// each of the two `WindowDuration` argument has been encoded as
+/// three distinct arguments, for 7 total arguments
+///
+/// ```text
+/// window_bounds(arg, every, offset)
+/// ```
+///
+/// Becomes
+///
+/// ```text
+/// window_bounds_udf(arg, every_type, every.field1, every.field2, offset_type, offset.field1, duration.field2)
+/// ```
+///
+/// For example this would mean that `window_bounds` like this:
+///
+/// ```text
+/// window_bounds(
+///   col(time),
+///   WindowDuration::Fixed(10),
+///   WindowDuration::Variable(11, false)
+/// )
+/// ```
+/// Would be called like:
+///
+/// ```text
+/// window_bounds_udf(
+///   col(time),
+///   "fixed", 10, NULL,
+///   "variable", 11, false
+/// )
+/// ```
+///
+/// Note: [`EncodedWindowDuration`] Handles the encoding / decoding of these arguments
+fn window_bounds_udf(args: &[ColumnarValue]) -> DataFusionResult<ColumnarValue> {
+    assert_eq!(args.len(), 7);
+
+    // extract the arguments as a Scalar
+    macro_rules! extract_scalar {
+        ($ARG:expr) => {
+            match &args[$ARG] {
+                ColumnarValue::Array(_) => {
+                    return Err(DataFusionError::Internal(format!(
+                        "window_bounds_udf argument {} not a scalar",
+                        $ARG
+                    )))
+                }
+                ColumnarValue::Scalar(v) => v.clone(),
+            }
+        };
+    }
+
+    let every: WindowDuration = EncodedWindowDuration {
+        ty: extract_scalar!(1),
+        field1: extract_scalar!(2),
+        field2: extract_scalar!(3),
+    }
+    .try_into()?;
+
+    let offset: WindowDuration = EncodedWindowDuration {
+        ty: extract_scalar!(4),
+        field1: extract_scalar!(5),
+        field2: extract_scalar!(6),
+    }
+    .try_into()?;
+
+    let arg = match &args[0] {
+        ColumnarValue::Scalar(v) => {
+            return Err(DataFusionError::NotImplemented(format!(
+                "window_bounds against scalar arguments ({v:?}) not yet implemented"
+            )))
+        }
+        ColumnarValue::Array(arr) => arr,
+    };
+
+    Ok(ColumnarValue::Array(window_bounds(arg, every, offset)))
+}
+
+/// This is the implementation of the `window_bounds` user defined
+/// function used in IOx to compute window boundaries when doing
+/// grouping by windows.
+fn window_bounds(arg: &dyn Array, every: WindowDuration, offset: WindowDuration) -> ArrayRef {
+    // `arg` and output are dynamically-typed Arrow arrays, which means that we
+    // need to:
+    //
+    // 1. cast the values to the type we want
+    // 2. perform the window_bounds calculation for every element in the
+    //     timestamp array
+    // 3. construct the resulting array
+
+    let time = arg
+        .as_any()
+        .downcast_ref::<TimestampNanosecondArray>()
+        .expect("cast of time failed");
+
+    // Note: the Go code uses the `Stop` field of the `GetEarliestBounds` call as
+    // the window boundary https://github.com/influxdata/influxdb/blob/master/storage/reads/array_cursor.gen.go#L546
+
+    // Note window doesn't use the period argument
+    let period = internal::Duration::from_nsecs(0);
+    let window = internal::Window::new((&every).into(), period, (&offset).into());
+
+    // calculate the output times, one at a time, one element at a time
+
+    let values = time.iter().map(|ts| {
+        ts.map(|ts| {
+            let bounds = window.get_earliest_bounds(ts);
+            bounds.stop
+        })
+    });
+
+    let array = values
+        .collect::<TimestampNanosecondArray>()
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
+    Arc::new(array) as ArrayRef
+}
+
+/// Represents a [`WindowDuration`] encoded as a set of
+/// [`ScalarValue`]s that can be passed as arguments to a DataFusion
+/// function
+pub(crate) struct EncodedWindowDuration {
+    /// type: uf8: "fixed" or "variable"
+    pub(crate) ty: ScalarValue,
+
+    /// type: i64
+    /// if "fixed", holds nanoseconds
+    /// if "variable", months
+    pub(crate) field1: ScalarValue,
+
+    /// type: bool
+    /// if "fixed", null
+    /// if "variable", holds `negative`
+    pub(crate) field2: ScalarValue,
+}
+
+impl From<WindowDuration> for EncodedWindowDuration {
+    // Turn a [`WindowDuration`] into a [`EncodedWindowDuration`]
+    // suitable for passing as a parameter to a datafusion parameter
+    fn from(window_duration: WindowDuration) -> Self {
+        match window_duration {
+            WindowDuration::Variable { months, negative } => Self {
+                ty: ScalarValue::Utf8(Some("variable".into())),
+                field1: ScalarValue::Int64(Some(months)),
+                field2: ScalarValue::Boolean(Some(negative)),
+            },
+            WindowDuration::Fixed { nanoseconds } => Self {
+                ty: ScalarValue::Utf8(Some("fixed".into())),
+                field1: ScalarValue::Int64(Some(nanoseconds)),
+                field2: ScalarValue::Boolean(None),
+            },
+        }
+    }
+}
+
+impl TryFrom<EncodedWindowDuration> for WindowDuration {
+    type Error = DataFusionError;
+
+    // attempt to convert the encoded duration back to a WindowDuration
+    fn try_from(value: EncodedWindowDuration) -> Result<Self, Self::Error> {
+        let EncodedWindowDuration { ty, field1, field2 } = value;
+
+        match ty {
+            ScalarValue::Utf8(Some(val)) if val == "variable" => {
+                let months = if let ScalarValue::Int64(Some(v)) = field1 {
+                    v
+                } else {
+                    return Err(DataFusionError::Internal(format!(
+                        "Invalid variable WindowDuration encoding. Expected int64 for months \
+                             but got '{field1:?}'"
+                    )));
+                };
+
+                let negative = if let ScalarValue::Boolean(Some(v)) = field2 {
+                    v
+                } else {
+                    return Err(DataFusionError::Internal(format!(
+                        "Invalid variable WindowDuration encoding. Expected bool for negative \
+                             but got '{field2:?}'"
+                    )));
+                };
+
+                Ok(Self::Variable { months, negative })
+            }
+            ScalarValue::Utf8(Some(val)) if val == "fixed" => {
+                let nanoseconds = if let ScalarValue::Int64(Some(v)) = field1 {
+                    v
+                } else {
+                    return Err(DataFusionError::Internal(format!(
+                        "Invalid fixed WindowDuration encoding. Expected int64 for nanoseconds \
+                             but got '{field1:?}'"
+                    )));
+                };
+
+                if let ScalarValue::Boolean(None) = field2 {
+                } else {
+                    return Err(DataFusionError::Internal(format!(
+                        "Invalid fixed WindowDuration encoding. Expected Null bool in field2 \
+                             but got '{field2:?}'"
+                    )));
+                };
+
+                Ok(Self::Fixed { nanoseconds })
+            }
+            _ => Err(DataFusionError::Internal(format!(
+                "Invalid WindowDuration encoding. Expected string 'variable' or 'fixed' but \
+                    got '{ty:?}'"
+            ))),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::array::TimestampNanosecondArray;
+
+    use super::*;
+
+    #[test]
+    fn test_window_bounds() {
+        let input: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from(vec![Some(100), None, Some(200), Some(300), Some(400)])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        let every = WindowDuration::from_nanoseconds(200);
+        let offset = WindowDuration::from_nanoseconds(50);
+
+        let bounds_array = window_bounds(&input, every, offset);
+
+        let expected_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from(vec![Some(250), None, Some(250), Some(450), Some(450)])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
+
+        assert_eq!(
+            &expected_array, &bounds_array,
+            "Expected:\n{expected_array:?}\nActual:\n{bounds_array:?}",
+        );
+    }
+
+    #[test]
+    fn test_encoded_duration_roundtrop() {
+        /// That `window_duration` survives encoding and decoding
+        fn round_trip(window_duration: WindowDuration) {
+            let encoded_window_duration: EncodedWindowDuration = window_duration.into();
+            let decoded_window_duration: WindowDuration =
+                encoded_window_duration.try_into().expect("decode failed");
+            assert_eq!(window_duration, decoded_window_duration);
+        }
+
+        round_trip(WindowDuration::Fixed { nanoseconds: 42 });
+        round_trip(WindowDuration::Variable {
+            months: 11,
+            negative: true,
+        });
+        round_trip(WindowDuration::Variable {
+            months: 3,
+            negative: false,
+        });
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Invalid WindowDuration encoding. Expected string 'variable' or 'fixed' but got \
+        'Boolean(true)'"
+    )]
+    fn test_decoding_error_wrong_type_type() {
+        decode(EncodedWindowDuration {
+            // incorrect type for ty
+            ty: ScalarValue::Boolean(Some(true)),
+            ..good_variable_duration()
+        })
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Invalid WindowDuration encoding. Expected string 'variable' or 'fixed' but got \
+        'Utf8(\\\"FIXED\\\")"
+    )]
+    fn test_decoding_error_wrong_type_value() {
+        decode(EncodedWindowDuration {
+            // incorrect value
+            ty: ScalarValue::Utf8(Some("FIXED".to_string())),
+            ..good_variable_duration()
+        })
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Invalid variable WindowDuration encoding. Expected int64 for months but got \
+        'UInt64(11)'"
+    )]
+    fn test_decoding_error_wrong_variable_months() {
+        let _: WindowDuration = EncodedWindowDuration {
+            field1: ScalarValue::UInt64(Some(11)),
+            ..good_variable_duration()
+        }
+        .try_into()
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Invalid variable WindowDuration encoding. Expected bool for negative but got \
+        'UInt64(11)'"
+    )]
+    fn test_decoding_error_wrong_variable_negative() {
+        decode(EncodedWindowDuration {
+            field2: ScalarValue::UInt64(Some(11)),
+            ..good_variable_duration()
+        })
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Invalid fixed WindowDuration encoding. Expected int64 for nanoseconds but got \
+        'UInt64(11)'"
+    )]
+    fn test_decoding_error_wrong_fixed_nanos() {
+        decode(EncodedWindowDuration {
+            field1: ScalarValue::UInt64(Some(11)),
+            ..good_fixed_duration()
+        })
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "Invalid fixed WindowDuration encoding. Expected Null bool in field2 but got \
+        'Boolean(false)'"
+    )]
+    fn test_decoding_error_wrong_fixed_field2() {
+        decode(EncodedWindowDuration {
+            field2: ScalarValue::Boolean(Some(false)),
+            ..good_fixed_duration()
+        })
+    }
+
+    /// decodes the encoded value as a `WindowDuration`, panic'ing on failure
+    fn decode(encoded: EncodedWindowDuration) {
+        let _: WindowDuration = encoded.try_into().unwrap();
+    }
+
+    fn good_variable_duration() -> EncodedWindowDuration {
+        EncodedWindowDuration {
+            // incorrect type for ty
+            ty: ScalarValue::Utf8(Some("variable".to_string())),
+            field1: ScalarValue::Int64(Some(11)),
+            field2: ScalarValue::Boolean(Some(false)),
+        }
+    }
+
+    fn good_fixed_duration() -> EncodedWindowDuration {
+        EncodedWindowDuration {
+            // incorrect type for ty
+            ty: ScalarValue::Utf8(Some("fixed".to_string())),
+            field1: ScalarValue::Int64(Some(11)),
+            field2: ScalarValue::Boolean(None),
+        }
+    }
+}
diff --git a/query_functions/src/window/internal.rs b/query_functions/src/window/internal.rs
new file mode 100644
index 0000000..8000881
--- /dev/null
+++ b/query_functions/src/window/internal.rs
@@ -0,0 +1,614 @@
+//! This module contains functions and structs needed to implement
+//! IOx plans, such as timestamp specific window functonality.
+//!
+//! The code in this module is intended to be as faithful a
+//! transliteration of the original Go code into Rust as possible. It
+//! does not forcing idomatic Rust when that might obscure the mapping
+//! between the original code and this port.
+use chrono::{prelude::*, Month::February};
+use std::ops::{Add, Mul};
+
+/// Duration is a vector representing the duration unit components.
+///
+/// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L18>
+#[derive(Debug, Clone, Copy)]
+pub struct Duration {
+    /// months is the number of months for the duration.
+    /// This must be a positive value.
+    months: i64,
+
+    /// nsecs is the number of nanoseconds for the duration.
+    /// This must be a positive value.
+    nsecs: i64,
+
+    /// negative indicates this duration is a negative value.
+    negative: bool,
+}
+
+impl Duration {
+    /// Port of values.ConvertDurationNsecs. Creates a Duration that
+    /// representing a fixed number of nanoseconds.
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L40>
+    pub fn from_nsecs(v: i64) -> Self {
+        let (negative, nsecs) = if v < 0 { (true, -v) } else { (false, v) };
+
+        Self {
+            months: 0,
+            negative,
+            nsecs,
+        }
+    }
+
+    // Port of values.ConvertDurationMonths. Creates a Duration that
+    /// representing a fixed number of months (which vary in absolute
+    /// number of nanoseconds).
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L52>
+    pub fn from_months(v: i64) -> Self {
+        let (negative, months) = if v < 0 { (true, -v) } else { (false, v) };
+
+        Self {
+            months,
+            negative,
+            nsecs: 0,
+        }
+    }
+
+    /// create a duration from a non negative value of months and a negative
+    /// flag
+    pub fn from_months_with_negative(months: i64, negative: bool) -> Self {
+        assert_eq!(months < 0, negative);
+        Self {
+            months,
+            negative,
+            nsecs: 0,
+        }
+    }
+
+    /// IsZero returns true if this is a zero duration.
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L204>
+    fn is_zero(&self) -> bool {
+        self.months == 0 && self.nsecs == 0
+    }
+
+    /// Return the number of months in this duration
+    pub fn months(&self) -> i64 {
+        self.months
+    }
+
+    /// Return the number of nanoseconds in this duration
+    pub fn nanoseconds(&self) -> i64 {
+        self.nsecs
+    }
+
+    /// truncate the time using the duration.
+    ///
+    /// Porting note: this implementation was moved into Duration so
+    /// we could safely assume that only month or nsec was zero, not both (as
+    /// the only two ways to create a duration in this `impl` ensures
+    /// that invariant)
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/window.go#L52>
+    fn truncate(&self, t: i64) -> i64 {
+        let months = self.months;
+        let nsec = self.nsecs;
+
+        match (months != 0, nsec != 0) {
+            (true, false) => truncate_by_months(t, self),
+            (false, true) => truncate_by_nsecs(t, self),
+            // the original Go code generates runtime errors for these two cases,
+            // but the way the Rust is written it Can't Happen (TM)
+            (true, true) => {
+                panic!("duration used as an interval cannot mix month and nanosecond units")
+            }
+            (false, false) => panic!("duration used as an interval cannot be zero"),
+        }
+    }
+}
+
+impl Mul<i64> for Duration {
+    type Output = Self;
+
+    /// Mul will multiply the Duration by a scalar.
+    /// This multiplies each component of the vector.
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L175>
+    fn mul(self, rhs: i64) -> Self {
+        let mut scale = rhs;
+        let mut d = self;
+
+        // If the duration is zero, do nothing.
+        // This prevents a zero value from becoming negative
+        // which is not possible.
+        if d.is_zero() {
+            return d;
+        }
+        if scale < 0 {
+            scale = -scale;
+            d.negative = !d.negative;
+        }
+        d.months *= scale;
+        d.nsecs *= scale;
+        d
+    }
+}
+
+/// Converts a nanosecond UTC timestamp into a DateTime structure
+/// (which can have year, month, etc. extracted)
+///
+/// This is roughly equivelnt to ConvertTime
+/// from <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L35-L37>
+fn timestamp_to_datetime(ts: i64) -> DateTime<Utc> {
+    let secs = ts / 1_000_000_000;
+    let nsec = ts % 1_000_000_000;
+    // Note that nsec as u32 is safe here because modulo on a negative ts value
+    //  still produces a positive remainder.
+    let datetime = NaiveDateTime::from_timestamp_opt(secs, nsec as u32).expect("ts in range");
+    DateTime::from_naive_utc_and_offset(datetime, Utc)
+}
+
+/// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L491>
+const LAST_DAYS: [u32; 12] = [
+    31, // time.January:   31,
+    28, // time.February:  28,
+    31, // time.March:     31,
+    30, // time.April:     30,
+    31, // time.May:       31,
+    30, // time.June:      30,
+    31, // time.July:      31,
+    31, // time.August:    31,
+    30, // time.September: 30,
+    31, // time.October:   31,
+    30, // time.November:  30,
+    31, // time.December:  31,
+];
+
+fn last_day_of_month(month: i32) -> u32 {
+    // month is 1 indexed
+    let idx = (month - 1) as usize;
+    LAST_DAYS[idx]
+}
+
+// port of fun isLeapYear(year int) bool {
+/// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L506>
+fn is_leap_year(year: i32) -> bool {
+    year % 400 == 0 || (year % 4 == 0 && year % 100 != 0)
+}
+
+/// Convert the parts of year to nanoseconds since the epoc UTC time.
+/// It mimics the combination of `time.Date` and `UnixNano`.
+///
+/// It is used in place of Go code such as:
+/// ```golang
+/// time.Date(1970, time.January, 1, 0, 0, 0, 0, time.UTC).UnixNano()
+/// ```
+fn to_timestamp_nanos_utc(
+    year: i32,
+    month: u32,
+    day: u32,
+    hour: u32,
+    min: u32,
+    sec: u32,
+    nano: u32,
+) -> i64 {
+    let ndate = NaiveDate::from_ymd_opt(year, month, day).expect("year-month-day in range");
+    let ntime =
+        NaiveTime::from_hms_nano_opt(hour, min, sec, nano).expect("hour-min-sec-nano in range");
+    let ndatetime = NaiveDateTime::new(ndate, ntime);
+
+    let datetime = DateTime::<Utc>::from_naive_utc_and_offset(ndatetime, Utc);
+    datetime
+        .timestamp_nanos_opt()
+        .expect("timestamp nanos in range")
+}
+
+impl Add<Duration> for i64 {
+    type Output = Self;
+
+    /// Adds a duration to a nanosecond timestamp
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/values/time.go#L84>
+    fn add(self, rhs: Duration) -> Self {
+        let t = self;
+        let d = rhs;
+
+        let mut new_t = t;
+        if d.months > 0 {
+            // Determine if the number of months is positive or negative.
+            let mut months = d.months;
+            if d.negative {
+                months = -months;
+            }
+
+            // Retrieve the current date and increment the values
+            // based on the number of months.
+            let ts = timestamp_to_datetime(t);
+            let (mut year, mut month, mut day) = (ts.year(), ts.month() as i32, ts.day());
+            year += (months / 12) as i32;
+            month += (months % 12) as i32;
+            // If the month overflowed or underflowed, adjust the year
+            // accordingly. Because we add the modulo for the months,
+            // the year will only adjust by one.
+            if month > 12 {
+                year += 1;
+                month -= 12;
+            } else if month <= 0 {
+                year -= 1;
+                month += 12;
+            }
+
+            // Normalize the day if we are past the end of the month.
+            let mut last_day_of_month = last_day_of_month(month);
+            if month == (February.number_from_month() as i32) && is_leap_year(year) {
+                last_day_of_month += 1;
+            }
+
+            if day > last_day_of_month {
+                day = last_day_of_month
+            }
+
+            // Retrieve the original time and construct a date
+            // with the new year, month, and day.
+            let (hour, min, sec) = (ts.hour(), ts.minute(), ts.second());
+            let nsec = ts.nanosecond();
+
+            let ts = to_timestamp_nanos_utc(year, month as u32, day, hour, min, sec, nsec);
+            // Convert it back to our own Time implementation.
+            new_t = ts;
+        }
+
+        // Add the number of nanoseconds to the time.
+        let mut nsecs = d.nsecs;
+        if d.negative {
+            nsecs = -nsecs;
+        }
+        // new_t + nsecs
+        // follow the golang behavior and ignore overflow
+        // see https://github.com/influxdata/influxdb_iox/issues/2890
+        let (v, _overflow) = new_t.overflowing_add(nsecs);
+        v
+    }
+}
+
+/// The bounds of a window
+///
+/// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/bounds.go#L19>
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct Bounds {
+    pub start: i64,
+    pub stop: i64,
+}
+
+/// Represents a window in time
+///
+/// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/window.go#L11>
+#[derive(Debug, Clone, Copy)]
+pub struct Window {
+    every: Duration,
+    // The period of the window.
+    period: Duration,
+    offset: Duration,
+}
+
+impl Window {
+    /// create a new Window with the specified duration and offset
+    pub fn new(every: Duration, period: Duration, offset: Duration) -> Self {
+        Self {
+            every,
+            period,
+            offset,
+        }
+    }
+
+    /// returns the bounds for the earliest window bounds
+    /// that contains the given time t.  For underlapping windows that
+    /// do not contain time t, the window directly after time t will be
+    /// returned.
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/window.go#L70>
+    pub fn get_earliest_bounds(&self, t: i64) -> Bounds {
+        // translate to not-offset coordinate
+        // t = t.Add(w.Offset.Mul(-1))
+        let t = t + self.offset.mul(-1);
+
+        // stop := w.truncate(t).Add(w.Every)
+        let stop = self.truncate(t).add(self.every);
+
+        // translate to offset coordinate
+        // stop = stop.Add(w.Offset)
+        let stop = stop + self.offset;
+
+        // start := stop.Add(w.Period.Mul(-1))
+        let start = stop.add(self.period.mul(-1));
+
+        Bounds { start, stop }
+    }
+
+    /// truncate the time using the duration.
+    ///
+    /// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/window.go#L52>
+    fn truncate(&self, t: i64) -> i64 {
+        self.every.truncate(t)
+    }
+}
+
+/// truncateByNsecs will truncate the time to the given number
+/// of nanoseconds.
+///
+/// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/window.go#L108>
+fn truncate_by_nsecs(t: i64, d: &Duration) -> i64 {
+    let dur = d.nanoseconds();
+    let mut remainder = t % dur;
+
+    if remainder < 0 {
+        remainder += dur;
+    }
+
+    t - remainder
+}
+
+/// truncateByMonths will truncate the time to the given
+/// number of months.
+///
+/// Original: <https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/window.go#L119>
+fn truncate_by_months(t: i64, d: &Duration) -> i64 {
+    let ts = timestamp_to_datetime(t);
+    let (year, month) = (ts.year(), ts.month());
+
+    // Determine the total number of months and truncate
+    // the number of months by the duration amount.
+    let mut total = (year * 12) + (month - 1) as i32;
+    let remainder = total % d.months() as i32;
+    total -= remainder;
+
+    // Recreate a new time from the year and month combination.
+    let (year, month) = ((total / 12), ((total % 12) + 1) as u32);
+    to_timestamp_nanos_utc(year, month, 1, 0, 0, 0, 0)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::Month::{January, June};
+
+    /// nanosecs per second
+    const NS_SECONDS: i64 = 60 * 1_000_000_000;
+
+    /// nanosecs per minute
+    const NS_MINUTE: i64 = 60 * NS_SECONDS;
+
+    fn make_time(v: i64) -> i64 {
+        v
+    }
+
+    /// Parses an ISO timestrng to a UTC timestamp, such as:
+    ///
+    /// t: mustParseTime("1970-02-01T00:00:00Z"),
+    fn must_parse_time(s: &str) -> i64 {
+        let datetime = DateTime::parse_from_rfc3339(s).unwrap();
+        datetime.timestamp_nanos_opt().unwrap()
+    }
+
+    /// TestWindow_GetEarliestBounds
+    ///
+    /// Original: https://github.com/influxdata/flux/blob/1e9bfd49f21c0e679b42acf6fc515ce05c6dec2b/execute/window_test.go#L84
+    #[test]
+    fn get_earliest_bounds() {
+        struct TestCase {
+            name: &'static str,
+            w: Window,
+            t: i64, // time
+            want: Bounds,
+        }
+
+        #[allow(clippy::identity_op)]
+        let testcases = vec![
+            TestCase {
+                name: "simple",
+                w: Window::new(
+                    Duration::from_nsecs(5 * NS_MINUTE),
+                    Duration::from_nsecs(5 * NS_MINUTE),
+                    Duration::from_nsecs(0),
+                ),
+                t: make_time(6 * NS_MINUTE),
+                want: Bounds {
+                    start: make_time(5 * NS_MINUTE),
+                    stop: make_time(10 * NS_MINUTE),
+                },
+            },
+            TestCase {
+                name: "simple with offset",
+                w: Window::new(
+                    Duration::from_nsecs(5 * NS_MINUTE),
+                    Duration::from_nsecs(5 * NS_MINUTE),
+                    Duration::from_nsecs(30 * NS_SECONDS),
+                ),
+                t: make_time(5 * NS_MINUTE),
+                want: Bounds {
+                    start: make_time(30 * NS_SECONDS),
+                    stop: make_time(5 * NS_MINUTE + 30 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "simple months",
+                w: Window::new(
+                    Duration::from_months(5),
+                    Duration::from_months(5),
+                    Duration::from_months(0),
+                ),
+                t: to_timestamp_nanos_utc(1970, January.number_from_month(), 1, 0, 0, 0, 0),
+                want: Bounds {
+                    start: to_timestamp_nanos_utc(1970, January.number_from_month(), 1, 0, 0, 0, 0),
+                    stop: to_timestamp_nanos_utc(1970, June.number_from_month(), 1, 0, 0, 0, 0),
+                },
+            },
+            TestCase {
+                name: "underlapping",
+                w: Window::new(
+                    Duration::from_nsecs(2 * NS_MINUTE),
+                    Duration::from_nsecs(1 * NS_MINUTE),
+                    Duration::from_nsecs(30 * NS_SECONDS),
+                ),
+                t: make_time(3 * NS_MINUTE),
+                want: Bounds {
+                    start: make_time(3 * NS_MINUTE + 30 * NS_SECONDS),
+                    stop: make_time(4 * NS_MINUTE + 30 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "underlapping not contained",
+                w: Window::new(
+                    Duration::from_nsecs(2 * NS_MINUTE),
+                    Duration::from_nsecs(1 * NS_MINUTE),
+                    Duration::from_nsecs(30 * NS_SECONDS),
+                ),
+                t: make_time(2 * NS_MINUTE + 45 * NS_SECONDS),
+                want: Bounds {
+                    start: make_time(3 * NS_MINUTE + 30 * NS_SECONDS),
+                    stop: make_time(4 * NS_MINUTE + 30 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "overlapping",
+                w: Window::new(
+                    Duration::from_nsecs(1 * NS_MINUTE),
+                    Duration::from_nsecs(2 * NS_MINUTE),
+                    Duration::from_nsecs(30 * NS_SECONDS),
+                ),
+                t: make_time(30 * NS_SECONDS),
+                want: Bounds {
+                    start: make_time(-30 * NS_SECONDS),
+                    stop: make_time(1 * NS_MINUTE + 30 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "partially overlapping",
+                w: Window::new(
+                    Duration::from_nsecs(1 * NS_MINUTE),
+                    Duration::from_nsecs(3 * NS_MINUTE + 30 * NS_SECONDS),
+                    Duration::from_nsecs(30 * NS_SECONDS),
+                ),
+                t: make_time(5 * NS_MINUTE + 45 * NS_SECONDS),
+                want: Bounds {
+                    start: make_time(3 * NS_MINUTE),
+                    stop: make_time(6 * NS_MINUTE + 30 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "partially overlapping (t on boundary)",
+                w: Window::new(
+                    Duration::from_nsecs(1 * NS_MINUTE),
+                    Duration::from_nsecs(3 * NS_MINUTE + 30 * NS_SECONDS),
+                    Duration::from_nsecs(30 * NS_SECONDS),
+                ),
+                t: make_time(5 * NS_MINUTE),
+                want: Bounds {
+                    start: make_time(2 * NS_MINUTE),
+                    stop: make_time(5 * NS_MINUTE + 30 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "truncate before offset",
+                w: Window::new(
+                    Duration::from_nsecs(5 * NS_SECONDS),
+                    Duration::from_nsecs(5 * NS_SECONDS),
+                    Duration::from_nsecs(2 * NS_SECONDS),
+                ),
+                t: make_time(1 * NS_SECONDS),
+                want: Bounds {
+                    start: make_time(-3 * NS_SECONDS),
+                    stop: make_time(2 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "truncate after offset",
+                w: Window::new(
+                    Duration::from_nsecs(5 * NS_SECONDS),
+                    Duration::from_nsecs(5 * NS_SECONDS),
+                    Duration::from_nsecs(2 * NS_SECONDS),
+                ),
+                t: make_time(3 * NS_SECONDS),
+                want: Bounds {
+                    start: make_time(2 * NS_SECONDS),
+                    stop: make_time(7 * NS_SECONDS),
+                },
+            },
+            TestCase {
+                name: "truncate before calendar offset",
+                w: Window::new(
+                    Duration::from_months(5),
+                    Duration::from_months(5),
+                    Duration::from_months(2),
+                ),
+                t: must_parse_time("1970-02-01T00:00:00Z"),
+                want: Bounds {
+                    start: must_parse_time("1969-10-01T00:00:00Z"),
+                    stop: must_parse_time("1970-03-01T00:00:00Z"),
+                },
+            },
+            TestCase {
+                name: "truncate after calendar offset",
+                w: Window::new(
+                    Duration::from_months(5),
+                    Duration::from_months(5),
+                    Duration::from_months(2),
+                ),
+                t: must_parse_time("1970-04-01T00:00:00Z"),
+                want: Bounds {
+                    start: must_parse_time("1970-03-01T00:00:00Z"),
+                    stop: must_parse_time("1970-08-01T00:00:00Z"),
+                },
+            },
+            TestCase {
+                name: "negative calendar offset",
+                w: Window::new(
+                    Duration::from_months(5),
+                    Duration::from_months(5),
+                    Duration::from_months(-2),
+                ),
+                t: must_parse_time("1970-02-01T00:00:00Z"),
+                want: Bounds {
+                    start: must_parse_time("1969-11-01T00:00:00Z"),
+                    stop: must_parse_time("1970-04-01T00:00:00Z"),
+                },
+            },
+        ];
+
+        for tc in testcases {
+            let got = tc.w.get_earliest_bounds(tc.t);
+
+            assert_eq!(
+                tc.want, got,
+                "'{}' did not get expected bounds; want:\n{:?}\ngot:\n{:?}",
+                tc.name, tc.want, got
+            );
+        }
+    }
+
+    #[test]
+    fn test_timestamp_to_datetime() {
+        assert_eq!(
+            timestamp_to_datetime(1591894320000000000).to_rfc3339(),
+            "2020-06-11T16:52:00+00:00"
+        );
+        assert_eq!(
+            timestamp_to_datetime(159189432).to_rfc3339(),
+            "1970-01-01T00:00:00.159189432+00:00"
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_timestamp_to_datetime_negative() {
+        // Note while testing to make sure a negative timestamp doesn't overflow, it
+        // turns out that the chrono library itself didn't handle parsing
+        // negative timestamps:
+        //
+        // thread 'window::tests::test_timestamp_to_datetime' panicked at 'invalid or
+        // out-of-range datetime', src/github.com-1ecc6299db9ec823/chrono-0.4.
+        // 19/src/naive/datetime.rs:117:18
+        assert_eq!(timestamp_to_datetime(-1568756160).to_rfc3339(), "foo");
+    }
+}
diff --git a/schema/Cargo.toml b/schema/Cargo.toml
new file mode 100644
index 0000000..0e595b3
--- /dev/null
+++ b/schema/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "schema"
+description = "IOx Schema definition"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+hashbrown = { workspace = true }
+indexmap = { version = "2.1", features = ["std"] }
+observability_deps = { path = "../observability_deps" }
+snafu = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+once_cell = "1"
diff --git a/schema/src/builder.rs b/schema/src/builder.rs
new file mode 100644
index 0000000..f762dac
--- /dev/null
+++ b/schema/src/builder.rs
@@ -0,0 +1,306 @@
+use std::convert::TryInto;
+
+use arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField};
+use snafu::{ResultExt, Snafu};
+
+use super::{InfluxColumnType, InfluxFieldType, Schema, TIME_COLUMN_NAME};
+
+/// Namespace schema creation / validation errors.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Error validating schema: {}", source))]
+    ValidatingSchema { source: super::Error },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Builder for a Schema
+#[derive(Debug, Default, Clone)]
+pub struct SchemaBuilder {
+    /// Optional measurement name
+    measurement: Option<String>,
+
+    /// The fields, in order
+    fields: Vec<(ArrowField, InfluxColumnType)>,
+
+    /// If the builder has been consumed
+    finished: bool,
+}
+
+impl SchemaBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_capacity(n: usize) -> Self {
+        Self {
+            measurement: Default::default(),
+            fields: Vec::with_capacity(n),
+            finished: Default::default(),
+        }
+    }
+
+    /// Add a new tag column to this schema. By default tags are
+    /// potentially nullable as they are not guaranteed to be present
+    /// for all rows
+    pub fn tag(&mut self, column_name: impl Into<String>) -> &mut Self {
+        let influxdb_column_type = InfluxColumnType::Tag;
+        let arrow_type = (&influxdb_column_type).into();
+
+        self.add_column(column_name, true, influxdb_column_type, arrow_type)
+    }
+
+    /// Add a new field column with the specified InfluxDB data model type
+    pub fn influx_field(
+        &mut self,
+        column_name: impl Into<String>,
+        influxdb_field_type: InfluxFieldType,
+    ) -> &mut Self {
+        let arrow_type: ArrowDataType = influxdb_field_type.into();
+        self.add_column(
+            column_name,
+            true,
+            InfluxColumnType::Field(influxdb_field_type),
+            arrow_type,
+        )
+    }
+
+    /// Add a new field column with the specified InfluxDB data model type
+    pub fn influx_column(
+        &mut self,
+        column_name: impl Into<String>,
+        column_type: InfluxColumnType,
+    ) -> &mut Self {
+        match column_type {
+            InfluxColumnType::Tag => self.tag(column_name),
+            InfluxColumnType::Field(influx_field_type) => self
+                .field(column_name, influx_field_type.into())
+                .expect("just converted this from a valid type"),
+            InfluxColumnType::Timestamp => self.timestamp(),
+        }
+    }
+
+    /// Add a new nullable field column with the specified Arrow datatype.
+    pub fn field(
+        &mut self,
+        column_name: impl Into<String>,
+        arrow_type: ArrowDataType,
+    ) -> Result<&mut Self, &'static str> {
+        let influxdb_column_type = arrow_type.clone().try_into().map(InfluxColumnType::Field)?;
+
+        Ok(self.add_column(column_name, true, influxdb_column_type, arrow_type))
+    }
+
+    /// Add the InfluxDB data model timestamp column
+    pub fn timestamp(&mut self) -> &mut Self {
+        let influxdb_column_type = InfluxColumnType::Timestamp;
+        let arrow_type = (&influxdb_column_type).into();
+        self.add_column(TIME_COLUMN_NAME, false, influxdb_column_type, arrow_type)
+    }
+
+    /// Set optional InfluxDB data model measurement name
+    pub fn measurement(&mut self, measurement_name: impl Into<String>) -> &mut Self {
+        self.measurement = Some(measurement_name.into());
+        self
+    }
+
+    /// Creates an Arrow schema with embedded metadata.
+    /// All schema validation happens at this time.
+    /// ```
+    /// use schema::{builder::SchemaBuilder, InfluxColumnType, InfluxFieldType};
+    ///
+    /// let schema = SchemaBuilder::new()
+    ///   .tag("region")
+    ///   .influx_field("counter", InfluxFieldType::Float)
+    ///   .timestamp()
+    ///   .build()
+    ///   .unwrap();
+    ///
+    /// let (influxdb_column_type, arrow_field) = schema.field(0);
+    /// assert_eq!(arrow_field.name(), "region");
+    /// assert_eq!(influxdb_column_type, InfluxColumnType::Tag);
+    ///
+    /// let (influxdb_column_type, arrow_field) = schema.field(1);
+    /// assert_eq!(arrow_field.name(), "counter");
+    /// assert_eq!(influxdb_column_type, InfluxColumnType::Field(InfluxFieldType::Float));
+    ///
+    /// let (influxdb_column_type, arrow_field) = schema.field(2);
+    /// assert_eq!(arrow_field.name(), "time");
+    /// assert_eq!(influxdb_column_type, InfluxColumnType::Timestamp);
+    /// ```
+    pub fn build(&mut self) -> Result<Schema> {
+        assert!(!self.finished, "build called multiple times");
+        self.finished = true;
+
+        Schema::new_from_parts(self.measurement.take(), self.fields.drain(..), false)
+            .context(ValidatingSchemaSnafu)
+    }
+
+    /// Internal helper method to add a column definition
+    fn add_column(
+        &mut self,
+        column_name: impl Into<String>,
+        nullable: bool,
+        column_type: InfluxColumnType,
+        arrow_type: ArrowDataType,
+    ) -> &mut Self {
+        let field = ArrowField::new(column_name, arrow_type, nullable);
+
+        self.fields.push((field, column_type));
+        self
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use InfluxColumnType::*;
+    use InfluxFieldType::*;
+
+    use crate::assert_column_eq;
+
+    use super::*;
+
+    #[test]
+    fn test_builder_basic() {
+        let s = SchemaBuilder::new()
+            .influx_field("str_field", String)
+            .tag("the_tag")
+            .influx_field("int_field", Integer)
+            .influx_field("uint_field", UInteger)
+            .influx_field("bool_field", Boolean)
+            .influx_field("float_field", Float)
+            .tag("the_second_tag")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+
+        assert_column_eq!(s, 0, Field(String), "str_field");
+        assert_column_eq!(s, 1, Tag, "the_tag");
+        assert_column_eq!(s, 2, Field(Integer), "int_field");
+        assert_column_eq!(s, 3, Field(UInteger), "uint_field");
+        assert_column_eq!(s, 4, Field(Boolean), "bool_field");
+        assert_column_eq!(s, 5, Field(Float), "float_field");
+        assert_column_eq!(s, 6, Tag, "the_second_tag");
+        assert_column_eq!(s, 7, Timestamp, "time");
+
+        assert_eq!(s.measurement().unwrap(), "the_measurement");
+        assert_eq!(s.len(), 8);
+    }
+
+    #[test]
+    fn test_builder_tag() {
+        let s = SchemaBuilder::new()
+            .tag("the_tag")
+            .tag("the_other_tag")
+            .build()
+            .unwrap();
+
+        let (influxdb_column_type, field) = s.field(0);
+        assert_eq!(field.name(), "the_tag");
+        assert_eq!(
+            field.data_type(),
+            &ArrowDataType::Dictionary(
+                Box::new(ArrowDataType::Int32),
+                Box::new(ArrowDataType::Utf8)
+            )
+        );
+        assert!(field.is_nullable());
+        assert_eq!(influxdb_column_type, Tag);
+
+        let (influxdb_column_type, field) = s.field(1);
+        assert_eq!(field.name(), "the_other_tag");
+        assert_eq!(
+            field.data_type(),
+            &ArrowDataType::Dictionary(
+                Box::new(ArrowDataType::Int32),
+                Box::new(ArrowDataType::Utf8)
+            )
+        );
+        assert!(field.is_nullable());
+        assert_eq!(influxdb_column_type, Tag);
+
+        assert_eq!(s.len(), 2);
+    }
+
+    #[test]
+    fn test_builder_field() {
+        let s = SchemaBuilder::new()
+            .field("the_influx_field", ArrowDataType::Float64)
+            .unwrap()
+            .field("the_other_influx_field", ArrowDataType::Int64)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let (influxdb_column_type, field) = s.field(0);
+        assert_eq!(field.name(), "the_influx_field");
+        assert_eq!(field.data_type(), &ArrowDataType::Float64);
+        assert!(field.is_nullable());
+        assert_eq!(influxdb_column_type, Field(Float));
+
+        let (influxdb_column_type, field) = s.field(1);
+        assert_eq!(field.name(), "the_other_influx_field");
+        assert_eq!(field.data_type(), &ArrowDataType::Int64);
+        assert!(field.is_nullable());
+        assert_eq!(influxdb_column_type, Field(Integer));
+
+        assert_eq!(s.len(), 2);
+    }
+
+    #[test]
+    fn test_builder_influx_field() {
+        let s = SchemaBuilder::new()
+            .influx_field("the_influx_field", InfluxFieldType::Float)
+            .build()
+            .unwrap();
+
+        let (influxdb_column_type, field) = s.field(0);
+        assert_eq!(field.name(), "the_influx_field");
+        assert_eq!(field.data_type(), &ArrowDataType::Float64);
+        assert!(field.is_nullable());
+        assert_eq!(influxdb_column_type, Field(Float));
+
+        assert_eq!(s.len(), 1);
+    }
+
+    #[test]
+    fn test_builder_no_measurement() {
+        let s = SchemaBuilder::new().tag("the tag").build().unwrap();
+
+        assert_eq!(s.measurement(), None);
+    }
+
+    #[test]
+    fn test_builder_dupe_tag() {
+        let res = SchemaBuilder::new().tag("the tag").tag("the tag").build();
+
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Error validating schema: Internal Error: Duplicate column name found in schema: 'the tag'"
+        );
+    }
+
+    #[test]
+    fn test_builder_dupe_field_and_tag() {
+        let res = SchemaBuilder::new()
+            .tag("the name")
+            .influx_field("the name", Integer)
+            .build();
+
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Error validating schema: Internal Error: Duplicate column name found in schema: 'the name'"
+        );
+    }
+
+    #[test]
+    fn test_builder_dupe_field_and_timestamp() {
+        let res = SchemaBuilder::new().tag("time").timestamp().build();
+
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Error validating schema: Internal Error: Duplicate column name found in schema: 'time'"
+        );
+    }
+}
diff --git a/schema/src/interner.rs b/schema/src/interner.rs
new file mode 100644
index 0000000..c1895e6
--- /dev/null
+++ b/schema/src/interner.rs
@@ -0,0 +1,57 @@
+use std::collections::HashSet;
+
+use crate::Schema;
+
+/// Helper that handles [Interning] for [`Schema`]s.
+///
+/// Note that this is rather expensive since the interner needs to compare the entire schema, so if you find another
+/// key to to store your schema (e.g. a table ID), use a `HashMap<K, Arc<Schema>>` instead.
+///
+/// [Interning]: https://en.wikipedia.org/wiki/Interning_(computer_science)
+#[derive(Debug, Default)]
+pub struct SchemaInterner {
+    schemas: HashSet<Schema>,
+}
+
+impl SchemaInterner {
+    /// Create new, empty interner.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Intern schema.
+    pub fn intern(&mut self, schema: Schema) -> Schema {
+        if let Some(schema) = self.schemas.get(&schema) {
+            schema.clone()
+        } else {
+            self.schemas.insert(schema.clone());
+            schema
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::builder::SchemaBuilder;
+    use std::sync::Arc;
+
+    use super::*;
+
+    #[test]
+    fn test() {
+        let mut interner = SchemaInterner::default();
+
+        let schema_1a = SchemaBuilder::new().tag("t1").tag("t2").build().unwrap();
+        let schema_1b = SchemaBuilder::new().tag("t1").tag("t2").build().unwrap();
+        let schema_2 = SchemaBuilder::new().tag("t1").tag("t3").build().unwrap();
+
+        let interned_1a = interner.intern(schema_1a.clone());
+        assert_eq!(interned_1a, schema_1a);
+
+        let interned_1b = interner.intern(schema_1b);
+        assert!(Arc::ptr_eq(interned_1a.inner(), interned_1b.inner()));
+
+        let interned_2 = interner.intern(schema_2.clone());
+        assert_eq!(interned_2, schema_2);
+    }
+}
diff --git a/schema/src/lib.rs b/schema/src/lib.rs
new file mode 100644
index 0000000..08fc697
--- /dev/null
+++ b/schema/src/lib.rs
@@ -0,0 +1,1406 @@
+//! This module contains the schema definition for IOx
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![allow(clippy::clone_on_ref_ptr)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::{
+    cmp::Ordering,
+    collections::HashMap,
+    convert::{TryFrom, TryInto},
+    fmt,
+    mem::{size_of, size_of_val},
+    sync::Arc,
+};
+
+use arrow::datatypes::{
+    DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields,
+    Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit,
+};
+use hashbrown::HashSet;
+
+use crate::sort::SortKey;
+use once_cell::sync::OnceCell;
+use snafu::{OptionExt, Snafu};
+
+/// The name of the timestamp column in the InfluxDB datamodel
+pub const TIME_COLUMN_NAME: &str = "time";
+
+/// The name of the column specifying the source measurement for a row for an InfluxQL query.
+pub const INFLUXQL_MEASUREMENT_COLUMN_NAME: &str = "iox::measurement";
+/// The key identifying the schema-level metadata.
+pub const INFLUXQL_METADATA_KEY: &str = "iox::influxql::group_key::metadata";
+
+/// The Timezone to use for InfluxDB timezone (should be a constant)
+// TODO: Start Epic Add timezone support to IOx #18154
+// https://github.com/influxdata/idpe/issues/18154
+#[allow(non_snake_case)]
+pub fn TIME_DATA_TIMEZONE() -> Option<Arc<str>> {
+    _TIME_DATA_TIMEZONE
+        .get_or_init(|| {
+            std::env::var("INFLUXDB_IOX_TIME_DATA_TIMEZONE")
+                .map_or_else(|_| None, |v| Some(v.into()))
+        })
+        .clone()
+}
+
+// TODO: refactor TIME_DATA_TIMEZONE() into a lazy static
+static _TIME_DATA_TIMEZONE: OnceCell<Option<Arc<str>>> = OnceCell::new();
+
+/// the [`ArrowDataType`] to use for InfluxDB timestamps
+#[allow(non_snake_case)]
+pub fn TIME_DATA_TYPE() -> ArrowDataType {
+    ArrowDataType::Timestamp(TimeUnit::Nanosecond, TIME_DATA_TIMEZONE())
+}
+
+pub mod builder;
+pub mod interner;
+pub mod merge;
+mod projection;
+pub mod sort;
+
+pub use builder::SchemaBuilder;
+pub use projection::Projection;
+
+/// Namespace schema creation / validation errors.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display(
+        "Internal Error: Duplicate column name found in schema: '{}'",
+        column_name,
+    ))]
+    DuplicateColumnName { column_name: String },
+
+    #[snafu(display(
+        "Internal Error: Incompatible metadata type found in schema for column '{}'. Metadata specified {:?} which is incompatible with actual type {:?}",
+        column_name, influxdb_column_type, actual_type
+    ))]
+    IncompatibleMetadata {
+        column_name: String,
+        influxdb_column_type: InfluxColumnType,
+        actual_type: ArrowDataType,
+    },
+
+    #[snafu(display(
+        "Internal Error: Invalid metadata type found in schema for column '{}'. Metadata specifies {:?} which requires the nullable flag to be set to {}",
+        column_name,
+        influxdb_column_type,
+        nullable,
+    ))]
+    Nullability {
+        column_name: String,
+        influxdb_column_type: InfluxColumnType,
+        nullable: bool,
+    },
+
+    #[snafu(display("Column not found '{}'", column_name))]
+    ColumnNotFound { column_name: String },
+
+    #[snafu(display("Sort column not found '{}'", column_name))]
+    SortColumnNotFound { column_name: String },
+
+    #[snafu(display(
+        "Internal Error: Invalid InfluxDB column type for column '{}', cannot parse metadata: {:?}",
+        column_name,
+        md
+    ))]
+    InvalidInfluxColumnType {
+        column_name: String,
+        md: Option<String>,
+    },
+
+    #[snafu(display(
+        "Internal Error: Time column should be named '{}' but is named '{}'",
+        TIME_COLUMN_NAME,
+        column_name
+    ))]
+    WrongTimeColumnName { column_name: String },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Schema for an IOx table.
+///
+/// This structure can be copied / cloned cheaply
+///
+/// Holds an Arrow [`SchemaRef`] that stores IOx schema information in
+/// the "user defined metadata".
+///
+/// The metadata can be used to map back and forth between the Arrow data model
+/// (e.g. [`DataType`]) and the to the InfluxDB
+/// data model, which is described in the
+/// [documentation](https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/).
+///
+/// Specifically, each column in the Arrow schema has a corresponding
+/// InfluxDB data model type of `Tag`, `Field` or `Timestamp` which is stored in
+/// the metadata field of the [`SchemaRef`].
+///
+/// [`SchemaRef`]: arrow::datatypes::SchemaRef
+/// [`DataType`]: arrow::datatypes::DataType
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct Schema {
+    /// Reference-counted pointer to underlying Arrow Schema
+    ///
+    /// All the actual data lives on the metadata structure in
+    /// `ArrowSchemaRef` and this structure knows how to access that
+    /// metadata
+    inner: ArrowSchemaRef,
+}
+
+impl From<Schema> for ArrowSchemaRef {
+    fn from(s: Schema) -> Self {
+        s.inner
+    }
+}
+
+impl From<&Schema> for ArrowSchemaRef {
+    fn from(s: &Schema) -> Self {
+        s.as_arrow()
+    }
+}
+
+impl TryFrom<ArrowSchemaRef> for Schema {
+    type Error = Error;
+
+    fn try_from(value: ArrowSchemaRef) -> Result<Self, Self::Error> {
+        Self::try_from_arrow(value)
+    }
+}
+
+const MEASUREMENT_METADATA_KEY: &str = "iox::measurement::name";
+const COLUMN_METADATA_KEY: &str = "iox::column::type";
+
+impl Schema {
+    /// Create a new Schema wrapper over the schema
+    ///
+    /// All metadata validation is done on creation (todo maybe offer
+    /// a fallible version where the checks are done on access)?
+    fn try_from_arrow(inner: ArrowSchemaRef) -> Result<Self> {
+        // Validate fields
+        {
+            // All column names must be unique
+            let mut field_names = HashSet::with_capacity(inner.fields().len());
+
+            for field in inner.fields() {
+                let column_name = field.name();
+                if !field_names.insert(column_name.as_str()) {
+                    return Err(Error::DuplicateColumnName {
+                        column_name: column_name.to_string(),
+                    });
+                }
+
+                // for each field, ensure any type specified by the metadata
+                // is compatible with the actual type of the field
+                let influxdb_column_type =
+                    get_influx_type(field).map_err(|md| Error::InvalidInfluxColumnType {
+                        column_name: column_name.to_string(),
+                        md,
+                    })?;
+                let actual_type = field.data_type();
+                if !influxdb_column_type.valid_arrow_type(actual_type) {
+                    return Err(Error::IncompatibleMetadata {
+                        column_name: column_name.to_string(),
+                        influxdb_column_type,
+                        actual_type: actual_type.clone(),
+                    });
+                }
+
+                let expected_nullable = match influxdb_column_type {
+                    InfluxColumnType::Tag => true,
+                    InfluxColumnType::Field(_) => true,
+                    InfluxColumnType::Timestamp => false,
+                };
+                if field.is_nullable() != expected_nullable {
+                    return Err(Error::Nullability {
+                        column_name: column_name.to_string(),
+                        influxdb_column_type,
+                        nullable: expected_nullable,
+                    });
+                }
+
+                if (influxdb_column_type == InfluxColumnType::Timestamp)
+                    && (column_name != TIME_COLUMN_NAME)
+                {
+                    return Err(Error::WrongTimeColumnName {
+                        column_name: column_name.to_string(),
+                    });
+                }
+            }
+        }
+
+        Ok(Self { inner })
+    }
+
+    /// Return a valid Arrow `SchemaRef` representing this `Schema`
+    pub fn as_arrow(&self) -> ArrowSchemaRef {
+        Arc::clone(&self.inner)
+    }
+
+    /// Create and validate a new Schema, creating metadata to
+    /// represent the the various parts. This method is intended to be
+    /// used only by the SchemaBuilder.
+    pub(crate) fn new_from_parts(
+        measurement: Option<String>,
+        fields: impl Iterator<Item = (ArrowField, InfluxColumnType)>,
+        sort_columns: bool,
+    ) -> Result<Self> {
+        let mut metadata = HashMap::new();
+
+        if let Some(measurement) = measurement {
+            metadata.insert(MEASUREMENT_METADATA_KEY.to_string(), measurement);
+        }
+
+        let mut fields: Vec<ArrowField> = fields
+            .map(|(mut field, column_type)| {
+                set_field_metadata(&mut field, column_type);
+                field
+            })
+            .collect();
+
+        if sort_columns {
+            fields.sort_unstable_by(|a, b| a.name().cmp(b.name()));
+        }
+
+        // Call new_from_arrow to do normal, additional validation
+        // (like dupe column detection)
+        let record =
+            ArrowSchemaRef::new(ArrowSchema::new_with_metadata(fields, metadata)).try_into()?;
+
+        Ok(record)
+    }
+
+    /// Returns true if the sort_key includes all primary key cols
+    pub fn is_sorted_on_pk(&self, sort_key: &SortKey) -> bool {
+        self.primary_key().iter().all(|col| sort_key.contains(col))
+    }
+
+    /// Provide a reference to the underlying Arrow Schema object
+    pub fn inner(&self) -> &ArrowSchemaRef {
+        &self.inner
+    }
+
+    /// Return the InfluxDB data model type, if any, and underlying arrow
+    /// schema field for the column at index `idx`. Panics if `idx` is
+    /// greater than or equal to self.len()
+    ///
+    /// if there is no corresponding influx metadata,
+    /// returns None for the influxdb_column_type
+    pub fn field(&self, idx: usize) -> (InfluxColumnType, &ArrowField) {
+        let field = self.inner.field(idx);
+        (
+            get_influx_type(field).expect("was checked during creation"),
+            field,
+        )
+    }
+
+    /// Return the InfluxDB data model type, if any, and underlying arrow
+    /// schema field for the column identified by `name`.
+    pub fn field_by_name(&self, name: &str) -> Option<(InfluxColumnType, &ArrowField)> {
+        self.find_index_of(name).map(|index| self.field(index))
+    }
+
+    /// Return the [`InfluxColumnType`] for the field identified by `name`.
+    pub fn field_type_by_name(&self, name: &str) -> Option<InfluxColumnType> {
+        self.field_by_name(name).map(|(t, _)| t)
+    }
+
+    /// Find the index of the column with the given name, if any.
+    pub fn find_index_of(&self, name: &str) -> Option<usize> {
+        self.inner.index_of(name).ok()
+    }
+
+    /// Provides the InfluxDB data model measurement name for this schema, if
+    /// any
+    pub fn measurement(&self) -> Option<&String> {
+        self.inner.metadata().get(MEASUREMENT_METADATA_KEY)
+    }
+
+    /// Returns the number of columns defined in this schema
+    pub fn len(&self) -> usize {
+        self.inner.fields().len()
+    }
+
+    /// Returns `true` if the schema contains no fields.
+    pub fn is_empty(&self) -> bool {
+        self.inner.fields().is_empty()
+    }
+
+    /// Returns an iterator of `(Option<InfluxColumnType>, &Field)` for
+    /// all the columns of this schema, in order
+    pub fn iter(&self) -> SchemaIter<'_> {
+        SchemaIter::new(self)
+    }
+
+    /// Returns an iterator of `&Field` for all the tag columns of
+    /// this schema, in order
+    pub fn tags_iter(&self) -> impl Iterator<Item = &ArrowField> {
+        self.iter().filter_map(|(influx_column_type, field)| {
+            if matches!(influx_column_type, InfluxColumnType::Tag) {
+                Some(field)
+            } else {
+                None
+            }
+        })
+    }
+
+    /// Returns an iterator of `&Field` for all the field columns of
+    /// this schema, in order
+    pub fn fields_iter(&self) -> impl Iterator<Item = &ArrowField> {
+        self.iter().filter_map(|(influx_column_type, field)| {
+            if matches!(influx_column_type, InfluxColumnType::Field(_)) {
+                Some(field)
+            } else {
+                None
+            }
+        })
+    }
+
+    /// Returns an iterator of `&Field` for all the timestamp columns
+    /// of this schema, in order. At the time of writing there should
+    /// be only one or 0 such columns
+    pub fn time_iter(&self) -> impl Iterator<Item = &ArrowField> {
+        self.iter().filter_map(|(influx_column_type, field)| {
+            if matches!(influx_column_type, InfluxColumnType::Timestamp) {
+                Some(field)
+            } else {
+                None
+            }
+        })
+    }
+
+    /// Resort order of our columns lexicographically by name
+    pub fn sort_fields_by_name(self) -> Self {
+        // pairs of (orig_index, field_ref)
+        let mut sorted_fields: Vec<(usize, &ArrowFieldRef)> =
+            self.inner.fields().iter().enumerate().collect();
+        sorted_fields.sort_by(|a, b| a.1.name().cmp(b.1.name()));
+
+        let is_sorted = sorted_fields
+            .iter()
+            .enumerate()
+            .all(|(index, pair)| index == pair.0);
+
+        if is_sorted {
+            self
+        } else {
+            // No way at present to destructure an existing Schema so
+            // we have to copy :(
+            let new_fields: Fields = sorted_fields.iter().map(|pair| pair.1).cloned().collect();
+
+            let new_meta = self.inner.metadata().clone();
+            let new_schema = ArrowSchema::new_with_metadata(new_fields, new_meta);
+
+            Self {
+                inner: Arc::new(new_schema),
+            }
+        }
+    }
+
+    /// Returns a Schema that represents selecting some of the columns
+    /// in this schema. An error is returned if the selection refers to
+    /// columns that do not exist.
+    pub fn select(&self, selection: Projection<'_>) -> Result<Self> {
+        Ok(match self.df_projection(selection)? {
+            None => self.clone(),
+            Some(indicies) => self.select_by_indices(&indicies),
+        })
+    }
+
+    /// Return names of the columns of given indexes with all PK columns (tags and time)
+    /// If the columns are not provided, return all columns
+    pub fn select_given_and_pk_columns(&self, cols: Option<&Vec<usize>>) -> Vec<String> {
+        match cols {
+            Some(cols) => {
+                let mut columns = cols
+                    .iter()
+                    .map(|i| self.field(*i).1.name().to_string())
+                    .collect::<HashSet<_>>();
+
+                // Add missing PK columnns (tags and time) as they are needed for deduplication
+                let pk = self.primary_key();
+                for col in pk {
+                    columns.insert(col.to_string());
+                }
+                let mut columns = columns.into_iter().collect::<Vec<String>>();
+                columns.sort();
+                columns
+            }
+            None => {
+                // Use all table columns
+                self.iter().map(|(_, f)| f.name().to_string()).collect()
+            }
+        }
+    }
+
+    /// Returns a DataFusion style "projection" when the selection is
+    /// applied to this schema.
+    ///
+    /// * `None` means "all columns"
+    /// * `Some(indicies)` means the subset
+    pub fn df_projection(&self, selection: Projection<'_>) -> Result<Option<Vec<usize>>> {
+        Ok(match selection {
+            Projection::All => None,
+            Projection::Some(columns) => {
+                let projection = columns
+                    .iter()
+                    .map(|&column_name| {
+                        self.find_index_of(column_name)
+                            .context(ColumnNotFoundSnafu { column_name })
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+                Some(projection)
+            }
+        })
+    }
+
+    /// Returns a Schema for the given (sub)set of column projects
+    pub fn select_by_indices(&self, selection: &[usize]) -> Self {
+        let mut fields = Vec::with_capacity(selection.len());
+        for idx in selection {
+            let field = self.inner.field(*idx);
+            fields.push(field.clone());
+        }
+
+        let mut metadata = HashMap::with_capacity(1);
+        if let Some(measurement) = self.inner.metadata().get(MEASUREMENT_METADATA_KEY).cloned() {
+            metadata.insert(MEASUREMENT_METADATA_KEY.to_string(), measurement);
+        }
+
+        Self {
+            inner: Arc::new(ArrowSchema::new_with_metadata(fields, metadata)),
+        }
+    }
+
+    /// Returns a Schema for a given (sub)set of named columns
+    pub fn select_by_names(&self, selection: &[&str]) -> Result<Self> {
+        self.select(Projection::Some(selection))
+    }
+
+    /// Return columns used for the "primary key" in this table.
+    ///
+    /// Currently this relies on the InfluxDB data model annotations
+    /// for what columns to include in the key columns
+    pub fn primary_key(&self) -> Vec<&str> {
+        use InfluxColumnType::*;
+        let mut primary_keys: Vec<_> = self
+            .iter()
+            .filter_map(|(column_type, field)| match column_type {
+                Tag => Some((Tag, field)),
+                Field(_) => None,
+                Timestamp => Some((Timestamp, field)),
+            })
+            .collect();
+
+        // Now, sort lexographically (but put timestamp last)
+        primary_keys.sort_by(|(a_column_type, a), (b_column_type, b)| {
+            match (a_column_type, b_column_type) {
+                (Tag, Tag) => a.name().cmp(b.name()),
+                (Timestamp, Tag) => Ordering::Greater,
+                (Tag, Timestamp) => Ordering::Less,
+                (Timestamp, Timestamp) => panic!("multiple timestamps in summary"),
+                _ => panic!("Unexpected types in key summary"),
+            }
+        });
+
+        // Take just the names
+        primary_keys
+            .into_iter()
+            .map(|(_column_type, field)| field.name().as_str())
+            .collect()
+    }
+
+    /// Estimate memory consumption in bytes of the schema.
+    ///
+    /// This includes the size of `Self` as well as the inner [`Arc`]ed arrow schema.
+    pub fn estimate_size(&self) -> usize {
+        let size_self = size_of_val(self);
+
+        let size_inner = size_of_val(self.inner.as_ref());
+
+        let size_fields = self.inner.fields().size();
+
+        let metadata = self.inner.metadata();
+        let size_metadata = metadata.capacity() * size_of::<(String, String)>()
+            + metadata
+                .iter()
+                .map(|(k, v)| k.capacity() + v.capacity())
+                .sum::<usize>();
+
+        size_self + size_inner + size_fields + size_metadata
+    }
+}
+
+/// Gets the influx type for a field
+pub(crate) fn get_influx_type(field: &ArrowField) -> Result<InfluxColumnType, Option<String>> {
+    let md = field
+        .metadata()
+        .get(COLUMN_METADATA_KEY)
+        .ok_or(None)?
+        .as_str();
+
+    md.try_into().map_err(|_| Some(md.to_owned()))
+}
+
+/// Sets the metadata for a field - replacing any existing metadata
+pub(crate) fn set_field_metadata(field: &mut ArrowField, column_type: InfluxColumnType) {
+    field.set_metadata(HashMap::from([(
+        COLUMN_METADATA_KEY.to_string(),
+        column_type.to_string(),
+    )]));
+}
+
+/// Field value types for InfluxDB 2.0 data model, as defined in
+/// [the documentation]: <https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/>
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum InfluxFieldType {
+    /// 64-bit floating point number (TDB if NULLs / Nans are allowed)
+    Float,
+    /// 64-bit signed integer
+    Integer,
+    /// Unsigned 64-bit integers
+    UInteger,
+    /// UTF-8 encoded string
+    String,
+    /// true or false
+    Boolean,
+}
+
+impl From<InfluxFieldType> for ArrowDataType {
+    fn from(t: InfluxFieldType) -> Self {
+        match t {
+            InfluxFieldType::Float => Self::Float64,
+            InfluxFieldType::Integer => Self::Int64,
+            InfluxFieldType::UInteger => Self::UInt64,
+            InfluxFieldType::String => Self::Utf8,
+            InfluxFieldType::Boolean => Self::Boolean,
+        }
+    }
+}
+
+impl TryFrom<ArrowDataType> for InfluxFieldType {
+    type Error = &'static str;
+
+    fn try_from(value: ArrowDataType) -> Result<Self, Self::Error> {
+        match value {
+            ArrowDataType::Float64 => Ok(Self::Float),
+            ArrowDataType::Int64 => Ok(Self::Integer),
+            ArrowDataType::UInt64 => Ok(Self::UInteger),
+            ArrowDataType::Utf8 => Ok(Self::String),
+            ArrowDataType::Boolean => Ok(Self::Boolean),
+            _ => Err("No corresponding type in the InfluxDB data model"),
+        }
+    }
+}
+
+impl TryFrom<&String> for InfluxFieldType {
+    type Error = &'static str;
+
+    fn try_from(s: &String) -> Result<Self, Self::Error> {
+        Ok(match s.as_str() {
+            "Float" => InfluxFieldType::Float,
+            "Integer" => InfluxFieldType::Integer,
+            "UnsignedInteger" => InfluxFieldType::UInteger,
+            "Boolean" => InfluxFieldType::Boolean,
+            "String" => InfluxFieldType::String,
+            _ => {
+                return Err("No corresponding type in the InfluxDB data model");
+            }
+        })
+    }
+}
+
+/// Column types.
+///
+/// Includes types for tags and fields in the InfluxDB data model, as described in the
+/// [documentation](https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/).
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum InfluxColumnType {
+    /// Tag
+    ///
+    /// Note: tags are always stored as a Utf8, but eventually this
+    /// should allow for both Utf8 and Dictionary
+    Tag,
+
+    /// Field: Data of type in InfluxDB Data model
+    Field(InfluxFieldType),
+
+    /// Timestamp
+    ///
+    /// 64 bit timestamp "UNIX timestamps" representing nanoseconds
+    /// since the UNIX epoch (00:00:00 UTC on 1 January 1970).
+    Timestamp,
+}
+
+impl InfluxColumnType {
+    /// returns true if `arrow_type` can validly store this column type
+    pub fn valid_arrow_type(&self, data_type: &ArrowDataType) -> bool {
+        match self {
+            Self::Tag => match data_type {
+                ArrowDataType::Utf8 => true,
+                ArrowDataType::Dictionary(key, value) => {
+                    key.as_ref() == &ArrowDataType::Int32 && value.as_ref() == &ArrowDataType::Utf8
+                }
+                _ => false,
+            },
+            Self::Field(_) => {
+                let default_type: ArrowDataType = self.into();
+                data_type == &default_type
+            }
+            Self::Timestamp => match data_type {
+                ArrowDataType::Timestamp(TimeUnit::Nanosecond, None) => true,
+                ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some(tz))
+                    if tz.as_ref() == "UTC" =>
+                {
+                    true
+                }
+                _ => false,
+            },
+        }
+    }
+}
+
+/// "serialization" to strings that are stored in arrow metadata
+impl From<&InfluxColumnType> for &'static str {
+    fn from(t: &InfluxColumnType) -> Self {
+        match t {
+            InfluxColumnType::Tag => "iox::column_type::tag",
+            InfluxColumnType::Field(InfluxFieldType::Float) => "iox::column_type::field::float",
+            InfluxColumnType::Field(InfluxFieldType::Integer) => "iox::column_type::field::integer",
+            InfluxColumnType::Field(InfluxFieldType::UInteger) => {
+                "iox::column_type::field::uinteger"
+            }
+            InfluxColumnType::Field(InfluxFieldType::String) => "iox::column_type::field::string",
+            InfluxColumnType::Field(InfluxFieldType::Boolean) => "iox::column_type::field::boolean",
+            InfluxColumnType::Timestamp => "iox::column_type::timestamp",
+        }
+    }
+}
+
+impl std::fmt::Display for InfluxColumnType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s: &str = self.into();
+        write!(f, "{s}")
+    }
+}
+
+/// "deserialization" from strings that are stored in arrow metadata
+impl TryFrom<&str> for InfluxColumnType {
+    type Error = String;
+    /// this is the inverse of converting to &str
+    fn try_from(s: &str) -> Result<Self, Self::Error> {
+        match s {
+            "iox::column_type::tag" => Ok(Self::Tag),
+            "iox::column_type::field::float" => Ok(Self::Field(InfluxFieldType::Float)),
+            "iox::column_type::field::integer" => Ok(Self::Field(InfluxFieldType::Integer)),
+            "iox::column_type::field::uinteger" => Ok(Self::Field(InfluxFieldType::UInteger)),
+            "iox::column_type::field::string" => Ok(Self::Field(InfluxFieldType::String)),
+            "iox::column_type::field::boolean" => Ok(Self::Field(InfluxFieldType::Boolean)),
+            "iox::column_type::timestamp" => Ok(Self::Timestamp),
+            _ => Err(format!("Unknown column type in metadata: {s:?}")),
+        }
+    }
+}
+
+impl From<&InfluxColumnType> for ArrowDataType {
+    /// What arrow type is used for this column type?
+    fn from(t: &InfluxColumnType) -> Self {
+        match t {
+            InfluxColumnType::Tag => Self::Dictionary(Box::new(Self::Int32), Box::new(Self::Utf8)),
+            InfluxColumnType::Field(influxdb_field_type) => (*influxdb_field_type).into(),
+            InfluxColumnType::Timestamp => TIME_DATA_TYPE(),
+        }
+    }
+}
+
+/// Thing that implements iterator over a Schema's columns.
+pub struct SchemaIter<'a> {
+    schema: &'a Schema,
+    idx: usize,
+}
+
+impl<'a> SchemaIter<'a> {
+    fn new(schema: &'a Schema) -> Self {
+        Self { schema, idx: 0 }
+    }
+}
+
+impl<'a> fmt::Debug for SchemaIter<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "SchemaIter<{}>", self.idx)
+    }
+}
+
+impl<'a> Iterator for SchemaIter<'a> {
+    type Item = (InfluxColumnType, &'a ArrowField);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.idx < self.schema.len() {
+            let ret = self.schema.field(self.idx);
+            self.idx += 1;
+            Some(ret)
+        } else {
+            None
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.schema.len()))
+    }
+}
+
+/// Asserts that the result of calling Schema:field(i) is as expected:
+///
+/// example
+///   assert_column_eq!(schema, 0, InfluxColumnType::Tag, "host");
+#[macro_export]
+macro_rules! assert_column_eq {
+    ($schema:expr, $i:expr, $expected_influxdb_column_type:expr, $expected_field_name:expr) => {
+        let (influxdb_column_type, arrow_field) = $schema.field($i);
+        assert_eq!(
+            influxdb_column_type, $expected_influxdb_column_type,
+            "Line protocol column mismatch for column {}, field {:?}, in schema {:#?}",
+            $i, arrow_field, $schema
+        );
+        assert_eq!(
+            arrow_field.name(),
+            $expected_field_name,
+            "expected field name mismatch for column {}, field {:?}, in schema {:#?}",
+            $i,
+            arrow_field,
+            $schema
+        )
+    };
+}
+
+#[cfg(test)]
+pub(crate) mod test_util {
+    use super::*;
+
+    pub(crate) fn make_field(
+        name: &str,
+        data_type: arrow::datatypes::DataType,
+        nullable: bool,
+        column_type: &str,
+    ) -> ArrowField {
+        let mut field = ArrowField::new(name, data_type, nullable);
+        field.set_metadata(
+            vec![(COLUMN_METADATA_KEY.to_string(), column_type.to_string())]
+                .into_iter()
+                .collect(),
+        );
+        field
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use InfluxColumnType::*;
+    use InfluxFieldType::*;
+
+    use crate::test_util::make_field;
+
+    use super::{builder::SchemaBuilder, *};
+
+    #[test]
+    fn new_from_arrow_metadata_good() {
+        let fields = vec![
+            make_field(
+                "tag_col",
+                ArrowDataType::Utf8,
+                true,
+                "iox::column_type::tag",
+            ),
+            make_field(
+                "int_col",
+                ArrowDataType::Int64,
+                true,
+                "iox::column_type::field::integer",
+            ),
+            make_field(
+                "uint_col",
+                ArrowDataType::UInt64,
+                true,
+                "iox::column_type::field::uinteger",
+            ),
+            make_field(
+                "float_col",
+                ArrowDataType::Float64,
+                true,
+                "iox::column_type::field::float",
+            ),
+            make_field(
+                "str_col",
+                ArrowDataType::Utf8,
+                true,
+                "iox::column_type::field::string",
+            ),
+            make_field(
+                "bool_col",
+                ArrowDataType::Boolean,
+                true,
+                "iox::column_type::field::boolean",
+            ),
+            make_field(
+                TIME_COLUMN_NAME,
+                TIME_DATA_TYPE(),
+                false,
+                "iox::column_type::timestamp",
+            ),
+        ];
+
+        let metadata: HashMap<_, _> = vec![(
+            "iox::measurement::name".to_string(),
+            "the_measurement".to_string(),
+        )]
+        .into_iter()
+        .collect();
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new_with_metadata(fields, metadata));
+
+        let schema: Schema = arrow_schema.try_into().unwrap();
+        assert_column_eq!(schema, 0, Tag, "tag_col");
+        assert_column_eq!(schema, 1, Field(Integer), "int_col");
+        assert_column_eq!(schema, 2, Field(UInteger), "uint_col");
+        assert_column_eq!(schema, 3, Field(Float), "float_col");
+        assert_column_eq!(schema, 4, Field(String), "str_col");
+        assert_column_eq!(schema, 5, Field(Boolean), "bool_col");
+        assert_column_eq!(schema, 6, Timestamp, TIME_COLUMN_NAME);
+        assert_eq!(schema.len(), 7);
+
+        assert_eq!(schema.measurement().unwrap(), "the_measurement");
+    }
+
+    // mismatched metadata / arrow types
+    #[test]
+    fn new_from_arrow_metadata_mismatched_tag() {
+        let fields = vec![
+            make_field(
+                "tag_col",
+                ArrowDataType::Int64,
+                false,
+                "iox::column_type::tag",
+            ), // not a valid tag type
+        ];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Incompatible metadata type found in schema for column 'tag_col'. Metadata specified Tag which is incompatible with actual type Int64"
+        );
+    }
+
+    // mismatched metadata / arrow types
+    #[test]
+    fn new_from_arrow_metadata_mismatched_field() {
+        let fields = vec![make_field(
+            "int_col",
+            ArrowDataType::Int64,
+            false,
+            "iox::column_type::field::float",
+        )];
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Incompatible metadata type found in schema for column 'int_col'. Metadata specified Field(Float) which is incompatible with actual type Int64"
+        );
+    }
+
+    // mismatched metadata / arrow types
+    #[test]
+    fn new_from_arrow_metadata_mismatched_timestamp() {
+        let fields = vec![
+            make_field(
+                "time",
+                ArrowDataType::Utf8,
+                false,
+                "iox::column_type::timestamp",
+            ), // timestamp can't be strings
+        ];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Incompatible metadata type found in schema for column 'time'. Metadata specified Timestamp which is incompatible with actual type Utf8"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_replicated_columns() {
+        // arrow allows duplicated colum names
+        let fields = vec![
+            make_field(
+                "the_column",
+                ArrowDataType::Utf8,
+                true,
+                "iox::column_type::tag",
+            ),
+            make_field(
+                "another_columng",
+                ArrowDataType::Utf8,
+                true,
+                "iox::column_type::tag",
+            ),
+            make_field(
+                "the_column",
+                ArrowDataType::Utf8,
+                true,
+                "iox::column_type::tag",
+            ),
+        ];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Duplicate column name found in schema: 'the_column'"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_nullable_wrong_tag() {
+        let fields = vec![make_field(
+            "tag_col",
+            ArrowDataType::Utf8,
+            false,
+            "iox::column_type::tag",
+        )];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Invalid metadata type found in schema for column 'tag_col'. Metadata specifies Tag which requires the nullable flag to be set to true"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_nullable_wrong_field() {
+        let fields = vec![make_field(
+            "field_col",
+            ArrowDataType::Utf8,
+            false,
+            "iox::column_type::field::string",
+        )];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Invalid metadata type found in schema for column 'field_col'. Metadata specifies Field(String) which requires the nullable flag to be set to true"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_nullable_wrong_timestamp() {
+        let fields = vec![make_field(
+            "time",
+            TIME_DATA_TYPE(),
+            true,
+            "iox::column_type::timestamp",
+        )];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Invalid metadata type found in schema for column 'time'. Metadata specifies Timestamp which requires the nullable flag to be set to false"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_no_metadata() {
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(vec![ArrowField::new(
+            "col1",
+            ArrowDataType::Int64,
+            false,
+        )]));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Invalid InfluxDB column type for column 'col1', cannot parse metadata: None"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_metadata_invalid_md() {
+        let fields = vec![make_field(
+            "tag_col",
+            ArrowDataType::Utf8,
+            false,
+            "something_other_than_iox",
+        )];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Invalid InfluxDB column type for column 'tag_col', cannot parse metadata: Some(\"something_other_than_iox\")"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_metadata_invalid_field() {
+        let fields = vec![make_field(
+            "int_col",
+            ArrowDataType::Int64,
+            false,
+            "iox::column_type::field::some_new_exotic_type",
+        )];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Invalid InfluxDB column type for column 'int_col', cannot parse metadata: Some(\"iox::column_type::field::some_new_exotic_type\")"
+        );
+    }
+
+    #[test]
+    fn new_from_arrow_wrong_time_column_name() {
+        let fields = vec![make_field(
+            "foo",
+            TIME_DATA_TYPE(),
+            false,
+            "iox::column_type::timestamp",
+        )];
+
+        let arrow_schema = ArrowSchemaRef::new(ArrowSchema::new(fields));
+
+        let res = Schema::try_from_arrow(arrow_schema);
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Internal Error: Time column should be named 'time' but is named 'foo'"
+        );
+    }
+
+    #[test]
+    fn test_round_trip() {
+        let schema1 = SchemaBuilder::new()
+            .influx_field("the_field", String)
+            .tag("the_tag")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+
+        // Make a new schema via ArrowSchema (serialized metadata) to ensure that
+        // the metadata makes it through a round trip
+
+        let arrow_schema_1: ArrowSchemaRef = schema1.clone().into();
+        let schema2 = Schema::try_from_arrow(arrow_schema_1).unwrap();
+
+        for s in &[schema1, schema2] {
+            assert_eq!(s.measurement().unwrap(), "the_measurement");
+            assert_column_eq!(s, 0, Field(String), "the_field");
+            assert_column_eq!(s, 1, Tag, "the_tag");
+            assert_column_eq!(s, 2, Timestamp, "time");
+            assert_eq!(3, s.len());
+        }
+    }
+
+    /// Build an empty iterator
+    fn empty_schema() -> Schema {
+        SchemaBuilder::new().build().unwrap()
+    }
+
+    #[test]
+    fn test_iter_empty() {
+        assert_eq!(empty_schema().iter().count(), 0);
+    }
+
+    #[test]
+    fn test_tags_iter_empty() {
+        assert_eq!(empty_schema().tags_iter().count(), 0);
+    }
+
+    #[test]
+    fn test_fields_iter_empty() {
+        assert_eq!(empty_schema().fields_iter().count(), 0);
+    }
+
+    #[test]
+    fn test_time_iter_empty() {
+        assert_eq!(empty_schema().time_iter().count(), 0);
+    }
+
+    /// Build a schema for testing iterators
+    fn iter_schema() -> Schema {
+        SchemaBuilder::new()
+            .influx_field("field1", Float)
+            .tag("tag1")
+            .timestamp()
+            .influx_field("field2", String)
+            .influx_field("field3", String)
+            .tag("tag2")
+            .build()
+            .unwrap()
+    }
+
+    #[test]
+    fn test_iter() {
+        let schema = iter_schema();
+
+        // test schema iterator and field accessor match up
+        for (i, (iter_col_type, iter_field)) in schema.iter().enumerate() {
+            let (col_type, field) = schema.field(i);
+            assert_eq!(iter_col_type, col_type);
+            assert_eq!(iter_field, field);
+        }
+        assert_eq!(schema.iter().count(), 6);
+    }
+
+    #[test]
+    fn test_tags_iter() {
+        let schema = iter_schema();
+
+        let mut iter = schema.tags_iter();
+        assert_eq!(iter.next().unwrap().name(), "tag1");
+        assert_eq!(iter.next().unwrap().name(), "tag2");
+        assert_eq!(iter.next(), None);
+        assert_eq!(iter.next(), None);
+    }
+
+    #[test]
+    fn test_fields_iter() {
+        let schema = iter_schema();
+
+        let mut iter = schema.fields_iter();
+        assert_eq!(iter.next().unwrap().name(), "field1");
+        assert_eq!(iter.next().unwrap().name(), "field2");
+        assert_eq!(iter.next().unwrap().name(), "field3");
+        assert_eq!(iter.next(), None);
+        assert_eq!(iter.next(), None);
+    }
+
+    #[test]
+    fn test_time_iter() {
+        let schema = iter_schema();
+
+        let mut iter = schema.time_iter();
+        assert_eq!(iter.next().unwrap().name(), "time");
+        assert_eq!(iter.next(), None);
+        assert_eq!(iter.next(), None);
+    }
+
+    #[test]
+    fn test_sort_fields_by_name_already_sorted() {
+        let schema = SchemaBuilder::new()
+            .field("field_a", ArrowDataType::Int64)
+            .unwrap()
+            .field("field_b", ArrowDataType::Int64)
+            .unwrap()
+            .field("field_c", ArrowDataType::Int64)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let sorted_schema = schema.clone().sort_fields_by_name();
+
+        assert_eq!(
+            schema, sorted_schema,
+            "\nExpected:\n{schema:#?}\nActual:\n{sorted_schema:#?}"
+        );
+    }
+
+    #[test]
+    fn test_sort_fields_by_name() {
+        let schema = SchemaBuilder::new()
+            .field("field_b", ArrowDataType::Int64)
+            .unwrap()
+            .field("field_a", ArrowDataType::Int64)
+            .unwrap()
+            .field("field_c", ArrowDataType::Int64)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let sorted_schema = schema.sort_fields_by_name();
+
+        let expected_schema = SchemaBuilder::new()
+            .field("field_a", ArrowDataType::Int64)
+            .unwrap()
+            .field("field_b", ArrowDataType::Int64)
+            .unwrap()
+            .field("field_c", ArrowDataType::Int64)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        assert_eq!(
+            expected_schema, sorted_schema,
+            "\nExpected:\n{expected_schema:#?}\nActual:\n{sorted_schema:#?}"
+        );
+    }
+
+    #[test]
+    fn test_select() {
+        let schema1 = SchemaBuilder::new()
+            .influx_field("the_field", String)
+            .tag("the_tag")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+
+        let schema2 = schema1.select_by_names(&[TIME_COLUMN_NAME]).unwrap();
+        let schema3 = Schema::try_from_arrow(Arc::clone(&schema2.inner)).unwrap();
+
+        assert_eq!(schema1.measurement(), schema2.measurement());
+        assert_eq!(schema1.measurement(), schema3.measurement());
+
+        assert_eq!(schema1.len(), 3);
+        assert_eq!(schema2.len(), 1);
+        assert_eq!(schema3.len(), 1);
+
+        assert_eq!(schema1.inner.fields().len(), 3);
+        assert_eq!(schema2.inner.fields().len(), 1);
+        assert_eq!(schema3.inner.fields().len(), 1);
+
+        let get_type = |x: &Schema, field: &str| -> InfluxColumnType {
+            let idx = x.find_index_of(field).unwrap();
+            x.field(idx).0
+        };
+
+        assert_eq!(
+            get_type(&schema1, TIME_COLUMN_NAME),
+            InfluxColumnType::Timestamp
+        );
+        assert_eq!(
+            get_type(&schema2, TIME_COLUMN_NAME),
+            InfluxColumnType::Timestamp
+        );
+        assert_eq!(get_type(&schema1, "the_tag"), InfluxColumnType::Tag);
+        assert_eq!(
+            get_type(&schema1, "the_field"),
+            InfluxColumnType::Field(InfluxFieldType::String)
+        );
+        assert_eq!(
+            get_type(&schema2, TIME_COLUMN_NAME),
+            InfluxColumnType::Timestamp
+        );
+        assert_eq!(
+            get_type(&schema3, TIME_COLUMN_NAME),
+            InfluxColumnType::Timestamp
+        );
+    }
+
+    #[test]
+    fn test_df_projection() {
+        let schema = SchemaBuilder::new()
+            .influx_field("the_field", String)
+            .tag("the_tag")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+
+        assert_eq!(schema.df_projection(Projection::All).unwrap(), None);
+        assert_eq!(
+            schema
+                .df_projection(Projection::Some(&["the_tag"]))
+                .unwrap(),
+            Some(vec![1])
+        );
+        assert_eq!(
+            schema
+                .df_projection(Projection::Some(&["the_tag", "the_field"]))
+                .unwrap(),
+            Some(vec![1, 0])
+        );
+
+        let res = schema.df_projection(Projection::Some(&["the_tag", "unknown_field"]));
+        assert_eq!(
+            res.unwrap_err().to_string(),
+            "Column not found 'unknown_field'"
+        );
+    }
+
+    #[test]
+    fn test_is_sort_on_pk() {
+        // Sort key the same as pk
+        let sort_key =
+            SortKey::from_columns(vec!["tag4", "tag3", "tag2", "tag1", TIME_COLUMN_NAME]);
+
+        let schema = SchemaBuilder::new()
+            .influx_field("the_field", String)
+            .tag("tag1")
+            .tag("tag2")
+            .tag("tag3")
+            .tag("tag4")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+        assert!(schema.is_sorted_on_pk(&sort_key));
+
+        // Sort key does not include all pk cols
+        let sort_key = SortKey::from_columns(vec!["tag3", "tag1", TIME_COLUMN_NAME]);
+
+        let schema = SchemaBuilder::new()
+            .influx_field("the_field", String)
+            .tag("tag1")
+            .tag("tag2")
+            .tag("tag3")
+            .tag("tag4")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+        assert!(!schema.is_sorted_on_pk(&sort_key));
+
+        // No PK, sort key on non pk
+        let sort_key = SortKey::from_columns(vec!["the_field"]);
+
+        let schema = SchemaBuilder::new()
+            .influx_field("the_field", String)
+            .tag("tag1")
+            .tag("tag2")
+            .tag("tag3")
+            .tag("tag4")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+        assert!(!schema.is_sorted_on_pk(&sort_key));
+    }
+
+    #[test]
+    fn test_estimate_size() {
+        let schema = SchemaBuilder::new()
+            .influx_field("the_field", String)
+            .tag("the_tag")
+            .timestamp()
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+
+        // this is mostly a smoke test
+        assert_eq!(schema.estimate_size(), 1243);
+    }
+}
diff --git a/schema/src/merge.rs b/schema/src/merge.rs
new file mode 100644
index 0000000..9473598
--- /dev/null
+++ b/schema/src/merge.rs
@@ -0,0 +1,424 @@
+use arrow::{datatypes::Field, record_batch::RecordBatch};
+use hashbrown::hash_map::RawEntryMut;
+use hashbrown::HashMap;
+use snafu::Snafu;
+
+use crate::interner::SchemaInterner;
+
+use super::{InfluxColumnType, Schema};
+
+/// Namespace schema creation / validation errors.
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("No schemas found when building merged schema"))]
+    NoSchemas,
+
+    #[snafu(display(
+        "Schema Merge Error: Incompatible measurement names. Existing measurement name '{}', new measurement name '{}'",
+        existing_measurement, new_measurement
+    ))]
+    TryMergeDifferentMeasurementNames {
+        existing_measurement: String,
+        new_measurement: String,
+    },
+
+    #[snafu(display(
+        "Schema Merge Error: Incompatible column type for '{}'. Existing type {:?}, new type {:?}",
+        field_name,
+        existing_column_type,
+        new_column_type
+    ))]
+    TryMergeBadColumnType {
+        field_name: String,
+        existing_column_type: InfluxColumnType,
+        new_column_type: InfluxColumnType,
+    },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Return the merged schema for RecordBatches
+///
+/// This is infallable because the schemas of chunks within a
+/// partition are assumed to be compatible because that schema was
+/// enforced as part of writing into the partition
+pub fn merge_record_batch_schemas(batches: &[RecordBatch]) -> Schema {
+    let mut merger = SchemaMerger::new();
+    for batch in batches {
+        let schema = Schema::try_from(batch.schema()).expect("Schema conversion error");
+        merger = merger.merge(&schema).expect("Schemas compatible");
+    }
+    merger.build()
+}
+
+/// Schema Merger
+///
+/// The usecase for merging schemas is when different chunks have
+/// different schemas. This struct can be used to build a combined
+/// schema by merging Schemas together according to the following
+/// rules:
+///
+/// 1. New columns may be added in subsequent schema, but the types of
+///    the columns (including any metadata) must be the same
+///
+/// 2. The measurement names must be consistent: one or both can be
+///    `None`, or they can both be `Some(name`)
+#[derive(Debug, Default)]
+pub struct SchemaMerger<'a> {
+    /// Maps column names to their definition
+    fields: HashMap<String, (Field, InfluxColumnType)>,
+    /// The measurement name if any
+    measurement: Option<String>,
+    /// Interner, if any.
+    interner: Option<&'a mut SchemaInterner>,
+}
+
+impl SchemaMerger<'static> {
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+impl<'a> SchemaMerger<'a> {
+    pub fn with_interner(self, interner: &mut SchemaInterner) -> SchemaMerger<'_> {
+        SchemaMerger {
+            fields: self.fields,
+            measurement: self.measurement,
+            interner: Some(interner),
+        }
+    }
+
+    /// Appends the schema to the merged schema being built,
+    /// validating that no columns are added.
+    pub fn merge(mut self, other: &Schema) -> Result<Self> {
+        // Verify measurement name is compatible
+        match (self.measurement.as_ref(), other.measurement()) {
+            (Some(existing_measurement), Some(new_measurement)) => {
+                if existing_measurement != new_measurement {
+                    return TryMergeDifferentMeasurementNamesSnafu {
+                        existing_measurement,
+                        new_measurement,
+                    }
+                    .fail();
+                }
+            }
+            (None, Some(other)) => self.measurement = Some(other.clone()),
+            _ => {}
+        }
+
+        // Merge fields
+        for (column_type, field) in other.iter() {
+            self.merge_field(field, column_type)?;
+        }
+
+        Ok(self)
+    }
+
+    pub fn merge_field(
+        &mut self,
+        field: &Field,
+        column_type: InfluxColumnType,
+    ) -> Result<&mut Self> {
+        let field_name = field.name();
+        match self.fields.raw_entry_mut().from_key(field_name) {
+            RawEntryMut::Vacant(vacant) => {
+                // Purposefully don't propagate metadata to avoid blindly propagating
+                // information such as sort key, etc... that SchemaMerger cannot guarantee
+                // to preserve the semantics of
+                let field = Field::new(field_name, field.data_type().clone(), field.is_nullable());
+                vacant.insert(field_name.clone(), (field, column_type));
+            }
+            RawEntryMut::Occupied(occupied) => {
+                let (existing_field, existing_column_type) = occupied.get();
+
+                // for now, insist the types are exactly the same
+                // (e.g. None and Some(..) don't match). We could
+                // consider relaxing this constraint
+                if existing_column_type != &column_type {
+                    return Err(Error::TryMergeBadColumnType {
+                        field_name: field_name.to_string(),
+                        existing_column_type: *existing_column_type,
+                        new_column_type: column_type,
+                    });
+                }
+
+                // both are valid schemas, so this should always hold
+                assert_eq!(field.is_nullable(), existing_field.is_nullable());
+                assert_eq!(field.data_type(), existing_field.data_type());
+            }
+        }
+
+        Ok(self)
+    }
+
+    /// Returns the schema that was built, the columns are always sorted in lexicographic order
+    pub fn build(mut self) -> Schema {
+        let schema = Schema::new_from_parts(
+            self.measurement.take(),
+            self.fields.drain().map(|x| x.1),
+            true,
+        )
+        .expect("failed to build merged schema");
+
+        if let Some(interner) = self.interner.as_mut() {
+            interner.intern(schema)
+        } else {
+            schema
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use crate::builder::SchemaBuilder;
+    use crate::InfluxFieldType::Integer;
+
+    use super::*;
+
+    #[test]
+    fn test_merge_same_schema() {
+        let schema1 = SchemaBuilder::new()
+            .influx_field("int_field", Integer)
+            .tag("the_tag")
+            .build()
+            .unwrap();
+
+        let schema2 = SchemaBuilder::new()
+            .influx_field("int_field", Integer)
+            .tag("the_tag")
+            .build()
+            .unwrap();
+
+        let merged_schema = SchemaMerger::new()
+            .merge(&schema1)
+            .unwrap()
+            .merge(&schema2)
+            .unwrap()
+            .build();
+
+        assert_eq!(merged_schema, schema1);
+        assert_eq!(merged_schema, schema2);
+    }
+
+    #[test]
+    fn test_merge_compatible_schema() {
+        let schema1 = SchemaBuilder::new()
+            .tag("the_tag")
+            .influx_field("int_field", Integer)
+            .build()
+            .unwrap()
+            .sort_fields_by_name();
+
+        // has some of the same and some new, different fields
+        let schema2 = SchemaBuilder::new()
+            .measurement("my_measurement")
+            .tag("the_other_tag")
+            .influx_field("int_field", Integer)
+            .influx_field("another_field", Integer)
+            .build()
+            .unwrap()
+            .sort_fields_by_name();
+
+        let merged_schema = SchemaMerger::new()
+            .merge(&schema1)
+            .unwrap()
+            .merge(&schema2)
+            .unwrap()
+            .build();
+
+        let expected_schema = SchemaBuilder::new()
+            .measurement("my_measurement")
+            .tag("the_tag")
+            .influx_field("int_field", Integer)
+            .tag("the_other_tag")
+            .influx_field("another_field", Integer)
+            .build()
+            .unwrap()
+            .sort_fields_by_name();
+
+        assert_eq!(
+            expected_schema, merged_schema,
+            "\nExpected:\n{expected_schema:#?}\nActual:\n{merged_schema:#?}"
+        );
+    }
+
+    #[test]
+    fn test_merge_compatible_schema_no_names() {
+        let schema1 = SchemaBuilder::new().tag("the_tag").build().unwrap();
+
+        // has some different fields
+        let schema2 = SchemaBuilder::new().tag("the_other_tag").build().unwrap();
+
+        // ensure the merge is not optimized away
+        let merged_schema = SchemaMerger::new()
+            .merge(&schema1)
+            .unwrap()
+            .merge(&schema2)
+            .unwrap()
+            .build();
+
+        let expected_schema = SchemaBuilder::new()
+            .tag("the_other_tag")
+            .tag("the_tag")
+            .build()
+            .unwrap();
+
+        assert_eq!(
+            expected_schema, merged_schema,
+            "\nExpected:\n{expected_schema:#?}\nActual:\n{merged_schema:#?}"
+        );
+    }
+
+    #[test]
+    fn test_merge_compatible_schema_only_measurement() {
+        let schema1 = SchemaBuilder::new()
+            .tag("the_tag")
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+
+        // schema has same fields but not measurement name
+        let schema2 = SchemaBuilder::new().tag("the_tag").build().unwrap();
+
+        // ensure the merge is not optimized away
+        let merged_schema = SchemaMerger::new()
+            .merge(&schema1)
+            .unwrap()
+            .merge(&schema2)
+            .unwrap()
+            .build();
+
+        let expected_schema = SchemaBuilder::new()
+            .tag("the_tag")
+            .measurement("the_measurement")
+            .build()
+            .unwrap();
+
+        assert_eq!(
+            expected_schema, merged_schema,
+            "\nExpected:\n{expected_schema:#?}\nActual:\n{merged_schema:#?}"
+        );
+    }
+
+    #[test]
+    fn test_merge_measurement_names() {
+        let schema1 = SchemaBuilder::new().tag("the_tag").build().unwrap();
+
+        // has some of the same and some different fields
+        let schema2 = SchemaBuilder::new()
+            .measurement("my_measurement")
+            .build()
+            .unwrap();
+
+        let merged_schema = SchemaMerger::new()
+            .merge(&schema1)
+            .unwrap()
+            .merge(&schema2)
+            .unwrap()
+            .build();
+
+        let expected_schema = SchemaBuilder::new()
+            .measurement("my_measurement")
+            .tag("the_tag")
+            .build()
+            .unwrap();
+
+        assert_eq!(
+            expected_schema, merged_schema,
+            "\nExpected:\n{expected_schema:#?}\nActual:\n{merged_schema:#?}"
+        );
+    }
+
+    #[test]
+    fn test_merge_incompatible_schema_measurement_names() {
+        let schema1 = SchemaBuilder::new()
+            .tag("the_tag")
+            .measurement("measurement1")
+            .build()
+            .unwrap();
+
+        // different measurement name, same otherwise
+        let schema2 = SchemaBuilder::new()
+            .tag("the_tag")
+            .measurement("measurement2")
+            .build()
+            .unwrap();
+
+        let merged_schema_error = SchemaMerger::new()
+            .merge(&schema1)
+            .unwrap()
+            .merge(&schema2)
+            .unwrap_err();
+
+        assert_eq!(
+            merged_schema_error.to_string(),
+            "Schema Merge Error: Incompatible measurement names. Existing measurement name 'measurement1', new measurement name 'measurement2'"
+        );
+    }
+
+    #[test]
+    fn test_merge_incompatible_column_types() {
+        let schema1 = SchemaBuilder::new().tag("the_tag").build().unwrap();
+
+        // same field name with different type
+        let schema2 = SchemaBuilder::new()
+            .influx_field("the_tag", Integer)
+            .build()
+            .unwrap();
+
+        let merged_schema_error = SchemaMerger::new()
+            .merge(&schema1)
+            .unwrap()
+            .merge(&schema2)
+            .unwrap_err();
+
+        assert_eq!(merged_schema_error.to_string(), "Schema Merge Error: Incompatible column type for 'the_tag'. Existing type Tag, new type Field(Integer)");
+    }
+
+    #[test]
+    fn test_interning() {
+        let schema_1a = SchemaBuilder::new()
+            .influx_field("int_field", Integer)
+            .tag("the_tag")
+            .build()
+            .unwrap();
+
+        let schema_1b = SchemaBuilder::new()
+            .influx_field("int_field", Integer)
+            .tag("the_tag")
+            .build()
+            .unwrap();
+
+        let schema_2 = SchemaBuilder::new()
+            .influx_field("float_field", crate::InfluxFieldType::Float)
+            .tag("the_tag")
+            .build()
+            .unwrap();
+
+        let mut interner = SchemaInterner::new();
+
+        let merged_schema_a = SchemaMerger::new()
+            .with_interner(&mut interner)
+            .merge(&schema_1a)
+            .unwrap()
+            .merge(&schema_2)
+            .unwrap()
+            .build();
+
+        let merged_schema_b = SchemaMerger::new()
+            .with_interner(&mut interner)
+            .merge(&schema_1b)
+            .unwrap()
+            .merge(&schema_2)
+            .unwrap()
+            .build();
+
+        assert_eq!(merged_schema_a, merged_schema_b);
+        assert!(Arc::ptr_eq(
+            merged_schema_a.inner(),
+            merged_schema_b.inner()
+        ));
+    }
+}
diff --git a/schema/src/projection.rs b/schema/src/projection.rs
new file mode 100644
index 0000000..2582449
--- /dev/null
+++ b/schema/src/projection.rs
@@ -0,0 +1,135 @@
+use std::{collections::HashMap, fmt::Display, sync::Arc};
+
+/// A collection of columns to include in query results.
+///
+/// The `All` variant denotes that the caller wishes to include all table
+/// columns in the results.
+///
+#[derive(Debug, Clone, Copy)]
+pub enum Projection<'a> {
+    /// Return all columns (e.g. SELECT *)
+    /// The columns are returned in an arbitrary order
+    All,
+
+    /// Return only the named columns
+    Some(&'a [&'a str]),
+}
+
+impl<'a> Display for Projection<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Projection::All => write!(f, "*")?,
+            Projection::Some(cols) => {
+                for (i, col) in cols.iter().enumerate() {
+                    write!(f, "{col}")?;
+                    if i < cols.len() - 1 {
+                        write!(f, ",")?;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<'a> Projection<'a> {
+    /// Compute projected schema.
+    pub fn project_schema(
+        &self,
+        schema: &arrow::datatypes::SchemaRef,
+    ) -> arrow::datatypes::SchemaRef {
+        match self {
+            Projection::Some(cols) => {
+                let fields_lookup: HashMap<_, _> = schema
+                    .fields()
+                    .iter()
+                    .map(|f| f.name().as_str())
+                    .enumerate()
+                    .map(|(v, k)| (k, v))
+                    .collect();
+
+                // Indices of columns in the schema needed to read
+                let projection: Vec<usize> = cols
+                    .iter()
+                    .filter_map(|c| fields_lookup.get(c).cloned())
+                    .collect();
+
+                // try NOT to create yet another schema if this is basically a "select all"
+                if (projection.len() == schema.fields().len())
+                    && projection.iter().enumerate().all(|(a, b)| a == *b)
+                {
+                    return Arc::clone(schema);
+                }
+
+                // Compute final (output) schema after selection
+                Arc::new(schema.project(&projection).expect("projection bug"))
+            }
+            Projection::All => Arc::clone(schema),
+        }
+    }
+}
+
+#[cfg(test)]
+mod test_super {
+    use super::*;
+
+    #[test]
+    fn test_selection_display() {
+        let selections = vec![
+            (Projection::All, "*"),
+            (Projection::Some(&["env"]), "env"),
+            (Projection::Some(&["env", "region"]), "env,region"),
+        ];
+
+        for (selection, exp) in selections {
+            assert_eq!(format!("{selection}").as_str(), exp);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::builder::SchemaBuilder;
+
+    use super::*;
+
+    #[test]
+    fn test_select_schema_all() {
+        let schema = SchemaBuilder::new()
+            .tag("t1")
+            .tag("t2")
+            .timestamp()
+            .build()
+            .unwrap()
+            .as_arrow();
+        let actual = Projection::All.project_schema(&schema);
+        assert!(Arc::ptr_eq(&schema, &actual));
+    }
+
+    #[test]
+    fn test_select_schema_some() {
+        let schema = SchemaBuilder::new()
+            .tag("t1")
+            .tag("t2")
+            .timestamp()
+            .build()
+            .unwrap()
+            .as_arrow();
+
+        // normal selection
+        let actual = Projection::Some(&["time", "t2"]).project_schema(&schema);
+        let expected = schema.project(&[2, 1]).unwrap();
+        assert!(!Arc::ptr_eq(&schema, &actual));
+        assert_eq!(actual.as_ref(), &expected);
+
+        // select unknown columns
+        let actual = Projection::Some(&["time", "t3", "t2"]).project_schema(&schema);
+        let expected = schema.project(&[2, 1]).unwrap();
+        assert!(!Arc::ptr_eq(&schema, &actual));
+        assert_eq!(actual.as_ref(), &expected);
+
+        // "hidden" all
+        let actual = Projection::Some(&["t1", "t2", "time"]).project_schema(&schema);
+        assert!(Arc::ptr_eq(&schema, &actual));
+    }
+}
diff --git a/schema/src/sort.rs b/schema/src/sort.rs
new file mode 100644
index 0000000..7d4c412
--- /dev/null
+++ b/schema/src/sort.rs
@@ -0,0 +1,818 @@
+use crate::{Schema, TIME_COLUMN_NAME};
+use arrow::compute::SortOptions;
+use arrow::{
+    array::{Array, DictionaryArray, StringArray},
+    datatypes::{DataType, Int32Type},
+    record_batch::RecordBatch,
+};
+use indexmap::{map::Iter, IndexMap};
+use observability_deps::tracing::debug;
+use std::{
+    collections::{HashMap, HashSet},
+    fmt::Display,
+    sync::Arc,
+};
+
+#[derive(Debug, Default)]
+pub struct SortKeyBuilder {
+    columns: IndexMap<Arc<str>, SortOptions>,
+}
+
+impl SortKeyBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            columns: IndexMap::with_capacity(capacity),
+        }
+    }
+
+    pub fn with_col(self, column: impl Into<Arc<str>>) -> Self {
+        self.with_col_sort_opts(column, Default::default())
+    }
+
+    /// Helper to insert col with specified sort options into sort key
+    pub fn with_col_opts(
+        self,
+        col: impl Into<Arc<str>>,
+        descending: bool,
+        nulls_first: bool,
+    ) -> Self {
+        self.with_col_sort_opts(
+            col,
+            SortOptions {
+                descending,
+                nulls_first,
+            },
+        )
+    }
+
+    pub fn with_col_sort_opts(mut self, col: impl Into<Arc<str>>, options: SortOptions) -> Self {
+        self.columns.insert(col.into(), options);
+        self
+    }
+
+    pub fn build(self) -> SortKey {
+        SortKey {
+            columns: Arc::new(self.columns),
+        }
+    }
+}
+
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct SortKey {
+    columns: Arc<IndexMap<Arc<str>, SortOptions>>,
+}
+
+impl SortKey {
+    /// Create a new empty sort key
+    pub fn empty() -> Self {
+        SortKey {
+            columns: Default::default(),
+        }
+    }
+
+    /// Create a new sort key from the provided columns
+    pub fn from_columns<C, I>(columns: C) -> Self
+    where
+        C: IntoIterator<Item = I>,
+        I: Into<Arc<str>>,
+    {
+        let iter = columns.into_iter();
+        let mut builder = SortKeyBuilder::with_capacity(iter.size_hint().0);
+        for c in iter {
+            builder = builder.with_col(c);
+        }
+        builder.build()
+    }
+
+    pub fn to_columns(&self) -> impl Iterator<Item = &str> {
+        self.columns.keys().map(|k| k.as_ref())
+    }
+
+    /// Gets the column for a given index
+    pub fn get_index(&self, idx: usize) -> Option<(&Arc<str>, &SortOptions)> {
+        self.columns.get_index(idx)
+    }
+
+    /// Return the index of the given column and its sort option. Return None otherwise.
+    pub fn find_index(&self, column: &str, sort_options: &SortOptions) -> Option<usize> {
+        // Find the given column in this SortKey
+        let (sort_ordinal, _, options) = self.columns.get_full(column)?;
+
+        // See if it SortOptions the same
+        if options == sort_options {
+            return Some(sort_ordinal);
+        }
+
+        None
+    }
+
+    /// Return true if this column appears anywhere in the sort key.
+    pub fn contains(&self, column: &str) -> bool {
+        self.columns.contains_key(column)
+    }
+
+    /// Returns an iterator over the columns in this key
+    pub fn iter(&self) -> Iter<'_, Arc<str>, SortOptions> {
+        self.columns.iter()
+    }
+
+    /// Returns the length of the sort key
+    pub fn len(&self) -> usize {
+        self.columns.len()
+    }
+
+    /// Returns if this sort key is empty
+    pub fn is_empty(&self) -> bool {
+        self.columns.is_empty()
+    }
+
+    /// Filters this sort key to contain only the columns present in the primary key, in the order
+    /// that the columns appear in this sort key.
+    ///
+    /// # Panics
+    ///
+    /// Panics if any columns in the primary key are NOT present in this sort key.
+    pub fn filter_to(&self, primary_key: &[&str], partition_id: i64) -> SortKey {
+        let missing_from_catalog_key: Vec<_> = primary_key
+            .iter()
+            .filter(|col| !self.contains(col))
+            .collect();
+        if !missing_from_catalog_key.is_empty() {
+            panic!(
+                "Primary key column(s) found that don't appear in the catalog sort key [{missing_from_catalog_key:?}] of partition: {partition_id}. Sort key: {self:?}"
+            )
+        }
+
+        Self::from_columns(
+            self.iter()
+                .map(|(col, _opts)| col)
+                .filter(|col| primary_key.contains(&col.as_ref()))
+                .cloned(),
+        )
+    }
+
+    /// Returns merge key of the 2 given keys if one covers the other. Returns None otherwise.
+    /// Key1 is said to cover key2 if key2 is a subset and in the same order of key1.
+    /// Examples:
+    ///   . (a) covers empty and (a)  =>
+    ///         . super key of (a) and empty is (a)
+    ///         . super key of (a) and (a)) is (a)
+    ///   . (a, b) covers empty, (a), (b), and (a, b) =>
+    ///         . super key of (a, b) and empty is (a, b)
+    ///         . super key of (a, b) and (a) is (a, b)
+    ///         . super key of (a, b) and (b) is (a, b)
+    ///         . super key of (a, b) and (a, b) is (a, b)
+    ///   . (a, b) does not cover (b, a) => super key of (a, b) and (b, a) is None
+    ///   . (a, b, c) covers (a, b), (a, c), (b, c), (a), (b), (c) and empty =>
+    ///        super key of (a, b, c) and any of { (a, b), (a, c), (b, c), (a), (b), (c) and empty } is (a, b, c)
+    ///   . (a, b, c) does not cover (b, a), (c, a), (c, b), (b, a, c), (b, c, a), (c, a, b), (c, b, a) =>
+    ///        super key of (a, b, c) and any of { b, a), (c, a), (c, b), (b, a, c), (b, c, a), (c, a, b), (c, b, a) } is None
+    ///
+    ///  Note that the last column in the sort key must be time
+    pub fn try_merge_key<'a>(key1: &'a SortKey, key2: &'a SortKey) -> Option<&'a SortKey> {
+        if key1.is_empty() || key2.is_empty() {
+            panic!("Sort key cannot be empty");
+        }
+
+        let (long_key, short_key) = if key1.len() > key2.len() {
+            (key1, key2)
+        } else {
+            (key2, key1)
+        };
+
+        // Go over short key and check its right-order availability in the long key
+        let mut prev_long_idx: Option<usize> = None;
+        for (col, sort_options) in &*short_key.columns {
+            if let Some(long_idx) = long_key.find_index(col, sort_options) {
+                match prev_long_idx {
+                    None => prev_long_idx = Some(long_idx),
+                    Some(prev_idx) => {
+                        if long_idx > prev_idx {
+                            // In the right order, update the current idx
+                            prev_long_idx = Some(long_idx);
+                        } else {
+                            // Not in the right order
+                            return None;
+                        }
+                    }
+                }
+            } else {
+                // Not found
+                return None;
+            }
+        }
+
+        // Reach here means the long key is the super key of the sort one
+        Some(long_key)
+    }
+
+    /// Get size of `self.columns` EXCLUDING the type itself (i.e. `size_of_val(&self.columns)`).
+    fn size_columns(&self) -> usize {
+        // Size calculation for `self.columns`:
+        //
+        // - `self.columns` is an `IndexMap`, which is mostly backed by `IndexMapCore`.
+        // - `IndexMapCore` is:
+        //     struct IndexMapCore<K, V> {
+        //         indices: RawTable<usize>,
+        //         entries: Vec<Bucket<K, V>>,
+        //     }
+        // - `Bucket` is:
+        //      struct Bucket<K, V> {
+        //         hash: HashValue,
+        //         key: K,
+        //         value: V,
+        //      }
+        // - `HashValue` is just a newtype `usize`
+        // - We assume that the hashbrown `RawTable` has 1 byte overhead per entry (as mentioned in their README) but
+        //   allocates the whole capacity (which is very conservative).
+        // - the size of `indices` and `entries` can sadly only be guessed, since `IndexMap::capacity` returns the
+        //   minimum of the two capacities.
+        type K = Arc<str>;
+        type V = SortOptions;
+        let capacity_indices = self.columns.capacity();
+        let capacity_entries = capacity_indices;
+        const SIZE_BUCKET: usize =
+            std::mem::size_of::<usize>() + std::mem::size_of::<K>() + std::mem::size_of::<V>();
+        let size_entries = SIZE_BUCKET * capacity_entries;
+        const SIZE_HASHBROWN_ENTRY: usize = std::mem::size_of::<usize>() + 1;
+        let size_indices = SIZE_HASHBROWN_ENTRY * capacity_indices;
+        size_entries + size_indices
+    }
+
+    /// Memory size in bytes including `self`
+    pub fn size(&self) -> usize {
+        std::mem::size_of_val(self)
+            + self.size_columns()
+            + self.columns.keys().map(|k| k.len()).sum::<usize>()
+    }
+}
+
+impl From<SortKey> for Vec<String> {
+    fn from(val: SortKey) -> Self {
+        val.columns.iter().map(|(id, _)| id.to_string()).collect()
+    }
+}
+
+impl From<Vec<String>> for SortKey {
+    fn from(val: Vec<String>) -> Self {
+        Self::from_columns(val)
+    }
+}
+
+// Produces a human-readable representation of a sort key that looks like:
+//
+//  "host, region DESC, env NULLS FIRST, time"
+//
+impl Display for SortKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        for (i, (name, options)) in self.columns.iter().enumerate() {
+            write!(f, "{name}")?;
+            if options.descending {
+                write!(f, " DESC")?;
+            }
+            if !options.nulls_first {
+                // less common case
+                write!(f, " NULLS LAST")?;
+            }
+            write!(f, ",")?;
+
+            if i < self.columns.len() - 1 {
+                write!(f, " ")?;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Given a `Schema` and an iterator of `RecordBatch`es, compute a sort key based on:
+///
+/// - The columns that make up the primary key of the schema
+/// - Order those columns from low cardinality to high cardinality based on the data
+/// - Always have the time column last
+pub fn compute_sort_key<'a>(
+    schema: &Schema,
+    batches: impl Iterator<Item = &'a RecordBatch>,
+) -> SortKey {
+    let primary_key = schema.primary_key();
+
+    let cardinalities = distinct_counts(batches, &primary_key);
+    let sort_key = sort_key_from_cardinalities(&cardinalities);
+
+    debug!(?primary_key, ?sort_key, "computed sort key");
+    sort_key
+}
+
+/// Given columns and their cardinalities (the number of distinct values in the data), sort the
+/// columns by cardinality and turn that ordering into a [`SortKey`], with the time column always
+/// appearing last.
+pub fn sort_key_from_cardinalities(cardinalities: &HashMap<String, usize>) -> SortKey {
+    let mut cardinalities: Vec<_> = cardinalities.iter().collect();
+    // Sort by (cardinality, column_name) to have deterministic order if same cardinality
+    cardinalities.sort_by_cached_key(|x| (x.1, x.0.clone()));
+
+    let mut builder = SortKeyBuilder::with_capacity(cardinalities.len() + 1);
+    for (col, _) in cardinalities {
+        builder = builder.with_col(col.as_str())
+    }
+    builder = builder.with_col(TIME_COLUMN_NAME);
+    builder.build()
+}
+
+/// Takes batches of data and the columns that make up the primary key. Computes the number of
+/// distinct values for each primary key column across all batches, also known as "cardinality".
+/// Used to determine sort order.
+fn distinct_counts<'a>(
+    batches: impl Iterator<Item = &'a RecordBatch>,
+    primary_key: &[&str],
+) -> HashMap<String, usize> {
+    let mut distinct_values_across_batches = HashMap::with_capacity(primary_key.len());
+
+    for batch in batches {
+        for (column, distinct_values) in distinct_values(batch, primary_key) {
+            let set = distinct_values_across_batches
+                .entry(column)
+                .or_insert_with(HashSet::new);
+            set.extend(distinct_values.into_iter());
+        }
+    }
+
+    distinct_values_across_batches
+        .into_iter()
+        .map(|(column, distinct_values)| (column, distinct_values.len()))
+        .collect()
+}
+
+/// Takes a `RecordBatch` and the column names that make up the primary key of the schema. Returns
+/// a map of column names to the set of the distinct string values, for the specified columns. Used
+/// to compute cardinality across multiple `RecordBatch`es.
+fn distinct_values(batch: &RecordBatch, primary_key: &[&str]) -> HashMap<String, HashSet<String>> {
+    let schema = batch.schema();
+    batch
+        .columns()
+        .iter()
+        .zip(schema.fields())
+        .filter(|(_col, field)| primary_key.contains(&field.name().as_str()))
+        .flat_map(|(col, field)| match field.data_type() {
+            // Dictionaries of I32 => Utf8 are supported as tags in
+            // `schema::InfluxColumnType::valid_arrow_type`
+            DataType::Dictionary(key, value)
+                if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+            {
+                let col = col
+                    .as_any()
+                    .downcast_ref::<DictionaryArray<Int32Type>>()
+                    .expect("unexpected datatype");
+
+                let values = col.values();
+                let values = values
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("unexpected datatype");
+
+                Some((
+                    field.name().into(),
+                    values.iter().flatten().map(ToString::to_string).collect(),
+                ))
+            }
+            // Utf8 types are supported as tags
+            DataType::Utf8 => {
+                let values = col
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("unexpected datatype");
+
+                Some((
+                    field.name().into(),
+                    values.iter().flatten().map(ToString::to_string).collect(),
+                ))
+            }
+            // No other data types are supported as tags; don't compute distinct values for them
+            _ => None,
+        })
+        .collect()
+}
+
+/// Given a sort key from the catalog and the primary key (tags + time) from the data, return the
+/// sort key that should be used for this parquet file and, if needed, the sort key that should
+/// be updated in the catalog. These are computed as follows:
+///
+/// - Columns that appear in both the primary key and the catalog sort key should appear in the
+///   same order as they appear in the catalog sort key.
+/// - If there are new columns that appear in the primary key, add the new columns to the end of
+///   the catalog sort key's tag list. Also return an updated catalog sort key to save the new
+///   column in the catalog.
+/// - If there are columns that appear in the catalog sort key but aren't present in this data's
+///   primary key, don't include them in the sort key to be used for this data. Don't remove them
+///   from the catalog sort key.
+pub fn adjust_sort_key_columns(
+    catalog_sort_key: &SortKey,
+    primary_key: &[&str],
+) -> (SortKey, Option<SortKey>) {
+    let existing_columns_without_time = catalog_sort_key
+        .iter()
+        .map(|(col, _opts)| col)
+        .filter(|col| TIME_COLUMN_NAME != col.as_ref())
+        .cloned();
+    let new_columns: Vec<_> = primary_key
+        .iter()
+        .filter(|col| !catalog_sort_key.contains(col))
+        .collect();
+
+    let metadata_sort_key = SortKey::from_columns(
+        existing_columns_without_time
+            .clone()
+            .filter(|col| primary_key.contains(&col.as_ref()))
+            .chain(new_columns.iter().map(|&&col| Arc::from(col)))
+            .chain(std::iter::once(Arc::from(TIME_COLUMN_NAME))),
+    );
+
+    let catalog_update = if new_columns.is_empty() {
+        None
+    } else {
+        Some(SortKey::from_columns(
+            existing_columns_without_time
+                .chain(new_columns.into_iter().map(|&col| Arc::from(col)))
+                .chain(std::iter::once(Arc::from(TIME_COLUMN_NAME))),
+        ))
+    };
+
+    debug!(?primary_key,
+           input_catalog_sort_key=?catalog_sort_key,
+           output_chunk_sort_key=?metadata_sort_key,
+           output_catalog_sort_key=?catalog_update,
+           "adjusted sort key");
+
+    (metadata_sort_key, catalog_update)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::builder::SchemaBuilder;
+    use arrow::array::ArrayRef;
+
+    #[test]
+    fn test_sort_key_eq() {
+        let key1 = SortKey::from_columns(vec!["a"]);
+
+        let key1_2 = SortKeyBuilder::with_capacity(2)
+            .with_col("a")
+            .with_col_opts("b", true, false)
+            .build();
+
+        let key2 = SortKey::empty();
+
+        // different keys
+        assert_ne!(key1, key2);
+        assert_ne!(key1_2, key2);
+        assert_ne!(key1, key1_2);
+
+        let key3 = SortKey::from_columns(vec!["a"]);
+
+        let key3_2 = SortKeyBuilder::with_capacity(2)
+            .with_col("a")
+            .with_col_opts("b", true, false)
+            .build();
+
+        // same
+        assert_eq!(key1, key3);
+        assert_eq!(key1_2, key3_2);
+
+        let key4 = SortKey::from_columns(vec!["aa"]);
+
+        let key4_2 = SortKeyBuilder::with_capacity(2)
+            .with_col("aa")
+            .with_col_opts("bb", true, false)
+            .build();
+
+        // different key, same value
+        assert_ne!(key1, key4);
+        assert_ne!(key1_2, key4_2);
+
+        let key5 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts("a", true, true)
+            .build();
+
+        let key5_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("a", true, true)
+            .with_col_opts("b", false, true)
+            .build();
+
+        // same key, different value
+        assert_ne!(key1, key5);
+        assert_ne!(key1_2, key5_2);
+    }
+
+    // Note that the last column must be TIME_COLUMN_NAME to avoid panicking
+    #[test]
+    fn test_super_sort_key() {
+        let a = TIME_COLUMN_NAME;
+        // key (a) with default sort options (false, true)
+        let key_a = SortKey::from_columns(vec![a]);
+
+        // key (a) with explicitly defined sort options
+        let key_a_2 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts(TIME_COLUMN_NAME, true, false)
+            .build();
+
+        // super key of (a) and (a) is (a)
+        let merge_key = SortKey::try_merge_key(&key_a, &key_a).unwrap();
+        assert_eq!(merge_key, &key_a);
+        let merge_key = SortKey::try_merge_key(&key_a_2, &key_a_2).unwrap();
+        assert_eq!(merge_key, &key_a_2);
+
+        // (a,b)
+        let b = TIME_COLUMN_NAME;
+        let key_ab = SortKey::from_columns(vec!["a", TIME_COLUMN_NAME]);
+        let key_ab_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("a", true, false)
+            .with_col_opts(b, false, false)
+            .build();
+
+        //(b)
+        let key_b = SortKey::from_columns(vec![b]);
+
+        let key_b_2 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts(b, false, false)
+            .build();
+
+        // super key of (a, b) and (b) is (a, b)
+        let merge_key = SortKey::try_merge_key(&key_ab, &key_b).unwrap();
+        assert_eq!(merge_key, &key_ab);
+        let merge_key = SortKey::try_merge_key(&key_ab_2, &key_b_2).unwrap();
+        assert_eq!(merge_key, &key_ab_2);
+        // super key of (a, b) and (b') is None
+        let merge_key = SortKey::try_merge_key(&key_ab, &key_b_2);
+        assert_eq!(merge_key, None);
+        let merge_key = SortKey::try_merge_key(&key_ab_2, &key_b);
+        assert_eq!(merge_key, None);
+
+        // super key of (a, b) and (a, b) is (a, b)
+        let merge_key = SortKey::try_merge_key(&key_ab, &key_ab).unwrap();
+        assert_eq!(merge_key, &key_ab);
+        let merge_key = SortKey::try_merge_key(&key_ab_2, &key_ab_2).unwrap();
+        assert_eq!(merge_key, &key_ab_2);
+        // super key of (a, b) and (a',b') is None
+        let merge_key = SortKey::try_merge_key(&key_ab, &key_ab_2);
+        assert_eq!(merge_key, None);
+        let merge_key = SortKey::try_merge_key(&key_ab_2, &key_ab);
+        assert_eq!(merge_key, None);
+
+        // (a, b, c)
+        let c = TIME_COLUMN_NAME;
+        let key_abc_2 = SortKeyBuilder::with_capacity(3)
+            .with_col_opts("a", true, false)
+            .with_col_opts("b", false, false)
+            .with_col_opts(c, true, true)
+            .build();
+
+        //  (c)
+        let key_c_2 = SortKeyBuilder::with_capacity(1)
+            .with_col_opts(c, true, true)
+            .build();
+
+        // (a, c)
+        let key_ac_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("a", true, false)
+            .with_col_opts(c, true, true)
+            .build();
+
+        // (b,c)
+        let key_bc_2 = SortKeyBuilder::with_capacity(2)
+            .with_col_opts("b", false, false)
+            .with_col_opts(c, true, true)
+            .build();
+
+        // (b,a,c)
+        let key_bac_2 = SortKeyBuilder::with_capacity(3)
+            .with_col_opts("b", false, false)
+            .with_col_opts("a", true, false)
+            .with_col_opts(c, true, true)
+            .build();
+
+        // super key of (a, b, c) and any of {  (a, c), (b, c), (a), (b), (c)  } is (a, b, c)
+        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_c_2).unwrap();
+        assert_eq!(merge_key, &key_abc_2);
+        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_ac_2).unwrap();
+        assert_eq!(merge_key, &key_abc_2);
+        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_bc_2).unwrap();
+        assert_eq!(merge_key, &key_abc_2);
+
+        // super key of (a, b, c) and any of (b, a, c) } is None
+        let merge_key = SortKey::try_merge_key(&key_abc_2, &key_bac_2);
+        assert_eq!(merge_key, None);
+    }
+
+    fn to_string_array(values: impl Into<StringArray>) -> ArrayRef {
+        Arc::new(values.into()) as ArrayRef
+    }
+
+    #[test]
+    fn test_distinct_values() {
+        let rb = RecordBatch::try_from_iter(vec![
+            ("host", to_string_array(vec!["a", "b", "c", "a"])),
+            (
+                "env",
+                to_string_array(vec![None, Some("prod"), Some("stage"), Some("prod")]),
+            ),
+        ])
+        .unwrap();
+
+        // Pass the tag field names plus time as the primary key, this is what should happen
+        let distinct = distinct_values(&rb, &["host", "env", "time"]);
+
+        // The hashmap should contain the distinct values for "host" and "env" only
+        assert_eq!(distinct.len(), 2);
+
+        // Return unique values
+        assert_eq!(
+            *distinct.get("host").unwrap(),
+            HashSet::from(["a".into(), "b".into(), "c".into()]),
+        );
+        // TODO: do nulls count as a value?
+        assert_eq!(
+            *distinct.get("env").unwrap(),
+            HashSet::from(["prod".into(), "stage".into()]),
+        );
+
+        // Requesting a column not present returns None
+        assert_eq!(distinct.get("foo"), None);
+
+        // Distinct count isn't computed for the time column or fields
+        assert_eq!(distinct.get("time"), None);
+        assert_eq!(distinct.get("val"), None);
+
+        // Specify a column in the primary key that doesn't appear in the data
+        let distinct = distinct_values(&rb, &["host", "env", "foo", "time"]);
+        // The hashmap should contain the distinct values for "host" and "env" only
+        assert_eq!(distinct.len(), 2);
+
+        // Don't specify one of the tag columns for the primary key
+        let distinct = distinct_values(&rb, &["host", "foo", "time"]);
+        // The hashmap should contain the distinct values for the specified columns only
+        assert_eq!(distinct.len(), 1);
+    }
+
+    #[test]
+    fn test_sort_key() {
+        // Across these three record batches:
+        // - `host` has 2 distinct values: "a", "b"
+        // - 'env' has 3 distinct values: "prod", "stage", "dev"
+        // host's 2 values appear in each record batch, so the distinct counts could be incorrectly
+        // aggregated together as 2 + 2 + 2 = 6. env's 3 values each occur in their own record
+        // batch, so they should always be aggregated as 3.
+        // host has the lower cardinality, so it should appear first in the sort key.
+        let rb1 = Arc::new(
+            RecordBatch::try_from_iter(vec![
+                ("host", to_string_array(vec!["a", "b"])),
+                ("env", to_string_array(vec!["prod", "prod"])),
+            ])
+            .unwrap(),
+        );
+        let rb2 = Arc::new(
+            RecordBatch::try_from_iter(vec![
+                ("host", to_string_array(vec!["a", "b"])),
+                ("env", to_string_array(vec!["stage", "stage"])),
+            ])
+            .unwrap(),
+        );
+        let rb3 = Arc::new(
+            RecordBatch::try_from_iter(vec![
+                ("host", to_string_array(vec!["a", "b"])),
+                ("env", to_string_array(vec!["dev", "dev"])),
+            ])
+            .unwrap(),
+        );
+        let rbs = [rb1, rb2, rb3];
+        let schema = SchemaBuilder::new()
+            .tag("host")
+            .tag("env")
+            .timestamp()
+            .build()
+            .unwrap();
+
+        let sort_key = compute_sort_key(&schema, rbs.iter().map(|rb| rb.as_ref()));
+
+        assert_eq!(sort_key, SortKey::from_columns(["host", "env", "time"]));
+    }
+
+    #[test]
+    fn test_sort_key_all_null() {
+        let rb = Arc::new(
+            RecordBatch::try_from_iter(vec![
+                ("x", to_string_array(vec!["a", "b"])),
+                ("y", to_string_array(vec![None::<&str>, None])),
+                ("z", to_string_array(vec!["c", "c"])),
+            ])
+            .unwrap(),
+        );
+        let rbs = [rb];
+        let schema = SchemaBuilder::new()
+            .tag("x")
+            .tag("y")
+            .tag("z")
+            .timestamp()
+            .build()
+            .unwrap();
+
+        let sort_key = compute_sort_key(&schema, rbs.iter().map(|rb| rb.as_ref()));
+
+        assert_eq!(sort_key, SortKey::from_columns(["x", "z", "y", "time"]));
+    }
+
+    #[test]
+    fn test_adjust_sort_key_columns() {
+        // If the catalog sort key is the same as the primary key, no changes
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "time"]);
+        let data_primary_key = ["host", "env", "time"];
+
+        let (metadata, update) = adjust_sort_key_columns(&catalog_sort_key, &data_primary_key);
+
+        assert_eq!(metadata, catalog_sort_key);
+        assert!(update.is_none());
+
+        // If the catalog sort key contains more columns than the primary key, the metadata key
+        // should only contain the columns in the data and the catalog should not be updated
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "time"]);
+        let data_primary_key = ["host", "time"];
+
+        let (metadata, update) = adjust_sort_key_columns(&catalog_sort_key, &data_primary_key);
+
+        assert_eq!(metadata, SortKey::from_columns(data_primary_key));
+        assert!(update.is_none());
+
+        // If the catalog sort key contains fewer columns than the primary key, add the new columns
+        // just before the time column and update the catalog
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "time"]);
+        let data_primary_key = ["host", "temp", "env", "time"];
+
+        let (metadata, update) = adjust_sort_key_columns(&catalog_sort_key, &data_primary_key);
+
+        let expected = SortKey::from_columns(["host", "env", "temp", "time"]);
+        assert_eq!(metadata, expected);
+        assert_eq!(update.unwrap(), expected);
+
+        // If the sort key contains a column that doesn't exist in the data and is missing a column,
+        // the metadata key should only contain the columns in the data and the catalog should be
+        // updated to include the new column (but not remove the missing column)
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "time"]);
+        let data_primary_key = ["host", "temp", "time"];
+
+        let (metadata, update) = adjust_sort_key_columns(&catalog_sort_key, &data_primary_key);
+        assert_eq!(metadata, SortKey::from_columns(data_primary_key));
+        let expected = SortKey::from_columns(["host", "env", "temp", "time"]);
+        assert_eq!(update.unwrap(), expected);
+    }
+
+    #[test]
+    fn test_filter_to_primary_key() {
+        // If the catalog sort key is the same as the primary key, no changes
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "time"]);
+        let data_primary_key = ["host", "env", "time"];
+
+        let filtered = catalog_sort_key.filter_to(&data_primary_key, 1);
+        assert_eq!(catalog_sort_key, filtered);
+
+        // If the catalog sort key contains more columns than the primary key, the filtered key
+        // should only contain the columns in the primary key
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "time"]);
+        let data_primary_key = ["host", "time"];
+
+        let filtered = catalog_sort_key.filter_to(&data_primary_key, 1);
+        let expected = SortKey::from_columns(["host", "time"]);
+        assert_eq!(expected, filtered);
+
+        // If the catalog sort key has columns in a different order than the primary key, the
+        // filtered key should contain the columns in the same order as the catalog sort key.
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "zone", "time"]);
+        let data_primary_key = ["env", "host", "time"];
+
+        let filtered = catalog_sort_key.filter_to(&data_primary_key, 1);
+        let expected = SortKey::from_columns(["host", "env", "time"]);
+        assert_eq!(expected, filtered);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_filter_missing_columns() {
+        // If the primary key has columns that are missing from the catalog sort key, panic.
+        // This should never happen because the ingester should save the sort key as the union
+        // of all primary key columns it has seen, and the compactor shouldn't get data that hasn't
+        // been through the ingester.
+        let catalog_sort_key = SortKey::from_columns(["host", "env", "time"]);
+        let data_primary_key = ["host", "env", "zone", "time"];
+
+        catalog_sort_key.filter_to(&data_primary_key, 1);
+    }
+
+    #[test]
+    fn test_size() {
+        let key_1 = SortKey::from_columns(vec![TIME_COLUMN_NAME]);
+        let key_2 = SortKey::from_columns(vec!["a", TIME_COLUMN_NAME]);
+        assert!(key_1.size() < key_2.size());
+    }
+}
diff --git a/service_common/Cargo.toml b/service_common/Cargo.toml
new file mode 100644
index 0000000..ec328aa
--- /dev/null
+++ b/service_common/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "service_common"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+arrow = { workspace = true }
+datafusion = { workspace = true }
+executor = { path = "../executor" }
+tonic = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/service_common/src/error.rs b/service_common/src/error.rs
new file mode 100644
index 0000000..f9a5b2e
--- /dev/null
+++ b/service_common/src/error.rs
@@ -0,0 +1,151 @@
+//! Routines for error handling
+use datafusion::error::DataFusionError;
+
+/// Converts a [`DataFusionError`] into the appropriate [`tonic::Code`]
+///
+/// Note: the goal of this function is that the "user sees the error
+/// message rather than an opaque message". Typically the messages of
+/// [`tonic::Code::Internal`] are not displayed to the user as they
+/// result from bugs in the software rather and the user can't do
+/// anything about them.
+///
+/// In an ideal world, it would be totally clear from a
+/// [`DataFusionError`] which errors belonged in which mapping.
+///
+/// However, this is not always the case, so the code takes the
+/// "conservative UX" approach to "show the user the message". This
+/// may be at odds with the conservative approach from a security
+/// perspective.
+///
+/// Basically because I wasn't sure they were all internal errors --
+/// for example, you can get an Arrow error if you try and divide a
+/// column by zero, depending on the data.
+pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code {
+    let e = e.find_root();
+
+    match e {
+        DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
+        // Map as many as possible back into user visible (non internal) errors
+        DataFusionError::SQL(_, _)
+        | DataFusionError::SchemaError(_, _)
+        // Execution, ArrowError and ParquetError might be due to an
+        // internal error (e.g. some sort of IO error or bug) or due
+        // to a user input error (e.g. you can get an Arrow error if
+        // you try and divide by a column and it has zeros).
+        //
+        // Since we are not sure they are all internal errors we
+        // classify them as InvalidArgument so the user has a chance
+        // to see them
+        | DataFusionError::Execution(_)
+        | DataFusionError::ArrowError(_, _)
+        | DataFusionError::ParquetError(_)
+        // DataFusion most often returns "NotImplemented" when a
+        // particular SQL feature is not implemented. This
+        // information is useful to the user who may be able to
+        // express their query using different syntax that is implemented.
+        //
+        // the grpc / tonic "NotImplemented" code typically means
+        // that the client called an API endpoint that wasn't
+        // implemented.
+        //
+        // See examples in:
+        // https://github.com/apache/arrow-datafusion/search?q=NotImplemented
+        | DataFusionError::NotImplemented(_)
+        | DataFusionError::Plan(_) => tonic::Code::InvalidArgument,
+        DataFusionError::Context(_,_) => unreachable!("handled in chain traversal above"),
+        // External errors are mostly traversed by the DataFusion already except for some IOx errors
+        DataFusionError::External(e) => {
+            if let Some(e) = e.downcast_ref::<executor::JobError>() {
+                match e {
+                    executor::JobError::WorkerGone => tonic::Code::Unavailable,
+                    executor::JobError::Panic { .. } => tonic::Code::Internal,
+                }
+            } else {
+                // All other, unclassified cases are signalled as "internal error" to the user since they cannot do
+                // anything about it (except for reporting a bug). Note that DataFusion "external" error is only from
+                // DataFusion's PoV, not from a users PoV.
+                tonic::Code::Internal
+            }
+        }
+        // Map as many as possible back into user visible
+        // (non internal) errors and only treat the ones
+        // the user likely can't do anything about as internal
+        DataFusionError::ObjectStore(_)
+        | DataFusionError::Configuration(_)
+        | DataFusionError::IoError(_)
+        // Substrait errors come from internal code and are unused
+        // with DataFusion at the moment
+        | DataFusionError::Substrait(_)
+        | DataFusionError::Internal(_) => tonic::Code::Internal,
+        // explicitly don't have a catchall here so any
+        // newly added DataFusion error will raise a compiler error for us to address
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use datafusion::sql::sqlparser::parser::ParserError;
+
+    use super::*;
+
+    #[test]
+    fn test_error_translation() {
+        let s = "foo".to_string();
+
+        // this is basically a second implementation of the translation table to help avoid mistakes
+        do_transl_test(
+            DataFusionError::ResourcesExhausted(s.clone()),
+            tonic::Code::ResourceExhausted,
+        );
+
+        let e = ParserError::ParserError(s.clone());
+        do_transl_test(DataFusionError::SQL(e, None), tonic::Code::InvalidArgument);
+
+        do_transl_test(
+            DataFusionError::NotImplemented(s.clone()),
+            tonic::Code::InvalidArgument,
+        );
+        do_transl_test(
+            DataFusionError::Plan(s.clone()),
+            tonic::Code::InvalidArgument,
+        );
+
+        do_transl_test(DataFusionError::Internal(s.clone()), tonic::Code::Internal);
+
+        // traversal
+        do_transl_test(
+            DataFusionError::Context(
+                "it happened!".to_string(),
+                Box::new(DataFusionError::ResourcesExhausted("foo".to_string())),
+            ),
+            tonic::Code::ResourceExhausted,
+        );
+
+        // inspect "external" errors
+        do_transl_test(
+            DataFusionError::External(s.clone().into()),
+            tonic::Code::Internal,
+        );
+        do_transl_test(
+            DataFusionError::External(Box::new(executor::JobError::Panic { msg: s })),
+            tonic::Code::Internal,
+        );
+        do_transl_test(
+            DataFusionError::External(Box::new(executor::JobError::WorkerGone)),
+            tonic::Code::Unavailable,
+        );
+        do_transl_test(
+            DataFusionError::Context(
+                "ctx".into(),
+                Box::new(DataFusionError::External(Box::new(
+                    executor::JobError::WorkerGone,
+                ))),
+            ),
+            tonic::Code::Unavailable,
+        );
+    }
+
+    fn do_transl_test(e: DataFusionError, code: tonic::Code) {
+        assert_eq!(datafusion_error_to_tonic_code(&e), code);
+    }
+}
diff --git a/service_common/src/lib.rs b/service_common/src/lib.rs
new file mode 100644
index 0000000..5b055ec
--- /dev/null
+++ b/service_common/src/lib.rs
@@ -0,0 +1,25 @@
+//! Common methods for RPC service implementations
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![allow(clippy::clone_on_ref_ptr)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod error;
+
+pub use error::datafusion_error_to_tonic_code;
+
+// Included to avoid arrow in workspace-hack crate
+use arrow as _;
diff --git a/service_common/src/planner.rs b/service_common/src/planner.rs
new file mode 100644
index 0000000..822e1fe
--- /dev/null
+++ b/service_common/src/planner.rs
@@ -0,0 +1,294 @@
+//! Query planner wrapper for use in IOx services
+use std::sync::Arc;
+
+use bytes::Bytes;
+use datafusion::{
+    arrow::datatypes::SchemaRef, error::DataFusionError, physical_plan::ExecutionPlan,
+};
+use flightsql::{FlightSQLCommand, FlightSQLPlanner};
+use iox_query::{
+    exec::IOxSessionContext,
+    frontend::sql::SqlQueryPlanner,
+    plan::{fieldlist::FieldListPlan, seriesset::SeriesSetPlans, stringset::StringSetPlan},
+    Aggregate, QueryNamespace, WindowDuration,
+};
+use iox_query_influxrpc::InfluxRpcPlanner;
+
+pub use datafusion::error::{DataFusionError as Error, Result};
+use iox_query_influxql::frontend::planner::InfluxQLQueryPlanner;
+use predicate::rpc_predicate::InfluxRpcPredicate;
+
+/// Query planner that plans queries on a separate threadpool.
+///
+/// Query planning was, at time of writing, a single threaded affair. In order
+/// to avoid tying up the tokio executor that is handling API requests, IOx plan
+/// queries using a separate thread pool.
+#[derive(Debug)]
+pub struct Planner {
+    /// Executors (whose threadpool to use)
+    ctx: IOxSessionContext,
+}
+
+impl Planner {
+    /// Create a new planner that will plan queries using the provided context
+    pub fn new(ctx: &IOxSessionContext) -> Self {
+        Self {
+            ctx: ctx.child_ctx("Planner"),
+        }
+    }
+
+    /// Plan a SQL query against the data in a namespace, and return a
+    /// DataFusion physical execution plan.
+    pub async fn sql(&self, query: impl Into<String> + Send) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = SqlQueryPlanner::new();
+        let query = query.into();
+        let ctx = self.ctx.child_ctx("planner sql");
+
+        self.ctx
+            .run(async move { planner.query(&query, &ctx).await })
+            .await
+    }
+
+    /// Plan an InfluxQL query against the data in `database`, and return a
+    /// DataFusion physical execution plan.
+    pub async fn influxql(
+        &self,
+        query: impl Into<String> + Send,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = InfluxQLQueryPlanner::new();
+        let query = query.into();
+        let ctx = self.ctx.child_ctx("planner influxql");
+
+        self.ctx
+            .run(async move { planner.query(&query, &ctx).await })
+            .await
+    }
+
+    /// Creates a plan for a `DoGet` FlightSQL message, as described on
+    /// [`FlightSQLPlanner::do_get`], on a separate threadpool
+    pub async fn flight_sql_do_get<N>(
+        &self,
+        namespace_name: impl Into<String> + Send,
+        namespace: Arc<N>,
+        cmd: FlightSQLCommand,
+    ) -> Result<Arc<dyn ExecutionPlan>>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let namespace_name = namespace_name.into();
+        let ctx = self.ctx.child_ctx("planner flight_sql_do_get");
+
+        self.ctx
+            .run(async move {
+                FlightSQLPlanner::do_get(namespace_name, namespace, cmd, &ctx)
+                    .await
+                    .map_err(DataFusionError::from)
+            })
+            .await
+    }
+
+    /// Creates a plan for a `DoAction` FlightSQL message, as described on
+    /// [`FlightSQLPlanner::do_action`], on a separate threadpool
+    pub async fn flight_sql_do_action<N>(
+        &self,
+        namespace_name: impl Into<String> + Send,
+        namespace: Arc<N>,
+        cmd: FlightSQLCommand,
+    ) -> Result<Bytes>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let namespace_name = namespace_name.into();
+        let ctx = self.ctx.child_ctx("planner flight_sql_do_get");
+
+        self.ctx
+            .run(async move {
+                FlightSQLPlanner::do_action(namespace_name, namespace, cmd, &ctx)
+                    .await
+                    .map_err(DataFusionError::from)
+            })
+            .await
+    }
+
+    /// Returns the [`SchemaRef`] to be included in the response to a
+    /// `GetFlightInfo` FlightSQL message as described on
+    /// [`FlightSQLPlanner::get_schema`], on a separate threadpool.
+    pub async fn flight_sql_get_flight_info_schema(
+        &self,
+        namespace_name: impl Into<String> + Send,
+        cmd: FlightSQLCommand,
+    ) -> Result<SchemaRef> {
+        let namespace_name = namespace_name.into();
+        let ctx = self.ctx.child_ctx("planner flight_sql_get_flight_info");
+
+        self.ctx
+            .run(async move {
+                FlightSQLPlanner::get_schema(namespace_name, cmd, &ctx)
+                    .await
+                    .map_err(DataFusionError::from)
+            })
+            .await
+    }
+
+    /// Creates a plan as described on [`InfluxRpcPlanner::table_names`], on a
+    /// separate threadpool
+    pub async fn table_names<N>(
+        &self,
+        namespace: Arc<N>,
+        predicate: InfluxRpcPredicate,
+    ) -> Result<StringSetPlan>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let planner = InfluxRpcPlanner::new(self.ctx.child_ctx("planner table_names")).await;
+
+        self.ctx
+            .run(async move {
+                planner
+                    .table_names(namespace, predicate)
+                    .await
+                    .map_err(|e| e.to_df_error("table_names"))
+            })
+            .await
+    }
+
+    /// Creates a plan as described on [`InfluxRpcPlanner::tag_keys`], on a
+    /// separate threadpool
+    pub async fn tag_keys<N>(
+        &self,
+        namespace: Arc<N>,
+        predicate: InfluxRpcPredicate,
+    ) -> Result<StringSetPlan>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let planner = InfluxRpcPlanner::new(self.ctx.child_ctx("planner tag_keys")).await;
+
+        self.ctx
+            .run(async move {
+                planner
+                    .tag_keys(namespace, predicate)
+                    .await
+                    .map_err(|e| e.to_df_error("tag_keys"))
+            })
+            .await
+    }
+
+    /// Creates a plan as described on [`InfluxRpcPlanner::tag_values`], on a
+    /// separate threadpool
+    pub async fn tag_values<N>(
+        &self,
+        namespace: Arc<N>,
+        tag_name: impl Into<String> + Send,
+        predicate: InfluxRpcPredicate,
+    ) -> Result<StringSetPlan>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let tag_name = tag_name.into();
+        let planner = InfluxRpcPlanner::new(self.ctx.child_ctx("planner tag_values")).await;
+
+        self.ctx
+            .run(async move {
+                planner
+                    .tag_values(namespace, &tag_name, predicate)
+                    .await
+                    .map_err(|e| e.to_df_error("tag_values"))
+            })
+            .await
+    }
+
+    /// Creates a plan as described on [`InfluxRpcPlanner::field_columns`], on a
+    /// separate threadpool
+    pub async fn field_columns<N>(
+        &self,
+        namespace: Arc<N>,
+        predicate: InfluxRpcPredicate,
+    ) -> Result<FieldListPlan>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let planner = InfluxRpcPlanner::new(self.ctx.child_ctx("planner field_columns")).await;
+
+        self.ctx
+            .run(async move {
+                planner
+                    .field_columns(namespace, predicate)
+                    .await
+                    .map_err(|e| e.to_df_error("field_columns"))
+            })
+            .await
+    }
+
+    /// Creates a plan as described on [`InfluxRpcPlanner::read_filter`], on a
+    /// separate threadpool
+    pub async fn read_filter<N>(
+        &self,
+        namespace: Arc<N>,
+        predicate: InfluxRpcPredicate,
+    ) -> Result<SeriesSetPlans>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let planner = InfluxRpcPlanner::new(self.ctx.child_ctx("planner read_filter")).await;
+
+        self.ctx
+            .run(async move {
+                planner
+                    .read_filter(namespace, predicate)
+                    .await
+                    .map_err(|e| e.to_df_error("read_filter"))
+            })
+            .await
+    }
+
+    /// Creates a plan as described on [`InfluxRpcPlanner::read_group`], on a
+    /// separate threadpool
+    pub async fn read_group<N>(
+        &self,
+        namespace: Arc<N>,
+        predicate: InfluxRpcPredicate,
+        agg: Aggregate,
+        group_columns: Vec<String>,
+    ) -> Result<SeriesSetPlans>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let planner = InfluxRpcPlanner::new(self.ctx.child_ctx("planner read_group")).await;
+
+        self.ctx
+            .run(async move {
+                planner
+                    .read_group(namespace, predicate, agg, &group_columns)
+                    .await
+                    .map_err(|e| e.to_df_error("read_group"))
+            })
+            .await
+    }
+
+    /// Creates a plan as described on
+    /// [`InfluxRpcPlanner::read_window_aggregate`], on a separate threadpool
+    pub async fn read_window_aggregate<N>(
+        &self,
+        namespace: Arc<N>,
+        predicate: InfluxRpcPredicate,
+        agg: Aggregate,
+        every: WindowDuration,
+        offset: WindowDuration,
+    ) -> Result<SeriesSetPlans>
+    where
+        N: QueryNamespace + 'static,
+    {
+        let planner =
+            InfluxRpcPlanner::new(self.ctx.child_ctx("planner read_window_aggregate")).await;
+
+        self.ctx
+            .run(async move {
+                planner
+                    .read_window_aggregate(namespace, predicate, agg, every, offset)
+                    .await
+                    .map_err(|e| e.to_df_error("read_window_aggregate"))
+            })
+            .await
+    }
+}
diff --git a/service_common/src/test_util.rs b/service_common/src/test_util.rs
new file mode 100644
index 0000000..979f692
--- /dev/null
+++ b/service_common/src/test_util.rs
@@ -0,0 +1,81 @@
+use std::{collections::BTreeMap, sync::Arc};
+
+use async_trait::async_trait;
+use iox_query::{exec::Executor, test::TestDatabase};
+use parking_lot::Mutex;
+use trace::span::Span;
+use tracker::{
+    AsyncSemaphoreMetrics, InstrumentedAsyncOwnedSemaphorePermit, InstrumentedAsyncSemaphore,
+};
+
+use crate::QueryNamespaceProvider;
+
+#[derive(Debug)]
+pub struct TestDatabaseStore {
+    databases: Mutex<BTreeMap<String, Arc<TestDatabase>>>,
+    executor: Arc<Executor>,
+    pub metric_registry: Arc<metric::Registry>,
+    pub query_semaphore: Arc<InstrumentedAsyncSemaphore>,
+}
+
+impl TestDatabaseStore {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn new_with_semaphore_size(semaphore_size: usize) -> Self {
+        let metric_registry = Arc::new(metric::Registry::default());
+        let semaphore_metrics = Arc::new(AsyncSemaphoreMetrics::new(
+            &metric_registry,
+            &[("semaphore", "query_execution")],
+        ));
+        Self {
+            databases: Mutex::new(BTreeMap::new()),
+            executor: Arc::new(Executor::new_testing()),
+            metric_registry,
+            query_semaphore: Arc::new(semaphore_metrics.new_semaphore(semaphore_size)),
+        }
+    }
+
+    pub async fn db_or_create(&self, name: &str) -> Arc<TestDatabase> {
+        let mut databases = self.databases.lock();
+
+        if let Some(db) = databases.get(name) {
+            Arc::clone(db)
+        } else {
+            let new_db = Arc::new(TestDatabase::new(Arc::clone(&self.executor)));
+            databases.insert(name.to_string(), Arc::clone(&new_db));
+            new_db
+        }
+    }
+}
+
+impl Default for TestDatabaseStore {
+    fn default() -> Self {
+        Self::new_with_semaphore_size(u16::MAX as usize)
+    }
+}
+
+#[async_trait]
+impl QueryNamespaceProvider for TestDatabaseStore {
+    type Db = TestDatabase;
+
+    /// Retrieve the database specified name
+    async fn db(
+        &self,
+        name: &str,
+        _span: Option<Span>,
+        _include_debug_info_tables: bool,
+    ) -> Option<Arc<Self::Db>> {
+        let databases = self.databases.lock();
+
+        databases.get(name).cloned()
+    }
+
+    async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit {
+        Arc::clone(&self.query_semaphore)
+            .acquire_owned(span)
+            .await
+            .unwrap()
+    }
+}
diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml
new file mode 100644
index 0000000..73386f6
--- /dev/null
+++ b/service_grpc_flight/Cargo.toml
@@ -0,0 +1,45 @@
+[package]
+name = "service_grpc_flight"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+# Workspace dependencies, in alphabetical order
+authz = { path = "../authz" }
+data_types = { path = "../data_types" }
+datafusion = { workspace = true }
+flightsql = { path = "../flightsql" }
+generated_types = { path = "../generated_types" }
+observability_deps = { path = "../observability_deps" }
+iox_query = { path = "../iox_query" }
+iox_query_influxql = { path = "../iox_query_influxql" }
+iox_query_params = { path = "../iox_query_params" }
+service_common = { path = "../service_common" }
+tower_trailer = { path = "../tower_trailer"}
+trace = { path = "../trace"}
+trace_http = { path = "../trace_http"}
+tracker = { path = "../tracker" }
+
+# Crates.io dependencies, in alphabetical order
+arrow = { workspace = true }
+arrow-flight = { workspace = true }
+bytes = "1.5"
+futures = "0.3"
+prost = { workspace = true }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.111"
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
+tonic = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+assert_matches = "1"
+async-trait = "0.1"
+metric = { path = "../metric" }
+test_helpers = { path = "../test_helpers" }
diff --git a/service_grpc_flight/src/keep_alive.rs b/service_grpc_flight/src/keep_alive.rs
new file mode 100644
index 0000000..0a1836c
--- /dev/null
+++ b/service_grpc_flight/src/keep_alive.rs
@@ -0,0 +1,395 @@
+//! Keep alive handling for response streaming.
+//!
+//! # The Problem
+//! Under some deployment scenarios, we receive reports of cryptic error messages for certain long-running queries. For
+//! example, the InfluxDB IOx CLI will report:
+//!
+//! ```text
+//! Error querying:
+//!   Tonic(
+//!     Status {
+//!       code: Internal, message: "h2 protocol error: error reading a body from connection: stream error received: unexpected internal error encountered",
+//!       source: Some(
+//!         hyper::Error(
+//!           Body,
+//!           Error { kind: Reset(StreamId(1), INTERNAL_ERROR, Remote) }
+//!         )
+//!       )
+//!     }
+//!   )
+//! ```
+//!
+//! And [PyArrow] will report something like:
+//!
+//! ```text
+//! pyarrow._flight.FlightInternalError:
+//!   Flight returned internal error, with message:
+//!     Received RST_STREAM with error code 2. gRPC client debug context:
+//!       UNKNOWN:Error received from peer ipv6:%5B::1%5D:8888 {
+//!         created_time:"2023-07-03T17:54:56.346363565+02:00",
+//!         grpc_status:13,
+//!         grpc_message:"Received RST_STREAM with error code 2"
+//!       }.
+//!       Client context: OK
+//! ```
+//!
+//! `Received RST_STREAM with error code 2` is a good hint. According to [RFC 7540] (the HTTP/2 spec) the error code is
+//! (see section 7):
+//!
+//! > INTERNAL_ERROR (0x2): The endpoint encountered an unexpected internal error.
+//!
+//! and `RST_STREAM` is (see section 6.4):
+//!
+//! > The `RST_STREAM` frame (type=0x3) allows for immediate termination of a stream. `RST_STREAM` is sent to request
+//! > cancellation of a stream or to indicate that an error condition has occurred.
+//!
+//! The `grpc_status:13` confirms that -- according to [gRPC Status Codes] this means:
+//!
+//! > Internal errors. This means that some invariants expected by the underlying system have been broken. This error
+//! > code is reserved for serious errors.
+//!
+//! The issue was replicated using [NGINX] and a hack in InfluxDB that makes streams really slow.
+//!
+//! The underlying issue is that some middleware or egress component -- e.g. [NGINX] -- terminates the response stream
+//! because it thinks it is dead.
+//!
+//! # The Official Way
+//! The [gPRC Keepalive] docs say:
+//!
+//! > HTTP/2 PING-based keepalives are a way to keep an HTTP/2 connection alive even when there is no data being
+//! > transferred. This is done by periodically sending a PING frame to the other end of the connection.
+//!
+//! The `PING` mechanism is described by [RFC 7540] in section 6.7:
+//!
+//! > In addition to the frame header, `PING` frames MUST contain 8 octets of opaque data in the payload. ...
+//! >
+//! > Receivers of a `PING frame that does not include an ACK flag MUST send a `PING` frame with the ACK flag set in
+//! > response, with an identical payload. ...
+//!
+//! So every "ping" has a "pong". However the same section also says:
+//!
+//! > `PING` frames are not associated with any individual stream. If a `PING` frame is received with a stream
+//! > identifier field value other than `0x0`, the recipient MUST respond with a connection error (Section 5.4.1) of
+//! > type `PROTOCOL_ERROR`.
+//!
+//! Now how should an egress proxy deal with this? Because streams may come from multiple upstream servers, they have
+//! no way to establish a proper ping-pong end-to-end signaling path per stream. Hence in general it is not feasible to
+//! use `PING` as a keep-alive mechanism, contrary to what the [gRPC] spec says. So what DO egress proxies do then?
+//! Looking at various egress solutions:
+//!
+//! - <https://github.com/microsoft/reverse-proxy/issues/118#issuecomment-940191553>
+//! - <https://kubernetes.github.io/ingress-nginx/examples/grpc/#notes-on-using-responserequest-streams>
+//!
+//! They all seem to agree that either you set really long timeouts and/or activity-based keep-alive, i.e. they require
+//! SOMETHING to be send on that stream.
+//!
+//! # The Wanted Workaround
+//! Since all `PING`-based signalling is broken, we fall back to activity-based keep-alive, i.e. we ensure that we
+//! regularly send something in our stream.
+//!
+//! Our response stream follows the [Apache Flight] defintion. This means that we have a [gRPC] stream with
+//! [`FlightData`] messages. Every of these messages has a [`MessageHeader`] describing its content. This is
+//! [FlatBuffers] union with the following options:
+//!
+//! - `None`: This is the implicit default.
+//! - `Schema`: Sent before any other data to describe the schema of the stream.
+//! - `DictionaryBatch`: Encodes dictionary data. This is not used in practice at the moment because dictionaries are
+//!   always hydrated.
+//! - `RecordBatch`: Content of a `RecordBatch` w/o schema information.
+//! - `Tensor`, `SparseTensor`: Irrelevant for us.
+//!
+//! Ideally we would send a `None` messages with some metdata. However most clients are too broken to accept this and
+//! will trip over these messages. E.g. [PyArrow] -- which uses the C++ implementation -- will fail with:
+//!
+//! ```text
+//! OSError: Header-type of flatbuffer-encoded Message is not RecordBatch.
+//! ```
+//!
+//! # The Actual Workaround
+//! So we send actual empty `RecordBatch`es instead. These are encoded as `RecordBatch` messages w/o a schema (see
+//! section above). The schema is sent separately right at the start of the stream. The arrow-rs implementation does
+//! that for us and also ensures that the schema is adjusted for dictionary hydration. So we just inspect the data
+//! stream and wait for that schema (the upstream implementation will always send this without any blocking / wait
+//! time / actual `RecordBatch` data).
+//!
+//!
+//! [Apache Flight]: https://arrow.apache.org/docs/format/Flight.html
+//! [FlatBuffers]: https://flatbuffers.dev/
+//! [`FlightData`]: https://github.com/apache/arrow/blob/cd1ed18fd1e08912ea47b64edf55be9c046375c4/format/Flight.proto#L401-L429
+//! [gRPC]: https://grpc.io/
+//! [gPRC Keepalive]: https://grpc.io/docs/guides/keepalive/
+//! [gRPC Status Codes]: https://grpc.github.io/grpc/core/md_doc_statuscodes.html
+//! [`MessageHeader`]: https://github.com/apache/arrow/blob/cd1ed18fd1e08912ea47b64edf55be9c046375c4/format/Message.fbs#L124-L132
+//! [NGINX]: https://nginx.org/
+//! [PyArrow]: https://arrow.apache.org/docs/python/index.html
+//! [RFC 7540]: https://httpwg.org/specs/rfc7540.html
+
+use std::{
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+    time::Duration,
+};
+
+use arrow::{
+    datatypes::{DataType, Schema, SchemaRef},
+    ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions},
+    record_batch::RecordBatch,
+};
+use arrow_flight::FlightData;
+use futures::{stream::BoxStream, Stream, StreamExt};
+use observability_deps::tracing::{info, warn};
+use tokio::time::{Interval, MissedTickBehavior};
+
+/// Keep alive underlying response stream by sending regular empty [`RecordBatch`]es.
+pub(crate) struct KeepAliveStream<E>
+where
+    E: 'static,
+{
+    inner: BoxStream<'static, Result<FlightData, E>>,
+}
+
+impl<E> KeepAliveStream<E>
+where
+    E: 'static,
+{
+    /// Create new keep-alive wrapper from the underlying stream and the given interval.
+    ///
+    /// The interval is measured from the last message -- which can either be a "real" message or a keep-alive.
+    pub(crate) fn new<S>(s: S, interval: Duration) -> Self
+    where
+        S: Stream<Item = Result<FlightData, E>> + Send + 'static,
+    {
+        let mut ticker = tokio::time::interval(interval);
+        ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        let state = State {
+            inner: s.boxed(),
+            schema: None,
+            ticker,
+        };
+
+        let inner = futures::stream::unfold(state, |mut state| async move {
+            loop {
+                tokio::select! {
+                    _ = state.ticker.tick() => {
+                        let Some(data) = build_empty_batch_msg(state.schema.as_ref()) else {
+                            continue;
+                        };
+                        info!("stream keep-alive");
+                        return Some((Ok(data), state));
+                    }
+                    res = state.inner.next() => {
+                        // peek at content to detect schema transmission
+                        if let Some(Ok(data)) = &res {
+                            if let Some(schema) = decode_schema(data) {
+                                if check_schema(&schema) {
+                                    state.schema = Some(Arc::new(schema));
+                                }
+                            }
+                        }
+
+                        state.ticker.reset();
+                        return res.map(|res| (res, state));
+                    }
+                }
+            }
+        })
+        .boxed();
+
+        Self { inner }
+    }
+}
+
+impl<E> Stream for KeepAliveStream<E>
+where
+    E: 'static,
+{
+    type Item = Result<FlightData, E>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.inner.poll_next_unpin(cx)
+    }
+}
+
+/// Inner state of [`KeepAliveStream`]
+struct State<E>
+where
+    E: 'static,
+{
+    /// The underlying stream that is kept alive.
+    inner: BoxStream<'static, Result<FlightData, E>>,
+
+    /// A [`Schema`] that was already received from the stream.
+    ///
+    /// We need this to produce sensible empty [`RecordBatch`]es and because [`RecordBatch`] messages can only come
+    /// AFTER an encoded [`Schema`].
+    schema: Option<SchemaRef>,
+
+    /// Keep-alive ticker.
+    ticker: Interval,
+}
+
+/// Decode [`Schema`] from response data stream.
+fn decode_schema(data: &FlightData) -> Option<Schema> {
+    let message = arrow::ipc::root_as_message(&data.data_header[..]).ok()?;
+
+    if arrow::ipc::MessageHeader::Schema != message.header_type() {
+        return None;
+    }
+    Schema::try_from(data).ok()
+}
+
+/// Check that the [`Schema`] that we've [decoded](decode_schema) is sensible.
+///
+/// Returns `true` if the [`Schema`] is OK. Will log a warning and return `false` if there is a problem.
+fn check_schema(schema: &Schema) -> bool {
+    schema.fields().iter().all(|field| match field.data_type() {
+        DataType::Dictionary(_, _) => {
+            warn!(
+                field = field.name(),
+                "arrow IPC schema still contains dictionary, should have been hydrated by now",
+            );
+            false
+        }
+        _ => true,
+    })
+}
+
+/// Encode an empty [`RecordBatch`] as a message.
+///
+/// This must only be sent AFTER a [`Schema`] was transmitted.
+fn build_empty_batch_msg(schema: Option<&SchemaRef>) -> Option<FlightData> {
+    let Some(schema) = schema else {
+        warn!("cannot send keep-alive because no schema was transmitted yet",);
+        return None;
+    };
+
+    let batch = RecordBatch::new_empty(Arc::clone(schema));
+    let data_gen = IpcDataGenerator::default();
+    let mut dictionary_tracker = DictionaryTracker::new(true);
+    let write_options = IpcWriteOptions::default();
+    let batch_data = match data_gen.encoded_batch(&batch, &mut dictionary_tracker, &write_options) {
+        Ok((dicts_data, batch_data)) => {
+            assert!(dicts_data.is_empty());
+            batch_data
+        }
+        Err(e) => {
+            warn!(
+                %e,
+                "cannot encode empty batch",
+            );
+            return None;
+        }
+    };
+
+    Some(batch_data.into())
+}
+
+#[cfg(test)]
+pub(crate) mod test_util {
+    use std::time::Duration;
+
+    use futures::{stream::BoxStream, Stream, StreamExt};
+
+    /// Ensure that there is a delay between steam responses.
+    pub(crate) fn make_stream_slow<S>(s: S, delay: Duration) -> BoxStream<'static, S::Item>
+    where
+        S: Send + Stream + Unpin + 'static,
+    {
+        futures::stream::unfold(s, move |mut s| async move {
+            tokio::time::sleep(delay).await;
+            let res = s.next().await;
+            res.map(|res| (res, s))
+        })
+        .boxed()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow::{array::Int64Array, datatypes::Field};
+    use arrow_flight::{
+        decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, error::FlightError,
+    };
+    use datafusion::assert_batches_eq;
+    use futures::TryStreamExt;
+    use test_helpers::maybe_start_logging;
+
+    use super::{test_util::make_stream_slow, *};
+
+    type BatchStream = BoxStream<'static, Result<RecordBatch, FlightError>>;
+    type FlightStream = BoxStream<'static, Result<FlightData, FlightError>>;
+
+    #[tokio::test]
+    #[should_panic(expected = "stream timeout")]
+    async fn test_timeout() {
+        let s = make_test_stream(false);
+        let s = FlightRecordBatchStream::new_from_flight_data(s);
+        s.collect::<Vec<_>>().await;
+    }
+
+    #[tokio::test]
+    async fn test_keep_alive() {
+        maybe_start_logging();
+
+        let s = make_test_stream(true);
+        let s = FlightRecordBatchStream::new_from_flight_data(s);
+        let batches: Vec<_> = s.try_collect().await.unwrap();
+        assert_batches_eq!(
+            ["+---+", "| f |", "+---+", "| 1 |", "| 2 |", "| 3 |", "| 4 |", "| 5 |", "+---+"],
+            &batches
+        );
+    }
+
+    /// Creates a stream like the query processing would do.
+    fn make_query_result_stream() -> (BatchStream, SchemaRef) {
+        let schema = Arc::new(Schema::new(vec![Field::new("f", DataType::Int64, false)]));
+
+        let batch_1 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int64Array::from(vec![1, 2, 3]))],
+        )
+        .unwrap();
+        let batch_2 = RecordBatch::try_new(
+            Arc::clone(&schema),
+            vec![Arc::new(Int64Array::from(vec![4, 5]))],
+        )
+        .unwrap();
+
+        let s = futures::stream::iter([batch_1, batch_2]).map(Ok).boxed();
+        (s, schema)
+    }
+
+    /// Convert query result stream (= [`RecordBatch`]es) into a [`FlightData`] stream.
+    ///
+    /// This stream will -- as in prod -- send the [`Schema`] data even when there are no [`RecordBatch`]es yet.
+    fn make_flight_data_stream(s: BatchStream, schema: SchemaRef) -> FlightStream {
+        FlightDataEncoderBuilder::new()
+            .with_schema(schema)
+            .build(s)
+            .boxed()
+    }
+
+    fn panic_on_stream_timeout(s: FlightStream, timeout: Duration) -> FlightStream {
+        futures::stream::unfold(s, move |mut s| async move {
+            let res = tokio::time::timeout(timeout, s.next())
+                .await
+                .expect("stream timeout");
+            res.map(|res| (res, s))
+        })
+        .boxed()
+    }
+
+    fn make_test_stream(keep_alive: bool) -> FlightStream {
+        let (s, schema) = make_query_result_stream();
+        let s = make_stream_slow(s, Duration::from_millis(500));
+        let s = make_flight_data_stream(s, schema);
+        let s = if keep_alive {
+            KeepAliveStream::new(s, Duration::from_millis(100)).boxed()
+        } else {
+            s
+        };
+
+        panic_on_stream_timeout(s, Duration::from_millis(250))
+    }
+}
diff --git a/service_grpc_flight/src/lib.rs b/service_grpc_flight/src/lib.rs
new file mode 100644
index 0000000..5e345c7
--- /dev/null
+++ b/service_grpc_flight/src/lib.rs
@@ -0,0 +1,1529 @@
+//! Implements the InfluxDB IOx "Native" Flight API and Arrow
+//! FlightSQL, based on Arrow Flight and gRPC. See [`FlightService`]
+//! for full detail.
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![allow(clippy::clone_on_ref_ptr)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use keep_alive::KeepAliveStream;
+use planner::Planner;
+use tower_trailer::{HeaderMap, Trailers};
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod keep_alive;
+mod planner;
+mod request;
+
+use arrow::error::ArrowError;
+use arrow_flight::{
+    encode::FlightDataEncoderBuilder,
+    error::FlightError,
+    flight_descriptor::DescriptorType,
+    flight_service_server::{FlightService as Flight, FlightServiceServer as FlightServer},
+    Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo,
+    HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket,
+};
+use authz::{extract_token, Authorizer};
+use data_types::NamespaceNameError;
+use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan};
+use flightsql::FlightSQLCommand;
+use futures::{ready, stream::BoxStream, Stream, StreamExt, TryStreamExt};
+use generated_types::influxdata::iox::querier::v1 as proto;
+use iox_query::{
+    exec::IOxSessionContext,
+    query_log::{QueryCompletedToken, QueryLogEntry, StatePermit, StatePlanned},
+    QueryNamespaceProvider,
+};
+use observability_deps::tracing::{debug, info, warn};
+use prost::Message;
+use request::{IoxGetRequest, RunQuery};
+use service_common::datafusion_error_to_tonic_code;
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::{
+    fmt::Debug,
+    pin::Pin,
+    sync::{Arc, Mutex},
+    task::Poll,
+    time::Duration,
+};
+use tonic::{
+    metadata::{AsciiMetadataValue, MetadataMap},
+    Request, Response, Streaming,
+};
+use trace::{ctx::SpanContext, span::SpanExt};
+use trace_http::ctx::{RequestLogContext, RequestLogContextExt};
+use tracker::InstrumentedAsyncOwnedSemaphorePermit;
+
+/// The supported names of the grpc header that contain the target database
+/// for FlightSQL requests.
+///
+/// See <https://lists.apache.org/thread/fd6r1n7vt91sg2c7fr35wcrsqz6x4645>
+/// for discussion on adding support to FlightSQL itself.
+const IOX_FLIGHT_SQL_DATABASE_REQUEST_HEADERS: [&str; 4] = [
+    "database", // preferred
+    "bucket",
+    "bucket-name",
+    "iox-namespace-name", // deprecated
+];
+
+/// Trailer that describes the duration (in seconds) for which a query was queued due to concurrency limits.
+const IOX_FLIGHT_QUEUE_DURATION_RESPONSE_TRAILER: &str = "x-influxdata-queue-duration-seconds";
+
+/// Trailer that describes the duration (in seconds) of the planning phase of a query.
+const IOX_FLIGHT_PLANNING_DURATION_RESPONSE_TRAILER: &str =
+    "x-influxdata-planning-duration-seconds";
+
+/// Trailer that describes the duration (in seconds) of the execution phase of a query.
+const IOX_FLIGHT_EXECUTION_DURATION_RESPONSE_TRAILER: &str =
+    "x-influxdata-execution-duration-seconds";
+
+/// Trailer that describes the duration (in seconds) the CPU(s) took to compute the results.
+const IOX_FLIGHT_COMPUTE_DURATION_RESPONSE_TRAILER: &str = "x-influxdata-compute-duration-seconds";
+
+/// In which interval should the `DoGet` stream send empty messages as keep alive markers?
+const DO_GET_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(5);
+
+#[allow(clippy::enum_variant_names)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid ticket. Error: {}", source))]
+    InvalidTicket { source: request::Error },
+
+    #[snafu(display("Internal creating encoding ticket: {}", source))]
+    InternalCreatingTicket { source: request::Error },
+
+    #[snafu(display("Invalid handshake. No payload provided"))]
+    InvalidHandshake {},
+
+    #[snafu(display("Database '{}' not found", namespace_name))]
+    DatabaseNotFound { namespace_name: String },
+
+    #[snafu(display(
+        "Internal error reading points from namespace {}: {}",
+        namespace_name,
+        source
+    ))]
+    Query {
+        namespace_name: String,
+        query: String,
+        source: DataFusionError,
+    },
+
+    #[snafu(display(
+        "More than one headers are found in request: {:?}. \
+    Please include only one of them",
+        header_names
+    ))]
+    TooManyFlightSQLDatabases { header_names: Vec<String> },
+
+    #[snafu(display("no 'database' header in request"))]
+    NoFlightSQLDatabase,
+
+    #[snafu(display("Invalid 'database' header in request: {}", source))]
+    InvalidDatabaseHeader {
+        source: tonic::metadata::errors::ToStrError,
+    },
+
+    #[snafu(display("Invalid database name: {}", source))]
+    InvalidDatabaseName { source: NamespaceNameError },
+
+    #[snafu(display("Failed to optimize record batch: {}", source))]
+    Optimize { source: ArrowError },
+
+    #[snafu(display("Failed to encode schema: {}", source))]
+    EncodeSchema { source: ArrowError },
+
+    #[snafu(display("Error while planning query: {}", source))]
+    Planning {
+        namespace_name: String,
+        query: String,
+        source: planner::Error,
+    },
+
+    #[snafu(display("Error while planning Flight SQL : {}", source))]
+    FlightSQL { source: flightsql::Error },
+
+    #[snafu(display("Invalid protobuf: {}", source))]
+    Deserialization { source: prost::DecodeError },
+
+    #[snafu(display("Unsupported message type: {}", description))]
+    UnsupportedMessageType { description: String },
+
+    #[snafu(display("Unauthenticated"))]
+    Unauthenticated,
+
+    #[snafu(display("Permission denied"))]
+    PermissionDenied,
+
+    #[snafu(display("Authz error: {}", source))]
+    Authz { source: authz::Error },
+}
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl From<Error> for tonic::Status {
+    /// Converts a result from the business logic into the appropriate tonic
+    /// status
+    fn from(err: Error) -> Self {
+        // An explicit match on the Error enum will ensure appropriate
+        // logging is handled for any new error variants.
+        let msg = "Error handling Flight gRPC request";
+        let namespace = err.namespace();
+        let query = err.query();
+        match err {
+            Error::DatabaseNotFound { .. }
+            | Error::InvalidTicket { .. }
+            | Error::InvalidHandshake { .. }
+            | Error::Unauthenticated { .. }
+            | Error::PermissionDenied { .. }
+            | Error::InvalidDatabaseName { .. }
+            | Error::Query { .. } => info!(e=%err, %namespace, %query, msg),
+            Error::Optimize { .. }
+            | Error::EncodeSchema { .. }
+            | Error::TooManyFlightSQLDatabases { .. }
+            | Error::NoFlightSQLDatabase
+            | Error::InvalidDatabaseHeader { .. }
+            | Error::Planning { .. }
+            | Error::Deserialization { .. }
+            | Error::InternalCreatingTicket { .. }
+            | Error::UnsupportedMessageType { .. }
+            | Error::FlightSQL { .. }
+            | Error::Authz { .. } => {
+                warn!(e=%err, %namespace, %query, msg)
+            }
+        }
+        err.into_status()
+    }
+}
+
+impl Error {
+    /// Converts a result from the business logic into the appropriate tonic (gRPC)
+    /// status message to send back to users
+    fn into_status(self) -> tonic::Status {
+        let msg = self.to_string();
+
+        let code = match self {
+            Self::DatabaseNotFound { .. } => tonic::Code::NotFound,
+            Self::InvalidTicket { .. }
+            | Self::InvalidHandshake { .. }
+            | Self::Deserialization { .. }
+            | Self::TooManyFlightSQLDatabases { .. }
+            | Self::NoFlightSQLDatabase
+            | Self::InvalidDatabaseHeader { .. }
+            | Self::InvalidDatabaseName { .. } => tonic::Code::InvalidArgument,
+            Self::Planning { source, .. } | Self::Query { source, .. } => {
+                datafusion_error_to_tonic_code(&source)
+            }
+            Self::UnsupportedMessageType { .. } => tonic::Code::Unimplemented,
+            Self::FlightSQL { source } => match source {
+                flightsql::Error::InvalidHandle { .. }
+                | flightsql::Error::Decode { .. }
+                | flightsql::Error::Protocol { .. }
+                | flightsql::Error::UnsupportedMessageType { .. } => tonic::Code::InvalidArgument,
+                flightsql::Error::Flight { source: e } => return tonic::Status::from(e),
+                fs_err @ flightsql::Error::Arrow { .. } => {
+                    // wrap in Datafusion error to walk source stacks
+                    let df_error = DataFusionError::from(fs_err);
+                    datafusion_error_to_tonic_code(&df_error)
+                }
+                flightsql::Error::DataFusion { source } => datafusion_error_to_tonic_code(&source),
+            },
+            Self::InternalCreatingTicket { .. }
+            | Self::Optimize { .. }
+            | Self::EncodeSchema { .. }
+            | Self::Authz { .. } => tonic::Code::Internal,
+            Self::Unauthenticated => tonic::Code::Unauthenticated,
+            Self::PermissionDenied => tonic::Code::PermissionDenied,
+        };
+
+        tonic::Status::new(code, msg)
+    }
+
+    /// returns the namespace name, if known, used for logging
+    fn namespace(&self) -> &str {
+        match self {
+            Error::InvalidTicket { .. }
+            | Error::InternalCreatingTicket { .. }
+            | Error::InvalidHandshake {}
+            | Error::TooManyFlightSQLDatabases { .. }
+            | Error::NoFlightSQLDatabase
+            | Error::InvalidDatabaseHeader { .. }
+            | Error::InvalidDatabaseName { .. }
+            | Error::Optimize { .. }
+            | Error::EncodeSchema { .. }
+            | Error::FlightSQL { .. }
+            | Error::Deserialization { .. }
+            | Error::UnsupportedMessageType { .. }
+            | Error::Unauthenticated
+            | Error::PermissionDenied
+            | Error::Authz { .. } => "<unknown>",
+            Error::DatabaseNotFound { namespace_name } => namespace_name,
+            Error::Query { namespace_name, .. } => namespace_name,
+            Error::Planning { namespace_name, .. } => namespace_name,
+        }
+    }
+
+    /// returns a query, if know, used for logging
+    fn query(&self) -> &str {
+        match self {
+            Error::InvalidTicket { .. }
+            | Error::InternalCreatingTicket { .. }
+            | Error::InvalidHandshake {}
+            | Error::TooManyFlightSQLDatabases { .. }
+            | Error::NoFlightSQLDatabase
+            | Error::InvalidDatabaseHeader { .. }
+            | Error::InvalidDatabaseName { .. }
+            | Error::Optimize { .. }
+            | Error::EncodeSchema { .. }
+            | Error::FlightSQL { .. }
+            | Error::Deserialization { .. }
+            | Error::UnsupportedMessageType { .. }
+            | Error::Unauthenticated
+            | Error::PermissionDenied
+            | Error::Authz { .. }
+            | Error::DatabaseNotFound { .. } => "NONE",
+            Error::Query { query, .. } => query,
+            Error::Planning { query, .. } => query,
+        }
+    }
+
+    fn unsupported_message_type(description: impl Into<String>) -> Self {
+        Self::UnsupportedMessageType {
+            description: description.into(),
+        }
+    }
+}
+
+impl From<flightsql::Error> for Error {
+    fn from(source: flightsql::Error) -> Self {
+        Self::FlightSQL { source }
+    }
+}
+
+impl From<authz::Error> for Error {
+    fn from(source: authz::Error) -> Self {
+        match source {
+            authz::Error::Forbidden => Self::PermissionDenied,
+            authz::Error::InvalidToken => Self::PermissionDenied,
+            authz::Error::NoToken => Self::Unauthenticated,
+            source => Self::Authz { source },
+        }
+    }
+}
+
+type TonicStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send + 'static>>;
+
+/// Concrete implementation of the IOx client protocol, implemented as
+/// a gRPC [Arrow Flight] Service API
+///
+/// Perhaps confusingly, this service also implements [FlightSQL] in
+/// addition to the IOx client protocol. This is done so clients can
+/// use the same Arrow Flight endpoint for either protocol. The
+/// difference between the two protocols is the specific messages
+/// passed to the Flight APIs (e.g. `DoGet` or `GetFlightInfo`).
+///
+/// The only way to run InfluxQL queries is to use the IOx client
+/// protocol. SQL queries can be run either using the IOx client
+/// protocol or FlightSQL.
+///
+/// Because FlightSQL is SQL specific, there is no way to specify a
+/// different language or dialect, and clients expect SQL semantics,
+/// thus it doesn't make sense to run InfluxQL over FlightSQL.
+///
+/// [FlightSQL]: https://arrow.apache.org/docs/format/FlightSql.html
+///
+/// # Tickets
+///
+/// Creating and serializing the `Ticket` structure used in IOx Arrow
+/// Flight API is handled by [`IoxGetRequest`]. See that for more
+/// details.
+///
+/// # Native IOx API ad-hoc query
+///
+/// To run a query with the native IOx API, a client needs to
+///
+/// 1. Encode the query string as a `Ticket` (see [`IoxGetRequest`]).
+///
+/// 2. Call the `DoGet` method with the `Ticket`,
+///
+/// 2. Recieve a stream of data encoded as [`FlightData`]
+///
+/// ```text
+///                                                      .───────.
+/// ╔═══════════╗                                       (         )
+/// ║           ║                                       │`───────'│
+/// ║  Client   ║                                       │   IOx   │
+/// ║           ║                                       │.───────.│
+/// ║           ║                                       (         )
+/// ╚═══════════╝                                        `───────'
+///       ┃ Creates a                                        ┃
+///     1 ┃ Ticket                                           ┃
+///       ┃                                                  ┃
+///       ┃                                                  ┃
+///     2 ┃                    DoGet(Ticket)                 ┃
+///       ┃━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━▶┃
+///       ┃                                                  ┃
+///       ┃                Stream of FightData               ┃
+///     3 ┃◀ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┃
+/// ```
+///
+/// # FlightSQL
+///
+/// IOx also supports [Arrow FlightSQL]. In addition to `DoGet`,
+/// FlightSQL clients call additional Arrow Flight RPC methods such as
+/// `GetFlightInfo`, `GetSchema`, `DoPut`, and `DoAction`.
+///
+/// ## FlightSQL List tables (NOT YET IMPLEMENTED)
+///
+/// TODO sequence diagram for List Tables
+///
+/// ## FlightSQL ad-hoc query
+///
+/// To run an ad-hoc query, via FlightSQL, the client needs to
+///
+/// 1. Encode the query in a `CommandStatementQuery` FlightSQL
+/// structure in a [`FlightDescriptor`]
+///
+/// 2. Call the `GetFlightInfo` method with the the [`FlightDescriptor`]
+///
+/// 3. Receive a `Ticket` in the returned [`FlightInfo`]. The Ticket is
+/// opaque (uninterpreted) by the client. It contains an
+/// [`IoxGetRequest`] with the `CommandStatementQuery` request.
+///
+/// 4. Calls the `DoGet` method with the `Ticket` from the previous step.
+///
+/// 5. Recieve a stream of data encoded as [`FlightData`]
+///
+/// ```text
+///                                                      .───────.
+/// ╔═══════════╗                                       (         )
+/// ║           ║                                       │`───────'│
+/// ║ FlightSQL ║                                       │   IOx   │
+/// ║  Client   ║                                       │.───────.│
+/// ║           ║                                       (         )
+/// ╚═══════════╝                                        `───────'
+///       ┃ Creates a                                        ┃
+///     1 ┃ CommandStatementQuery                            ┃
+///       ┃                                                  ┃
+///       ┃                                                  ┃
+///       ┃                                                  ┃
+///     2 ┃       GetFlightInfo(CommandStatementQuery)       ┃
+///       ┃━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━▶┃
+///       ┃               FlightInfo{..Ticket{               ┃
+///       ┃                CommandStatementQuery             ┃
+///     3 ┃◀ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┃
+///       ┃                                                  ┃
+///       ┃                                                  ┃
+///       ┃                  DoGet(Ticket)                   ┃
+///     4 ┃━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━▶┃
+///       ┃                                                  ┃
+///       ┃                Stream of FightData               ┃
+///     5 ┃◀ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┃
+///       ┃                                                  ┃
+/// ```
+///
+/// ## FlightSQL Prepared Statement (no bind parameters like $1, etc)
+///
+/// To run a prepared query, via FlightSQL, the client undertakes a
+/// few more steps:
+///
+/// 1. Encode the query in a `ActionCreatePreparedStatementRequest`
+/// request structure
+///
+/// 2. Call `DoAction` method with the the request
+///
+/// 3. Receive a `ActionCreatePreparedStatementResponse`, which contains
+/// a prepared statement "handle".
+///
+/// 4. Encode the handle in a `CommandPreparedStatementQuery`
+/// FlightSQL structure in a [`FlightDescriptor`] and call the
+/// `GetFlightInfo` method with the the [`FlightDescriptor`]
+///
+/// 5. Steps 5,6,7 proceed the same as for a FlightSQL ad-hoc query
+///
+/// ```text
+///                                                      .───────.
+/// ╔═══════════╗                                       (         )
+/// ║           ║                                       │`───────'│
+/// ║ FlightSQL ║                                       │   IOx   │
+/// ║  Client   ║                                       │.───────.│
+/// ║           ║                                       (         )
+/// ╚═══════════╝                                        `───────'
+///       ┃ Creates                                          ┃
+///     1 ┃ ActionCreatePreparedStatementRequest             ┃
+///       ┃                                                  ┃
+///       ┃                                                  ┃
+///       ┃  DoAction(ActionCreatePreparedStatementRequest)  ┃
+///     2 ┃━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━▶┃
+///       ┃                                                  ┃
+///       ┃  Result(ActionCreatePreparedStatementResponse)   ┃
+///     3 ┃◀ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┃
+///       ┃                                                  ┃
+///       ┃  GetFlightInfo(CommandPreparedStatementQuery)    ┃
+///     4 ┃━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━▶┃
+///       ┃  FlightInfo(..Ticket{                            ┃
+///       ┃     CommandPreparedStatementQuery})              ┃
+///     5 ┃◀ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┃
+///       ┃                                                  ┃
+///       ┃                  DoGet(Ticket)                   ┃
+///     6 ┃━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━▶┃
+///       ┃                                                  ┃
+///       ┃                Stream of FightData               ┃
+///     7 ┃◀ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ━ ┃
+/// ```
+///
+/// [Arrow Flight]: https://arrow.apache.org/docs/format/Flight.html
+/// [Arrow FlightSQL]: https://arrow.apache.org/docs/format/FlightSql.html
+#[derive(Debug)]
+struct FlightService<S>
+where
+    S: QueryNamespaceProvider,
+{
+    server: Arc<S>,
+    authz: Option<Arc<dyn Authorizer>>,
+}
+
+pub fn make_server<S>(
+    server: Arc<S>,
+    authz: Option<Arc<dyn Authorizer>>,
+) -> FlightServer<impl Flight>
+where
+    S: QueryNamespaceProvider,
+{
+    FlightServer::new(FlightService { server, authz })
+}
+
+impl<S> FlightService<S>
+where
+    S: QueryNamespaceProvider,
+{
+    /// Implementation of the `DoGet` method
+    async fn run_do_get(
+        server: Arc<S>,
+        span_ctx: Option<SpanContext>,
+        external_span_ctx: Option<RequestLogContext>,
+        request: IoxGetRequest,
+        log_entry: &mut Option<Arc<QueryLogEntry>>,
+    ) -> Result<TonicStream<FlightData>, tonic::Status> {
+        let IoxGetRequest {
+            database,
+            query,
+            params,
+            is_debug,
+        } = request;
+        let namespace_name = database.as_str();
+
+        let db = server
+            .db(
+                namespace_name,
+                span_ctx.child_span("get namespace"),
+                is_debug,
+            )
+            .await
+            .context(DatabaseNotFoundSnafu { namespace_name })?;
+
+        //TODO: add structured logging for parameterized queries https://github.com/influxdata/influxdb_iox/issues/9626
+        let query_completed_token = db.record_query(
+            external_span_ctx.as_ref().map(RequestLogContext::ctx),
+            query.variant(),
+            Box::new(query.to_string()),
+        );
+
+        *log_entry = Some(Arc::clone(query_completed_token.entry()));
+
+        // Log after we acquire the permit and are about to start execution
+        info!(
+            %namespace_name,
+            %query,
+            trace=external_span_ctx.format_jaeger().as_str(),
+            variant=query.variant(),
+            "DoGet request",
+        );
+
+        let ctx = db.new_query_context(span_ctx);
+        let physical_plan = match &query {
+            RunQuery::Sql(sql_query) => Planner::new(&ctx)
+                .sql(sql_query, params)
+                .await
+                .with_context(|_| PlanningSnafu {
+                    namespace_name,
+                    query: query.to_string(),
+                })?,
+            RunQuery::InfluxQL(sql_query) => Planner::new(&ctx)
+                .influxql(sql_query, params)
+                .await
+                .with_context(|_| PlanningSnafu {
+                namespace_name,
+                query: query.to_string(),
+            })?,
+            RunQuery::FlightSQL(msg) => Planner::new(&ctx)
+                .flight_sql_do_get(namespace_name, db, msg.clone(), params)
+                .await
+                .with_context(|_| PlanningSnafu {
+                    namespace_name,
+                    query: query.to_string(),
+                })?,
+        };
+        let query_completed_token = query_completed_token.planned(Arc::clone(&physical_plan));
+
+        let output = GetStream::new(
+            server,
+            ctx,
+            physical_plan,
+            namespace_name.to_string(),
+            &query,
+            query_completed_token,
+        )
+        .await?;
+
+        // Log any error that happens *during* execution (other error
+        // handling in this file happen during planning)
+        let output = output.map(move |res| {
+            if let Err(e) = &res {
+                info!(
+                    %database,
+                    %query,
+                    trace=external_span_ctx.format_jaeger().as_str(),
+                    %e,
+                    "Error executing query via DoGet",
+                );
+            }
+            res
+        });
+
+        Ok(Box::pin(output) as TonicStream<FlightData>)
+    }
+}
+
+#[tonic::async_trait]
+impl<S> Flight for FlightService<S>
+where
+    S: QueryNamespaceProvider,
+{
+    type HandshakeStream = TonicStream<HandshakeResponse>;
+    type ListFlightsStream = TonicStream<FlightInfo>;
+    type DoGetStream = TonicStream<FlightData>;
+    type DoPutStream = TonicStream<PutResult>;
+    type DoActionStream = TonicStream<arrow_flight::Result>;
+    type ListActionsStream = TonicStream<ActionType>;
+    type DoExchangeStream = TonicStream<FlightData>;
+
+    async fn get_schema(
+        &self,
+        _request: Request<FlightDescriptor>,
+    ) -> Result<Response<SchemaResult>, tonic::Status> {
+        Err(tonic::Status::unimplemented(
+            "Not yet implemented: get_schema",
+        ))
+    }
+
+    async fn do_get(
+        &self,
+        request: Request<Ticket>,
+    ) -> Result<Response<Self::DoGetStream>, tonic::Status> {
+        let external_span_ctx: Option<RequestLogContext> = request.extensions().get().cloned();
+        // technically the trailers layer should always be installed but for testing this isn' always the case, so lets
+        // make this optional
+        let trailers: Option<Trailers> = request.extensions().get().cloned();
+        let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
+        let authz_token = get_flight_authz(request.metadata());
+        let debug_header = has_debug_header(request.metadata());
+        let ticket = request.into_inner();
+
+        // attempt to decode ticket
+        let request = IoxGetRequest::try_decode(ticket).context(InvalidTicketSnafu);
+
+        if let Err(e) = &request {
+            info!(%e, "Error decoding Flight API ticket");
+        };
+
+        let request = request?.add_debug_header(debug_header);
+
+        let perms = match request.query() {
+            RunQuery::FlightSQL(cmd) => flightsql_permissions(request.database(), cmd),
+            RunQuery::Sql(_) | RunQuery::InfluxQL(_) => vec![authz::Permission::ResourceAction(
+                authz::Resource::Database(request.database().to_string()),
+                authz::Action::Read,
+            )],
+        };
+        self.authz
+            .permissions(authz_token, &perms)
+            .await
+            .map_err(Error::from)?;
+
+        // `run_do_get` may wait for the semaphore. In this case, we shall send empty "keep alive" messages already. So
+        // wrap the whole implementation into the keep alive stream.
+        //
+        // Also note that due to the keep alive mechanism, we cannot send any headers back because they might come
+        // after a keep alive message and therefore aren't headers. gRPC metadata can only be sent at the very beginning
+        // (headers) or at the very end (trailers). We shall use trailers.
+        let server = Arc::clone(&self.server);
+        let mut log_entry = None;
+        let response = Self::run_do_get(
+            server,
+            span_ctx,
+            external_span_ctx.clone(),
+            request.clone(),
+            &mut log_entry,
+        )
+        .await;
+
+        if let Err(e) = &response {
+            info!(
+                %request.database,
+                %request.query,
+                trace=external_span_ctx.format_jaeger().as_str(),
+                %e,
+                "Error running DoGet",
+            );
+        } else {
+            debug!(
+                %request.database,
+                %request.query,
+                trace=external_span_ctx.format_jaeger().as_str(),
+                "Planned DoGet request",
+            );
+        }
+
+        let md = QueryResponseMetadata { log_entry };
+        let md_captured = md.clone();
+        if let Some(trailers) = trailers {
+            trailers.add_callback(move |trailers| md_captured.write_trailers(trailers));
+        }
+
+        let stream = response?;
+
+        Ok(Response::new(Box::pin(stream) as _))
+    }
+
+    async fn handshake(
+        &self,
+        request: Request<Streaming<HandshakeRequest>>,
+    ) -> Result<Response<Self::HandshakeStream>, tonic::Status> {
+        // Note that the JDBC driver doesn't send the iox-namespace-name metadata
+        // in the handshake request, even if configured in the JDBC URL,
+        // so we cannot actually do any access checking here.
+        let authz_token = get_flight_authz(request.metadata());
+
+        let request = request
+            .into_inner()
+            .message()
+            .await?
+            .context(InvalidHandshakeSnafu)?;
+
+        // The handshake method is used for authentication. IOx ignores the
+        // username and returns the password itself as the token to use for
+        // subsequent requests
+        let response_header = authz_token
+            .map(|mut v| {
+                let mut nv = b"Bearer ".to_vec();
+                nv.append(&mut v);
+                nv
+            })
+            .map(AsciiMetadataValue::try_from)
+            .transpose()
+            .map_err(|e| tonic::Status::invalid_argument(e.to_string()))?;
+
+        let response = HandshakeResponse {
+            protocol_version: request.protocol_version,
+            payload: request.payload,
+        };
+        let output = futures::stream::iter(std::iter::once(Ok(response)));
+        let mut response = Response::new(Box::pin(output) as Self::HandshakeStream);
+        if let Some(header) = response_header {
+            response.metadata_mut().insert("authorization", header);
+        }
+        Ok(response)
+    }
+
+    async fn list_flights(
+        &self,
+        _request: Request<Criteria>,
+    ) -> Result<Response<Self::ListFlightsStream>, tonic::Status> {
+        Err(tonic::Status::unimplemented(
+            "Not yet implemented: list_flights",
+        ))
+    }
+
+    /// Handles `GetFlightInfo` RPC requests. The [`FlightDescriptor`]
+    /// is treated containing an FlightSQL command, encoded as a binary
+    /// ProtoBuf message.
+    ///
+    /// see [`FlightService`] for more details.
+    async fn get_flight_info(
+        &self,
+        request: Request<FlightDescriptor>,
+    ) -> Result<Response<FlightInfo>, tonic::Status> {
+        let external_span_ctx: Option<RequestLogContext> = request.extensions().get().cloned();
+        let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
+        let trace = external_span_ctx.format_jaeger();
+        let is_debug = has_debug_header(request.metadata());
+
+        let namespace_name = get_flightsql_namespace(request.metadata())?;
+        let authz_token = get_flight_authz(request.metadata());
+        let flight_descriptor = request.into_inner();
+
+        // extract the FlightSQL message
+        let cmd = cmd_from_descriptor(flight_descriptor.clone())?;
+        info!(%namespace_name, %cmd, %trace, "GetFlightInfo request");
+
+        let perms = flightsql_permissions(&namespace_name, &cmd);
+        self.authz
+            .permissions(authz_token, &perms)
+            .await
+            .map_err(Error::from)?;
+
+        let db = self
+            .server
+            .db(
+                &namespace_name,
+                span_ctx.child_span("get namespace"),
+                is_debug,
+            )
+            .await
+            .context(DatabaseNotFoundSnafu {
+                namespace_name: &namespace_name,
+            })?;
+
+        let ctx = db.new_query_context(span_ctx);
+        let schema = Planner::new(&ctx)
+            .flight_sql_get_flight_info_schema(&namespace_name, cmd.clone())
+            .await
+            .context(PlanningSnafu {
+                namespace_name: &namespace_name,
+                query: format!("{cmd:?}"),
+            });
+
+        if let Err(e) = &schema {
+            info!(%namespace_name, %cmd, %trace, %e, "Error running GetFlightInfo");
+        } else {
+            debug!(%namespace_name, %cmd, %trace, "Completed GetFlightInfo request");
+        };
+        let schema = schema?;
+
+        // Form the response ticket (that the client will pass back to DoGet)
+        let ticket = IoxGetRequest::new(&namespace_name, RunQuery::FlightSQL(cmd), is_debug)
+            .try_encode()
+            .context(InternalCreatingTicketSnafu)?;
+
+        let endpoint = FlightEndpoint::new().with_ticket(ticket);
+
+        let flight_info = FlightInfo::new()
+            .with_endpoint(endpoint)
+            // return descriptor we were passed
+            .with_descriptor(flight_descriptor)
+            .try_with_schema(schema.as_ref())
+            .context(EncodeSchemaSnafu)?;
+
+        Ok(tonic::Response::new(flight_info))
+    }
+
+    async fn do_put(
+        &self,
+        _request: Request<Streaming<FlightData>>,
+    ) -> Result<Response<Self::DoPutStream>, tonic::Status> {
+        info!("Handling flightsql do_put body");
+
+        Err(tonic::Status::unimplemented("Not yet implemented: do_put"))
+    }
+
+    async fn do_action(
+        &self,
+        request: Request<Action>,
+    ) -> Result<Response<Self::DoActionStream>, tonic::Status> {
+        let external_span_ctx: Option<RequestLogContext> = request.extensions().get().cloned();
+        let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
+        let trace = external_span_ctx.format_jaeger();
+        let is_debug = has_debug_header(request.metadata());
+
+        let namespace_name = get_flightsql_namespace(request.metadata())?;
+        let authz_token = get_flight_authz(request.metadata());
+        let Action {
+            r#type: action_type,
+            body,
+        } = request.into_inner();
+
+        // extract the FlightSQL message
+        let cmd = FlightSQLCommand::try_decode(body).context(FlightSQLSnafu)?;
+
+        info!(%namespace_name, %action_type, %cmd, %trace, "DoAction request");
+
+        let perms = flightsql_permissions(&namespace_name, &cmd);
+        self.authz
+            .permissions(authz_token, &perms)
+            .await
+            .map_err(Error::from)?;
+
+        let db = self
+            .server
+            .db(
+                &namespace_name,
+                span_ctx.child_span("get namespace"),
+                is_debug,
+            )
+            .await
+            .context(DatabaseNotFoundSnafu {
+                namespace_name: &namespace_name,
+            })?;
+
+        let ctx = db.new_query_context(span_ctx);
+        let body = Planner::new(&ctx)
+            .flight_sql_do_action(&namespace_name, db, cmd.clone())
+            .await
+            .context(PlanningSnafu {
+                namespace_name: &namespace_name,
+                query: format!("{cmd:?}"),
+            })?;
+
+        let result = arrow_flight::Result { body };
+        let stream = futures::stream::iter([Ok(result)]);
+
+        Ok(Response::new(stream.boxed()))
+    }
+
+    async fn list_actions(
+        &self,
+        _request: Request<Empty>,
+    ) -> Result<Response<Self::ListActionsStream>, tonic::Status> {
+        Err(tonic::Status::unimplemented(
+            "Not yet implemented: list_actions",
+        ))
+    }
+
+    async fn do_exchange(
+        &self,
+        _request: Request<Streaming<FlightData>>,
+    ) -> Result<Response<Self::DoExchangeStream>, tonic::Status> {
+        Err(tonic::Status::unimplemented(
+            "Not yet implemented: do_exchange",
+        ))
+    }
+}
+
+/// Extracts an encoded Protobuf message from a [`FlightDescriptor`],
+/// as used in FlightSQL.
+fn cmd_from_descriptor(flight_descriptor: FlightDescriptor) -> Result<FlightSQLCommand> {
+    match flight_descriptor.r#type() {
+        DescriptorType::Cmd => Ok(FlightSQLCommand::try_decode(flight_descriptor.cmd)?),
+        DescriptorType::Path => Err(Error::unsupported_message_type("FlightInfo with Path")),
+        DescriptorType::Unknown => Err(Error::unsupported_message_type(
+            "FlightInfo of unknown type",
+        )),
+    }
+}
+
+/// Figure out the database for this request by checking
+/// the "database=database_or_bucket_name" (preferred)
+/// or "bucket=database_or_bucket_name"
+/// or "bucket-name=database_or_bucket_name"
+/// or "iox-namespace-name=the_name" (deprecated);
+///
+/// Only one of the keys is accepted.
+///
+/// Note that `iox-namespace-name` is still accepted (rather than error) for
+/// some period of time until we are sure that all other software speaking
+/// FlightSQL is using the new header names.
+fn get_flightsql_namespace(metadata: &MetadataMap) -> Result<String> {
+    let mut found_header_keys: Vec<String> = vec![];
+
+    for key in IOX_FLIGHT_SQL_DATABASE_REQUEST_HEADERS {
+        if metadata.contains_key(key) {
+            found_header_keys.push(key.to_string());
+        }
+    }
+
+    // if all the keys specify the same database name, return the name
+    let mut database_name: Option<&str> = None;
+    for key in &found_header_keys {
+        if let Some(v) = metadata.get(key) {
+            let v = v.to_str().context(InvalidDatabaseHeaderSnafu)?;
+            if database_name.is_none() {
+                database_name = Some(v);
+            } else if let Some(database_name) = database_name {
+                if database_name != v {
+                    return TooManyFlightSQLDatabasesSnafu {
+                        header_names: found_header_keys,
+                    }
+                    .fail();
+                }
+            }
+        }
+    }
+
+    Ok(database_name.context(NoFlightSQLDatabaseSnafu)?.to_string())
+}
+
+/// Retrieve the authorization token associated with the request.
+fn get_flight_authz(metadata: &MetadataMap) -> Option<Vec<u8>> {
+    extract_token(metadata.get("authorization"))
+}
+
+fn flightsql_permissions(namespace_name: &str, cmd: &FlightSQLCommand) -> Vec<authz::Permission> {
+    let resource = authz::Resource::Database(namespace_name.to_string());
+    let action = match cmd {
+        FlightSQLCommand::CommandStatementQuery(_) => authz::Action::Read,
+        FlightSQLCommand::CommandPreparedStatementQuery(_) => authz::Action::Read,
+        FlightSQLCommand::CommandGetSqlInfo(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetCatalogs(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetCrossReference(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetDbSchemas(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetExportedKeys(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetImportedKeys(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetPrimaryKeys(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetTables(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetTableTypes(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::CommandGetXdbcTypeInfo(_) => authz::Action::ReadSchema,
+        FlightSQLCommand::ActionCreatePreparedStatementRequest(_) => authz::Action::Read,
+        FlightSQLCommand::ActionClosePreparedStatementRequest(_) => authz::Action::Read,
+    };
+    vec![authz::Permission::ResourceAction(resource, action)]
+}
+
+/// Check if request has IOx debug header set.
+fn has_debug_header(metadata: &MetadataMap) -> bool {
+    metadata
+        .get("iox-debug")
+        .and_then(|s| s.to_str().ok())
+        .map(|s| s.to_lowercase())
+        .map(|s| matches!(s.as_str(), "1" | "on" | "yes" | "y" | "true" | "t"))
+        .unwrap_or_default()
+}
+
+struct PermitAndToken {
+    #[allow(dead_code)]
+    permit: InstrumentedAsyncOwnedSemaphorePermit,
+    query_completed_token: QueryCompletedToken<StatePermit>,
+}
+
+/// Wrapper over a FlightDataEncodeStream that adds IOx specific
+/// metadata and records completion
+struct GetStream {
+    inner: BoxStream<'static, Result<FlightData, FlightError>>,
+    permit_state: Arc<Mutex<Option<PermitAndToken>>>,
+    done: bool,
+}
+
+impl GetStream {
+    async fn new<S>(
+        server: Arc<S>,
+        ctx: IOxSessionContext,
+        physical_plan: Arc<dyn ExecutionPlan>,
+        namespace_name: String,
+        query: &RunQuery,
+        query_completed_token: QueryCompletedToken<StatePlanned>,
+    ) -> Result<Self, tonic::Status>
+    where
+        S: QueryNamespaceProvider,
+    {
+        let app_metadata = proto::AppMetadata {};
+
+        let schema = physical_plan.schema();
+
+        let query_results = ctx
+            .execute_stream(Arc::clone(&physical_plan))
+            .await
+            .context(QuerySnafu {
+                namespace_name: namespace_name.clone(),
+                query: query.to_string(),
+            })?
+            .map_err(|e| {
+                let code = datafusion_error_to_tonic_code(&e);
+                tonic::Status::new(code, e.to_string()).into()
+            });
+
+        // acquire token (after planning)
+        let permit_state: Arc<Mutex<Option<PermitAndToken>>> = Default::default();
+        let permit_state_captured = Arc::clone(&permit_state);
+        let permit_span = ctx.child_span("query rate limit semaphore");
+        let query_results = futures::stream::once(async move {
+            let permit = server.acquire_semaphore(permit_span).await;
+            let query_completed_token = query_completed_token.permit();
+            *permit_state_captured.lock().expect("not poisened") = Some(PermitAndToken {
+                permit,
+                query_completed_token,
+            });
+            query_results
+        })
+        .flatten();
+
+        // setup encoding stream
+        let encoded = FlightDataEncoderBuilder::new()
+            .with_schema(schema)
+            .with_metadata(app_metadata.encode_to_vec().into())
+            .build(query_results);
+
+        // keep-alive
+        let inner = KeepAliveStream::new(encoded, DO_GET_KEEP_ALIVE_INTERVAL).boxed();
+
+        Ok(Self {
+            inner,
+            permit_state,
+            done: false,
+        })
+    }
+
+    #[must_use]
+    fn finish_stream(&self) -> Option<QueryCompletedToken<StatePermit>> {
+        self.permit_state
+            .lock()
+            .expect("not poisened")
+            .take()
+            .map(|state| state.query_completed_token)
+    }
+}
+
+impl Stream for GetStream {
+    type Item = Result<FlightData, tonic::Status>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        loop {
+            if self.done {
+                return Poll::Ready(None);
+            }
+
+            let res = ready!(self.inner.poll_next_unpin(cx));
+            match res {
+                None => {
+                    self.done = true;
+                    // if we get here, all is good
+                    if let Some(token) = self.finish_stream() {
+                        token.success();
+                    }
+                }
+                Some(Ok(data)) => {
+                    return Poll::Ready(Some(Ok(data)));
+                }
+                Some(Err(e)) => {
+                    self.done = true;
+                    if let Some(token) = self.finish_stream() {
+                        token.fail();
+                    }
+                    return Poll::Ready(Some(Err(e.into())));
+                }
+            }
+        }
+    }
+}
+
+/// Header/trailer data added to query responses.
+#[derive(Debug, Clone)]
+struct QueryResponseMetadata {
+    log_entry: Option<Arc<QueryLogEntry>>,
+}
+
+impl QueryResponseMetadata {
+    fn write_trailer_duration(md: &mut HeaderMap, key: &'static str, d: Option<Duration>) {
+        let Some(d) = d else { return };
+
+        md.insert(
+            key,
+            d.as_secs_f64().to_string().parse().expect("always valid"),
+        );
+    }
+
+    fn write_trailers(&self, md: &mut HeaderMap) {
+        let Some(log_entry) = &self.log_entry else {
+            return;
+        };
+
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_QUEUE_DURATION_RESPONSE_TRAILER,
+            log_entry.permit_duration(),
+        );
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_PLANNING_DURATION_RESPONSE_TRAILER,
+            log_entry.plan_duration(),
+        );
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_EXECUTION_DURATION_RESPONSE_TRAILER,
+            log_entry.execute_duration(),
+        );
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_COMPUTE_DURATION_RESPONSE_TRAILER,
+            log_entry.compute_duration(),
+        );
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_flight::sql::ProstMessageExt;
+    use async_trait::async_trait;
+    use authz::Permission;
+    use futures::Future;
+    use iox_query::test::TestDatabaseStore;
+    use metric::{Attributes, Metric, U64Gauge};
+    use test_helpers::maybe_start_logging;
+    use tokio::pin;
+    use tonic::metadata::{MetadataKey, MetadataValue};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_query_semaphore() {
+        let semaphore_size = 2;
+        let test_storage = Arc::new(TestDatabaseStore::new_with_semaphore_size(semaphore_size));
+
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_total",
+            2,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_pending",
+            0,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_acquired",
+            0,
+        );
+
+        // add some data
+        test_storage.db_or_create("my_db").await;
+
+        let service = FlightService {
+            server: Arc::clone(&test_storage),
+            authz: Option::<Arc<dyn Authorizer>>::None,
+        };
+        let ticket = Ticket {
+            ticket: br#"{"namespace_name": "my_db", "sql_query": "SELECT 1;"}"#
+                .to_vec()
+                .into(),
+        };
+        let mut streaming_resp1 = service
+            .do_get(tonic::Request::new(ticket.clone()))
+            .await
+            .unwrap()
+            .into_inner();
+        streaming_resp1.next().await.unwrap().unwrap(); // schema (planning)
+        streaming_resp1.next().await.unwrap().unwrap(); // record batch (execution)
+
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_total",
+            2,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_pending",
+            0,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_acquired",
+            1,
+        );
+
+        let mut streaming_resp2 = service
+            .do_get(tonic::Request::new(ticket.clone()))
+            .await
+            .unwrap()
+            .into_inner();
+        streaming_resp2.next().await.unwrap().unwrap(); // schema (planning)
+        streaming_resp2.next().await.unwrap().unwrap(); // record batch (execution)
+
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_total",
+            2,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_pending",
+            0,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_acquired",
+            2,
+        );
+
+        // 3rd request is pending
+        let mut streaming_resp3 = service
+            .do_get(tonic::Request::new(ticket.clone()))
+            .await
+            .unwrap()
+            .into_inner();
+        streaming_resp3.next().await.unwrap().unwrap(); // schema (planning)
+        let fut = streaming_resp3.next(); // record batch (execution)
+        pin!(fut);
+        assert_fut_pending(&mut fut).await;
+
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_total",
+            2,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_pending",
+            1,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_acquired",
+            2,
+        );
+
+        // free permit
+        drop(streaming_resp1);
+        fut.await.unwrap().unwrap();
+
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_total",
+            2,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_pending",
+            0,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_acquired",
+            2,
+        );
+
+        drop(streaming_resp2);
+        drop(streaming_resp3);
+
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_total",
+            2,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_pending",
+            0,
+        );
+        assert_semaphore_metric(
+            &test_storage.metric_registry,
+            "iox_async_semaphore_permits_acquired",
+            0,
+        );
+    }
+
+    /// Assert that given future is pending.
+    ///
+    /// This will try to poll the future a bit to ensure that it is not stuck in tokios task preemption.
+    async fn assert_fut_pending<F>(fut: &mut F)
+    where
+        F: Future + Send + Unpin,
+    {
+        tokio::select! {
+            _ = fut => panic!("future is not pending, yielded"),
+            _ = tokio::time::sleep(std::time::Duration::from_millis(10)) => {},
+        };
+    }
+
+    #[track_caller]
+    fn assert_semaphore_metric(registry: &metric::Registry, name: &'static str, expected: u64) {
+        let actual = registry
+            .get_instrument::<Metric<U64Gauge>>(name)
+            .expect("failed to read metric")
+            .get_observer(&Attributes::from(&[("semaphore", "query_execution")]))
+            .expect("failed to get observer")
+            .fetch();
+        assert_eq!(actual, expected);
+    }
+
+    #[derive(Debug)]
+    struct MockAuthorizer {}
+
+    #[async_trait]
+    impl Authorizer for MockAuthorizer {
+        async fn permissions(
+            &self,
+            token: Option<Vec<u8>>,
+            perms: &[Permission],
+        ) -> Result<Vec<Permission>, authz::Error> {
+            match token {
+                Some(token) => match (&token as &dyn AsRef<[u8]>).as_ref() {
+                    b"GOOD" => Ok(perms.to_vec()),
+                    b"BAD" => Err(authz::Error::Forbidden),
+                    b"INVALID" => Err(authz::Error::InvalidToken),
+                    b"UGLY" => Err(authz::Error::verification("test", "test error")),
+                    _ => panic!("unexpected token"),
+                },
+                None => Err(authz::Error::NoToken),
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn do_get_authz() {
+        maybe_start_logging();
+
+        let test_storage = Arc::new(TestDatabaseStore::default());
+        test_storage.db_or_create("bananas").await;
+
+        let svc = FlightService {
+            server: Arc::clone(&test_storage),
+            authz: Some(Arc::new(MockAuthorizer {})),
+        };
+
+        async fn assert_code(
+            svc: &FlightService<TestDatabaseStore>,
+            want: tonic::Code,
+            request: tonic::Request<arrow_flight::Ticket>,
+        ) {
+            let got = match svc.do_get(request).await {
+                Ok(_) => tonic::Code::Ok,
+                Err(e) => e.code(),
+            };
+            assert_eq!(want, got);
+        }
+
+        fn request(
+            query: RunQuery,
+            authorization: &'static str,
+        ) -> tonic::Request<arrow_flight::Ticket> {
+            let mut req = tonic::Request::new(
+                IoxGetRequest::new("bananas".to_string(), query, false)
+                    .try_encode()
+                    .unwrap(),
+            );
+            if !authorization.is_empty() {
+                req.metadata_mut().insert(
+                    MetadataKey::from_static("authorization"),
+                    MetadataValue::from_static(authorization),
+                );
+            }
+            req
+        }
+
+        fn sql_request(authorization: &'static str) -> tonic::Request<arrow_flight::Ticket> {
+            request(RunQuery::Sql("SELECT 1".to_string()), authorization)
+        }
+
+        fn influxql_request(authorization: &'static str) -> tonic::Request<arrow_flight::Ticket> {
+            request(
+                RunQuery::InfluxQL("SHOW DATABASES".to_string()),
+                authorization,
+            )
+        }
+
+        fn flightsql_request(authorization: &'static str) -> tonic::Request<arrow_flight::Ticket> {
+            request(
+                RunQuery::FlightSQL(FlightSQLCommand::CommandGetCatalogs(
+                    arrow_flight::sql::CommandGetCatalogs {},
+                )),
+                authorization,
+            )
+        }
+
+        assert_code(&svc, tonic::Code::Unauthenticated, sql_request("")).await;
+        assert_code(&svc, tonic::Code::Ok, sql_request("Bearer GOOD")).await;
+        assert_code(
+            &svc,
+            tonic::Code::PermissionDenied,
+            sql_request("Bearer BAD"),
+        )
+        .await;
+        assert_code(
+            &svc,
+            tonic::Code::PermissionDenied,
+            sql_request("Bearer INVALID"),
+        )
+        .await;
+        assert_code(&svc, tonic::Code::Internal, sql_request("Bearer UGLY")).await;
+
+        assert_code(&svc, tonic::Code::Unauthenticated, influxql_request("")).await;
+
+        assert_code(
+            &svc,
+            tonic::Code::InvalidArgument, // SHOW DATABASE has not been implemented yet.
+            influxql_request("Bearer GOOD"),
+        )
+        .await;
+        assert_code(
+            &svc,
+            tonic::Code::PermissionDenied,
+            influxql_request("Bearer BAD"),
+        )
+        .await;
+        assert_code(&svc, tonic::Code::Internal, influxql_request("Bearer UGLY")).await;
+
+        assert_code(&svc, tonic::Code::Unauthenticated, flightsql_request("")).await;
+        assert_code(&svc, tonic::Code::Ok, flightsql_request("Bearer GOOD")).await;
+        assert_code(
+            &svc,
+            tonic::Code::PermissionDenied,
+            flightsql_request("Bearer BAD"),
+        )
+        .await;
+        assert_code(
+            &svc,
+            tonic::Code::Internal,
+            flightsql_request("Bearer UGLY"),
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn get_flight_info_authz() {
+        let test_storage = Arc::new(TestDatabaseStore::default());
+        test_storage.db_or_create("bananas").await;
+
+        let svc = FlightService {
+            server: Arc::clone(&test_storage),
+            authz: Some(Arc::new(MockAuthorizer {})),
+        };
+
+        async fn assert_code(
+            svc: &FlightService<TestDatabaseStore>,
+            want: tonic::Code,
+            request: tonic::Request<FlightDescriptor>,
+        ) {
+            let got = match svc.get_flight_info(request).await {
+                Ok(_) => tonic::Code::Ok,
+                Err(e) => e.code(),
+            };
+            assert_eq!(want, got);
+        }
+
+        fn request(authorization: &'static str) -> tonic::Request<FlightDescriptor> {
+            let cmd = arrow_flight::sql::CommandGetCatalogs {};
+            let mut req =
+                tonic::Request::new(FlightDescriptor::new_cmd(cmd.as_any().encode_to_vec()));
+            req.metadata_mut().insert(
+                MetadataKey::from_static("database"),
+                MetadataValue::from_static("bananas"),
+            );
+            if !authorization.is_empty() {
+                req.metadata_mut().insert(
+                    MetadataKey::from_static("authorization"),
+                    MetadataValue::from_static(authorization),
+                );
+            }
+            req
+        }
+
+        assert_code(&svc, tonic::Code::Unauthenticated, request("")).await;
+        assert_code(&svc, tonic::Code::Ok, request("Bearer GOOD")).await;
+        assert_code(&svc, tonic::Code::PermissionDenied, request("Bearer BAD")).await;
+        assert_code(&svc, tonic::Code::Internal, request("Bearer UGLY")).await;
+    }
+}
diff --git a/service_grpc_flight/src/planner.rs b/service_grpc_flight/src/planner.rs
new file mode 100644
index 0000000..9c6caf4
--- /dev/null
+++ b/service_grpc_flight/src/planner.rs
@@ -0,0 +1,113 @@
+//! Query planner wrapper for use in IOx services
+use std::sync::Arc;
+
+use bytes::Bytes;
+use datafusion::{
+    arrow::datatypes::SchemaRef, error::DataFusionError, physical_plan::ExecutionPlan,
+};
+use flightsql::{FlightSQLCommand, FlightSQLPlanner};
+use iox_query::{exec::IOxSessionContext, frontend::sql::SqlQueryPlanner, QueryNamespace};
+
+pub(crate) use datafusion::error::{DataFusionError as Error, Result};
+use iox_query_influxql::frontend::planner::InfluxQLQueryPlanner;
+use iox_query_params::StatementParams;
+
+/// Query planner that plans queries on a separate threadpool.
+///
+/// Query planning was, at time of writing, a single threaded affair. In order
+/// to avoid tying up the tokio executor that is handling API requests, IOx plan
+/// queries using a separate thread pool.
+#[derive(Debug)]
+pub(crate) struct Planner {
+    /// Executors (whose threadpool to use)
+    ctx: IOxSessionContext,
+}
+
+impl Planner {
+    /// Create a new planner that will plan queries using the provided context
+    pub(crate) fn new(ctx: &IOxSessionContext) -> Self {
+        Self {
+            ctx: ctx.child_ctx("Planner"),
+        }
+    }
+
+    /// Plan a SQL query against the data in a namespace, and return a
+    /// DataFusion physical execution plan.
+    pub(crate) async fn sql(
+        &self,
+        query: impl AsRef<str> + Send,
+        params: StatementParams,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = SqlQueryPlanner::new();
+        let query = query.as_ref();
+        let ctx = self.ctx.child_ctx("planner sql");
+        let params = params.into_df_param_values();
+
+        planner.query(query, params, &ctx).await
+    }
+
+    /// Plan an InfluxQL query against the data in `database`, and return a
+    /// DataFusion physical execution plan.
+    pub(crate) async fn influxql(
+        &self,
+        query: impl AsRef<str> + Send,
+        params: impl Into<StatementParams> + Send,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = InfluxQLQueryPlanner::new();
+        let query = query.as_ref();
+        let ctx = self.ctx.child_ctx("planner influxql");
+        let params = params.into();
+
+        planner.query(query, params, &ctx).await
+    }
+
+    /// Creates a plan for a `DoGet` FlightSQL message, as described on
+    /// [`FlightSQLPlanner::do_get`], on a separate threadpool
+    pub(crate) async fn flight_sql_do_get(
+        &self,
+        namespace_name: impl AsRef<str> + Send,
+        namespace: Arc<dyn QueryNamespace>,
+        cmd: FlightSQLCommand,
+        params: StatementParams,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let namespace_name = namespace_name.as_ref();
+        let ctx = self.ctx.child_ctx("planner flight_sql_do_get");
+        let params = params.into_df_param_values();
+
+        FlightSQLPlanner::do_get(namespace_name, namespace, cmd, params, &ctx)
+            .await
+            .map_err(DataFusionError::from)
+    }
+
+    /// Creates a plan for a `DoAction` FlightSQL message, as described on
+    /// [`FlightSQLPlanner::do_action`], on a separate threadpool
+    pub(crate) async fn flight_sql_do_action(
+        &self,
+        namespace_name: impl Into<String> + Send,
+        namespace: Arc<dyn QueryNamespace>,
+        cmd: FlightSQLCommand,
+    ) -> Result<Bytes> {
+        let namespace_name = namespace_name.into();
+        let ctx = self.ctx.child_ctx("planner flight_sql_do_get");
+
+        FlightSQLPlanner::do_action(namespace_name, namespace, cmd, &ctx)
+            .await
+            .map_err(DataFusionError::from)
+    }
+
+    /// Returns the [`SchemaRef`] to be included in the response to a
+    /// `GetFlightInfo` FlightSQL message as described on
+    /// [`FlightSQLPlanner::get_schema`], on a separate threadpool.
+    pub(crate) async fn flight_sql_get_flight_info_schema(
+        &self,
+        namespace_name: impl Into<String> + Send,
+        cmd: FlightSQLCommand,
+    ) -> Result<SchemaRef> {
+        let namespace_name = namespace_name.into();
+        let ctx = self.ctx.child_ctx("planner flight_sql_get_flight_info");
+
+        FlightSQLPlanner::get_schema(namespace_name, cmd, &ctx)
+            .await
+            .map_err(DataFusionError::from)
+    }
+}
diff --git a/service_grpc_flight/src/request.rs b/service_grpc_flight/src/request.rs
new file mode 100644
index 0000000..eeafd6f
--- /dev/null
+++ b/service_grpc_flight/src/request.rs
@@ -0,0 +1,1045 @@
+//! Ticket handling for the native IOx Flight API
+
+use arrow_flight::Ticket;
+use bytes::Bytes;
+
+use flightsql::FlightSQLCommand;
+use generated_types::google::protobuf::Any;
+use generated_types::influxdata::iox::querier::v1 as proto;
+use generated_types::influxdata::iox::querier::v1::read_info::QueryType;
+
+use iox_query_params::StatementParams;
+use observability_deps::tracing::trace;
+use prost::Message;
+use serde::Deserialize;
+
+use snafu::{ResultExt, Snafu};
+use std::fmt::{Debug, Display, Formatter};
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Invalid ticket"))]
+    Invalid,
+    #[snafu(display("Invalid ticket content: {}", msg))]
+    InvalidContent { msg: String },
+    #[snafu(display("Unknown query type. Expected 'sql' or 'influxql', got {}", query_type))]
+    InvalidQueryType { query_type: String },
+    #[snafu(display("Invalid Flight SQL ticket: {}", source))]
+    FlightSQL { source: flightsql::Error },
+    #[snafu(display("Protobuf decoding error: {}", source))]
+    DecodeProtobuf { source: prost::DecodeError },
+    #[snafu(display("JSON parse error: {}", source))]
+    DecodeJson { source: serde_json::Error },
+    #[snafu(display("Invalid params: {}", source))]
+    DecodeParams { source: iox_query_params::Error },
+}
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// AnyError is an internal error that contains the result of attempting
+/// to decode a protobuf "Any" message. This is separate from Error so
+/// that an error resulting from attempting to decode the value can be
+/// embedded as a source.
+#[derive(Debug, Snafu)]
+enum AnyError {
+    #[snafu(display("Invalid Protobuf: {}", source))]
+    DecodeAny { source: prost::DecodeError },
+    #[snafu(display("Unknown type_url: {}", type_url))]
+    UnknownTypeURL { type_url: String },
+    #[snafu(display("Invalid value: {}", source))]
+    InvalidValue { source: Error },
+}
+
+/// Request structure of the "opaque" tickets used for IOx Arrow
+/// Flight DoGet endpoint.
+///
+/// This structure encapsulates the deserialization and serializion
+/// logic for these requests.  The protocol is described in more
+/// detail on [`FlightService`](crate::FlightService).
+///
+/// # Ticket Format
+///
+/// Tickets are encoded in one of two formats:
+///
+/// 1. Protobuf: as a [ReadInfo](proto::ReadInfo) wrapped as a "Any"
+/// message and encoded using binary encoding
+///
+/// 2. JSON: formatted as below.
+///
+/// ## Known clients use the JSON encoding
+///
+/// - <https://github.com/influxdata/influxdb-iox-client-go/commit/2e7a3b0bd47caab7f1a31a1bbe0ff54aa9486b7b>
+/// - <https://github.com/influxdata/influxdb-iox-client-go/commit/52f1a1b8d5bb8cc8dc2fe825f4da630ad0b9167c>
+///
+/// ## Example JSON Ticket format
+///
+/// This runs the SQL "SELECT 1" in database `my_db`
+///
+/// ```json
+/// {
+///   "database": "my_db",
+///   "sql_query": "SELECT 1;"
+/// }
+/// ```
+///
+/// This is the same as the example above, but has an explicit query language
+///
+/// ```json
+/// {
+///   "database": "my_db",
+///   "sql_query": "SELECT 1;"
+///   "query_type": "sql"
+/// }
+/// ```
+///
+/// This runs the 'SHOW DATABASES' InfluxQL command (the `sql_query` field name is misleading)
+///
+/// ```json
+/// {
+///   "database": "my_db",
+///   "sql_query": "SHOW DATABASES;"
+///   "query_type": "influxql"
+/// }
+/// ```
+///
+/// ## Query parameters
+///
+/// You can bind parameters to the query by using `$placeholder` syntax within the query and
+/// supplying the parameter values via the `params` object. For example:
+///
+/// ```json
+/// {
+///     "database": "my_db",
+///     "sql_query": "SELECT a, b, c FROM my_table WHERE id = $id AND name = $name",
+///     "query_type": "sql",
+///     "params": {
+///         "id": 1234,
+///         "name": "alice"
+///     }
+/// }
+/// ```
+///
+#[derive(Debug, PartialEq, Clone)]
+pub(crate) struct IoxGetRequest {
+    pub(crate) database: String,
+    pub(crate) query: RunQuery,
+    pub(crate) params: StatementParams,
+    pub(crate) is_debug: bool,
+}
+
+#[derive(Debug, PartialEq, Clone)]
+pub(crate) enum RunQuery {
+    /// Unparameterized SQL query
+    Sql(String),
+    /// InfluxQL
+    InfluxQL(String),
+    /// Execute a FlightSQL command. The payload is an encoded
+    /// FlightSQL Command*. message that was received at the
+    /// get_flight_info endpoint
+    FlightSQL(FlightSQLCommand),
+}
+
+impl RunQuery {
+    pub(crate) fn variant(&self) -> &'static str {
+        match self {
+            Self::Sql(_) => "sql",
+            Self::InfluxQL(_) => "influxql",
+            Self::FlightSQL(_) => "flightsql",
+        }
+    }
+}
+
+impl Display for RunQuery {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Sql(s) => Display::fmt(s, f),
+            Self::InfluxQL(s) => Display::fmt(s, f),
+            Self::FlightSQL(s) => Display::fmt(s, f),
+        }
+    }
+}
+
+impl IoxGetRequest {
+    const READ_INFO_TYPE_URL: &'static str =
+        "type.googleapis.com/influxdata.iox.querier.v1.ReadInfo";
+
+    /// Create a new request to run the specified query
+    pub(crate) fn new(database: impl Into<String>, query: RunQuery, is_debug: bool) -> Self {
+        Self {
+            database: database.into(),
+            query,
+            params: StatementParams::default(),
+            is_debug,
+        }
+    }
+
+    /// Merges result of the gRPC debug header into the is_debug field of this request using boolean or logic
+    pub(crate) fn add_debug_header(mut self, debug_header: bool) -> Self {
+        self.is_debug |= debug_header;
+        self
+    }
+
+    /// try to decode a ReadInfo structure from a Token
+    pub(crate) fn try_decode(ticket: Ticket) -> Result<Self> {
+        // decode ticket
+        IoxGetRequest::decode_protobuf_any(ticket.ticket.clone())
+            .or_else(|e| {
+                match e {
+                    // If the ticket decoded as an Any with a type_url that was recognised
+                    // don't attempt to fall back to ReadInfo it will almost certainly
+                    // succeed, but with invalid parameters.
+                    AnyError::InvalidValue { source } => Err(source),
+                    e => {
+                        trace!(%e, "Error decoding ticket as Any, trying as ReadInfo");
+                        IoxGetRequest::decode_protobuf(ticket.ticket.clone())
+                    }
+                }
+            })
+            .or_else(|e| {
+                trace!(%e, ticket=%String::from_utf8_lossy(&ticket.ticket),
+                       "Error decoding ticket as ProtoBuf, trying as JSON");
+                IoxGetRequest::decode_json(ticket.ticket.clone())
+            })
+            .map_err(|e| {
+                trace!(%e, "Error decoding ticket as JSON");
+                Error::Invalid
+            })
+    }
+
+    /// Encode the request as a protobuf Ticket
+    pub(crate) fn try_encode(self) -> Result<Ticket> {
+        let Self {
+            database,
+            query,
+            params,
+            is_debug,
+        } = self;
+
+        let params: Vec<proto::read_info::QueryParam> = params.into();
+
+        let read_info = match query {
+            RunQuery::Sql(sql_query) => proto::ReadInfo {
+                database,
+                sql_query,
+                query_type: QueryType::Sql.into(),
+                flightsql_command: vec![],
+                params,
+                is_debug,
+            },
+            RunQuery::InfluxQL(influxql) => proto::ReadInfo {
+                database,
+                // field name is misleading
+                sql_query: influxql,
+                query_type: QueryType::InfluxQl.into(),
+                flightsql_command: vec![],
+                params,
+                is_debug,
+            },
+            RunQuery::FlightSQL(flightsql_command) => proto::ReadInfo {
+                database,
+                sql_query: "".into(),
+                query_type: QueryType::FlightSqlMessage.into(),
+                flightsql_command: flightsql_command
+                    .try_encode()
+                    .context(FlightSQLSnafu)?
+                    .into(),
+                params,
+                is_debug,
+            },
+        };
+
+        let any = Any {
+            type_url: Self::READ_INFO_TYPE_URL.to_string(),
+            value: read_info.encode_to_vec().into(),
+        };
+        let ticket = any.encode_to_vec();
+
+        Ok(Ticket {
+            ticket: ticket.into(),
+        })
+    }
+
+    /// See comments on [`IoxGetRequest`] for details of this format
+    fn decode_json(ticket: Bytes) -> Result<Self> {
+        let json_str = String::from_utf8(ticket.to_vec()).map_err(|_| Error::InvalidContent {
+            msg: "Not UTF8".to_string(),
+        })?;
+
+        /// This represents ths JSON fields
+        #[derive(Deserialize, Debug)]
+        struct ReadInfoJson {
+            #[serde(alias = "namespace_name", alias = "bucket", alias = "bucket-name")]
+            database: String,
+            sql_query: String,
+            // If query type is not supplied, defaults to SQL
+            query_type: Option<String>,
+            #[serde(default = "Default::default")]
+            params: StatementParams,
+            #[serde(default = "Default::default")]
+            is_debug: bool,
+        }
+
+        let ReadInfoJson {
+            database,
+            sql_query,
+            query_type,
+            params,
+            is_debug,
+        } = serde_json::from_str(&json_str).context(DecodeJsonSnafu)?;
+
+        let query = if let Some(query_type) = query_type {
+            match query_type.as_str() {
+                "sql" => RunQuery::Sql(sql_query),
+                "influxql" => RunQuery::InfluxQL(sql_query),
+                _ => return InvalidQueryTypeSnafu { query_type }.fail(),
+            }
+        } else {
+            // default to SQL
+            RunQuery::Sql(sql_query)
+        };
+
+        Ok(Self {
+            database,
+            query,
+            params,
+            is_debug,
+        })
+    }
+
+    /// Decode a ReadInfo ticket wrapped in a protobuf Any message.
+    fn decode_protobuf_any(ticket: Bytes) -> Result<Self, AnyError> {
+        let any = Any::decode(ticket).context(DecodeAnySnafu)?;
+        if any.type_url == Self::READ_INFO_TYPE_URL {
+            Self::decode_protobuf(any.value).context(InvalidValueSnafu)
+        } else {
+            UnknownTypeURLSnafu {
+                type_url: any.type_url,
+            }
+            .fail()
+        }
+    }
+
+    /// See comments on [`IoxGetRequest`] for details of this format
+    fn decode_protobuf(ticket: Bytes) -> Result<Self, Error> {
+        let read_info = proto::ReadInfo::decode(ticket).context(DecodeProtobufSnafu)?;
+
+        let query_type = read_info.query_type();
+        let proto::ReadInfo {
+            database,
+            sql_query,
+            query_type: _,
+            flightsql_command,
+            is_debug,
+            params,
+        } = read_info;
+
+        Ok(Self {
+            database,
+            query: match query_type {
+                QueryType::Unspecified | QueryType::Sql => {
+                    if !flightsql_command.is_empty() {
+                        return InvalidContentSnafu {
+                            msg: "QueryType::Sql contained non empty flightsql_command",
+                        }
+                        .fail();
+                    }
+                    RunQuery::Sql(sql_query)
+                }
+                QueryType::InfluxQl => {
+                    if !flightsql_command.is_empty() {
+                        return InvalidContentSnafu {
+                            msg: "QueryType::InfluxQl contained non empty flightsql_command",
+                        }
+                        .fail();
+                    }
+                    RunQuery::InfluxQL(sql_query)
+                }
+                QueryType::FlightSqlMessage => {
+                    if !sql_query.is_empty() {
+                        return InvalidContentSnafu {
+                            msg: "QueryType::FlightSqlMessage contained non empty sql_query",
+                        }
+                        .fail();
+                    }
+                    let cmd = FlightSQLCommand::try_decode(flightsql_command.into())
+                        .context(FlightSQLSnafu)?;
+                    RunQuery::FlightSQL(cmd)
+                }
+            },
+            params: params.try_into().context(DecodeParamsSnafu)?,
+            is_debug,
+        })
+    }
+
+    pub(crate) fn database(&self) -> &str {
+        self.database.as_ref()
+    }
+
+    pub(crate) fn query(&self) -> &RunQuery {
+        &self.query
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_flight::sql::CommandStatementQuery;
+    use assert_matches::assert_matches;
+    use generated_types::influxdata::iox::querier::v1::read_info::QueryType;
+    use iox_query_params::{params, StatementParams};
+
+    #[test]
+    fn json_ticket_decoding_compatibility() {
+        // The Go clients still use JSON tickets. See:
+        //
+        // - <https://github.com/influxdata/influxdb-iox-client-go/commit/2e7a3b0bd47caab7f1a31a1bbe0ff54aa9486b7b>
+        // - <https://github.com/influxdata/influxdb-iox-client-go/commit/52f1a1b8d5bb8cc8dc2fe825f4da630ad0b9167c
+        //
+        // Do not change this test without having first changed what the Go clients are sending!
+        let ticket = make_json_ticket(r#"{"database": "my_db", "sql_query": "SELECT 1;"}"#);
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+
+        assert_eq!(ri.database, "my_db");
+        assert_matches!(ri.query, RunQuery::Sql(query) => assert_eq!(query, "SELECT 1;"));
+    }
+
+    #[test]
+    fn json_ticket_decoding() {
+        struct TestCase {
+            json: &'static str,
+            expected: IoxGetRequest,
+        }
+
+        impl TestCase {
+            fn new_sql(json: &'static str, expected_database: &str, query: &str) -> Self {
+                Self::new_sql_with_params(
+                    json,
+                    expected_database,
+                    query,
+                    StatementParams::default(),
+                )
+            }
+
+            fn new_sql_with_params(
+                json: &'static str,
+                expected_database: &str,
+                query: &str,
+                params: impl Into<StatementParams>,
+            ) -> Self {
+                Self {
+                    json,
+                    expected: IoxGetRequest {
+                        database: String::from(expected_database),
+                        query: RunQuery::Sql(String::from(query)),
+                        params: params.into(),
+                        is_debug: false,
+                    },
+                }
+            }
+
+            fn new_influxql(json: &'static str, expected_database: &str, query: &str) -> Self {
+                Self::new_influxql_with_params(
+                    json,
+                    expected_database,
+                    query,
+                    StatementParams::default(),
+                )
+            }
+
+            fn new_influxql_with_params(
+                json: &'static str,
+                expected_database: &str,
+                query: &str,
+                params: impl Into<StatementParams>,
+            ) -> Self {
+                Self {
+                    json,
+                    expected: IoxGetRequest {
+                        database: String::from(expected_database),
+                        query: RunQuery::InfluxQL(String::from(query)),
+                        params: params.into(),
+                        is_debug: false,
+                    },
+                }
+            }
+        }
+
+        let cases = vec![
+            // implict `query_type`
+            TestCase::new_sql(
+                r#"{"database": "my_db", "sql_query": "SELECT 1;"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"namespace_name": "my_db", "sql_query": "SELECT 1;"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket": "my_db", "sql_query": "SELECT 1;"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket-name": "my_db", "sql_query": "SELECT 1;"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            // explicit query type, sql
+            TestCase::new_sql(
+                r#"{"database": "my_db", "sql_query": "SELECT 1;", "query_type": "sql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"namespace_name": "my_db", "sql_query": "SELECT 1;", "query_type": "sql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket": "my_db", "sql_query": "SELECT 1;", "query_type": "sql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket-name": "my_db", "sql_query": "SELECT 1;", "query_type": "sql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            // explicit query type null
+            TestCase::new_sql(
+                r#"{"database": "my_db", "sql_query": "SELECT 1;", "query_type": null}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"namespace_name": "my_db", "sql_query": "SELECT 1;", "query_type": null}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket": "my_db", "sql_query": "SELECT 1;", "query_type": null}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket-name": "my_db", "sql_query": "SELECT 1;", "query_type": null}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            // explicit query type, influxql
+            TestCase::new_influxql(
+                r#"{"database": "my_db", "sql_query": "SELECT 1;", "query_type": "influxql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_influxql(
+                r#"{"namespace_name": "my_db", "sql_query": "SELECT 1;", "query_type": "influxql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_influxql(
+                r#"{"bucket": "my_db", "sql_query": "SELECT 1;", "query_type": "influxql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            TestCase::new_influxql(
+                r#"{"bucket-name": "my_db", "sql_query": "SELECT 1;", "query_type": "influxql"}"#,
+                "my_db",
+                "SELECT 1;",
+            ),
+            // explicit query type, influxql on metadata
+            TestCase::new_influxql(
+                r#"{"database": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "influxql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            TestCase::new_influxql(
+                r#"{"namespace_name": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "influxql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            TestCase::new_influxql(
+                r#"{"bucket": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "influxql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            TestCase::new_influxql(
+                r#"{"bucket-name": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "influxql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            // explicit query type, sql on metadata
+            TestCase::new_sql(
+                r#"{"database": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "sql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            TestCase::new_sql(
+                r#"{"namespace_name": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "sql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "sql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            TestCase::new_sql(
+                r#"{"bucket-name": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "sql"}"#,
+                "my_otherdb",
+                "SHOW DATABASES;",
+            ),
+            // query parameter cases
+            TestCase::new_sql_with_params(
+                r#"
+                {
+                    "bucket": "my_db",
+                    "sql_query": "SELECT $1, $2, $3, $4, $5;",
+                    "query_type": "sql",
+                    "params": {
+                        "1": null,
+                        "2": true,
+                        "3": "string",
+                        "4": 1234,
+                        "5": 12.34
+                    }
+                }"#,
+                "my_db",
+                "SELECT $1, $2, $3, $4, $5;",
+                params! {
+                    "1" => (),
+                    "2" => true,
+                    "3" => "string",
+                    "4" => 1234_u32,
+                    "5" => 12.34
+                },
+            ),
+            TestCase::new_influxql_with_params(
+                r#"
+                {
+                    "bucket": "my_db",
+                    "sql_query": "SELECT $1, $2, $3, $4, $5;",
+                    "query_type": "influxql",
+                    "params": {
+                        "1": null,
+                        "2": true,
+                        "3": "string",
+                        "4": 1234,
+                        "5": 12.34
+                    }
+                }"#,
+                "my_db",
+                "SELECT $1, $2, $3, $4, $5;",
+                params! {
+                    "1" => (),
+                    "2" => true,
+                    "3" => "string",
+                    "4" => 1234_u32,
+                    "5" => 12.34
+                },
+            ),
+        ];
+
+        for TestCase { json, expected } in cases {
+            println!("Test:\nInput:\n{json}\nExpected:\n{expected:?}");
+            let ticket = make_json_ticket(json);
+
+            let ri = IoxGetRequest::try_decode(ticket).unwrap();
+            assert_eq!(ri, expected);
+        }
+    }
+
+    #[test]
+    fn json_ticket_decoding_invalid_json() {
+        // invalid json (database name rather than namespace name)
+        let ticket = make_json_ticket(r#"{"database_name": "my_db", "sql_query": "SELECT 1;"}"#);
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn json_ticket_decoding_invalid_query_type() {
+        // invalid query_type
+        let ticket = make_json_ticket(
+            r#"{"namespace_name": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": "flight"}"#,
+        );
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn json_ticket_decoding_empty_query_type() {
+        // invalid query_type ""
+        let ticket = make_json_ticket(
+            r#"{"namespace_name": "my_otherdb", "sql_query": "SHOW DATABASES;", "query_type": ""}"#,
+        );
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn json_ticket_decoding_invalid_params() {
+        let ticket = make_json_ticket(
+            r#"
+        {
+            "bucket": "my_db",
+            "sql_query": "SELECT $1, $2, $3, $4, $5;",
+            "query_type": "influxql",
+            "params": ["foo", "bar"]
+        }"#,
+        );
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+
+        let ticket = make_json_ticket(
+            r#"
+        {
+            "bucket": "my_db",
+            "sql_query": "SELECT $1, $2, $3, $4, $5;",
+            "query_type": "influxql",
+            "params": null
+        }"#,
+        );
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid)
+    }
+    #[test]
+    fn proto_ticket_decoding_unspecified() {
+        let ticket = make_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::Unspecified.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        // Reverts to default (unspecified) for invalid query_type enumeration, and thus SQL
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::Sql(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn proto_ticket_decoding_sql() {
+        let ticket = make_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::Sql.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::Sql(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn proto_ticket_decoding_influxql() {
+        let ticket = make_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::InfluxQl.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::InfluxQL(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn proto_ticket_decoding_too_new() {
+        let ticket = make_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".into(),
+            query_type: 42, // not a known query type
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        // Reverts to default (unspecified) for invalid query_type enumeration, and thus SQL
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::Sql(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn proto_ticket_decoding_sql_too_many_fields() {
+        let ticket = make_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::Sql.into(),
+            // can't have both sql_query and flightsql
+            flightsql_command: vec![1, 2, 3],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn proto_ticket_decoding_influxql_too_many_fields() {
+        let ticket = make_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::InfluxQl.into(),
+            // can't have both sql_query and flightsql
+            flightsql_command: vec![1, 2, 3],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn proto_ticket_decoding_flightsql_too_many_fields() {
+        let ticket = make_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::FlightSqlMessage.into(),
+            // can't have both sql_query and flightsql
+            flightsql_command: vec![1, 2, 3],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn proto_ticket_decoding_error() {
+        let ticket = Ticket {
+            ticket: b"invalid ticket".to_vec().into(),
+        };
+
+        // Reverts to default (unspecified) for invalid query_type enumeration, and thus SQL
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn any_ticket_decoding_unspecified() {
+        let ticket = make_any_wrapped_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::Unspecified.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        // Reverts to default (unspecified) for invalid query_type enumeration, and thus SQL
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::Sql(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn any_ticket_decoding_sql() {
+        let ticket = make_any_wrapped_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::Sql.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::Sql(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn any_ticket_decoding_influxql() {
+        let ticket = make_any_wrapped_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::InfluxQl.into(),
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::InfluxQL(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn any_ticket_decoding_too_new() {
+        let ticket = make_any_wrapped_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".into(),
+            query_type: 42, // not a known query type
+            flightsql_command: vec![],
+            params: vec![],
+            is_debug: false,
+        });
+
+        // Reverts to default (unspecified) for invalid query_type enumeration, and thus SQL
+        let ri = IoxGetRequest::try_decode(ticket).unwrap();
+        assert_eq!(ri.database, "<foo>_<bar>");
+        assert_matches!(ri.query, RunQuery::Sql(query) => assert_eq!(query, "SELECT 1"));
+    }
+
+    #[test]
+    fn any_ticket_decoding_sql_too_many_fields() {
+        let ticket = make_any_wrapped_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::Sql.into(),
+            // can't have both sql_query and flightsql
+            flightsql_command: vec![1, 2, 3],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn any_ticket_decoding_influxql_too_many_fields() {
+        let ticket = make_any_wrapped_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::InfluxQl.into(),
+            // can't have both sql_query and flightsql
+            flightsql_command: vec![1, 2, 3],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn any_ticket_decoding_flightsql_too_many_fields() {
+        let ticket = make_any_wrapped_proto_ticket(&proto::ReadInfo {
+            database: "<foo>_<bar>".to_string(),
+            sql_query: "SELECT 1".to_string(),
+            query_type: QueryType::FlightSqlMessage.into(),
+            // can't have both sql_query and flightsql
+            flightsql_command: vec![1, 2, 3],
+            params: vec![],
+            is_debug: false,
+        });
+
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn any_ticket_decoding_error() {
+        let ticket = Ticket {
+            ticket: b"invalid ticket".to_vec().into(),
+        };
+
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+    }
+
+    #[test]
+    fn round_trip_sql() {
+        let request = IoxGetRequest {
+            database: "foo_blarg".into(),
+            query: RunQuery::Sql("select * from bar".into()),
+            params: StatementParams::default(),
+            is_debug: false,
+        };
+
+        let ticket = request.clone().try_encode().expect("encoding failed");
+
+        let roundtripped = IoxGetRequest::try_decode(ticket).expect("decode failed");
+
+        assert_eq!(request, roundtripped)
+    }
+
+    #[test]
+    fn round_trip_sql_is_debug() {
+        let request = IoxGetRequest {
+            database: "foo_blarg".into(),
+            query: RunQuery::Sql("select * from bar".into()),
+            params: StatementParams::default(),
+            is_debug: true,
+        };
+
+        let ticket = request.clone().try_encode().expect("encoding failed");
+
+        let roundtripped = IoxGetRequest::try_decode(ticket).expect("decode failed");
+
+        assert_eq!(request, roundtripped)
+    }
+
+    #[test]
+    fn round_trip_influxql() {
+        let request = IoxGetRequest {
+            database: "foo_blarg".into(),
+            query: RunQuery::InfluxQL("select * from bar".into()),
+            params: StatementParams::default(),
+            is_debug: false,
+        };
+
+        let ticket = request.clone().try_encode().expect("encoding failed");
+
+        let roundtripped = IoxGetRequest::try_decode(ticket).expect("decode failed");
+
+        assert_eq!(request, roundtripped)
+    }
+
+    #[test]
+    fn round_trip_flightsql() {
+        let cmd = FlightSQLCommand::CommandStatementQuery(CommandStatementQuery {
+            query: "select * from foo".into(),
+            transaction_id: None,
+        });
+
+        let request = IoxGetRequest {
+            database: "foo_blarg".into(),
+            query: RunQuery::FlightSQL(cmd),
+            params: StatementParams::default(),
+            is_debug: false,
+        };
+
+        let ticket = request.clone().try_encode().expect("encoding failed");
+
+        let roundtripped = IoxGetRequest::try_decode(ticket).expect("decode failed");
+
+        assert_eq!(request, roundtripped)
+    }
+
+    fn make_any_wrapped_proto_ticket(read_info: &proto::ReadInfo) -> Ticket {
+        let any = Any {
+            type_url: IoxGetRequest::READ_INFO_TYPE_URL.to_string(),
+            value: read_info.encode_to_vec().into(),
+        };
+        Ticket {
+            ticket: any.encode_to_vec().into(),
+        }
+    }
+
+    fn make_proto_ticket(read_info: &proto::ReadInfo) -> Ticket {
+        Ticket {
+            ticket: read_info.encode_to_vec().into(),
+        }
+    }
+
+    fn make_json_ticket(json: &str) -> Ticket {
+        Ticket {
+            ticket: json.as_bytes().to_vec().into(),
+        }
+    }
+}
diff --git a/service_grpc_testing/Cargo.toml b/service_grpc_testing/Cargo.toml
new file mode 100644
index 0000000..659799e
--- /dev/null
+++ b/service_grpc_testing/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "service_grpc_testing"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+generated_types = { path = "../generated_types" }
+observability_deps = { path = "../observability_deps" }
+tonic =  { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/service_grpc_testing/src/lib.rs b/service_grpc_testing/src/lib.rs
new file mode 100644
index 0000000..6729ca9
--- /dev/null
+++ b/service_grpc_testing/src/lib.rs
@@ -0,0 +1,29 @@
+#![warn(unused_crate_dependencies)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use generated_types::i_ox_testing_server::{IOxTesting, IOxTestingServer};
+use generated_types::{TestErrorRequest, TestErrorResponse};
+use observability_deps::tracing::warn;
+
+/// Concrete implementation of the gRPC IOx testing service API
+struct IOxTestingService {}
+
+#[tonic::async_trait]
+impl IOxTesting for IOxTestingService {
+    async fn test_error(
+        &self,
+        _req: tonic::Request<TestErrorRequest>,
+    ) -> Result<tonic::Response<TestErrorResponse>, tonic::Status> {
+        warn!("Got a test_error request. About to panic");
+        // Purposely do not use a static string (so that the panic
+        // code has to deal with aribtrary payloads). See
+        // https://github.com/influxdata/influxdb_iox/issues/1953
+        panic!("This {}", "is a test panic");
+    }
+}
+
+pub fn make_server() -> IOxTestingServer<impl IOxTesting> {
+    IOxTestingServer::new(IOxTestingService {})
+}
diff --git a/sharder/Cargo.toml b/sharder/Cargo.toml
new file mode 100644
index 0000000..66d8853
--- /dev/null
+++ b/sharder/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "sharder"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+data_types = { path = "../data_types" }
+mutable_batch = { path = "../mutable_batch" }
+parking_lot = "0.12"
+siphasher = "1.0"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+criterion = { version = "0.5", default-features = false, features = ["async_tokio", "rayon"]}
+hashbrown = { workspace = true }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+rand = "0.8.3"
+
+[[bench]]
+name = "sharder"
+harness = false
+
+[lib]
+# Allow --save-baseline to work
+# https://github.com/bheisler/criterion.rs/issues/275
+bench = false
diff --git a/sharder/benches/sharder.rs b/sharder/benches/sharder.rs
new file mode 100644
index 0000000..303a9b2
--- /dev/null
+++ b/sharder/benches/sharder.rs
@@ -0,0 +1,160 @@
+use std::{
+    sync::{Arc, Barrier},
+    time::Instant,
+};
+
+use criterion::{
+    criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, Criterion, Throughput,
+};
+use data_types::NamespaceName;
+use mutable_batch::MutableBatch;
+use rand::{distributions::Alphanumeric, thread_rng, Rng};
+use sharder::{JumpHash, RoundRobin, Sharder};
+
+fn get_random_string(length: usize) -> String {
+    thread_rng()
+        .sample_iter(&Alphanumeric)
+        .take(length)
+        .map(char::from)
+        .collect()
+}
+
+fn sharder_benchmarks(c: &mut Criterion) {
+    benchmark_impl(c, "jumphash", |num_buckets| {
+        JumpHash::new((0..num_buckets).map(Arc::new))
+    });
+
+    benchmark_impl(c, "round_robin", |num_buckets| {
+        RoundRobin::new((0..num_buckets).map(Arc::new))
+    });
+}
+
+fn benchmark_impl<T, F>(c: &mut Criterion, name: &str, init: F)
+where
+    T: Sharder<MutableBatch>,
+    F: Fn(usize) -> T,
+{
+    let mut group = c.benchmark_group(name);
+
+    // benchmark sharder with fixed table name and namespace, with varying number of buckets
+    benchmark_scenario(
+        &mut group,
+        "basic 1k buckets",
+        "table",
+        &NamespaceName::try_from("namespace").unwrap(),
+        init(1_000),
+    );
+    benchmark_scenario(
+        &mut group,
+        "basic 10k buckets",
+        "table",
+        &NamespaceName::try_from("namespace").unwrap(),
+        init(10_000),
+    );
+    benchmark_scenario(
+        &mut group,
+        "basic 100k buckets",
+        "table",
+        &NamespaceName::try_from("namespace").unwrap(),
+        init(100_000),
+    );
+    benchmark_scenario(
+        &mut group,
+        "basic 1M buckets",
+        "table",
+        &NamespaceName::try_from("namespace").unwrap(),
+        init(1_000_000),
+    );
+
+    // benchmark sharder with random table name and namespace of length 16
+    benchmark_scenario(
+        &mut group,
+        "random with key-length 16",
+        get_random_string(16).as_str(),
+        &NamespaceName::try_from(get_random_string(16)).unwrap(),
+        init(10_000),
+    );
+
+    // benchmark sharder with random table name and namespace of length 32
+    benchmark_scenario(
+        &mut group,
+        "random with key-length 32",
+        get_random_string(32).as_str(),
+        &NamespaceName::try_from(get_random_string(32)).unwrap(),
+        init(10_000),
+    );
+
+    // benchmark sharder with random table name and namespace of length 64
+    benchmark_scenario(
+        &mut group,
+        "random with key-length 64",
+        get_random_string(64).as_str(),
+        &NamespaceName::try_from(get_random_string(64)).unwrap(),
+        init(10_000),
+    );
+
+    group.finish();
+}
+
+fn benchmark_scenario<T>(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    bench_name: &str,
+    table: &str,
+    namespace: &NamespaceName<'_>,
+    sharder: T,
+) where
+    T: Sharder<MutableBatch>,
+{
+    let batch = MutableBatch::default();
+
+    group.throughput(Throughput::Elements(1));
+    group.bench_function(bench_name, |b| {
+        b.iter(|| {
+            sharder.shard(table, namespace, &batch);
+        });
+    });
+
+    const N_THREADS: usize = 10;
+
+    // Run the same test with N contending threads.
+    //
+    // Note that this includes going through pointer indirection for each shard
+    // op due to the Arc.
+    let sharder = Arc::new(sharder);
+    group.bench_function(format!("{bench_name}_{N_THREADS}_threads"), |b| {
+        b.iter_custom(|iters| {
+            let sharder = Arc::clone(&sharder);
+            std::thread::scope(|s| {
+                let barrier = Arc::new(Barrier::new(N_THREADS));
+                // Spawn N-1 threads that wait for the last thread to spawn
+                for _ in 0..(N_THREADS - 1) {
+                    let sharder = Arc::clone(&sharder);
+                    let barrier = Arc::clone(&barrier);
+                    s.spawn(move || {
+                        let batch = MutableBatch::default();
+                        barrier.wait();
+                        for _ in 0..iters {
+                            sharder.shard(table, namespace, &batch);
+                        }
+                    });
+                }
+                // Spawn the Nth thread that performs the same sharding ops, but
+                // measures the duration of time taken.
+                s.spawn(move || {
+                    let batch = MutableBatch::default();
+                    barrier.wait();
+                    let start = Instant::now();
+                    for _ in 0..iters {
+                        sharder.shard(table, namespace, &batch);
+                    }
+                    start.elapsed()
+                })
+                .join()
+                .unwrap()
+            })
+        });
+    });
+}
+
+criterion_group!(benches, sharder_benchmarks);
+criterion_main!(benches);
diff --git a/sharder/src/jumphash.rs b/sharder/src/jumphash.rs
new file mode 100644
index 0000000..d408370
--- /dev/null
+++ b/sharder/src/jumphash.rs
@@ -0,0 +1,396 @@
+use super::Sharder;
+use data_types::{DeletePredicate, NamespaceName};
+use mutable_batch::MutableBatch;
+use siphasher::sip::SipHasher13;
+use std::{
+    fmt::Debug,
+    hash::{Hash, Hasher},
+    sync::Arc,
+};
+
+/// A [`JumpHash`] maps operations for a given table in a given namespace
+/// consistently to the same shard, irrespective of the operation itself with
+/// near perfect distribution.
+///
+/// Different instances of a [`JumpHash`] using the same seed key, and the same
+/// set of shards (in the same order) will always map the same input table &
+/// namespace to the same shard `T`.
+///
+/// For `N` shards, this type uses `O(N)` memory and `O(ln N)` lookup, utilising
+/// Google's [jump hash] internally. Adding 1 additional shard causes
+/// approximately `1/N` keys to be remapped.
+///
+/// [jump hash]: https://arxiv.org/ftp/arxiv/papers/1406/1406.2294.pdf
+#[derive(Debug)]
+pub struct JumpHash<T> {
+    hasher: SipHasher13,
+    shards: Vec<T>,
+}
+
+impl<T> JumpHash<T> {
+    /// Initialise a [`JumpHash`] that consistently maps keys to
+    /// one of `shards`.
+    ///
+    /// # Correctness
+    ///
+    /// Changing the number of, or order of, the elements in `shards` when
+    /// constructing two instances changes the mapping produced.
+    ///
+    /// # Panics
+    ///
+    /// This constructor panics if the number of elements in `shards` is 0.
+    pub fn new(shards: impl IntoIterator<Item = T>) -> Self {
+        // A randomly generated static siphash key to ensure all router
+        // instances hash the same input to the same u64 sharding key.
+        //
+        // Generated with: xxd -i -l 16 /dev/urandom
+        let key = [
+            0x6d, 0x83, 0x93, 0x52, 0xa3, 0x7c, 0xe6, 0x02, 0xac, 0x01, 0x11, 0x94, 0x79, 0x0c,
+            0x64, 0x42,
+        ];
+
+        let shards = shards.into_iter().collect::<Vec<_>>();
+        assert!(!shards.is_empty(), "empty shard set given to sharder");
+
+        Self {
+            hasher: SipHasher13::new_with_key(&key),
+            shards,
+        }
+    }
+
+    /// Return a slice of all the shards this instance is configured with,
+    pub fn shards(&self) -> &[T] {
+        &self.shards
+    }
+}
+
+impl<T> JumpHash<T> {
+    /// Reinitialise [`Self`] with a new key.
+    ///
+    /// Re-keying [`Self`] will change the mapping of inputs to output instances
+    /// of `T`.
+    pub fn with_seed_key(self, key: &[u8; 16]) -> Self {
+        let hasher = SipHasher13::new_with_key(key);
+        Self { hasher, ..self }
+    }
+
+    /// Consistently hash `key` to a `T`.
+    pub fn hash<H>(&self, key: H) -> &T
+    where
+        H: Hash,
+    {
+        let mut state = self.hasher;
+        key.hash(&mut state);
+        let mut key = state.finish();
+
+        let mut b = -1;
+        let mut j = 0;
+        while j < self.shards.len() as i64 {
+            b = j;
+            key = key.wrapping_mul(2862933555777941757).wrapping_add(1);
+            j = ((b.wrapping_add(1) as f64) * (((1u64 << 31) as f64) / (((key >> 33) + 1) as f64)))
+                as i64
+        }
+
+        assert!(b >= 0);
+        self.shards
+            .get(b as usize)
+            .expect("sharder mapped input to non-existant bucket")
+    }
+
+    /// Consistently hash a table and namespace to a `T`. For use in a situation where you don't
+    /// have a payload.
+    pub fn shard_for_query(&self, table: &str, namespace: &str) -> &T {
+        // The derived hash impl for HashKey is hardened against prefix
+        // collisions when combining the two fields.
+        self.hash(&HashKey { table, namespace })
+    }
+}
+
+#[derive(Hash)]
+struct HashKey<'a> {
+    table: &'a str,
+    namespace: &'a str,
+}
+
+/// A [`JumpHash`] sharder mapping a [`MutableBatch`] reference according to the
+/// namespace it is destined for.
+///
+/// This currently doesn't use any information about the payload, just encodes
+/// that a MutableBatch will always be sharded to one `Arc<T>`.
+impl<T> Sharder<MutableBatch> for JumpHash<Arc<T>>
+where
+    T: Debug + Send + Sync,
+{
+    type Item = Arc<T>;
+
+    fn shard(
+        &self,
+        table: &str,
+        namespace: &NamespaceName<'_>,
+        _payload: &MutableBatch,
+    ) -> Self::Item {
+        // Because the MutableBatch is not (currently) used to derive the shard
+        // destination, delegate to the "no payload" sharder.
+        Self::shard(self, table, namespace, &())
+    }
+}
+
+/// A [`JumpHash`] sharder mapping a [`DeletePredicate`] reference to all
+/// shards unless a table is specified, in which case the table & namespace are
+/// used to shard to the same destination as a write with the same table &
+/// namespace would.
+impl<T> Sharder<DeletePredicate> for JumpHash<Arc<T>>
+where
+    T: Debug + Send + Sync,
+{
+    type Item = Vec<Arc<T>>;
+
+    fn shard(
+        &self,
+        table: &str,
+        namespace: &NamespaceName<'_>,
+        _payload: &DeletePredicate,
+    ) -> Self::Item {
+        // A delete that does not specify a table is mapped to all shards.
+        if table.is_empty() {
+            return self.shards.iter().map(Arc::clone).collect();
+        }
+
+        // A delete that specifies a table is mapped to the shard responsible
+        // for this (namespace, table) tuple.
+        vec![Arc::clone(self.hash(&HashKey {
+            table,
+            namespace: namespace.as_str(),
+        }))]
+    }
+}
+
+impl<T> Sharder<()> for JumpHash<Arc<T>>
+where
+    T: Debug + Send + Sync,
+{
+    type Item = Arc<T>;
+
+    fn shard(&self, table: &str, namespace: &NamespaceName<'_>, _payload: &()) -> Self::Item {
+        Arc::clone(self.shard_for_query(table, namespace.as_str()))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use data_types::TimestampRange;
+    use hashbrown::HashMap;
+    use std::iter;
+
+    #[test]
+    fn test_consistent_hashing() {
+        const NUM_TESTS: usize = 10_000;
+        const NUM_SHARDS: usize = 10;
+
+        let hasher = JumpHash::new(0..NUM_SHARDS);
+
+        // Create a HashMap<key, shard> to verify against.
+        let mappings = (0..NUM_TESTS)
+            .map(|v| {
+                let shard = hasher.hash(v);
+                (v, shard)
+            })
+            .collect::<HashMap<_, _>>();
+
+        // Rehash all the same keys and validate they map to the same shard.
+        //
+        // The random iteration order of the hashmap asserts the shard output is
+        // not a function of the order of the keys hashed.
+        assert!(mappings
+            .iter()
+            .all(|(&key, &value)| hasher.hash(key) == value));
+
+        // Reinitialise the hasher with the same (default) key
+        let hasher = JumpHash::new(0..NUM_SHARDS);
+
+        // And assert the mappings are the same
+        assert!(mappings
+            .iter()
+            .all(|(&key, &value)| hasher.hash(key) == value));
+
+        // Reinitialise the hasher with the a different key
+        let hasher = JumpHash::new(0..NUM_SHARDS).with_seed_key(&[42; 16]);
+
+        // And assert the mappings are the NOT all same (some may be the same)
+        assert!(!mappings
+            .iter()
+            .all(|(&key, &value)| hasher.hash(key) == value));
+    }
+
+    #[test]
+    fn test_sharder_impl() {
+        let hasher = JumpHash::new((0..10_000).map(Arc::new));
+
+        let a = hasher.shard(
+            "table",
+            &NamespaceName::try_from("namespace").unwrap(),
+            &MutableBatch::default(),
+        );
+        let b = hasher.shard(
+            "table",
+            &NamespaceName::try_from("namespace2").unwrap(),
+            &MutableBatch::default(),
+        );
+        assert_ne!(a, b);
+
+        let a = hasher.shard(
+            "table",
+            &NamespaceName::try_from("namespace").unwrap(),
+            &MutableBatch::default(),
+        );
+        let b = hasher.shard(
+            "table2",
+            &NamespaceName::try_from("namespace").unwrap(),
+            &MutableBatch::default(),
+        );
+        assert_ne!(a, b);
+
+        let mut batches = mutable_batch_lp::lines_to_batches("cpu a=1i", 42).unwrap();
+        let batch = batches.remove("cpu").unwrap();
+
+        // Assert payloads are ignored for this sharder
+        let a = hasher.shard(
+            "table",
+            &NamespaceName::try_from("namespace").unwrap(),
+            &MutableBatch::default(),
+        );
+        let b = hasher.shard(
+            "table",
+            &NamespaceName::try_from("namespace").unwrap(),
+            &batch,
+        );
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn test_sharder_prefix_collision() {
+        let hasher = JumpHash::new((0..10_000).map(Arc::new));
+        let a = hasher.shard(
+            "a",
+            &NamespaceName::try_from("bc").unwrap(),
+            &MutableBatch::default(),
+        );
+        let b = hasher.shard(
+            "ab",
+            &NamespaceName::try_from("c").unwrap(),
+            &MutableBatch::default(),
+        );
+        assert_ne!(a, b);
+    }
+
+    // This test ensures hashing key K always maps to bucket B, even after
+    // dependency updates, code changes, etc.
+    //
+    // It is not a problem if these mappings change so long as all the nodes in
+    // the cluster are producing the same mapping of K->B. However, this would
+    // not be the case during a rolling deployment where some nodes are using
+    // one mapping, and new nodes using another.
+    //
+    // This test being updated necessitates a stop-the-world deployment (stop
+    // all routers, deploy new hashing code on all routers, resume serving
+    // traffic) to inconsistently routing of ops. Also prepare a roll-back
+    // strategy would that accounts for this mapping change.
+    #[test]
+    fn test_key_bucket_fixture() {
+        let hasher = JumpHash::new((0..1_000).map(Arc::new));
+        let namespace = NamespaceName::try_from("bananas").unwrap();
+
+        let mut batches = mutable_batch_lp::lines_to_batches("cpu a=1i", 42).unwrap();
+        let batch = batches.remove("cpu").unwrap();
+
+        assert_eq!(
+            *hasher.shard("42", &namespace, &MutableBatch::default()),
+            904
+        );
+        assert_eq!(*hasher.shard("42", &namespace, &()), 904);
+        assert_eq!(
+            *hasher.shard("4242", &namespace, &MutableBatch::default()),
+            230
+        );
+        assert_eq!(*hasher.shard("4242", &namespace, &()), 230);
+        assert_eq!(*hasher.shard("bananas", &namespace, &batch), 183);
+        assert_eq!(*hasher.shard("bananas", &namespace, &()), 183);
+    }
+
+    #[test]
+    fn test_distribution() {
+        let hasher = JumpHash::new((0..100).map(Arc::new));
+        let namespace = NamespaceName::try_from("bananas").unwrap();
+
+        let mut mapping = HashMap::<_, usize>::new();
+
+        for i in 0..10_000_000 {
+            let bucket = hasher.shard(
+                format!("{i}").as_str(),
+                &namespace,
+                &MutableBatch::default(),
+            );
+            *mapping.entry(bucket).or_default() += 1;
+        }
+
+        let (min, max) = mapping.values().fold((usize::MAX, 0), |acc, &v| {
+            let (min, max) = acc;
+            (min.min(v), max.max(v))
+        });
+
+        // Expect that the number of values of each bucket are all within ±0.05%
+        // of the total 10M values
+        assert!(max - min < 5000, "min: {min}, max: {max}");
+    }
+
+    #[test]
+    fn test_delete_with_table() {
+        let namespace = NamespaceName::try_from("bananas").unwrap();
+
+        let hasher = JumpHash::new((0..10_000).map(Arc::new));
+
+        let predicate = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![],
+        };
+
+        let batch = MutableBatch::default();
+
+        for i in 0..100_usize {
+            // A delete with a table should map to exactly one shard.
+            let mut got = hasher.shard(i.to_string().as_str(), &namespace, &predicate);
+            assert_eq!(got.len(), 1);
+            let delete_shard = got.pop().unwrap();
+
+            // And a write to the same table & namespace MUST map to the same shard.
+            let write_shard = hasher.shard(i.to_string().as_str(), &namespace, &batch);
+            assert_eq!(delete_shard, write_shard);
+        }
+    }
+
+    #[test]
+    fn test_delete_no_table_shards_to_all() {
+        let namespace = NamespaceName::try_from("bananas").unwrap();
+
+        let shards = (0..10_000).map(Arc::new).collect::<Vec<_>>();
+        let hasher = JumpHash::new(shards.clone());
+
+        let predicate = DeletePredicate {
+            range: TimestampRange::new(1, 2),
+            exprs: vec![],
+        };
+
+        let got = hasher.shard("", &namespace, &predicate);
+
+        assert_eq!(got, shards);
+    }
+
+    #[test]
+    #[should_panic = "empty shard set given to sharder"]
+    fn no_shards() {
+        let shards: iter::Empty<i32> = iter::empty();
+        JumpHash::new(shards);
+    }
+}
diff --git a/sharder/src/lib.rs b/sharder/src/lib.rs
new file mode 100644
index 0000000..aba17a0
--- /dev/null
+++ b/sharder/src/lib.rs
@@ -0,0 +1,39 @@
+//! IOx sharder implementation.
+//!
+//! Given a table and a namespace, assign a consistent shard from the set of shards.
+
+#![deny(
+    rustdoc::broken_intra_doc_links,
+    rust_2018_idioms,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+#![warn(
+    missing_docs,
+    clippy::todo,
+    clippy::dbg_macro,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    unused_crate_dependencies
+)]
+#![allow(clippy::missing_docs_in_private_items)]
+
+// Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use criterion as _;
+#[cfg(test)]
+use rand as _;
+use workspace_hack as _;
+
+mod r#trait;
+pub use r#trait::*;
+
+mod round_robin;
+pub use round_robin::*;
+
+mod jumphash;
+pub use jumphash::*;
+
+#[allow(missing_docs)]
+pub mod mock;
diff --git a/sharder/src/mock.rs b/sharder/src/mock.rs
new file mode 100644
index 0000000..73c5033
--- /dev/null
+++ b/sharder/src/mock.rs
@@ -0,0 +1,114 @@
+use super::Sharder;
+use data_types::{DeletePredicate, NamespaceName};
+use mutable_batch::MutableBatch;
+use parking_lot::Mutex;
+use std::{collections::VecDeque, fmt::Debug, sync::Arc};
+
+#[derive(Debug, Clone)]
+pub enum MockSharderPayload {
+    MutableBatch(MutableBatch),
+    DeletePredicate(DeletePredicate),
+}
+
+impl MockSharderPayload {
+    pub fn mutable_batch(&self) -> &MutableBatch {
+        match self {
+            Self::MutableBatch(v) => v,
+            _ => panic!("payload is not a mutable batch"),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct MockSharderCall {
+    pub table_name: String,
+    pub namespace: String,
+    pub payload: MockSharderPayload,
+}
+
+#[derive(Debug, Default)]
+struct Inner<T> {
+    calls: Vec<MockSharderCall>,
+    shard_return: VecDeque<T>,
+}
+
+impl<T> Inner<T> {
+    fn record_call(&mut self, call: MockSharderCall) {
+        self.calls.push(call);
+    }
+}
+
+#[derive(Debug)]
+pub struct MockSharder<T>(Mutex<Inner<T>>);
+
+impl<T> Default for MockSharder<T> {
+    fn default() -> Self {
+        Self(Mutex::new(Inner {
+            calls: Default::default(),
+            shard_return: VecDeque::new(),
+        }))
+    }
+}
+
+impl<T> MockSharder<T> {
+    /// Return the values specified in `ret` in sequence for calls to `shard`,
+    /// starting from the front.
+    pub fn with_return(self, ret: impl Into<VecDeque<T>>) -> Self {
+        self.0.lock().shard_return = ret.into();
+        self
+    }
+
+    pub fn calls(&self) -> Vec<MockSharderCall> {
+        self.0.lock().calls.clone()
+    }
+}
+
+impl<T> Sharder<MutableBatch> for Arc<MockSharder<T>>
+where
+    T: Debug + Send + Sync,
+{
+    type Item = T;
+
+    fn shard(
+        &self,
+        table: &str,
+        namespace: &NamespaceName<'_>,
+        payload: &MutableBatch,
+    ) -> Self::Item {
+        let mut guard = self.0.lock();
+        guard.record_call(MockSharderCall {
+            table_name: table.to_string(),
+            namespace: namespace.to_string(),
+            payload: MockSharderPayload::MutableBatch(payload.clone()),
+        });
+        guard
+            .shard_return
+            .pop_front()
+            .expect("no shard mock value to return")
+    }
+}
+
+impl<T> Sharder<DeletePredicate> for Arc<MockSharder<T>>
+where
+    T: Debug + Send + Sync,
+{
+    type Item = Vec<T>;
+
+    fn shard(
+        &self,
+        table: &str,
+        namespace: &NamespaceName<'_>,
+        payload: &DeletePredicate,
+    ) -> Self::Item {
+        let mut guard = self.0.lock();
+        guard.record_call(MockSharderCall {
+            table_name: table.to_string(),
+            namespace: namespace.to_string(),
+            payload: MockSharderPayload::DeletePredicate(payload.clone()),
+        });
+        vec![guard
+            .shard_return
+            .pop_front()
+            .expect("no shard mock value to return")]
+    }
+}
diff --git a/sharder/src/round_robin.rs b/sharder/src/round_robin.rs
new file mode 100644
index 0000000..d40f9f2
--- /dev/null
+++ b/sharder/src/round_robin.rs
@@ -0,0 +1,95 @@
+use std::{cell::RefCell, fmt::Debug, sync::Arc};
+
+use crate::Sharder;
+
+thread_local! {
+    /// A per-thread counter incremented once per call to
+    /// [`RoundRobin::next()`].
+    static COUNTER: RefCell<usize>  = RefCell::new(0);
+}
+
+/// A round-robin sharder (with no data locality) that arbitrarily maps to `T`
+/// with an approximately uniform distribution.
+///
+/// # Distribution
+///
+/// Requests are distributed uniformly across all shards **per thread**. Given
+/// enough requests (where `N` is significantly larger than the number of
+/// threads) an approximately uniform distribution is achieved.
+#[derive(Debug)]
+pub struct RoundRobin<T> {
+    shards: Vec<T>,
+}
+
+impl<T> RoundRobin<T> {
+    /// Construct a new [`RoundRobin`] sharder that maps requests to each of
+    /// `shards`.
+    pub fn new(shards: impl IntoIterator<Item = T>) -> Self {
+        Self {
+            shards: shards.into_iter().collect(),
+        }
+    }
+
+    /// Return the next `T` to be used.
+    pub fn next(&self) -> &T {
+        // Grab and increment the current counter.
+        let counter = COUNTER.with(|cell| {
+            let mut cell = cell.borrow_mut();
+            let new_value = cell.wrapping_add(1);
+            *cell = new_value;
+            new_value
+        });
+
+        // Reduce it to the range of [0, N) where N is the number of shards in
+        // this sharder.
+        let idx = counter % self.shards.len();
+
+        self.shards.get(idx).expect("mapped to out-of-bounds shard")
+    }
+}
+
+impl<T, U> Sharder<U> for RoundRobin<Arc<T>>
+where
+    T: Send + Sync + Debug,
+    U: Send + Sync + Debug,
+{
+    type Item = Arc<T>;
+
+    fn shard(
+        &self,
+        _table: &str,
+        _namespace: &data_types::NamespaceName<'_>,
+        _payload: &U,
+    ) -> Self::Item {
+        Arc::clone(self.next())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use super::*;
+
+    // Note this is a property test that asserts the round-robin nature of the
+    // returned results, not the values themselves.
+    #[test]
+    fn test_round_robin() {
+        // Initialise sharder with a set of 5 shards
+        let shards = ["s1", "s2", "s3", "s4", "s5"];
+        let sharder = RoundRobin::new(shards.iter().map(Arc::new));
+
+        // Request the first N mappings.
+        #[allow(clippy::needless_collect)] // Incorrect lint
+        let mappings = (0..shards.len())
+            .map(|_| sharder.next())
+            .collect::<Vec<_>>();
+
+        // Request another 100 shard mappings, and ensure the shards are
+        // yielded in round-robin fashion (matching the initial shard
+        // mappings)
+        for want in mappings.into_iter().cycle().take(100) {
+            assert_eq!(want, sharder.next());
+        }
+    }
+}
diff --git a/sharder/src/trait.rs b/sharder/src/trait.rs
new file mode 100644
index 0000000..3d3ec36
--- /dev/null
+++ b/sharder/src/trait.rs
@@ -0,0 +1,55 @@
+use data_types::NamespaceName;
+use std::{fmt::Debug, sync::Arc};
+
+/// A [`Sharder`] implementation is responsible for mapping an opaque payload
+/// for a given table name & namespace to an output type.
+///
+/// [`Sharder`] instances can be generic over any payload type (in which case,
+/// the implementation operates exclusively on the table name and/or namespace)
+/// or they can be implemented for, and inspect, a specific payload type while
+/// sharding.
+///
+/// NOTE: It is a system invariant that deletes are routed to (all of) the same
+/// shards as a write for the same table.
+pub trait Sharder<P>: Debug + Send + Sync {
+    /// The type returned by a sharder.
+    ///
+    /// This could be a shard ID, a shard index, an array of multiple shards,
+    /// etc.
+    type Item: Debug + Send + Sync;
+
+    /// Map the specified `payload` to a shard.
+    fn shard(&self, table: &str, namespace: &NamespaceName<'_>, payload: &P) -> Self::Item;
+}
+
+impl<T, P> Sharder<P> for Arc<T>
+where
+    T: Sharder<P>,
+{
+    type Item = T::Item;
+
+    fn shard(&self, table: &str, namespace: &NamespaceName<'_>, payload: &P) -> Self::Item {
+        (**self).shard(table, namespace, payload)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use mutable_batch::MutableBatch;
+
+    use crate::JumpHash;
+
+    use super::*;
+
+    #[test]
+    fn test_arc_wrapped_sharder() {
+        let hasher: Arc<dyn Sharder<MutableBatch, Item = Arc<u32>>> =
+            Arc::new(JumpHash::new((0..10_u32).map(Arc::new)));
+
+        let _ = hasher.shard(
+            "table",
+            &NamespaceName::try_from("namespace").unwrap(),
+            &MutableBatch::default(),
+        );
+    }
+}
diff --git a/sqlx-hotswap-pool/Cargo.toml b/sqlx-hotswap-pool/Cargo.toml
new file mode 100644
index 0000000..a85ee6e
--- /dev/null
+++ b/sqlx-hotswap-pool/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "sqlx-hotswap-pool"
+description = "Workaround for the lack of dyanmic credential update support in sqlx"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+# Prevent this from being published to crates.io!
+publish = false
+
+[lints]
+workspace = true
+
+[dependencies]
+sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "json", "tls-rustls"] }
+either = "1.9.0"
+futures = "0.3"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+dotenvy = "0.15.7"
+rand = { version = "0.8", features = ["small_rng"] }
+tokio = { version = "1.35", features = ["rt-multi-thread", "macros", "parking_lot"] }
diff --git a/sqlx-hotswap-pool/README.md b/sqlx-hotswap-pool/README.md
new file mode 100644
index 0000000..e10ec85
--- /dev/null
+++ b/sqlx-hotswap-pool/README.md
@@ -0,0 +1,53 @@
+# sqlx-hotswap-pool
+
+This crate implements a workaround for the lack of support for password rotation in the `sqlx` crate.
+
+There is an upstream ticket for this [Support rotating passwords #445](https://github.com/launchbadge/sqlx/issues/445).
+This crate offers a more quick&dirty solution to the problem.
+
+## Problem
+
+Some authentication methods for databases provide short lived passwords that must be regularly rotated.
+
+Examples are: [AWS IAM database authentication](https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html), HashiCorp Vault's dynamic role, ...
+
+However, in `sqlx` once you create a pool you need to pass the connection string (which includes the credentials) and you can't change it afterwards.
+The pool will create one or more connections with those credentials.
+
+## Workaround
+
+This crate implements a wrapper struct around a reference counted Pool smart pointer. This wrapper can then be updated using internal mutability (mutex protected) whenever the main binary detects a credentials refresh. Every subsequent use of the pool will use the new underlying pool.
+
+This workaround has been designed to solve the problem of updating credentials, but it happens to work if you want to point your pool to an entirely different database as well.
+
+If the credentials refresh happen before the existing credentials are invalidated, references to the previous pool can still be used for some time.
+
+If the credentials refresh contextually invalidates the existing credentials, the process will experience connection errors if they used the pool before it has been updated (and if they cloned the `Arc` before the `update` method has been called).
+
+Already open connections will keep working in both cases.
+
+Usage:
+
+```rust
+use sqlx_hotswap_pool::HotSwapPool;
+use sqlx::{pool::PoolOptions, Pool, Postgres};
+# async fn foo() {
+let pool1: Pool<Postgres> = PoolOptions::new()
+    .test_before_acquire(true)
+    .connect("postgresql://user:pw1@localhost/db")
+    .await
+    .unwrap();
+
+// create a HotSwapPool, a pool that wraps `pool1` and supports replacing it with another
+let pool: HotSwapPool<Postgres> = HotSwapPool::new(pool1);
+
+let pool2 = PoolOptions::new()
+    .test_before_acquire(true)
+    .connect("postgresql://user:pw2@localhost/db")
+    .await
+    .unwrap();
+    
+// replace the pool wrapped by the HotSwapPool with `pool2` instead of `pool1`
+pool.replace(pool2);
+# }
+```
diff --git a/sqlx-hotswap-pool/src/lib.rs b/sqlx-hotswap-pool/src/lib.rs
new file mode 100644
index 0000000..db913cf
--- /dev/null
+++ b/sqlx-hotswap-pool/src/lib.rs
@@ -0,0 +1,336 @@
+#![doc = include_str!("../README.md")]
+#![deny(
+    rustdoc::broken_intra_doc_links,
+    rust_2018_idioms,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+#![warn(
+    missing_docs,
+    clippy::todo,
+    clippy::dbg_macro,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(clippy::missing_docs_in_private_items, clippy::type_complexity)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::sync::{Arc, RwLock};
+
+use either::Either;
+use futures::{future::BoxFuture, prelude::stream::BoxStream};
+use sqlx::{
+    database::HasStatement, pool::PoolConnection, Acquire, Database, Describe, Error, Execute,
+    Executor, Pool, Transaction,
+};
+
+/// A `HotSwapPool` is a Pool that wraps another Pool and it allows the pool to
+/// be replaced at runtime.
+#[derive(Debug)]
+pub struct HotSwapPool<DB>
+where
+    DB: Database,
+{
+    pool: Arc<RwLock<Arc<Pool<DB>>>>,
+}
+
+impl<DB> HotSwapPool<DB>
+where
+    DB: Database,
+{
+    /// Creates a new [`HotSwapPool`] from a [`Pool`].
+    pub fn new(pool: Pool<DB>) -> Self {
+        Self {
+            pool: Arc::new(RwLock::new(Arc::new(pool))),
+        }
+    }
+
+    /// Replaces the underlying [`Pool`] with `new_pool`, returning
+    /// the previous pool
+    ///
+    /// Existing connections obtained by performing operations on the pool
+    /// before the call to `replace` are still valid.
+    ///
+    /// This method affects new operations only.
+    pub fn replace(&self, new_pool: Pool<DB>) -> Arc<Pool<DB>> {
+        let mut t = Arc::new(new_pool);
+        let mut pool = self.pool.write().expect("poisoned");
+        // swap the new pool for the old pool
+        std::mem::swap(&mut t, &mut *pool);
+        t
+    }
+}
+
+impl<DB> Clone for HotSwapPool<DB>
+where
+    DB: Database,
+{
+    fn clone(&self) -> Self {
+        Self {
+            pool: Arc::clone(&self.pool),
+        }
+    }
+}
+
+impl<'a, DB> Acquire<'a> for &'_ HotSwapPool<DB>
+where
+    DB: Database,
+{
+    type Database = DB;
+
+    type Connection = PoolConnection<DB>;
+
+    fn acquire(self) -> BoxFuture<'static, Result<Self::Connection, Error>> {
+        let pool = self.pool.read().expect("poisoned");
+        Box::pin(pool.acquire())
+    }
+
+    fn begin(self) -> BoxFuture<'static, Result<Transaction<'a, DB>, Error>> {
+        let pool = self.pool.read().expect("poisoned");
+        let pool = Arc::clone(&pool);
+        Box::pin(async move { pool.begin().await })
+    }
+}
+
+impl<'p, DB> Executor<'p> for &'_ HotSwapPool<DB>
+where
+    DB: Database,
+    for<'c> &'c mut DB::Connection: Executor<'c, Database = DB>,
+{
+    type Database = DB;
+
+    fn fetch_many<'e, 'q: 'e, E: 'q>(
+        self,
+        query: E,
+    ) -> BoxStream<'e, Result<Either<DB::QueryResult, DB::Row>, Error>>
+    where
+        E: Execute<'q, Self::Database>,
+    {
+        let pool = self.pool.read().expect("poisoned");
+        pool.fetch_many(query)
+    }
+
+    fn fetch_optional<'e, 'q: 'e, E: 'q>(
+        self,
+        query: E,
+    ) -> BoxFuture<'e, Result<Option<DB::Row>, Error>>
+    where
+        E: Execute<'q, Self::Database>,
+    {
+        let pool = self.pool.read().expect("poisoned");
+        pool.fetch_optional(query)
+    }
+
+    fn prepare_with<'e, 'q: 'e>(
+        self,
+        sql: &'q str,
+        parameters: &'e [<Self::Database as Database>::TypeInfo],
+    ) -> BoxFuture<'e, Result<<Self::Database as HasStatement<'q>>::Statement, Error>> {
+        let pool = self.pool.read().expect("poisoned");
+        pool.prepare_with(sql, parameters)
+    }
+
+    #[doc(hidden)]
+    fn describe<'e, 'q: 'e>(
+        self,
+        sql: &'q str,
+    ) -> BoxFuture<'e, Result<Describe<Self::Database>, Error>> {
+        let pool = self.pool.read().expect("poisoned");
+        pool.describe(sql)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::env;
+    use std::time::Duration;
+
+    use super::*;
+    use rand::{distributions::Alphanumeric, Rng};
+    use sqlx::{postgres::PgPoolOptions, Postgres};
+
+    // Helper macro to skip tests if TEST_INTEGRATION and TEST_INFLUXDB_IOX_CATALOG_DSN environment variables
+    // are not set.
+    macro_rules! maybe_skip_integration {
+        () => {{
+            dotenvy::dotenv().ok();
+
+            let required_vars = ["TEST_INFLUXDB_IOX_CATALOG_DSN"];
+            let unset_vars: Vec<_> = required_vars
+                .iter()
+                .filter_map(|&name| match env::var(name) {
+                    Ok(_) => None,
+                    Err(_) => Some(name),
+                })
+                .collect();
+            let unset_var_names = unset_vars.join(", ");
+
+            let force = env::var("TEST_INTEGRATION");
+
+            if force.is_ok() && !unset_var_names.is_empty() {
+                panic!(
+                    "TEST_INTEGRATION is set, \
+                            but variable(s) {} need to be set",
+                    unset_var_names
+                );
+            } else if force.is_err() {
+                eprintln!(
+                    "skipping Postgres integration test - set {}TEST_INTEGRATION to run",
+                    if unset_var_names.is_empty() {
+                        String::new()
+                    } else {
+                        format!("{} and ", unset_var_names)
+                    }
+                );
+                return;
+            }
+        }};
+    }
+
+    // test helper to create a regular (non hotswapping) DB connection
+    async fn connect_db() -> Result<Pool<Postgres>, sqlx::Error> {
+        // create a random schema for this particular pool
+        let schema_name = {
+            // use scope to make it clear to clippy / rust that `rng` is
+            // not carried past await points
+            let mut rng = rand::thread_rng();
+            (&mut rng)
+                .sample_iter(Alphanumeric)
+                .filter(|c| c.is_ascii_alphabetic())
+                .take(20)
+                .map(char::from)
+                .collect::<String>()
+        };
+        let dsn = std::env::var("TEST_INFLUXDB_IOX_CATALOG_DSN").unwrap();
+        let captured_schema_name = schema_name.clone();
+        PgPoolOptions::new()
+            .min_connections(1)
+            .max_connections(5)
+            .acquire_timeout(Duration::from_secs(2))
+            .idle_timeout(Duration::from_secs(500))
+            .test_before_acquire(true)
+            .after_connect(move |c, _meta| {
+                let captured_schema_name = captured_schema_name.clone();
+                Box::pin(async move {
+                    // Tag the connection with the provided application name.
+                    c.execute(sqlx::query("SET application_name = 'test';"))
+                        .await?;
+
+                    // Note can only bind data values, not schema names
+                    let query = format!("CREATE SCHEMA IF NOT EXISTS {}", &captured_schema_name);
+                    c.execute(sqlx::query(&query))
+                        .await
+                        .expect("failed to create schema");
+
+                    let search_path_query = format!("SET search_path TO {captured_schema_name}");
+                    c.execute(sqlx::query(&search_path_query)).await?;
+                    Ok(())
+                })
+            })
+            .connect(&dsn)
+            .await
+    }
+
+    // The goal of this test is to verify that the hotswap pool can indeed replace
+    // the pool. In the real world one would replace pools in order to use new
+    // credentials. Here in our tests though just testing that it actually uses
+    // a different pool handed over by the test utilities will be good enough.
+    // The test utilities return an isolated namespace schema for every test. We'll
+    // leverage that in order to verify that we indeed have swapped the pool at
+    // runtime.
+    #[tokio::test]
+    async fn test_replace() {
+        // If running an integration test on your laptop, this requires that you have Postgres
+        // running and that you've done the sqlx migrations. See the README in this crate for
+        // info to set it up.
+        maybe_skip_integration!();
+        println!("tests are running");
+
+        let db = connect_db().await.unwrap();
+        let db = HotSwapPool::new(db);
+
+        sqlx::query("CREATE TABLE IF NOT EXISTS test (id int)")
+            .execute(&db)
+            .await
+            .expect("executed");
+
+        sqlx::query("INSERT INTO test (id) VALUES (1)")
+            .execute(&db)
+            .await
+            .expect("executed");
+
+        // Acquire a connection before replacing the pool. We'll use this later to prove
+        // that having a connection from the old pool doesn't block our ability to
+        // replace the pool.
+        let mut conn = db.acquire().await.unwrap();
+
+        // hot swap a new pool. The maybe_skip_postgres_integration_no_hotswap creates a
+        // new schema namespace and sets it as the default namespace for all the
+        // connections in that pool, this effectively means we won't find this table
+        // anymore.
+        let new_pool = connect_db().await.unwrap();
+        db.replace(new_pool);
+
+        // create the table so we can test if the row doesn't exist, which is nicer than
+        // testing that we got a "relation not found" error.
+        sqlx::query("CREATE TABLE IF NOT EXISTS test (id int)")
+            .execute(&db)
+            .await
+            .expect("executed");
+
+        // Perform an actual query on the previous pool.
+        conn.fetch_one("SELECT id FROM test")
+            .await
+            .expect("got result");
+
+        // Perform a query on the new pool. This pool uses the schema whose test table
+        // has no rows.
+        let res = sqlx::query("SELECT id FROM test")
+            .fetch_optional(&db)
+            .await
+            .expect("got result");
+
+        assert!(res.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_replace_with_outstanding_future() {
+        maybe_skip_integration!();
+        println!("tests are running");
+
+        let db = connect_db().await.unwrap();
+        let db = HotSwapPool::new(db);
+
+        sqlx::query("CREATE TABLE IF NOT EXISTS test (id int)")
+            .execute(&db)
+            .await
+            .expect("executed");
+
+        // create a future from the pool but don't execute it yet
+        let query_future = sqlx::query("CREATE TABLE IF NOT EXISTS test (id int)").execute(&db);
+
+        // hot swap a new pool. This should not deadlock
+        let new_pool = connect_db().await.unwrap();
+        let old_pool = db.replace(new_pool);
+
+        // Ensure there are outstanding references to the old pool
+        // captured in the future
+        assert_eq!(Arc::strong_count(&old_pool), 1);
+
+        // resolve the future created prior to the previous
+        // connection (executes against the new connection)
+        query_future.await.expect("executed");
+
+        // can also run queries successfully against the new pool
+        sqlx::query("CREATE TABLE IF NOT EXISTS test (id int)")
+            .execute(&db)
+            .await
+            .expect("executed");
+    }
+}
diff --git a/test_fixtures/000000000000005-000000002.tsm.gz b/test_fixtures/000000000000005-000000002.tsm.gz
new file mode 100644
index 0000000..22f78e2
Binary files /dev/null and b/test_fixtures/000000000000005-000000002.tsm.gz differ
diff --git a/test_fixtures/000000000000462-000000002.tsm.gz b/test_fixtures/000000000000462-000000002.tsm.gz
new file mode 100644
index 0000000..14efd57
Binary files /dev/null and b/test_fixtures/000000000000462-000000002.tsm.gz differ
diff --git a/test_fixtures/README.md b/test_fixtures/README.md
new file mode 100644
index 0000000..568e1f6
--- /dev/null
+++ b/test_fixtures/README.md
@@ -0,0 +1,26 @@
+# Test fixtures
+
+This directory contains files that may be useful for testing purposes.
+
+If you add a new file to this directory, please add a brief description of it here.
+
+# Top-level files
+
+- `000000000000005-000000002.tsm.gz` - Used in testing the `influxdb_tsm` crate.
+- `cpu_usage.tsm.gz` - Used in testing the `influxdb_tsm` crate.
+- `cpu.parquet` - Parquet file generated by IOx to be imported in tests where data loaded is needed.
+
+# lineproto directory
+
+- `air_and_water.lp`
+- `metrics.lp`
+- `prometheus.lp`
+- `read_filter.lp.gz`
+- `temperature.lp`
+
+# parquet directory
+
+- `influxql_log_*.parquet` - Data exported from TSM for the purposes of testing bulk ingest. Notably
+  NOT generated with IOx.
+- `sql_query_log_*.parquet` - Data exported from TSM for the purposes of testing bulk ingest.
+  Notably NOT generated with IOx.
diff --git a/test_fixtures/cpu.parquet b/test_fixtures/cpu.parquet
new file mode 100644
index 0000000..0638da3
Binary files /dev/null and b/test_fixtures/cpu.parquet differ
diff --git a/test_fixtures/cpu_usage.tsm.gz b/test_fixtures/cpu_usage.tsm.gz
new file mode 100644
index 0000000..214eb4c
Binary files /dev/null and b/test_fixtures/cpu_usage.tsm.gz differ
diff --git a/test_fixtures/lineproto/air_and_water.lp b/test_fixtures/lineproto/air_and_water.lp
new file mode 100644
index 0000000..eb3286a
--- /dev/null
+++ b/test_fixtures/lineproto/air_and_water.lp
@@ -0,0 +1,12 @@
+h2o_temperature,location=santa_monica,state=CA surface_degrees=65.2,bottom_degrees=50.4 1568756160
+air_temperature,location=santa_monica,state=CA sea_level_degrees=77.3,tenk_feet_feet_degrees=40.0 1568756160
+h2o_temperature,location=santa_monica,state=CA surface_degrees=63.6,bottom_degrees=49.2 1600756160
+air_temperature,location=santa_monica,state=CA sea_level_degrees=77.6,tenk_feet_feet_degrees=40.9 1600756160
+h2o_temperature,location=coyote_creek,state=CA surface_degrees=55.1,bottom_degrees=51.3 1568756160
+air_temperature,location=coyote_creek,state=CA sea_level_degrees=77.2,tenk_feet_feet_degrees=40.8 1568756160
+air_temperature,location=coyote_creek,state=CA sea_level_degrees=77.1,tenk_feet_feet_degrees=41.0 1600756160
+air_temperature,location=puget_sound,state=WA sea_level_degrees=77.5,tenk_feet_feet_degrees=41.1 1568756160
+h2o_temperature,location=coyote_creek,state=CA surface_degrees=50.2,bottom_degrees=50.9 1600756160
+h2o_temperature,location=puget_sound,state=WA surface_degrees=54.7,bottom_degrees=40.1 1600756160
+h2o_temperature,location=puget_sound,state=WA surface_degrees=55.8,bottom_degrees=40.2 1568756160
+air_temperature,location=puget_sound,state=WA sea_level_degrees=78.0,tenk_feet_feet_degrees=40.9 1600756160
\ No newline at end of file
diff --git a/test_fixtures/lineproto/metrics.lp b/test_fixtures/lineproto/metrics.lp
new file mode 100644
index 0000000..1e1ce31
--- /dev/null
+++ b/test_fixtures/lineproto/metrics.lp
@@ -0,0 +1,1000 @@
+mem,host=Andrews-MBP.hsd1.ma.comcast.net low_free=0i,mapped=0i,sreclaimable=0i,free=27875999744i,cached=0i,available_percent=62.288713455200195,committed_as=0i,high_total=0i,sunreclaim=0i,swap_free=0i,vmalloc_used=0i,total=68719476736i,buffered=0i,high_free=0i,huge_pages_total=0i,page_tables=0i,vmalloc_chunk=0i,used=25914998784i,wired=5231611904i,commit_limit=0i,swap_total=0i,write_back_tmp=0i,active=18817593344i,inactive=14928478208i,shared=0i,write_back=0i,available=42804477952i,slab=0i,dirty=0i,low_total=0i,swap_cached=0i,used_percent=37.711286544799805,huge_page_size=0i,huge_pages_free=0i,vmalloc_total=0i 1591894320000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net blocked=0i,zombies=1i,stopped=0i,running=4i,sleeping=514i,total=519i,unknown=0i,idle=0i 1591894310000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=514i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 bytes_recv=9772079981i,packets_sent=9692474i,packets_recv=9196112i,err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609411161i 1591894310000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 packets_recv=91855i,err_in=0i,err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451861416i,bytes_recv=9704129i,packets_sent=82792i 1591894310000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 merged_writes=0i,write_bytes=337955491840i,merged_reads=0i,read_bytes=44226408448i,read_time=1224732i,write_time=3639542i,io_time=4864275i,weighted_io_time=0i,iops_in_progress=0i,reads=1864536i,writes=5591520i 1591894310000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 iops_in_progress=0i,merged_reads=0i,reads=1864536i,writes=5591520i,read_bytes=44226408448i,write_time=3639542i,io_time=4864275i,weighted_io_time=0i,write_bytes=337955491840i,read_time=1224732i,merged_writes=0i 1591894310000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894310000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894310000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457565i 1591894310000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457565i 1591894310000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net n_cpus=16i,n_users=4i,load1=1.2333984375,load5=1.78076171875,load15=2.513671875 1591894310000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.2333984375,load5=1.78076171875,load15=2.513671875,n_cpus=16i,n_users=4i 1591894310000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_used=3i,total=1000240963584i,free=882941362176i,used=4387262464i,used_percent=0.49443490744818086,inodes_total=9767978160i,inodes_free=9767978157i 1591894310000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data free=882941362176i,used=101143633920i,used_percent=10.27793679623718,inodes_total=9767978160i,inodes_free=9766246318i,inodes_used=1731842i,total=1000240963584i 1591894310000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941362176i,used=10945150976i,used_percent=1.224445252832542,inodes_total=9767978160i 1591894310000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm used_percent=0.49443490744818086,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941362176i,used=4387262464i 1591894310000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data total=1000240963584i,free=882941362176i,used=101143633920i,used_percent=10.27793679623718,inodes_total=9767978160i,inodes_free=9766246318i,inodes_used=1731842i 1591894310000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.224445252832542,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941362176i 1591894310000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net vmalloc_chunk=0i,total=68719476736i,used=25916297216i,used_percent=37.713176012039185,high_total=0i,huge_page_size=0i,swap_cached=0i,huge_pages_free=0i,huge_pages_total=0i,low_free=0i,free=27873271808i,buffered=0i,active=18828300288i,available_percent=62.286823987960815,commit_limit=0i,page_tables=0i,shared=0i,sreclaimable=0i,inactive=14929907712i,committed_as=0i,dirty=0i,vmalloc_total=0i,vmalloc_used=0i,cached=0i,wired=5222944768i,slab=0i,high_free=0i,write_back_tmp=0i,available=42803179520i,mapped=0i,sunreclaim=0i,write_back=0i,low_total=0i,swap_free=0i,swap_total=0i 1591894310000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net total=68719476736i,slab=0i,available_percent=62.286823987960815,mapped=0i,write_back_tmp=0i,sreclaimable=0i,available=42803179520i,free=27873271808i,wired=5222944768i,used_percent=37.713176012039185,huge_pages_total=0i,write_back=0i,high_total=0i,huge_page_size=0i,low_total=0i,vmalloc_total=0i,vmalloc_used=0i,used=25916297216i,high_free=0i,swap_cached=0i,swap_free=0i,vmalloc_chunk=0i,page_tables=0i,swap_total=0i,cached=0i,buffered=0i,inactive=14929907712i,dirty=0i,huge_pages_free=0i,commit_limit=0i,shared=0i,active=18828324864i,committed_as=0i,low_free=0i,sunreclaim=0i 1591894310000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net out=0i,in=0i 1591894310000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894310000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894310000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net used=653000704i,free=1494482944i,used_percent=30.40771484375,total=2147483648i 1591894310000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net total=517i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=513i 1591894320000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=513i,total=517i,unknown=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 packets_recv=9196171i,err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609419435i,bytes_recv=9772084808i,packets_sent=9692564i 1591894320000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 bytes_sent=451861580i,bytes_recv=9704293i,packets_sent=82794i,packets_recv=91857i,err_in=0i,err_out=157523i,drop_in=0i,drop_out=112i 1591894320000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_guest_nice=0,usage_user=0.9428695490354196,usage_system=1.141368401463929,usage_idle=97.91576204950066,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894320000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_user=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0 1591894320000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_nice=0,usage_iowait=0,usage_idle=99.50347567030785,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.19860973187686196,usage_system=0.29791459781529295 1591894320000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_system=0.0992063492063492,usage_idle=99.90079365079364,usage_nice=0,usage_iowait=0,usage_irq=0,usage_user=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894320000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.5958291956305859,usage_system=0.49652432969215493,usage_nice=0,usage_guest_nice=0,usage_idle=98.90764647467726,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894320000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0,usage_idle=100,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894320000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.6958250497017893,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=0.6958250497017893,usage_idle=98.60834990059642,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0 1591894320000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.09910802775024777,usage_idle=99.8017839444995,usage_iowait=0,usage_steal=0,usage_user=0.09910802775024777 1591894320000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_guest_nice=0,usage_idle=97.02085402184707,usage_system=1.4895729890764648,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=1.4895729890764648 1591894320000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_user=0.0992063492063492,usage_system=0.0992063492063492,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_idle=99.8015873015873,usage_guest=0 1591894320000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=96.91848906560637,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=1.3916500994035785,usage_system=1.6898608349900597,usage_nice=0,usage_guest=0,usage_guest_nice=0 1591894320000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.8015873015873,usage_iowait=0,usage_guest_nice=0,usage_user=0.0992063492063492,usage_system=0.0992063492063492,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894320000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_nice=0,usage_system=2.8798411122144985,usage_idle=94.93545183714002,usage_irq=0,usage_softirq=0,usage_user=2.1847070506454815 1591894320000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_user=0.09930486593843098,usage_idle=99.90069513406156,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0 1591894320000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=3.5749751737835154,usage_system=3.872889771598808,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_idle=92.55213505461768,usage_irq=0,usage_steal=0 1591894320000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.19821605550049554,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.09910802775024777,usage_idle=99.70267591674926,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894320000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=4.468718967229394,usage_system=5.958291956305859,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_idle=89.57298907646475,usage_iowait=0,usage_steal=0 1591894320000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 weighted_io_time=0i,merged_reads=0i,writes=5591620i,write_time=3639693i,write_bytes=337956200448i,read_time=1224733i,io_time=4864426i,iops_in_progress=0i,merged_writes=0i,reads=1864537i,read_bytes=44226670592i 1591894320000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm free=882941370368i,used=4387262464i,used_percent=0.4944349028834563,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i 1591894320000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data used_percent=10.27793596378876,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941370368i,used=101143625728i 1591894320000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net vmalloc_used=0i,used=25915174912i,used_percent=37.71154284477234,high_total=0i,low_total=0i,shared=0i,dirty=0i,huge_page_size=0i,huge_pages_free=0i,mapped=0i,wired=5231652864i,committed_as=0i,swap_free=0i,write_back_tmp=0i,total=68719476736i,huge_pages_total=0i,available=42804301824i,commit_limit=0i,high_free=0i,page_tables=0i,sunreclaim=0i,cached=0i,buffered=0i,active=18818400256i,inactive=14928265216i,slab=0i,free=27876036608i,swap_cached=0i,vmalloc_chunk=0i,vmalloc_total=0i,available_percent=62.28845715522766,low_free=0i,sreclaimable=0i,swap_total=0i,write_back=0i 1591894320000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882941370368i,used=10945150976i,used_percent=1.224445241611145,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894320000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.9435133457479826,usage_idle=97.93916821849783,usage_iowait=0,usage_steal=0,usage_guest_nice=0,usage_system=1.1173184357541899,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0 1591894320000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894320000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net used_percent=30.40771484375,total=2147483648i,used=653000704i,free=1494482944i 1591894320000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_idle=100,usage_iowait=0,usage_irq=0 1591894320000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 write_bytes=337956200448i,io_time=4864426i,weighted_io_time=0i,iops_in_progress=0i,merged_reads=0i,merged_writes=0i,reads=1864537i,writes=5591620i,read_bytes=44226670592i,read_time=1224733i,write_time=3639693i 1591894320000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.19860973187686196,usage_system=0.29791459781529295,usage_iowait=0,usage_guest=0,usage_idle=99.50347567030785,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894320000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.09930486593843098,usage_idle=99.90069513406156,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0,usage_guest=0,usage_guest_nice=0 1591894320000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.5958291956305859,usage_system=0.49652432969215493,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=98.90764647467726,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894320000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0,usage_idle=100,usage_irq=0,usage_guest_nice=0,usage_system=0,usage_iowait=0 1591894320000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.6958250497017893,usage_idle=98.60834990059642,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.6958250497017893,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894320000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0992063492063492,usage_idle=99.8015873015873,usage_nice=0,usage_irq=0,usage_guest=0,usage_user=0.0992063492063492,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894320000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=97.02085402184707,usage_irq=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=1.4895729890764648,usage_system=1.4895729890764648 1591894320000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.8015873015873,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=0.0992063492063492,usage_system=0.0992063492063492,usage_iowait=0,usage_softirq=0,usage_guest_nice=0 1591894320000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.3916500994035785,usage_softirq=0,usage_guest_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_system=1.6898608349900597,usage_idle=96.91848906560637,usage_nice=0,usage_iowait=0 1591894320000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894320000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894320000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.09930486593843098,usage_system=0.09930486593843098,usage_idle=99.80139026812314,usage_irq=0,usage_softirq=0 1591894320000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894320000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882941370368i,used=4387262464i,used_percent=0.4944349028834563,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894320000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data total=1000240963584i,free=882941370368i,used=101143625728i,used_percent=10.27793596378876,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i 1591894320000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.224445241611145,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941370368i 1591894320000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457575i 1591894320000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=2.1847070506454815,usage_system=2.8798411122144985,usage_iowait=0,usage_steal=0,usage_guest_nice=0,usage_idle=94.93545183714002,usage_nice=0 1591894320000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.35693359375,load5=1.78759765625,load15=2.5068359375,n_cpus=16i,n_users=4i 1591894320000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.09930486593843098,usage_idle=99.90069513406156,usage_irq=0,usage_system=0,usage_nice=0,usage_iowait=0 1591894320000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=3.7773359840954273,usage_idle=92.6441351888668,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=3.5785288270377733,usage_nice=0,usage_steal=0,usage_guest=0 1591894320000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0992063492063492,usage_softirq=0,usage_guest_nice=0,usage_user=0.0992063492063492,usage_idle=99.8015873015873,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0 1591894320000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=5.964214711729622,usage_nice=0,usage_iowait=0,usage_guest_nice=0,usage_user=4.473161033797217,usage_idle=89.56262425447316,usage_irq=0 1591894320000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894320000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457575i 1591894320000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.35693359375,load5=1.78759765625,load15=2.5068359375,n_cpus=16i,n_users=4i 1591894320000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net running=5i,sleeping=511i,total=517i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i 1591894330000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net stopped=0i,running=2i,sleeping=513i,total=516i,unknown=0i,idle=0i,blocked=0i,zombies=1i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 bytes_recv=9772099646i,packets_sent=9692649i,packets_recv=9196295i,err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609449476i 1591894330000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 err_in=0i,err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451861865i,bytes_recv=9704546i,packets_sent=82796i,packets_recv=91859i 1591894330000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894330000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457585i 1591894330000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.22802734375,load5=1.7451171875,load15=2.48291015625,n_cpus=16i,n_users=4i 1591894330000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894330000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457585i 1591894330000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load15=2.48291015625,n_cpus=16i,n_users=4i,load1=1.22802734375,load5=1.7451171875 1591894330000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm used_percent=0.49443490973054316,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941358080i,used=4387262464i 1591894330000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941358080i,used=101143638016i,used_percent=10.27793721246139 1591894330000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used_percent=1.2244452584432404,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941358080i,used=10945150976i 1591894330000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 write_time=3639769i,iops_in_progress=0i,merged_reads=0i,merged_writes=0i,read_bytes=44226670592i,write_bytes=337956814848i,read_time=1224733i,io_time=4864502i,weighted_io_time=0i,reads=1864537i,writes=5591729i 1591894330000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_irq=0,usage_system=1.093681644897194,usage_idle=97.7063933504156,usage_nice=0,usage_iowait=0,usage_user=1.199925004687207 1591894330000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_iowait=0,usage_irq=0,usage_guest_nice=0 1591894330000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.5,usage_idle=99,usage_guest_nice=0,usage_system=0.5,usage_iowait=0 1591894330000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941358080i,used=4387262464i,used_percent=0.49443490973054316,inodes_total=9767978160i 1591894330000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.1,usage_system=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_idle=99.9,usage_nice=0 1591894330000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data used=101143638016i,used_percent=10.27793721246139,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941358080i 1591894330000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.9009009009009009,usage_system=0.7007007007007007,usage_softirq=0,usage_idle=98.3983983983984,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894330000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.2244452584432404,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941358080i 1591894330000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.8001998001998,usage_system=0.0999000999000999,usage_iowait=0,usage_user=0.0999000999000999 1591894330000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.8,usage_idle=98.2,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_user=1,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894330000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_nice=0,usage_system=0,usage_idle=100,usage_iowait=0,usage_irq=0,usage_steal=0,usage_user=0 1591894330000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=97.70610663166448,usage_nice=0,usage_irq=0,usage_guest_nice=0,usage_system=1.0875679729983123,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=1.2063253953372086 1591894330000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.5,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_steal=0,usage_system=1.3,usage_idle=97.2,usage_nice=0,usage_iowait=0,usage_irq=0 1591894330000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_nice=0,usage_iowait=0,usage_guest=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894330000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_idle=99,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.5,usage_system=0.5,usage_nice=0 1591894330000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=0.1001001001001001,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_system=0,usage_idle=99.89989989989989 1591894330000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894330000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.9009009009009009,usage_idle=98.3983983983984,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_system=0.7007007007007007,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0 1591894330000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894330000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_idle=99.8001998001998,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_nice=0 1591894330000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.89989989989989,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.1001001001001001,usage_system=0,usage_iowait=0,usage_softirq=0,usage_guest=0 1591894330000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_guest_nice=0,usage_system=1.5,usage_idle=96.8,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=1.7 1591894330000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.8,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=1,usage_idle=98.2,usage_irq=0,usage_guest=0,usage_guest_nice=0 1591894330000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_idle=100,usage_softirq=0,usage_user=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_nice=0 1591894330000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_user=1.5,usage_system=1.3,usage_idle=97.2,usage_irq=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0 1591894330000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.1,usage_idle=99.9,usage_guest=0,usage_system=0,usage_irq=0 1591894330000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_user=1.7,usage_nice=0,usage_iowait=0,usage_irq=0,usage_system=1.5,usage_idle=96.8,usage_guest=0,usage_guest_nice=0 1591894330000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894330000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894330000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_user=0.1001001001001001,usage_idle=99.89989989989989,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0,usage_nice=0 1591894330000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=3.1,usage_system=2.5,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_idle=94.4,usage_irq=0 1591894330000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864537i,writes=5591729i,read_bytes=44226670592i,io_time=4864502i,merged_writes=0i,write_bytes=337956814848i,read_time=1224733i,write_time=3639769i,weighted_io_time=0i,iops_in_progress=0i,merged_reads=0i 1591894330000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1,usage_idle=99.9,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_nice=0,usage_iowait=0,usage_steal=0 1591894330000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=4.404404404404405,usage_system=3.803803803803804,usage_nice=0,usage_steal=0,usage_idle=91.7917917917918,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894330000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net high_free=0i,total=68719476736i,available=42796924928i,buffered=0i,huge_pages_total=0i,shared=0i,swap_cached=0i,write_back_tmp=0i,cached=0i,active=18826252288i,commit_limit=0i,page_tables=0i,vmalloc_used=0i,write_back=0i,used_percent=37.72227764129639,dirty=0i,low_free=0i,vmalloc_chunk=0i,vmalloc_total=0i,slab=0i,sreclaimable=0i,sunreclaim=0i,huge_page_size=0i,low_total=0i,committed_as=0i,swap_total=0i,high_total=0i,swap_free=0i,wired=5231632384i,available_percent=62.27772235870361,huge_pages_free=0i,mapped=0i,used=25922551808i,free=27868196864i,inactive=14928728064i 1591894330000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.1,usage_system=0.1,usage_irq=0,usage_steal=0,usage_guest=0,usage_idle=99.8,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0 1591894330000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=88.6,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest_nice=0,usage_user=5.5,usage_system=5.9,usage_irq=0,usage_softirq=0,usage_guest=0 1591894330000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.1,usage_irq=0,usage_steal=0,usage_guest=0,usage_system=0,usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0 1591894330000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net available=42796933120i,wired=5231632384i,available_percent=62.27773427963257,low_total=0i,total=68719476736i,vmalloc_used=0i,free=27868205056i,high_total=0i,swap_cached=0i,page_tables=0i,sunreclaim=0i,used=25922543616i,cached=0i,active=18826354688i,huge_pages_free=0i,inactive=14928728064i,commit_limit=0i,huge_page_size=0i,swap_total=0i,vmalloc_total=0i,write_back_tmp=0i,buffered=0i,used_percent=37.72226572036743,sreclaimable=0i,swap_free=0i,mapped=0i,shared=0i,committed_as=0i,high_free=0i,huge_pages_total=0i,low_free=0i,slab=0i,dirty=0i,vmalloc_chunk=0i,write_back=0i 1591894330000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=3.1,usage_system=2.5,usage_idle=94.4,usage_softirq=0,usage_guest=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894330000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=0,usage_system=0.1,usage_iowait=0,usage_guest=0,usage_idle=99.9,usage_nice=0,usage_steal=0 1591894330000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=3.9,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=4.4,usage_idle=91.7,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894330000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0999000999000999,usage_system=0.1998001998001998,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_idle=99.7002997002997,usage_nice=0,usage_guest=0 1591894330000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=5.4945054945054945,usage_iowait=0,usage_steal=0,usage_guest=0,usage_softirq=0,usage_system=5.894105894105894,usage_idle=88.6113886113886,usage_nice=0,usage_irq=0 1591894330000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894350000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net total=517i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=513i 1591894340000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=513i,total=517i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 drop_in=0i,drop_out=4i,bytes_sent=3609476848i,bytes_recv=9772113559i,packets_sent=9692828i,packets_recv=9196398i,err_in=0i,err_out=4989i 1591894340000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 packets_sent=82796i,packets_recv=91859i,err_in=0i,err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451861865i,bytes_recv=9704546i 1591894340000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894340000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882941149184i,used=4387262464i,used_percent=0.4944350261310478,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894340000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941149184i,used=101143846912i,used_percent=10.277958439896096 1591894340000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457595i 1591894340000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=97.8048780487805,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.9380863039399625,usage_system=1.2570356472795496,usage_guest_nice=0,usage_irq=0 1591894340000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net n_users=4i,load1=1.34619140625,load5=1.7529296875,load15=2.47705078125,n_cpus=16i 1591894340000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_nice=0,usage_irq=0,usage_steal=0,usage_user=0,usage_system=0.1001001001001001,usage_idle=99.89989989989989 1591894340000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.2244455445889357,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941149184i 1591894340000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4,usage_idle=99.2,usage_irq=0,usage_system=0.4,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894340000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_system=0.1,usage_idle=99.8,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1 1591894340000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.4995004995004995,usage_system=0.4995004995004995,usage_idle=99.000999000999,usage_nice=0,usage_iowait=0,usage_guest_nice=0 1591894340000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_iowait=0,usage_steal=0,usage_idle=100,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=0,usage_system=0 1591894340000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=98.5985985985986,usage_iowait=0,usage_irq=0,usage_guest=0,usage_user=0.6006006006006006,usage_system=0.8008008008008008,usage_steal=0,usage_guest_nice=0,usage_nice=0,usage_softirq=0 1591894340000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_user=0.1,usage_system=0.1,usage_idle=99.8,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894340000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_user=1.1,usage_idle=97.5,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_system=1.4,usage_nice=0 1591894340000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_idle=99.89989989989989,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=0.1001001001001001,usage_nice=0,usage_iowait=0,usage_irq=0 1591894340000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.9,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=1.5,usage_idle=96.6,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894340000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 merged_writes=0i,writes=5592114i,read_bytes=44226678784i,read_time=1224734i,write_time=3640213i,io_time=4864947i,reads=1864539i,write_bytes=337959378944i,weighted_io_time=0i,iops_in_progress=0i,merged_reads=0i 1591894340000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1001001001001001,usage_iowait=0,usage_irq=0,usage_user=0,usage_idle=99.89989989989989,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894340000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=2.5025025025025025,usage_system=2.8028028028028027,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_idle=94.69469469469469,usage_nice=0,usage_softirq=0,usage_guest=0 1591894340000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.1,usage_softirq=0,usage_steal=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_system=0.1,usage_idle=99.8,usage_nice=0 1591894340000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_system=5,usage_idle=91.5,usage_nice=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=3.5,usage_iowait=0,usage_irq=0 1591894340000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=0,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.1001001001001001,usage_idle=99.89989989989989 1591894340000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_system=6.7,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=4.7,usage_idle=88.6,usage_nice=0,usage_guest=0 1591894340000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894340000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 io_time=4864947i,weighted_io_time=0i,merged_reads=0i,merged_writes=0i,write_bytes=337959378944i,write_time=3640213i,read_bytes=44226678784i,read_time=1224734i,iops_in_progress=0i,reads=1864539i,writes=5592114i 1591894340000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941149184i,used=4387262464i,used_percent=0.4944350261310478,inodes_total=9767978160i 1591894340000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941149184i,used=101143846912i,used_percent=10.277958439896096,inodes_total=9767978160i 1591894340000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.2244455445889357,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941149184i 1591894340000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.263052585506159,usage_idle=97.79278434315013,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_user=0.9441630713437129,usage_steal=0,usage_guest=0,usage_irq=0 1591894340000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1001001001001001,usage_idle=99.89989989989989,usage_iowait=0,usage_guest=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894340000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457595i 1591894340000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.34619140625,load5=1.7529296875,load15=2.47705078125,n_cpus=16i,n_users=4i 1591894340000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=0.4,usage_idle=99.2,usage_softirq=0,usage_guest_nice=0,usage_system=0.4,usage_iowait=0 1591894340000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_system=0.1,usage_idle=99.8,usage_steal=0,usage_guest_nice=0,usage_user=0.1,usage_irq=0 1591894340000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4995004995004995,usage_idle=99.000999000999,usage_nice=0,usage_iowait=0,usage_irq=0,usage_system=0.4995004995004995,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894340000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net mapped=0i,buffered=0i,huge_page_size=0i,low_total=0i,committed_as=0i,write_back=0i,total=68719476736i,free=27871428608i,inactive=14930239488i,shared=0i,sunreclaim=0i,swap_total=0i,wired=5231738880i,used_percent=37.7153754234314,high_free=0i,dirty=0i,available=42801668096i,slab=0i,low_free=0i,write_back_tmp=0i,cached=0i,swap_free=0i,vmalloc_chunk=0i,huge_pages_free=0i,page_tables=0i,vmalloc_total=0i,available_percent=62.2846245765686,commit_limit=0i,high_total=0i,sreclaimable=0i,swap_cached=0i,vmalloc_used=0i,used=25917808640i,active=18820759552i,huge_pages_total=0i 1591894340000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_idle=100,usage_iowait=0,usage_steal=0,usage_system=0,usage_nice=0 1591894340000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.6006006006006006,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_system=0.8008008008008008,usage_idle=98.5985985985986,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894340000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_system=0.1,usage_idle=99.8,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1 1591894340000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net out=0i,in=0i 1591894340000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_steal=0,usage_idle=97.5,usage_system=1.4,usage_nice=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.1 1591894340000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_user=0,usage_system=0.1,usage_idle=99.9,usage_irq=0,usage_softirq=0,usage_nice=0,usage_iowait=0,usage_guest=0 1591894340000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.5,usage_system=1.9,usage_guest=0,usage_steal=0,usage_guest_nice=0,usage_idle=96.6,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894340000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894340000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0,usage_nice=0,usage_irq=0,usage_guest=0,usage_system=0.1001001001001001,usage_idle=99.89989989989989 1591894340000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=94.69469469469469,usage_iowait=0,usage_irq=0,usage_user=2.5025025025025025,usage_system=2.8028028028028027,usage_nice=0 1591894340000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.1,usage_system=0.1,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_idle=99.8,usage_nice=0,usage_iowait=0 1591894340000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=5,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=3.5,usage_idle=91.5,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894340000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0,usage_system=0.1001001001001001,usage_idle=99.89989989989989,usage_nice=0,usage_irq=0,usage_guest_nice=0 1591894340000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=88.6,usage_guest=0,usage_guest_nice=0,usage_user=4.7,usage_system=6.7,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894340000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net huge_pages_free=0i,sunreclaim=0i,swap_total=0i,cached=0i,active=18820702208i,dirty=0i,high_total=0i,mapped=0i,sreclaimable=0i,vmalloc_chunk=0i,swap_free=0i,vmalloc_used=0i,available=42801655808i,used=25917820928i,low_free=0i,swap_cached=0i,wired=5231738880i,available_percent=62.28460669517517,high_free=0i,free=27871416320i,vmalloc_total=0i,huge_pages_total=0i,low_total=0i,total=68719476736i,buffered=0i,inactive=14930239488i,huge_page_size=0i,slab=0i,page_tables=0i,write_back=0i,write_back_tmp=0i,used_percent=37.71539330482483,commit_limit=0i,committed_as=0i,shared=0i 1591894340000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894340000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894340000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net total=68719476736i,high_total=0i,huge_pages_total=0i,sreclaimable=0i,vmalloc_used=0i,buffered=0i,slab=0i,mapped=0i,swap_cached=0i,swap_free=0i,write_back=0i,free=27858313216i,low_free=0i,write_back_tmp=0i,swap_total=0i,used=25993605120i,cached=0i,inactive=14867558400i,committed_as=0i,low_total=0i,sunreclaim=0i,available_percent=62.17432618141174,huge_pages_free=0i,shared=0i,vmalloc_chunk=0i,available=42725871616i,high_free=0i,page_tables=0i,vmalloc_total=0i,active=18767937536i,wired=5360455680i,used_percent=37.82567381858826,commit_limit=0i,dirty=0i,huge_page_size=0i 1591894360000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=513i,total=517i,unknown=0i,idle=0i 1591894350000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net idle=0i,blocked=0i,zombies=1i,stopped=0i,running=2i,sleeping=513i,total=516i,unknown=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609495247i,bytes_recv=9772122159i,packets_sent=9692975i,packets_recv=9196472i 1591894350000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 bytes_sent=451862150i,bytes_recv=9704799i,packets_sent=82798i,packets_recv=91861i,err_in=0i,err_out=157523i,drop_in=0i,drop_out=112i 1591894350000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864539i,read_bytes=44226678784i,read_time=1224734i,merged_reads=0i,merged_writes=0i,iops_in_progress=0i,writes=5592210i,write_bytes=337959931904i,write_time=3640364i,io_time=4865099i,weighted_io_time=0i 1591894350000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.5771284951303801,usage_idle=96.34935595350298,usage_user=2.0735155513666355,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.79899497487438,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.10050251256281408,usage_nice=0,usage_iowait=0,usage_steal=0,usage_user=0.10050251256281408 1591894350000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.6069298851296214,usage_idle=96.31535998995669,usage_iowait=0,usage_guest=0,usage_user=2.0777101249136902,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_guest=0,usage_steal=0,usage_guest_nice=0,usage_user=0.10040160642570281,usage_system=0.10040160642570281,usage_idle=99.79919678714859,usage_nice=0,usage_irq=0 1591894350000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894350000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.2060301507537687,usage_system=1.1055276381909547,usage_idle=97.68844221105527,usage_nice=0,usage_iowait=0,usage_guest_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894350000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.2048192771084338,usage_system=1.104417670682731,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=97.69076305220884,usage_iowait=0,usage_irq=0 1591894350000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_user=0.10040160642570281,usage_system=0.10040160642570281,usage_nice=0,usage_iowait=0,usage_irq=0,usage_idle=99.79919678714859,usage_softirq=0,usage_guest=0 1591894350000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=1.6096579476861168,usage_idle=97.48490945674044,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=0.9054325955734407 1591894350000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.79919678714859,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest_nice=0,usage_system=0.10040160642570281,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0.10040160642570281 1591894350000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894350000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457605i 1591894350000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 weighted_io_time=0i,merged_reads=0i,merged_writes=0i,writes=5592210i,read_bytes=44226678784i,write_bytes=337959931904i,write_time=3640364i,io_time=4865099i,reads=1864539i,read_time=1224734i,iops_in_progress=0i 1591894350000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load15=2.45361328125,n_cpus=16i,n_users=4i,load1=1.21240234375,load5=1.7109375 1591894350000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm used=4387262464i,used_percent=0.4944350192839578,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941161472i 1591894350000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941161472i,used=101143834624i,used_percent=10.277957191223466,inodes_total=9767978160i 1591894350000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ free=882941161472i,used=10945150976i,used_percent=1.2244455277568322,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i 1591894350000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457605i 1591894350000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net n_users=4i,load1=1.21240234375,load5=1.7109375,load15=2.45361328125,n_cpus=16i 1591894350000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_user=1.5060240963855422,usage_system=1.5060240963855422,usage_idle=96.98795180722891,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=0.10040160642570281,usage_system=0.10040160642570281,usage_softirq=0,usage_steal=0,usage_idle=99.79919678714859,usage_nice=0,usage_iowait=0,usage_irq=0 1591894350000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest=0,usage_user=0.10040160642570281,usage_system=0.10040160642570281,usage_idle=99.79919678714859,usage_iowait=0,usage_nice=0,usage_softirq=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=1.6096579476861168,usage_system=0.9054325955734407,usage_idle=97.48490945674044,usage_nice=0,usage_iowait=0,usage_guest=0 1591894350000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.79899497487438,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.10050251256281408,usage_system=0.10050251256281408,usage_iowait=0,usage_softirq=0 1591894350000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.5060240963855422,usage_idle=96.98795180722891,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.5060240963855422,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894350000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_guest=0,usage_user=0.10040160642570281,usage_nice=0,usage_iowait=0,usage_irq=0,usage_system=0.10040160642570281,usage_idle=99.79919678714859,usage_steal=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=2.9175050301810863,usage_system=2.112676056338028,usage_idle=94.96981891348088,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=2.9175050301810863,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=2.112676056338028,usage_idle=94.96981891348088,usage_iowait=0,usage_irq=0,usage_guest=0 1591894350000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_guest_nice=0,usage_user=0.10050251256281408,usage_system=0.10050251256281408,usage_idle=99.79899497487438,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0 1591894350000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.10050251256281408,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.10050251256281408,usage_idle=99.79899497487438,usage_iowait=0,usage_guest=0 1591894350000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_system=2.613065326633166,usage_idle=93.86934673366834,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=3.5175879396984926,usage_irq=0 1591894350000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.20080321285140562,usage_idle=99.6987951807229,usage_steal=0,usage_guest=0,usage_softirq=0,usage_guest_nice=0,usage_user=0.10040160642570281,usage_nice=0,usage_iowait=0,usage_irq=0 1591894350000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=5.622489959839357,usage_system=3.8152610441767068,usage_idle=90.56224899598394,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_system=0.20080321285140562,usage_idle=99.6987951807229,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.10040160642570281 1591894350000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=3.5175879396984926,usage_system=2.613065326633166,usage_nice=0,usage_steal=0,usage_idle=93.86934673366834,usage_iowait=0,usage_irq=0 1591894350000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=6.827309236947791,usage_system=5.120481927710843,usage_irq=0,usage_guest=0,usage_idle=88.05220883534136,usage_nice=0,usage_iowait=0 1591894350000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net inactive=14890930176i,low_free=0i,used=26008395776i,commit_limit=0i,huge_pages_total=0i,page_tables=0i,swap_cached=0i,available=42711080960i,free=27820150784i,sreclaimable=0i,swap_free=0i,total=68719476736i,used_percent=37.84719705581665,low_total=0i,buffered=0i,active=18791145472i,dirty=0i,huge_page_size=0i,shared=0i,vmalloc_chunk=0i,cached=0i,slab=0i,high_total=0i,huge_pages_free=0i,committed_as=0i,high_free=0i,mapped=0i,sunreclaim=0i,swap_total=0i,write_back=0i,write_back_tmp=0i,wired=5352370176i,available_percent=62.15280294418335,vmalloc_total=0i,vmalloc_used=0i 1591894350000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.10050251256281408,usage_system=0.10050251256281408,usage_idle=99.79899497487438,usage_nice=0,usage_irq=0,usage_steal=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894350000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894350000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941161472i,used=4387262464i,used_percent=0.4944350192839578,inodes_total=9767978160i 1591894350000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941161472i,used=101143834624i,used_percent=10.277957191223466 1591894350000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.10040160642570281,usage_system=0.20080321285140562,usage_idle=99.6987951807229,usage_iowait=0,usage_irq=0,usage_guest=0 1591894350000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941161472i,used=10945150976i,used_percent=1.2244455277568322 1591894350000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=9.14572864321608,usage_idle=83.41708542713567,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=7.437185929648241,usage_nice=0,usage_irq=0 1591894350000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894350000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=3.722334004024145,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=5.633802816901408,usage_idle=90.64386317907444,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0 1591894350000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.10050251256281408,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.10050251256281408,usage_idle=99.79899497487438,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0 1591894350000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=88.04020100502512,usage_guest=0,usage_guest_nice=0,usage_user=6.834170854271357,usage_system=5.125628140703518,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894350000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.10050251256281408,usage_system=0.10050251256281408,usage_idle=99.79899497487438,usage_nice=0,usage_irq=0,usage_softirq=0 1591894350000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=9.14572864321608,usage_system=7.437185929648241,usage_idle=83.41708542713567,usage_nice=0,usage_steal=0,usage_guest_nice=0 1591894350000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net low_total=0i,sunreclaim=0i,swap_total=0i,total=68719476736i,available=42711732224i,buffered=0i,slab=0i,used_percent=37.84624934196472,vmalloc_chunk=0i,used=26007744512i,page_tables=0i,swap_cached=0i,cached=0i,high_total=0i,huge_pages_total=0i,sreclaimable=0i,free=27820867584i,committed_as=0i,low_free=0i,vmalloc_total=0i,vmalloc_used=0i,inactive=14890864640i,available_percent=62.15375065803528,commit_limit=0i,shared=0i,dirty=0i,high_free=0i,huge_pages_free=0i,mapped=0i,swap_free=0i,active=18790637568i,wired=5352308736i,huge_page_size=0i,write_back=0i,write_back_tmp=0i 1591894350000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894350000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net available=42770948096i,cached=0i,inactive=14904864768i,swap_total=0i,vmalloc_total=0i,used=25948528640i,page_tables=0i,sunreclaim=0i,vmalloc_chunk=0i,total=68719476736i,buffered=0i,high_total=0i,huge_page_size=0i,swap_free=0i,active=18829082624i,vmalloc_used=0i,free=27866083328i,wired=5254393856i,dirty=0i,huge_pages_total=0i,low_free=0i,mapped=0i,shared=0i,high_free=0i,slab=0i,used_percent=37.76007890701294,available_percent=62.23992109298706,commit_limit=0i,committed_as=0i,huge_pages_free=0i,swap_cached=0i,write_back=0i,low_total=0i,sreclaimable=0i,write_back_tmp=0i 1591894370000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net zombies=1i,stopped=0i,running=3i,sleeping=513i,total=517i,unknown=0i,idle=0i,blocked=0i 1591894360000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net sleeping=513i,total=516i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=2i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609572139i,bytes_recv=9772165065i,packets_sent=9693134i,packets_recv=9196657i,err_in=0i 1591894360000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 bytes_recv=9704963i,packets_sent=82800i,packets_recv=91863i,err_in=0i,err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451862314i 1591894360000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 iops_in_progress=0i,writes=5592366i,read_bytes=44226686976i,write_bytes=337961066496i,io_time=4865331i,merged_reads=0i,merged_writes=0i,reads=1864541i,read_time=1224735i,write_time=3640595i,weighted_io_time=0i 1591894360000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,writes=5592366i,read_bytes=44226686976i,io_time=4865331i,merged_writes=0i,write_bytes=337961066496i,read_time=1224735i,write_time=3640595i,weighted_io_time=0i,iops_in_progress=0i,merged_reads=0i 1591894360000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894360000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm used=4387262464i,used_percent=0.4944350215663211,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941157376i 1591894360000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:06" 1591894360000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457615i 1591894360000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load5=1.9814453125,load15=2.541015625,n_cpus=16i,n_users=4i,load1=2.5244140625 1591894360000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data free=882941157376i,used=101143838720i,used_percent=10.277957607447677,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i 1591894360000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457615i 1591894360000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=2.5244140625,load5=1.9814453125,load15=2.541015625,n_cpus=16i,n_users=4i 1591894360000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ free=882941157376i,used=10945150976i,used_percent=1.2244455333675333,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i 1591894360000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.2374082825519215,usage_system=1.2560626787712972,usage_nice=0,usage_steal=0,usage_idle=97.50652903867679,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894360000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=0.09950248756218906,usage_idle=99.90049751243781,usage_nice=0,usage_guest_nice=0,usage_system=0,usage_steal=0,usage_guest=0 1591894360000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.398406374501992,usage_system=0.5976095617529881,usage_idle=99.00398406374502,usage_nice=0,usage_irq=0,usage_softirq=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894360000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_idle=100,usage_steal=0 1591894360000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_iowait=0,usage_softirq=0,usage_user=0.6965174129353234,usage_system=0.5970149253731343,usage_idle=98.70646766169155,usage_nice=0 1591894360000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.09940357852882704,usage_system=0.09940357852882704,usage_idle=99.80119284294234,usage_nice=0,usage_iowait=0 1591894360000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_steal=0,usage_idle=97.51243781094527,usage_system=1.0945273631840795,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.3930348258706469 1591894360000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_user=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_system=0.099601593625498,usage_idle=99.9003984063745,usage_softirq=0,usage_guest=0 1591894360000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.3916500994035785,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=1.5904572564612327,usage_nice=0,usage_softirq=0,usage_idle=97.01789264413519 1591894360000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_system=0.09940357852882704,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0.09940357852882704,usage_idle=99.80119284294234,usage_iowait=0 1591894360000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.9880715705765408,usage_nice=0,usage_iowait=0,usage_irq=0,usage_user=2.0874751491053676,usage_idle=95.92445328031809,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894360000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.099601593625498,usage_idle=99.9003984063745,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894360000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=3.280318091451292,usage_system=3.081510934393638,usage_idle=93.63817097415507,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894360000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_idle=99.9003984063745,usage_softirq=0,usage_system=0.099601593625498,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894360000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.2319561971129915,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_system=1.2381781981085116,usage_idle=97.5298656047785,usage_irq=0,usage_softirq=0,usage_steal=0 1591894360000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=91.64179104477611,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_guest=0,usage_user=4.079601990049751,usage_system=4.278606965174129,usage_nice=0,usage_iowait=0,usage_irq=0 1591894360000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.099601593625498,usage_idle=99.9003984063745,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_system=0,usage_steal=0,usage_guest=0 1591894360000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.70178926441352,usage_iowait=0,usage_guest=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.09940357852882704,usage_system=0.1988071570576541,usage_nice=0 1591894360000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=0.3988035892323031,usage_idle=99.00299102691925,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_system=0.5982053838484547,usage_nice=0 1591894360000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=87.76119402985074,usage_irq=0,usage_softirq=0,usage_guest=0,usage_steal=0,usage_guest_nice=0,usage_user=5.7711442786069655,usage_system=6.467661691542289,usage_nice=0,usage_iowait=0 1591894360000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_nice=0,usage_iowait=0,usage_idle=100,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0 1591894360000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=0.6965174129353234,usage_system=0.5970149253731343,usage_nice=0,usage_softirq=0,usage_steal=0,usage_idle=98.70646766169155,usage_iowait=0,usage_irq=0 1591894360000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.09950248756218906,usage_nice=0,usage_guest_nice=0,usage_system=0.09950248756218906,usage_idle=99.80099502487562,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894360000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.3930348258706469,usage_system=1.0945273631840795,usage_idle=97.51243781094527,usage_irq=0,usage_steal=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894360000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_guest=0,usage_idle=99.9003984063745,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0,usage_system=0.099601593625498 1591894360000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=97.01789264413519,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_system=1.3916500994035785,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.5904572564612327 1591894360000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.80119284294234,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.09940357852882704,usage_irq=0,usage_steal=0,usage_system=0.09940357852882704 1591894360000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894360000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894360000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.9880715705765408,usage_idle=95.92445328031809,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=2.0874751491053676,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0 1591894360000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.099601593625498,usage_system=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.9003984063745,usage_iowait=0 1591894360000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=3.2868525896414345,usage_idle=93.72509960159363,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_system=2.9880478087649402,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0 1591894360000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_idle=100,usage_nice=0,usage_steal=0 1591894360000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894360000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894360000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882941157376i,used=4387262464i,used_percent=0.4944350215663211,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894360000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=91.63346613545816,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=4.083665338645418,usage_system=4.282868525896414,usage_softirq=0,usage_steal=0 1591894360000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data used_percent=10.277957607447677,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941157376i,used=101143838720i 1591894360000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net committed_as=0i,mapped=0i,page_tables=0i,swap_cached=0i,total=68719476736i,available=42725879808i,used=25993596928i,buffered=0i,high_total=0i,inactive=14867546112i,slab=0i,available_percent=62.1743381023407,commit_limit=0i,huge_page_size=0i,active=18767953920i,dirty=0i,sreclaimable=0i,write_back=0i,free=27858333696i,low_free=0i,swap_total=0i,vmalloc_total=0i,cached=0i,used_percent=37.8256618976593,huge_pages_free=0i,shared=0i,swap_free=0i,vmalloc_chunk=0i,vmalloc_used=0i,wired=5360455680i,high_free=0i,huge_pages_total=0i,low_total=0i,sunreclaim=0i,write_back_tmp=0i 1591894360000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.09940357852882704,usage_idle=99.70178926441352,usage_nice=0,usage_system=0.1988071570576541,usage_iowait=0,usage_softirq=0 1591894360000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=87.76119402985074,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=6.467661691542289,usage_guest=0,usage_user=5.7711442786069655 1591894360000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ inodes_used=487112i,total=1000240963584i,free=882941157376i,used=10945150976i,used_percent=1.2244455333675333,inodes_total=9767978160i,inodes_free=9767491048i 1591894360000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=2i,sleeping=515i 1591894370000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net zombies=1i,stopped=0i,running=3i,sleeping=513i,total=517i,unknown=0i,idle=0i,blocked=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609645686i,bytes_recv=9772189851i,packets_sent=9693393i,packets_recv=9196806i 1591894370000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451862599i,bytes_recv=9705216i,packets_sent=82802i,packets_recv=91865i,err_in=0i 1591894370000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 iops_in_progress=0i,merged_reads=0i,read_bytes=44226686976i,write_bytes=337963343872i,write_time=3640749i,io_time=4865485i,weighted_io_time=0i,reads=1864541i,writes=5592576i,read_time=1224735i,merged_writes=0i 1591894370000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.0494752623688155,usage_system=1.23688155922039,usage_nice=0,usage_guest_nice=0,usage_idle=97.71364317841079,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894370000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_user=0,usage_idle=99.9000999000999,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_system=0.0999000999000999,usage_nice=0,usage_irq=0 1591894370000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_idle=98.9010989010989,usage_nice=0,usage_steal=0,usage_guest=0,usage_user=0.4995004995004995,usage_system=0.5994005994005994 1591894370000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_guest_nice=0,usage_irq=0,usage_system=0.0999000999000999,usage_idle=99.9000999000999,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_user=0 1591894370000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=98.9010989010989,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.4995004995004995,usage_system=0.5994005994005994,usage_nice=0,usage_irq=0 1591894370000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_system=0,usage_idle=100,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0,usage_nice=0 1591894370000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0.7,usage_system=0.8,usage_idle=98.5,usage_iowait=0,usage_steal=0,usage_guest_nice=0 1591894370000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.9000999000999,usage_nice=0,usage_softirq=0,usage_user=0.0999000999000999,usage_system=0 1591894370000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_idle=96.3,usage_iowait=0,usage_steal=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=1.1,usage_system=2.6,usage_nice=0 1591894370000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_system=0.1,usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_user=0 1591894370000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_nice=0,usage_system=1.8,usage_idle=96.7,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.5 1591894370000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_idle=99.9000999000999,usage_system=0.0999000999000999,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0 1591894370000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_system=2.6973026973026974,usage_nice=0,usage_iowait=0,usage_guest=0,usage_user=2.797202797202797,usage_idle=94.50549450549451,usage_softirq=0 1591894370000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_user=0.0998003992015968,usage_system=0.0998003992015968,usage_nice=0,usage_guest=0,usage_idle=99.8003992015968,usage_irq=0,usage_steal=0 1591894370000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=4.2957042957042955,usage_system=4.095904095904096,usage_idle=91.60839160839161,usage_iowait=0,usage_softirq=0,usage_guest=0 1591894370000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894370000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_user=5.2,usage_system=6.1,usage_idle=88.7,usage_irq=0,usage_softirq=0,usage_guest_nice=0 1591894370000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894370000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457625i 1591894370000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 writes=5592576i,read_bytes=44226686976i,io_time=4865485i,iops_in_progress=0i,merged_reads=0i,merged_writes=0i,reads=1864541i,write_bytes=337963343872i,read_time=1224735i,write_time=3640749i,weighted_io_time=0i 1591894370000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load5=1.93212890625,load15=2.5166015625,n_cpus=16i,n_users=4i,load1=2.2158203125 1591894370000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894370000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457625i 1591894370000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=2.2158203125,load5=1.93212890625,load15=2.5166015625,n_cpus=16i,n_users=4i 1591894370000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894370000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_steal=0,usage_user=1.0432935590679078,usage_system=1.2369588305116512,usage_idle=97.71974761042044,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894370000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894370000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_idle=99.9000999000999,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=0,usage_nice=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894370000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4995004995004995,usage_system=0.5994005994005994,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_idle=98.9010989010989,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0 1591894370000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_system=0.0999000999000999,usage_idle=99.9000999000999,usage_irq=0,usage_softirq=0,usage_guest_nice=0 1591894370000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net used=25948241920i,committed_as=0i,mapped=0i,sunreclaim=0i,dirty=0i,high_total=0i,huge_pages_total=0i,write_back=0i,buffered=0i,wired=5254402048i,high_free=0i,page_tables=0i,shared=0i,swap_cached=0i,used_percent=37.75966167449951,commit_limit=0i,huge_pages_free=0i,active=18828881920i,low_total=0i,sreclaimable=0i,vmalloc_used=0i,total=68719476736i,free=27866202112i,slab=0i,huge_page_size=0i,swap_free=0i,vmalloc_total=0i,cached=0i,low_free=0i,vmalloc_chunk=0i,write_back_tmp=0i,available=42771234816i,inactive=14905032704i,available_percent=62.24033832550049,swap_total=0i 1591894370000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_system=0.5994005994005994,usage_idle=98.9010989010989,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_user=0.4995004995004995,usage_softirq=0,usage_guest=0 1591894370000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_iowait=0,usage_system=0,usage_idle=100,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894370000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.7,usage_system=0.8,usage_idle=98.5,usage_softirq=0,usage_steal=0 1591894370000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0999000999000999,usage_idle=99.9000999000999,usage_guest_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=0,usage_nice=0,usage_iowait=0,usage_irq=0 1591894370000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=96.3,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.1,usage_system=2.6,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894370000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net out=0i,in=0i 1591894370000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894370000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_guest=0,usage_user=0,usage_system=0,usage_steal=0 1591894370000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.5,usage_system=1.8,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_idle=96.7,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894370000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_idle=99.9000999000999,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_system=0.0999000999000999,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0 1591894370000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_system=2.6973026973026974,usage_idle=94.50549450549451,usage_nice=0,usage_guest=0,usage_guest_nice=0,usage_user=2.797202797202797,usage_iowait=0,usage_irq=0 1591894370000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0998003992015968,usage_system=0.0998003992015968,usage_idle=99.8003992015968,usage_irq=0,usage_guest=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894370000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=4.2957042957042955,usage_system=4.095904095904096,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_idle=91.60839160839161,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894370000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_steal=0,usage_system=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894370000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882941063168i,used=4387262464i,used_percent=0.4944350740606833,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894370000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941063168i,used=101143932928i,used_percent=10.277967180604504 1591894370000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=5.105105105105105,usage_system=6.106106106106106,usage_iowait=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_idle=88.78878878878879,usage_nice=0,usage_softirq=0,usage_steal=0 1591894370000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882941063168i,used=10945150976i,used_percent=1.2244456624136726,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894370000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941063168i,used=4387262464i,used_percent=0.4944350740606833 1591894370000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data used=101143932928i,used_percent=10.277967180604504,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882941063168i 1591894370000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882941063168i,used=10945150976i,used_percent=1.2244456624136726,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894370000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net huge_page_size=0i,shared=0i,sreclaimable=0i,swap_cached=0i,vmalloc_chunk=0i,total=68719476736i,slab=0i,used_percent=37.71045804023743,commit_limit=0i,available=42805047296i,wired=5239648256i,dirty=0i,high_free=0i,high_total=0i,huge_pages_free=0i,huge_pages_total=0i,inactive=14946000896i,page_tables=0i,swap_free=0i,committed_as=0i,low_free=0i,low_total=0i,write_back=0i,cached=0i,mapped=0i,sunreclaim=0i,vmalloc_total=0i,vmalloc_used=0i,free=27859046400i,buffered=0i,active=18810281984i,available_percent=62.28954195976257,swap_total=0i,used=25914429440i,write_back_tmp=0i 1591894390000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net sleeping=514i,total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i 1591894380000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net zombies=1i,stopped=0i,running=3i,sleeping=514i,total=518i,unknown=0i,idle=0i,blocked=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609675806i,bytes_recv=9772202900i,packets_sent=9693453i,packets_recv=9196894i,err_in=0i 1591894380000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 drop_out=112i,bytes_sent=451862599i,bytes_recv=9705216i,packets_sent=82802i,packets_recv=91865i,err_in=0i,err_out=157523i,drop_in=0i 1591894380000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=97.23784526934134,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=1.4748156480439945,usage_iowait=0,usage_steal=0,usage_guest_nice=0,usage_system=1.2873390826146731 1591894380000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0,usage_system=0,usage_idle=100,usage_steal=0,usage_guest_nice=0 1591894380000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.5,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.5,usage_idle=99,usage_nice=0,usage_iowait=0,usage_steal=0 1591894380000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_idle=99.9,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.1,usage_system=0 1591894380000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.2,usage_system=0.5,usage_nice=0,usage_irq=0,usage_steal=0,usage_idle=98.3,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894380000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_guest_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_idle=99.8001998001998 1591894380000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_irq=0,usage_steal=0,usage_user=1.480972317690433,usage_system=1.2872586390051866,usage_idle=97.23176904330438 1591894380000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0,usage_system=0,usage_guest_nice=0,usage_idle=100,usage_steal=0 1591894380000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_user=0.5,usage_system=0.5,usage_idle=99,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894380000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.1,usage_system=0,usage_idle=99.9,usage_guest=0,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894380000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_steal=0,usage_system=0.5,usage_idle=98.3,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=1.2 1591894380000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.8001998001998,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=0.0999000999000999,usage_iowait=0,usage_softirq=0,usage_user=0.0999000999000999 1591894380000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=0.7992007992007992,usage_idle=97.9020979020979,usage_nice=0,usage_irq=0,usage_guest_nice=0,usage_user=1.2987012987012987 1591894380000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_system=0.8,usage_idle=97.9,usage_nice=0,usage_steal=0,usage_guest=0,usage_user=1.3 1591894380000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_system=0.1,usage_idle=99.9,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_nice=0 1591894380000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=2,usage_system=1.6,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_idle=96.4,usage_nice=0,usage_iowait=0,usage_steal=0 1591894380000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0 1591894380000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,writes=5592646i,read_bytes=44226686976i,write_bytes=337963720704i,io_time=4865561i,merged_reads=0i,read_time=1224735i,write_time=3640825i,weighted_io_time=0i,iops_in_progress=0i,merged_writes=0i 1591894380000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_system=0,usage_idle=100,usage_irq=0,usage_softirq=0 1591894380000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.6,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=2,usage_idle=96.4,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894380000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=2.7027027027027026,usage_system=2.1021021021021022,usage_irq=0,usage_idle=95.1951951951952,usage_nice=0 1591894380000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_system=0.1,usage_idle=99.9,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894380000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=95.1951951951952,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=2.7027027027027026,usage_system=2.1021021021021022,usage_iowait=0,usage_irq=0,usage_guest_nice=0 1591894380000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_softirq=0,usage_guest=0,usage_steal=0,usage_system=0.0999000999000999,usage_nice=0,usage_iowait=0,usage_irq=0 1591894380000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_nice=0,usage_iowait=0,usage_irq=0,usage_user=0.0999000999000999,usage_softirq=0,usage_steal=0 1591894380000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=3.6036036036036037,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_system=3.103103103103103,usage_idle=93.29329329329329,usage_irq=0,usage_softirq=0,usage_steal=0 1591894380000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=3.6,usage_nice=0,usage_irq=0,usage_guest=0,usage_system=3.2,usage_idle=93.2 1591894380000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=0,usage_system=0.1001001001001001,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_idle=99.89989989989989,usage_iowait=0,usage_guest_nice=0 1591894380000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_idle=89.7,usage_system=4.8,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=5.5 1591894380000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0.1,usage_idle=99.9,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894380000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=89.7,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=5.5,usage_system=4.8,usage_nice=0,usage_irq=0,usage_steal=0 1591894380000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=0,usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_steal=0,usage_system=0.1,usage_irq=0,usage_softirq=0,usage_guest_nice=0 1591894380000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm free=882941063168i,used=4387262464i,used_percent=0.4944350740606833,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i 1591894380000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0,usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_system=0.1 1591894380000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=6.586826347305389,usage_idle=86.82634730538922,usage_steal=0,usage_user=6.586826347305389 1591894380000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894380000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data total=1000240963584i,free=882941063168i,used=101143932928i,used_percent=10.277967180604504,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i 1591894380000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,writes=5592646i,read_bytes=44226686976i,read_time=1224735i,write_time=3640825i,merged_writes=0i,write_bytes=337963720704i,io_time=4865561i,weighted_io_time=0i,iops_in_progress=0i,merged_reads=0i 1591894380000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894380000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.2244456624136726,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882941063168i 1591894380000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=6.4935064935064934,usage_idle=86.91308691308691,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=6.593406593406593,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0 1591894380000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894380000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net high_free=0i,high_total=0i,page_tables=0i,huge_page_size=0i,swap_free=0i,swap_total=0i,write_back_tmp=0i,available=42767740928i,wired=5239930880i,committed_as=0i,sunreclaim=0i,swap_cached=0i,active=18846973952i,slab=0i,used_percent=37.76474595069885,low_free=0i,mapped=0i,sreclaimable=0i,cached=0i,inactive=14947860480i,dirty=0i,commit_limit=0i,huge_pages_free=0i,low_total=0i,used=25951735808i,buffered=0i,available_percent=62.23525404930115,shared=0i,write_back=0i,total=68719476736i,free=27819880448i,huge_pages_total=0i,vmalloc_chunk=0i,vmalloc_total=0i,vmalloc_used=0i 1591894380000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457635i 1591894380000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894380000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=2.27490234375,load5=1.951171875,load15=2.515625,n_cpus=16i,n_users=4i 1591894380000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894380000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894380000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457635i 1591894380000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net n_cpus=16i,n_users=4i,load1=2.27490234375,load5=1.951171875,load15=2.515625 1591894380000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net available=42767654912i,buffered=0i,swap_cached=0i,vmalloc_total=0i,free=27819749376i,huge_pages_free=0i,huge_pages_total=0i,swap_free=0i,write_back=0i,wired=5239840768i,dirty=0i,low_free=0i,shared=0i,sunreclaim=0i,swap_total=0i,inactive=14947905536i,high_free=0i,high_total=0i,low_total=0i,sreclaimable=0i,write_back_tmp=0i,total=68719476736i,active=18846015488i,commit_limit=0i,slab=0i,used_percent=37.76487112045288,page_tables=0i,vmalloc_chunk=0i,used=25951821824i,cached=0i,available_percent=62.23512887954712,committed_as=0i,huge_page_size=0i,mapped=0i,vmalloc_used=0i 1591894380000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm used_percent=0.4944350740606833,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941063168i,used=4387262464i 1591894380000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data free=882941063168i,used=101143932928i,used_percent=10.277967180604504,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i 1591894380000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882941063168i,used=10945150976i,used_percent=1.2244456624136726,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894380000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net zombies=1i,stopped=0i,running=4i,sleeping=513i,total=518i,unknown=0i,idle=0i,blocked=0i 1591894390000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=513i,total=517i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 packets_recv=9197005i,err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609696052i,bytes_recv=9772212980i,packets_sent=9693518i 1591894390000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 drop_out=112i,bytes_sent=451862884i,bytes_recv=9705469i,packets_sent=82804i,packets_recv=91867i,err_in=0i,err_out=157523i,drop_in=0i 1591894390000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,writes=5592810i,io_time=4865707i,iops_in_progress=0i,merged_writes=0i,read_bytes=44226686976i,write_bytes=337964666880i,read_time=1224735i,write_time=3640972i,weighted_io_time=0i,merged_reads=0i 1591894390000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 writes=5592810i,read_bytes=44226686976i,read_time=1224735i,io_time=4865707i,merged_reads=0i,merged_writes=0i,reads=1864541i,write_bytes=337964666880i,write_time=3640972i,weighted_io_time=0i,iops_in_progress=0i 1591894390000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894390000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457645i 1591894390000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load15=2.4912109375,n_cpus=16i,n_users=4i,load1=1.998046875,load5=1.90283203125 1591894390000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894390000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=1.0748656417947757,usage_idle=97.92525934258218,usage_iowait=0,usage_irq=0,usage_system=0.9998750156230471,usage_nice=0,usage_softirq=0 1591894390000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0999000999000999,usage_system=0,usage_idle=99.9000999000999,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4995004995004995,usage_idle=99.2007992007992,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_system=0.2997002997002997,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894390000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882941063168i,used=4387262464i,used_percent=0.4944350740606833,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894390000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_used=1731843i,total=1000240963584i,free=882941063168i,used=101143932928i,used_percent=10.277967180604504,inodes_total=9767978160i,inodes_free=9766246317i 1591894390000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457645i 1591894390000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882941063168i,used=10945150976i,used_percent=1.2244456624136726,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894390000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net n_users=4i,load1=1.998046875,load5=1.90283203125,load15=2.4912109375,n_cpus=16i 1591894390000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.0999000999000999,usage_idle=99.9000999000999,usage_iowait=0,usage_guest_nice=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894390000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.6006006006006006,usage_idle=98.998998998999,usage_nice=0,usage_irq=0,usage_system=0.4004004004004004,usage_iowait=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_softirq=0,usage_guest=0,usage_user=0,usage_system=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_irq=0 1591894390000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.6,usage_idle=98.7,usage_irq=0,usage_steal=0,usage_guest=0,usage_system=0.7,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_idle=99.9,usage_irq=0,usage_user=0.1,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_nice=0 1591894390000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_idle=96.90309690309691,usage_irq=0,usage_softirq=0,usage_iowait=0,usage_steal=0,usage_guest_nice=0,usage_user=1.5984015984015985,usage_system=1.4985014985014986,usage_nice=0 1591894390000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882941063168i,used=4387262464i,used_percent=0.4944350740606833 1591894390000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.0999000999000999,usage_system=0,usage_idle=99.9000999000999,usage_nice=0,usage_steal=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0 1591894390000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data total=1000240963584i,free=882941063168i,used=101143932928i,used_percent=10.277967180604504,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i 1591894390000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.7982017982017982,usage_system=1.898101898101898,usage_idle=96.3036963036963,usage_irq=0,usage_steal=0 1591894390000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=1.0748656417947757,usage_system=0.9998750156230471,usage_idle=97.92525934258218,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894390000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882941063168i,used=10945150976i,used_percent=1.2244456624136726,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894390000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_system=0,usage_idle=100,usage_nice=0,usage_guest_nice=0,usage_user=0,usage_steal=0,usage_guest=0 1591894390000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_system=0,usage_idle=99.9000999000999,usage_steal=0,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894390000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4995004995004995,usage_idle=99.2007992007992,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.2997002997002997,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0 1591894390000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=94.10589410589411,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=3.196803196803197,usage_system=2.6973026973026974,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=0.0999000999000999,usage_idle=99.9000999000999,usage_nice=0,usage_user=0,usage_iowait=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_system=0.4004004004004004,usage_idle=98.998998998999,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.6006006006006006 1591894390000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_user=0.0999000999000999,usage_irq=0,usage_softirq=0,usage_iowait=0,usage_guest_nice=0,usage_system=0,usage_idle=99.9000999000999,usage_nice=0 1591894390000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_system=0,usage_idle=100,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_nice=0,usage_guest=0,usage_user=3.2967032967032965,usage_system=3.3966033966033966,usage_idle=93.3066933066933 1591894390000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_nice=0,usage_irq=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894390000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894390000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=98.69869869869869,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_user=0.6006006006006006,usage_system=0.7007007007007007,usage_irq=0,usage_softirq=0,usage_steal=0 1591894390000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=5.005005005005005,usage_idle=89.88988988988989,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=5.105105105105105,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1,usage_system=0,usage_iowait=0,usage_softirq=0 1591894390000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net slab=0i,huge_page_size=0i,used_percent=37.71045804023743,dirty=0i,sreclaimable=0i,vmalloc_total=0i,free=27859046400i,active=18810277888i,page_tables=0i,write_back=0i,buffered=0i,commit_limit=0i,committed_as=0i,huge_pages_total=0i,low_total=0i,swap_free=0i,vmalloc_used=0i,used=25914429440i,available_percent=62.28954195976257,high_free=0i,high_total=0i,sunreclaim=0i,total=68719476736i,cached=0i,wired=5239648256i,shared=0i,swap_cached=0i,vmalloc_chunk=0i,available=42805047296i,inactive=14946000896i,huge_pages_free=0i,low_free=0i,mapped=0i,swap_total=0i,write_back_tmp=0i 1591894390000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_idle=96.90309690309691,usage_nice=0,usage_steal=0,usage_irq=0,usage_softirq=0,usage_user=1.5984015984015985,usage_system=1.4985014985014986,usage_iowait=0 1591894390000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_iowait=0,usage_system=0,usage_idle=99.9,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1 1591894390000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net free=1494482944i,used_percent=30.40771484375,total=2147483648i,used=653000704i 1591894390000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894390000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894390000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.898101898101898,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=1.7982017982017982,usage_idle=96.3036963036963,usage_nice=0,usage_iowait=0,usage_irq=0 1591894390000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest_nice=0,usage_system=0 1591894390000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_guest=0,usage_user=3.2,usage_system=2.6,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_idle=94.2,usage_nice=0 1591894390000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_system=0,usage_idle=99.9,usage_guest_nice=0,usage_user=0.1,usage_softirq=0 1591894390000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=3.2967032967032965,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_system=3.3966033966033966,usage_idle=93.3066933066933,usage_iowait=0,usage_softirq=0 1591894390000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_iowait=0,usage_irq=0,usage_guest=0,usage_system=0.0999000999000999,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894390000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=5.105105105105105,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=5.005005005005005,usage_idle=89.88988988988989,usage_nice=0,usage_iowait=0,usage_guest_nice=0 1591894390000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net stopped=0i,running=2i,sleeping=515i,total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i 1591894400000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net stopped=0i,running=3i,sleeping=514i,total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 bytes_recv=9772224987i,packets_sent=9693589i,packets_recv=9197122i,err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609751103i 1591894400000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451863048i,bytes_recv=9705633i,packets_sent=82806i,packets_recv=91869i,err_in=0i 1591894400000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 iops_in_progress=0i,merged_reads=0i,reads=1864541i,read_bytes=44226686976i,write_bytes=337966243840i,read_time=1224735i,write_time=3641064i,io_time=4865800i,weighted_io_time=0i,merged_writes=0i,writes=5592997i 1591894400000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.8559290266150194,usage_system=1.0246157690865925,usage_idle=98.11945520429839,usage_softirq=0,usage_steal=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1,usage_nice=0,usage_softirq=0,usage_guest=0,usage_idle=99.9,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.5,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.3,usage_idle=99.2,usage_guest=0 1591894400000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_user=0.1,usage_idle=99.9,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_system=0,usage_nice=0,usage_iowait=0,usage_guest=0 1591894400000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.6986027944111777,usage_iowait=0,usage_nice=0,usage_steal=0,usage_system=0.5988023952095808,usage_idle=98.70259481037924 1591894400000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_iowait=0,usage_softirq=0,usage_user=0.0999000999000999,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.8001998001998 1591894400000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_system=0.7992007992007992,usage_irq=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=0.6993006993006993,usage_idle=98.5014985014985 1591894400000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0,usage_system=0.0999000999000999,usage_idle=99.9000999000999,usage_softirq=0,usage_guest=0 1591894400000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_user=0.8559290266150194,usage_system=1.0246157690865925,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_idle=98.11945520429839,usage_nice=0,usage_iowait=0,usage_guest=0 1591894400000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.9,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.8,usage_idle=98.3,usage_steal=0 1591894400000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1,usage_iowait=0,usage_irq=0,usage_guest=0,usage_user=0,usage_idle=99.9,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0,usage_system=0.1,usage_idle=99.9,usage_nice=0,usage_guest=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894400000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.3003003003003003,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_system=0.5005005005005005,usage_idle=99.1991991991992,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.3,usage_idle=97.4,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.3,usage_iowait=0,usage_irq=0,usage_steal=0,usage_nice=0 1591894400000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_system=0.1,usage_idle=99.9,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0,usage_nice=0 1591894400000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0.1001001001001001,usage_system=0,usage_idle=99.89989989989989,usage_iowait=0,usage_guest=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=2,usage_idle=95.9,usage_irq=0,usage_steal=0,usage_system=2.1,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1,usage_irq=0,usage_guest_nice=0,usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894400000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_system=0.4995004995004995,usage_idle=98.80119880119881,usage_nice=0,usage_iowait=0,usage_user=0.6993006993006993,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=2.9,usage_nice=0,usage_softirq=0,usage_guest=0,usage_system=3.3,usage_idle=93.8,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0 1591894400000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_user=0,usage_system=0.1,usage_nice=0,usage_irq=0,usage_softirq=0,usage_idle=99.9,usage_iowait=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_idle=98.5,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=0.7,usage_system=0.8,usage_steal=0,usage_guest=0 1591894400000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894400000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=4.795204795204795,usage_nice=0,usage_guest_nice=0,usage_system=6.193806193806194,usage_idle=89.01098901098901 1591894400000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1,usage_idle=99.9,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_nice=0,usage_iowait=0,usage_irq=0 1591894400000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.8008008008008008,usage_system=0.9009009009009009,usage_idle=98.29829829829829,usage_iowait=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1,usage_idle=99.9,usage_nice=0,usage_guest_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894400000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=1.3,usage_system=1.3,usage_idle=97.4,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894400000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9,usage_nice=0,usage_irq=0,usage_guest=0,usage_user=0,usage_system=0.1,usage_steal=0,usage_guest_nice=0,usage_iowait=0,usage_softirq=0 1591894400000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=2.1,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=2,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_idle=95.9 1591894400000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 writes=5592997i,io_time=4865800i,iops_in_progress=0i,merged_writes=0i,weighted_io_time=0i,merged_reads=0i,reads=1864541i,read_bytes=44226686976i,write_bytes=337966243840i,read_time=1224735i,write_time=3641064i 1591894400000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_irq=0,usage_guest=0,usage_user=0,usage_system=0.1001001001001001,usage_idle=99.89989989989989 1591894400000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457655i 1591894400000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=3.3033033033033035,usage_idle=93.7937937937938,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_user=2.9029029029029028,usage_irq=0 1591894400000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.76416015625,load5=1.85595703125,load15=2.4677734375,n_cpus=16i,n_users=4i 1591894400000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_idle=99.9,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0 1591894400000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894400000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=4.795204795204795,usage_system=6.193806193806194,usage_idle=89.01098901098901,usage_nice=0,usage_iowait=0,usage_guest=0 1591894400000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457655i 1591894400000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load5=1.85595703125,load15=2.4677734375,n_cpus=16i,n_users=4i,load1=1.76416015625 1591894400000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net used=25919365120i,high_total=0i,cached=0i,used_percent=37.71764039993286,huge_page_size=0i,write_back_tmp=0i,vmalloc_total=0i,available_percent=62.28235960006714,mapped=0i,shared=0i,sreclaimable=0i,swap_free=0i,inactive=14943776768i,wired=5227212800i,high_free=0i,low_total=0i,sunreclaim=0i,active=18826252288i,huge_pages_free=0i,swap_cached=0i,low_free=0i,page_tables=0i,total=68719476736i,free=27856334848i,slab=0i,committed_as=0i,huge_pages_total=0i,buffered=0i,dirty=0i,swap_total=0i,vmalloc_chunk=0i,vmalloc_used=0i,available=42800111616i,commit_limit=0i,write_back=0i 1591894400000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894400000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net out=0i,in=0i 1591894400000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894400000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894400000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net buffered=0i,active=18826252288i,committed_as=0i,dirty=0i,shared=0i,sreclaimable=0i,free=27856334848i,used_percent=37.71764039993286,low_free=0i,sunreclaim=0i,swap_cached=0i,vmalloc_total=0i,used=25919365120i,high_total=0i,huge_page_size=0i,huge_pages_total=0i,page_tables=0i,vmalloc_used=0i,cached=0i,wired=5227212800i,slab=0i,huge_pages_free=0i,available_percent=62.28235960006714,available=42800111616i,commit_limit=0i,write_back=0i,write_back_tmp=0i,high_free=0i,low_total=0i,mapped=0i,swap_free=0i,swap_total=0i,vmalloc_chunk=0i,total=68719476736i,inactive=14943776768i 1591894400000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_used=3i,total=1000240963584i,free=882940801024i,used=4387262464i,used_percent=0.49443522013201063,inodes_total=9767978160i,inodes_free=9767978157i 1591894400000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882940801024i,used=4387262464i,used_percent=0.49443522013201063,inodes_total=9767978160i 1591894400000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data total=1000240963584i,free=882940801024i,used=101144195072i,used_percent=10.27799381895394,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i 1591894400000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data used=101144195072i,used_percent=10.27799381895394,inodes_total=9767978160i,inodes_free=9766246317i,inodes_used=1731843i,total=1000240963584i,free=882940801024i 1591894400000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882940801024i,used=10945150976i,used_percent=1.2244460214987247,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894400000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ inodes_used=487112i,total=1000240963584i,free=882940801024i,used=10945150976i,used_percent=1.2244460214987247,inodes_total=9767978160i,inodes_free=9767491048i 1591894400000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894420000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894420000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net available=42795352064i,inactive=14942777344i,wired=5227229184i,huge_pages_free=0i,vmalloc_chunk=0i,total=68719476736i,free=27852574720i,shared=0i,available_percent=62.27543354034424,low_free=0i,swap_total=0i,active=18832785408i,commit_limit=0i,huge_pages_total=0i,page_tables=0i,vmalloc_used=0i,committed_as=0i,huge_page_size=0i,sreclaimable=0i,swap_free=0i,write_back=0i,buffered=0i,dirty=0i,sunreclaim=0i,used=25924124672i,slab=0i,used_percent=37.72456645965576,mapped=0i,write_back_tmp=0i,vmalloc_total=0i,cached=0i,high_free=0i,high_total=0i,low_total=0i,swap_cached=0i 1591894420000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=4i,sleeping=513i 1591894410000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net idle=0i,blocked=0i,zombies=1i,stopped=0i,running=4i,sleeping=513i,total=518i,unknown=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 drop_in=0i,drop_out=4i,bytes_sent=3609776704i,bytes_recv=9772236903i,packets_sent=9693670i,packets_recv=9197207i,err_in=0i,err_out=4989i 1591894410000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 packets_sent=82808i,packets_recv=91871i,err_in=0i,err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451863333i,bytes_recv=9705886i 1591894410000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0.8816908454227114,usage_system=0.9442221110555278,usage_idle=98.17408704352177,usage_steal=0,usage_guest_nice=0 1591894410000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_irq=0,usage_system=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1 1591894410000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_guest=0,usage_user=0.3996003996003996,usage_system=0.3996003996003996,usage_idle=99.2007992007992,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894410000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_user=0,usage_system=0.1,usage_nice=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.9,usage_irq=0,usage_softirq=0 1591894410000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.09909909909909,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=0.5005005005005005,usage_system=0.4004004004004004,usage_nice=0,usage_steal=0,usage_guest=0 1591894410000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_nice=0,usage_softirq=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_idle=100,usage_iowait=0 1591894410000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.8,usage_softirq=0,usage_guest_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=0.9,usage_idle=98.3,usage_nice=0 1591894410000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 weighted_io_time=0i,writes=5593109i,read_bytes=44226686976i,write_time=3641211i,io_time=4865947i,iops_in_progress=0i,merged_reads=0i,merged_writes=0i,reads=1864541i,write_bytes=337966866432i,read_time=1224735i 1591894410000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_user=0.1,usage_irq=0,usage_softirq=0,usage_iowait=0,usage_guest_nice=0,usage_system=0,usage_idle=99.9,usage_nice=0 1591894410000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.9,usage_idle=98.2,usage_softirq=0,usage_steal=0,usage_system=0.9,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_guest_nice=0 1591894410000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0.0999000999000999,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.8001998001998 1591894410000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,writes=5593109i,io_time=4865947i,merged_reads=0i,merged_writes=0i,read_bytes=44226686976i,write_bytes=337966866432i,read_time=1224735i,write_time=3641211i,weighted_io_time=0i,iops_in_progress=0i 1591894410000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm used_percent=0.49443521784964545,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882940805120i,used=4387262464i 1591894410000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=97,usage_nice=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=1.4,usage_system=1.6,usage_iowait=0,usage_irq=0,usage_steal=0 1591894410000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1,usage_system=0,usage_irq=0 1591894410000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data total=1000240963584i,free=882940805120i,used=101144190976i,used_percent=10.277993402729729,inodes_total=9767978160i,inodes_free=9766246316i,inodes_used=1731844i 1591894410000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used_percent=1.224446015888019,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882940805120i,used=10945150976i 1591894410000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=1.7,usage_iowait=0,usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_system=2.3,usage_idle=96 1591894410000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0.1,usage_nice=0,usage_steal=0,usage_idle=99.9,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894410000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894410000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=3.2,usage_idle=93.4,usage_iowait=0,usage_irq=0,usage_guest_nice=0,usage_system=3.4 1591894410000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894410000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_nice=0,usage_iowait=0,usage_user=0.1,usage_idle=99.9,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894410000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net free=1494482944i,used_percent=30.40771484375,total=2147483648i,used=653000704i 1591894410000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=90.4,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=5,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=4.6 1591894410000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457665i 1591894410000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net n_users=4i,load1=1.4921875,load5=1.7939453125,load15=2.4384765625,n_cpus=16i 1591894410000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894410000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457665i 1591894410000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.4921875,load5=1.7939453125,load15=2.4384765625,n_cpus=16i,n_users=4i 1591894410000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894410000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894410000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net dirty=0i,shared=0i,vmalloc_chunk=0i,total=68719476736i,free=27853545472i,buffered=0i,swap_total=0i,low_free=0i,page_tables=0i,sunreclaim=0i,wired=5227216896i,used_percent=37.722307443618774,commit_limit=0i,high_total=0i,mapped=0i,sreclaimable=0i,swap_cached=0i,vmalloc_used=0i,write_back_tmp=0i,cached=0i,available_percent=62.277692556381226,committed_as=0i,inactive=14943358976i,swap_free=0i,vmalloc_total=0i,available=42796904448i,used=25922572288i,slab=0i,low_total=0i,active=18832244736i,high_free=0i,huge_page_size=0i,huge_pages_free=0i,huge_pages_total=0i,write_back=0i 1591894410000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.8818011257035647,usage_system=0.9318323952470294,usage_idle=98.18636647904941,usage_nice=0,usage_guest_nice=0,usage_irq=0 1591894410000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0,usage_idle=100,usage_nice=0,usage_iowait=0 1591894410000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_system=0.3003003003003003,usage_iowait=0,usage_softirq=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.4004004004004004,usage_idle=99.29929929929929,usage_nice=0 1591894410000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1001001001001001,usage_idle=99.89989989989989,usage_iowait=0,usage_irq=0,usage_guest=0,usage_user=0,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894410000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.30060120240480964,usage_idle=99.19839679358718,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.501002004008016,usage_softirq=0,usage_guest=0 1591894410000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_steal=0,usage_guest_nice=0,usage_system=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=0,usage_idle=100 1591894410000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.8008008008008008,usage_nice=0,usage_softirq=0,usage_guest=0,usage_user=0.9009009009009009,usage_idle=98.29829829829829 1591894410000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.89989989989989,usage_nice=0,usage_softirq=0,usage_guest=0,usage_user=0.1001001001001001,usage_system=0,usage_steal=0,usage_guest_nice=0,usage_iowait=0,usage_irq=0 1591894410000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_user=0.9009009009009009,usage_system=0.9009009009009009,usage_idle=98.1981981981982,usage_guest=0,usage_guest_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0 1591894410000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1,usage_idle=99.8,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_user=0.1,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894410000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_user=1.4,usage_system=1.6,usage_idle=97,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894410000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_idle=99.9,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_user=0.1,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0 1591894410000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_user=1.7,usage_system=2.3,usage_idle=96,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894410000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.89989989989989,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0.1001001001001001,usage_iowait=0 1591894410000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882940805120i,used=4387262464i,used_percent=0.49443521784964545,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894410000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=3.4034034034034035,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=3.2032032032032034,usage_idle=93.3933933933934,usage_softirq=0 1591894410000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_used=1731844i,total=1000240963584i,free=882940805120i,used=101144190976i,used_percent=10.277993402729729,inodes_total=9767978160i,inodes_free=9766246316i 1591894410000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.1,usage_system=0,usage_idle=99.9,usage_irq=0,usage_guest_nice=0 1591894410000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.224446015888019,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882940805120i 1591894410000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=5,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=4.6,usage_nice=0,usage_iowait=0,usage_irq=0,usage_idle=90.4 1591894410000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net used=25922605056i,wired=5227216896i,used_percent=37.722355127334595,commit_limit=0i,committed_as=0i,dirty=0i,low_free=0i,low_total=0i,total=68719476736i,available=42796871680i,active=18832252928i,inactive=14943358976i,high_total=0i,swap_total=0i,huge_pages_total=0i,page_tables=0i,swap_free=0i,free=27853512704i,available_percent=62.277644872665405,shared=0i,write_back=0i,slab=0i,huge_page_size=0i,sreclaimable=0i,swap_cached=0i,vmalloc_total=0i,cached=0i,high_free=0i,huge_pages_free=0i,mapped=0i,write_back_tmp=0i,buffered=0i,sunreclaim=0i,vmalloc_chunk=0i,vmalloc_used=0i 1591894410000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net stopped=0i,running=3i,sleeping=514i,total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i 1591894420000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net sleeping=513i,total=517i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 drop_out=4i,bytes_sent=3609810617i,bytes_recv=9772251045i,packets_sent=9694169i,packets_recv=9197341i,err_in=0i,err_out=4989i,drop_in=0i 1591894420000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451863411i,bytes_recv=9705972i,packets_sent=82809i,packets_recv=91872i,err_in=0i 1591894420000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_idle=98.1375,usage_system=1.00625,usage_steal=0,usage_user=0.85625 1591894420000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1,usage_system=0,usage_nice=0,usage_iowait=0 1591894420000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.5005005005005005,usage_idle=98.998998998999,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.5005005005005005,usage_irq=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_system=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_user=0,usage_idle=100,usage_irq=0,usage_steal=0 1591894420000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894420000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.5,usage_idle=98.9,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_user=0.6,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_nice=0,usage_irq=0 1591894420000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,read_bytes=44226686976i,read_time=1224735i,write_time=3641359i,iops_in_progress=0i,merged_reads=0i,merged_writes=0i,writes=5593219i,write_bytes=337967591424i,io_time=4866094i,weighted_io_time=0i 1591894420000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 io_time=4866094i,merged_reads=0i,writes=5593219i,read_bytes=44226686976i,read_time=1224735i,weighted_io_time=0i,iops_in_progress=0i,merged_writes=0i,reads=1864541i,write_bytes=337967591424i,write_time=3641359i 1591894420000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457675i 1591894420000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load15=2.4091796875,n_cpus=16i,n_users=4i,load1=1.26220703125,load5=1.73486328125 1591894420000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.7,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=0.6,usage_idle=98.7,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=0,usage_system=0.1,usage_nice=0,usage_softirq=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.7007007007007007,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_system=0.9009009009009009,usage_idle=98.3983983983984,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=0,usage_system=0,usage_guest=0,usage_guest_nice=0,usage_idle=100,usage_steal=0 1591894420000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=1.2012012012012012,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_system=1.4014014014014013,usage_idle=97.3973973973974,usage_guest=0 1591894420000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1,usage_idle=99.9,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894420000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=95.6,usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=2.1,usage_system=2.3,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.1,usage_system=0,usage_idle=99.9,usage_nice=0,usage_guest=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894420000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm used_percent=0.49443521784964545,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882940805120i,used=4387262464i 1591894420000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=3.4034034034034035,usage_system=3.6036036036036037,usage_idle=92.992992992993,usage_nice=0,usage_steal=0 1591894420000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_free=9766246316i,inodes_used=1731844i,total=1000240963584i,free=882940805120i,used=101144190976i,used_percent=10.277993402729729,inodes_total=9767978160i 1591894420000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net swap_total=0i,wired=5227294720i,slab=0i,huge_pages_total=0i,swap_free=0i,vmalloc_chunk=0i,vmalloc_total=0i,high_free=0i,low_total=0i,mapped=0i,write_back_tmp=0i,free=27852161024i,available_percent=62.27483153343201,low_free=0i,high_total=0i,vmalloc_used=0i,used=25924538368i,buffered=0i,used_percent=37.72516846656799,committed_as=0i,dirty=0i,swap_cached=0i,available=42794938368i,cached=0i,commit_limit=0i,page_tables=0i,total=68719476736i,inactive=14942777344i,huge_page_size=0i,active=18833276928i,huge_pages_free=0i,shared=0i,sreclaimable=0i,sunreclaim=0i,write_back=0i 1591894420000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_idle=99.89989989989989,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=0,usage_system=0.1001001001001001,usage_irq=0 1591894420000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=4.3043043043043046,usage_system=5.405405405405405,usage_idle=90.29029029029029,usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_steal=0 1591894420000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:07" 1591894420000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ total=1000240963584i,free=882940805120i,used=10945150976i,used_percent=1.224446015888019,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i 1591894420000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.9694164738257552,usage_idle=98.18625304897117,usage_nice=0,usage_guest=0,usage_user=0.8443304772030771,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_iowait=0 1591894420000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457675i 1591894420000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=0,usage_idle=100,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0,usage_nice=0,usage_iowait=0 1591894420000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.26220703125,load5=1.73486328125,load15=2.4091796875,n_cpus=16i,n_users=4i 1591894420000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.501002004008016,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=0.40080160320641284,usage_idle=99.09819639278557,usage_nice=0,usage_iowait=0 1591894420000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=100,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=0,usage_system=0,usage_nice=0,usage_softirq=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.6,usage_system=0.5,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_idle=98.9,usage_irq=0,usage_softirq=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_irq=0,usage_softirq=0,usage_guest=0,usage_system=0.0999000999000999,usage_iowait=0 1591894420000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.6,usage_system=0.7,usage_idle=98.7,usage_iowait=0,usage_irq=0,usage_nice=0,usage_softirq=0 1591894420000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_used=3i,total=1000240963584i,free=882940805120i,used=4387262464i,used_percent=0.49443521784964545,inodes_total=9767978160i,inodes_free=9767978157i 1591894420000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_total=9767978160i,inodes_free=9766246316i,inodes_used=1731844i,total=1000240963584i,free=882940805120i,used=101144190976i,used_percent=10.277993402729729 1591894420000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ free=882940805120i,used=10945150976i,used_percent=1.224446015888019,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i 1591894420000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_idle=99.9,usage_irq=0,usage_softirq=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_steal=0,usage_user=0.7007007007007007,usage_system=0.9009009009009009,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_idle=98.3983983983984,usage_iowait=0 1591894420000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0,usage_system=0,usage_iowait=0,usage_guest=0,usage_idle=100,usage_nice=0,usage_softirq=0 1591894420000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net out=0i,in=0i 1591894420000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894420000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.4014014014014013,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=1.2012012012012012,usage_idle=97.3973973973974,usage_nice=0,usage_irq=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=0.1,usage_idle=99.9,usage_iowait=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=2.002002002002002,usage_system=2.3023023023023024,usage_guest_nice=0,usage_steal=0,usage_guest=0,usage_idle=95.69569569569569,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894420000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_system=0,usage_idle=99.9,usage_nice=0,usage_irq=0,usage_user=0.1,usage_iowait=0,usage_guest=0,usage_guest_nice=0 1591894420000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_idle=92.992992992993,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=3.4034034034034035,usage_system=3.6036036036036037 1591894420000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.89989989989989,usage_nice=0,usage_iowait=0,usage_guest_nice=0,usage_guest=0,usage_user=0,usage_system=0.1001001001001001,usage_irq=0,usage_softirq=0,usage_steal=0 1591894420000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_idle=90.29029029029029,usage_irq=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_user=4.3043043043043046,usage_system=5.405405405405405 1591894420000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=2i,sleeping=515i 1591894430000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net zombies=1i,stopped=0i,running=2i,sleeping=514i,total=517i,unknown=0i,idle=0i,blocked=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 packets_sent=9694346i,packets_recv=9197501i,err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609902593i,bytes_recv=9772281887i 1591894430000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 drop_out=112i,bytes_sent=451863696i,bytes_recv=9706225i,packets_sent=82811i,packets_recv=91874i,err_in=0i,err_out=157523i,drop_in=0i 1591894430000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,io_time=4866257i,merged_reads=0i,merged_writes=0i,write_time=3641522i,weighted_io_time=0i,iops_in_progress=0i,writes=5593438i,read_bytes=44226686976i,write_bytes=337969463296i,read_time=1224735i 1591894430000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.9854060122240239,usage_system=1.097667456654609,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_idle=97.91692653112136,usage_iowait=0,usage_steal=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_idle=99.8001998001998,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_iowait=0 1591894430000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.9734789391575663,usage_system=1.0733229329173166,usage_idle=97.95319812792512,usage_guest_nice=0,usage_nice=0 1591894430000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894430000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.2997002997002997,usage_system=0.2997002997002997,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_idle=99.4005994005994,usage_steal=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=0.2997002997002997,usage_system=0.2997002997002997,usage_idle=99.4005994005994,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_nice=0,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_irq=0,usage_steal=0,usage_guest=0,usage_user=0.0999000999000999 1591894430000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0999000999000999,usage_nice=0,usage_steal=0,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4995004995004995,usage_idle=99.000999000999,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.4995004995004995,usage_nice=0,usage_irq=0,usage_guest=0 1591894430000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_idle=100,usage_nice=0,usage_irq=0,usage_user=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.6,usage_idle=98.7,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.7,usage_nice=0,usage_softirq=0,usage_guest=0 1591894430000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1,usage_system=0,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894430000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_user=1.1976047904191616,usage_system=1.2974051896207586,usage_nice=0,usage_softirq=0,usage_guest_nice=0,usage_idle=97.50499001996008,usage_iowait=0,usage_irq=0 1591894430000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_user=0.0999000999000999,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_iowait=0 1591894430000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=97.1,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=1.4,usage_system=1.5,usage_guest=0,usage_guest_nice=0 1591894430000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:08" 1591894430000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_irq=0,usage_softirq=0,usage_guest=0,usage_system=0.0999000999000999,usage_iowait=0 1591894430000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4995004995004995,usage_idle=99.000999000999,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_system=0.4995004995004995,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0 1591894430000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_idle=100,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=2.4,usage_irq=0,usage_guest_nice=0,usage_user=2.4,usage_idle=95.2,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894430000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=98.7,usage_nice=0,usage_softirq=0,usage_user=0.7,usage_system=0.6,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1,usage_idle=99.8,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_user=0.1,usage_irq=0,usage_steal=0,usage_guest=0 1591894430000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 iops_in_progress=0i,merged_reads=0i,write_bytes=337969463296i,write_time=3641522i,read_bytes=44226686976i,read_time=1224735i,io_time=4866257i,weighted_io_time=0i,merged_writes=0i,reads=1864541i,writes=5593438i 1591894430000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_idle=99.9,usage_irq=0,usage_guest_nice=0,usage_guest=0,usage_user=0.1,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894430000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=3.2934131736526946,usage_system=4.191616766467066,usage_idle=92.51497005988024,usage_nice=0 1591894430000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_system=1.2974051896207586,usage_idle=97.50499001996008,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=1.1976047904191616,usage_nice=0 1591894430000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=0.0999000999000999,usage_nice=0 1591894430000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_guest=0,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894430000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_user=5,usage_system=5.2,usage_idle=89.8,usage_irq=0,usage_softirq=0,usage_guest_nice=0 1591894430000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_guest_nice=0,usage_nice=0,usage_system=1.5,usage_idle=97.1,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=1.4 1591894430000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:08" 1591894430000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_irq=0,usage_system=0.0999000999000999,usage_idle=99.8001998001998,usage_nice=0,usage_iowait=0,usage_steal=0,usage_user=0.0999000999000999 1591894430000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_idle=95.1048951048951,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=2.4975024975024973,usage_system=2.3976023976023977 1591894430000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1,usage_system=0.1,usage_idle=99.8,usage_nice=0,usage_irq=0,usage_steal=0 1591894430000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=3.2934131736526946,usage_system=4.191616766467066,usage_idle=92.51497005988024,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894430000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_irq=0,usage_guest_nice=0 1591894430000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457685i 1591894430000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_guest=0,usage_user=5,usage_system=5.2,usage_idle=89.8,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest_nice=0 1591894430000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.14111328125,load5=1.69384765625,load15=2.38671875,n_cpus=16i,n_users=4i 1591894430000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882940719104i,used=4387262464i,used_percent=0.4944352657793182,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894430000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882940719104i,used=4387262464i,used_percent=0.4944352657793182,inodes_total=9767978160i 1591894430000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457685i 1591894430000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_used=1731844i,total=1000240963584i,free=882940719104i,used=101144276992i,used_percent=10.278002143438139,inodes_total=9767978160i,inodes_free=9766246316i 1591894430000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.14111328125,load5=1.69384765625,load15=2.38671875,n_cpus=16i,n_users=4i 1591894430000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ free=882940719104i,used=10945150976i,used_percent=1.2244461337128467,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i 1591894430000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data total=1000240963584i,free=882940719104i,used=101144276992i,used_percent=10.278002143438139,inodes_total=9767978160i,inodes_free=9766246316i,inodes_used=1731844i 1591894430000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ free=882940719104i,used=10945150976i,used_percent=1.2244461337128467,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i 1591894430000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894430000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894430000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net used=25923842048i,inactive=14942384128i,huge_pages_free=0i,sreclaimable=0i,vmalloc_chunk=0i,shared=0i,slab=0i,used_percent=37.72415518760681,available_percent=62.27584481239319,low_total=0i,vmalloc_used=0i,total=68719476736i,free=27853250560i,high_free=0i,high_total=0i,mapped=0i,sunreclaim=0i,vmalloc_total=0i,write_back=0i,buffered=0i,committed_as=0i,huge_pages_total=0i,low_free=0i,swap_free=0i,cached=0i,dirty=0i,page_tables=0i,write_back_tmp=0i,wired=5227253760i,swap_total=0i,available=42795634688i,active=18831147008i,commit_limit=0i,huge_page_size=0i,swap_cached=0i 1591894430000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net huge_pages_total=0i,sreclaimable=0i,write_back_tmp=0i,used_percent=37.72415518760681,committed_as=0i,low_free=0i,mapped=0i,shared=0i,active=18831147008i,huge_page_size=0i,high_free=0i,swap_free=0i,available=42795634688i,dirty=0i,commit_limit=0i,low_total=0i,swap_total=0i,vmalloc_chunk=0i,total=68719476736i,wired=5227253760i,swap_cached=0i,inactive=14942384128i,page_tables=0i,buffered=0i,huge_pages_free=0i,sunreclaim=0i,used=25923842048i,free=27853250560i,available_percent=62.27584481239319,vmalloc_used=0i,cached=0i,slab=0i,write_back=0i,high_total=0i,vmalloc_total=0i 1591894430000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net out=0i,in=0i 1591894430000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894430000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm free=882940715008i,used=4387262464i,used_percent=0.4944352680616837,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i,total=1000240963584i 1591894450000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_free=9766246316i,inodes_used=1731844i,total=1000240963584i,free=882940715008i,used=101144281088i,used_percent=10.27800255966235,inodes_total=9767978160i 1591894450000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ inodes_used=487112i,total=1000240963584i,free=882940715008i,used=10945150976i,used_percent=1.2244461393235535,inodes_total=9767978160i,inodes_free=9767491048i 1591894450000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894450000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894450000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net running=2i,sleeping=515i,total=518i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i 1591894440000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net stopped=0i,running=3i,sleeping=513i,total=517i,unknown=0i,idle=0i,blocked=0i,zombies=1i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=19287i,bytes_recv=124i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 drop_out=4i,bytes_sent=3609924002i,bytes_recv=9772288979i,packets_sent=9694401i,packets_recv=9197574i,err_in=0i,err_out=4989i,drop_in=0i 1591894440000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 drop_out=112i,bytes_sent=451863860i,bytes_recv=9706389i,packets_sent=82813i,packets_recv=91876i,err_in=0i,err_out=157523i,drop_in=0i 1591894440000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.969598398598774,usage_system=0.9258100838233454,usage_idle=98.10459151757789,usage_iowait=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_idle=99.89989989989989,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.1001001001001001,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest=0 1591894440000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=98.7987987987988,usage_guest=0,usage_user=0.7007007007007007,usage_system=0.5005005005005005,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_nice=0,usage_iowait=0 1591894440000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 writes=5593513i,write_bytes=337969868800i,read_time=1224735i,write_time=3641597i,iops_in_progress=0i,merged_reads=0i,reads=1864541i,read_bytes=44226686976i,io_time=4866333i,weighted_io_time=0i,merged_writes=0i 1591894440000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=0.1001001001001001,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_system=0,usage_idle=99.89989989989989,usage_steal=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=98.7987987987988,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_system=0.4004004004004004,usage_nice=0,usage_steal=0,usage_guest=0,usage_user=0.8008008008008008 1591894440000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 writes=5593513i,write_bytes=337969868800i,read_time=1224735i,write_time=3641597i,iops_in_progress=0i,merged_reads=0i,reads=1864541i,read_bytes=44226686976i,io_time=4866333i,weighted_io_time=0i,merged_writes=0i 1591894440000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.1,usage_nice=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.1,usage_idle=99.8,usage_iowait=0,usage_irq=0,usage_steal=0 1591894440000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=1.001001001001001,usage_idle=98.3983983983984,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.6006006006006006 1591894440000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.1001001001001001,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.89989989989989,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894440000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.2012012012012012,usage_idle=97.7977977977978,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=1.001001001001001,usage_nice=0,usage_iowait=0,usage_steal=0 1591894440000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0,usage_irq=0,usage_guest_nice=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894440000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.3013013013013013,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=1.4014014014014013,usage_idle=97.29729729729729,usage_iowait=0,usage_irq=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.9649727426530484,usage_system=0.845917664014036,usage_nice=0,usage_irq=0,usage_steal=0,usage_idle=98.18910959333292,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_iowait=0,usage_softirq=0,usage_idle=99.89989989989989,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0.1001001001001001 1591894440000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.10020040080160321,usage_system=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_idle=99.8997995991984,usage_nice=0,usage_irq=0 1591894440000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.5005005005005005,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0.7007007007007007,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_idle=98.7987987987988 1591894440000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:08" 1591894440000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=2.1021021021021022,usage_idle=95.89589589589589,usage_irq=0,usage_guest_nice=0,usage_user=2.002002002002002,usage_iowait=0 1591894440000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_system=0,usage_idle=99.89989989989989,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0.1001001001001001,usage_nice=0 1591894440000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_softirq=0,usage_user=0.8008008008008008,usage_system=0.4004004004004004,usage_idle=98.7987987987988,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.1,usage_idle=99.8,usage_nice=0,usage_iowait=0,usage_system=0.1,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_idle=93.687374749499,usage_irq=0,usage_softirq=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_user=3.306613226452906,usage_system=3.006012024048096,usage_nice=0 1591894440000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457695i 1591894440000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_idle=99.8997995991984,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_system=0.10020040080160321,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0 1591894440000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.1,usage_guest=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.1,usage_idle=99.8 1591894440000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load5=1.67041015625,load15=2.3701171875,n_cpus=16i,n_users=4i,load1=1.1123046875 1591894440000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.001001001001001,usage_system=0.6006006006006006,usage_idle=98.3983983983984,usage_irq=0,usage_steal=0,usage_guest=0,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=4.904904904904905,usage_idle=90.09009009009009,usage_softirq=0,usage_guest=0,usage_system=5.005005005005005,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.1001001001001001,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_idle=99.89989989989989,usage_nice=0,usage_softirq=0 1591894440000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.2012012012012012,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_system=1.001001001001001,usage_idle=97.7977977977978,usage_iowait=0,usage_steal=0,usage_guest_nice=0 1591894440000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:08" 1591894440000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0,usage_idle=100,usage_nice=0,usage_irq=0,usage_guest=0 1591894440000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_idle=97.29729729729729,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=1.3013013013013013,usage_system=1.4014014014014013,usage_guest=0 1591894440000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_guest=0,usage_user=0.10020040080160321,usage_idle=99.8997995991984,usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_system=0,usage_irq=0,usage_softirq=0 1591894440000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_used=3i,total=1000240963584i,free=882940715008i,used=4387262464i,used_percent=0.4944352680616837,inodes_total=9767978160i,inodes_free=9767978157i 1591894440000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data free=882940715008i,used=101144281088i,used_percent=10.27800255966235,inodes_total=9767978160i,inodes_free=9766246316i,inodes_used=1731844i,total=1000240963584i 1591894440000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882940715008i,used=10945150976i,used_percent=1.2244461393235535,inodes_total=9767978160i 1591894440000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457695i 1591894440000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net n_users=4i,load1=1.1123046875,load5=1.67041015625,load15=2.3701171875,n_cpus=16i 1591894440000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=2.002002002002002,usage_system=2.1021021021021022,usage_idle=95.89589589589589,usage_iowait=0,usage_guest_nice=0 1591894440000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net cached=0i,dirty=0i,shared=0i,swap_cached=0i,swap_free=0i,free=27853258752i,inactive=14942093312i,wired=5227470848i,low_free=0i,vmalloc_total=0i,used=25924124672i,slab=0i,high_total=0i,mapped=0i,page_tables=0i,vmalloc_chunk=0i,available_percent=62.27543354034424,high_free=0i,sreclaimable=0i,swap_total=0i,total=68719476736i,used_percent=37.72456645965576,committed_as=0i,huge_page_size=0i,huge_pages_free=0i,commit_limit=0i,huge_pages_total=0i,vmalloc_used=0i,write_back=0i,low_total=0i,sunreclaim=0i,write_back_tmp=0i,available=42795352064i,buffered=0i,active=18831106048i 1591894440000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_user=0.1,usage_system=0.1,usage_idle=99.8,usage_irq=0,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0 1591894440000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest=0,usage_idle=93.687374749499,usage_iowait=0,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=3.306613226452906,usage_system=3.006012024048096 1591894440000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_user=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest_nice=0,usage_system=0.10020040080160321,usage_idle=99.8997995991984,usage_steal=0,usage_guest=0 1591894440000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=5.005005005005005,usage_guest=0,usage_guest_nice=0,usage_user=4.904904904904905,usage_idle=90.09009009009009,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0 1591894440000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894440000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net huge_pages_total=0i,low_free=0i,mapped=0i,sreclaimable=0i,swap_total=0i,write_back_tmp=0i,used=25921015808i,wired=5227323392i,low_total=0i,swap_cached=0i,vmalloc_chunk=0i,vmalloc_total=0i,huge_pages_free=0i,page_tables=0i,available=42798460928i,commit_limit=0i,committed_as=0i,high_free=0i,high_total=0i,huge_page_size=0i,swap_free=0i,free=27856367616i,active=18829639680i,inactive=14942093312i,used_percent=37.72004246711731,available_percent=62.27995753288269,shared=0i,sunreclaim=0i,total=68719476736i,write_back=0i,cached=0i,dirty=0i,vmalloc_used=0i,buffered=0i,slab=0i 1591894440000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net total=2147483648i,used=653000704i,free=1494482944i,used_percent=30.40771484375 1591894440000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894440000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net used_percent=30.40771484375,total=2147483648i,used=653000704i,free=1494482944i 1591894440000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm total=1000240963584i,free=882940715008i,used=4387262464i,used_percent=0.4944352680616837,inodes_total=9767978160i,inodes_free=9767978157i,inodes_used=3i 1591894440000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_used=1731844i,total=1000240963584i,free=882940715008i,used=101144281088i,used_percent=10.27800255966235,inodes_total=9767978160i,inodes_free=9766246316i 1591894440000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ inodes_used=487112i,total=1000240963584i,free=882940715008i,used=10945150976i,used_percent=1.2244461393235535,inodes_total=9767978160i,inodes_free=9767491048i 1591894440000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i,running=3i,sleeping=514i,total=518i 1591894450000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net zombies=1i,stopped=0i,running=3i,sleeping=513i,total=517i,unknown=0i,idle=0i,blocked=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 drop_out=4i,bytes_sent=3609938837i,bytes_recv=9772296075i,packets_sent=9694561i,packets_recv=9197659i,err_in=0i,err_out=4989i,drop_in=0i 1591894450000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 drop_out=112i,bytes_sent=451864145i,bytes_recv=9706642i,packets_sent=82815i,packets_recv=91878i,err_in=0i,err_out=157523i,drop_in=0i 1591894450000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.9126711258360942,usage_system=0.9439269863099331,usage_iowait=0,usage_irq=0,usage_guest=0,usage_idle=98.14340188785397,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0 1591894450000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_idle=99.7005988023952,usage_irq=0,usage_nice=0,usage_iowait=0,usage_guest=0,usage_guest_nice=0,usage_user=0.1996007984031936,usage_system=0.0998003992015968 1591894450000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=0.3992015968063872,usage_system=0.3992015968063872,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_idle=99.20159680638723,usage_iowait=0,usage_irq=0 1591894450000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9000999000999,usage_nice=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0.0999000999000999,usage_steal=0,usage_iowait=0,usage_irq=0 1591894450000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.3996003996003996,usage_system=0.4995004995004995,usage_idle=99.1008991008991,usage_nice=0,usage_guest_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0 1591894450000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=0,usage_idle=99.9000999000999,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894450000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_idle=98.6013986013986,usage_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0.4995004995004995,usage_system=0.8991008991008991,usage_iowait=0,usage_guest_nice=0 1591894450000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0998003992015968,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.0998003992015968,usage_guest_nice=0,usage_idle=99.8003992015968 1591894450000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=1.3986013986013985,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_user=0.999000999000999,usage_nice=0,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_idle=97.6023976023976 1591894450000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.0998003992015968,usage_irq=0,usage_guest=0,usage_guest_nice=0,usage_system=0.0998003992015968,usage_idle=99.8003992015968,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0 1591894450000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:08" 1591894450000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.596806387225549,usage_system=1.4970059880239521,usage_idle=96.9061876247505,usage_nice=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_iowait=0,usage_steal=0,usage_guest_nice=0 1591894450000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_user=0,usage_idle=99.9000999000999,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_system=0.0999000999000999 1591894450000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=2.097902097902098,usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=1.6983016983016983,usage_idle=96.2037962037962,usage_iowait=0,usage_irq=0,usage_guest_nice=0 1591894450000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,read_bytes=44226686976i,write_bytes=337970249728i,read_time=1224735i,io_time=4866412i,weighted_io_time=0i,merged_writes=0i,writes=5593589i,write_time=3641676i,iops_in_progress=0i,merged_reads=0i 1591894450000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0,usage_system=0.0999000999000999,usage_idle=99.9000999000999,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_steal=0,usage_guest=0 1591894450000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_guest_nice=0,usage_user=2.9940119760479043,usage_idle=93.812375249501,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=3.193612774451098 1591894450000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=0.0998003992015968,usage_system=0.0998003992015968,usage_idle=99.8003992015968,usage_guest=0 1591894450000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=0.9175457212408713,usage_idle=98.05879782785095,usage_nice=0,usage_softirq=0,usage_guest=0,usage_system=1.023656450908183,usage_iowait=0,usage_irq=0,usage_steal=0 1591894450000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 weighted_io_time=0i,iops_in_progress=0i,merged_reads=0i,reads=1864541i,write_time=3641676i,write_bytes=337970249728i,read_time=1224735i,io_time=4866412i,merged_writes=0i,writes=5593589i,read_bytes=44226686976i 1591894450000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457705i 1591894450000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=5.588822355289421,usage_steal=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=5.588822355289421,usage_idle=88.82235528942115,usage_nice=0 1591894450000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.18115234375,load5=1.6650390625,load15=2.359375,n_cpus=16i,n_users=4i 1591894450000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime_format="5 days,  7:08" 1591894450000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.7002997002997,usage_nice=0,usage_iowait=0,usage_irq=0,usage_user=0.1998001998001998,usage_system=0.0999000999000999,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894450000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.4,usage_nice=0,usage_steal=0,usage_guest_nice=0,usage_system=0.3,usage_idle=99.3,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0 1591894450000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest=0,usage_guest_nice=0,usage_user=0,usage_system=0.0999000999000999,usage_irq=0,usage_softirq=0,usage_idle=99.9000999000999,usage_nice=0,usage_iowait=0,usage_steal=0 1591894450000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_nice=0,usage_iowait=0,usage_guest=0,usage_user=0.3996003996003996,usage_system=0.4995004995004995,usage_idle=99.1008991008991 1591894450000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0,usage_system=0.1,usage_guest_nice=0,usage_idle=99.9,usage_irq=0 1591894450000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net uptime=457705i 1591894450000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_steal=0,usage_user=0.4995004995004995,usage_system=0.8991008991008991,usage_idle=98.6013986013986,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0 1591894450000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.0999000999000999,usage_steal=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998 1591894450000000000
+system,host=Andrews-MBP.hsd1.ma.comcast.net load1=1.18115234375,load5=1.6650390625,load15=2.359375,n_cpus=16i,n_users=4i 1591894450000000000
+disk,device=disk1s5,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/private/var/vm inodes_free=9767978157i,inodes_used=3i,total=1000240963584i,free=882940715008i,used=4387262464i,used_percent=0.4944352680616837,inodes_total=9767978160i 1591894450000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1,usage_system=1.3,usage_idle=97.7,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0 1591894450000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net sreclaimable=0i,vmalloc_chunk=0i,total=68719476736i,buffered=0i,high_total=0i,low_total=0i,wired=5227405312i,available=42795847680i,vmalloc_total=0i,used=25923629056i,available_percent=62.27615475654602,committed_as=0i,swap_free=0i,page_tables=0i,shared=0i,swap_total=0i,vmalloc_used=0i,free=27854012416i,slab=0i,used_percent=37.72384524345398,dirty=0i,write_back=0i,huge_pages_total=0i,mapped=0i,write_back_tmp=0i,cached=0i,active=18830561280i,high_free=0i,huge_pages_free=0i,inactive=14941835264i,commit_limit=0i,swap_cached=0i,huge_page_size=0i,low_free=0i,sunreclaim=0i 1591894450000000000
+disk,device=disk1s2,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=rw,path=/System/Volumes/Data inodes_used=1731844i,total=1000240963584i,free=882940715008i,used=101144281088i,used_percent=10.27800255966235,inodes_total=9767978160i,inodes_free=9766246316i 1591894450000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_iowait=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_idle=99.8001998001998,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_system=0.0999000999000999,usage_nice=0 1591894450000000000
+disk,device=disk1s1,fstype=apfs,host=Andrews-MBP.hsd1.ma.comcast.net,mode=ro,path=/ used=10945150976i,used_percent=1.2244461393235535,inodes_total=9767978160i,inodes_free=9767491048i,inodes_used=487112i,total=1000240963584i,free=882940715008i 1591894450000000000
+cpu,cpu=cpu6,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_system=1.3986013986013985,usage_softirq=0,usage_steal=0,usage_iowait=0,usage_irq=0,usage_guest=0,usage_user=1.5984015984015985,usage_idle=97.002997002997,usage_nice=0 1591894450000000000
+cpu,cpu=cpu5,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.9000999000999,usage_nice=0,usage_guest=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0,usage_system=0.0999000999000999,usage_iowait=0 1591894450000000000
+cpu,cpu=cpu4,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_user=1.6983016983016983,usage_idle=96.2037962037962,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_system=2.097902097902098,usage_nice=0,usage_irq=0,usage_steal=0 1591894450000000000
+cpu,cpu=cpu3,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_guest=0,usage_user=0,usage_system=0.1,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_idle=99.9,usage_nice=0 1591894450000000000
+cpu,cpu=cpu2,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=2.997002997002997,usage_system=3.196803196803197,usage_idle=93.8061938061938,usage_nice=0,usage_iowait=0,usage_softirq=0 1591894450000000000
+cpu,cpu=cpu1,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.8001998001998,usage_iowait=0,usage_softirq=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest=0 1591894450000000000
+cpu,cpu=cpu0,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_softirq=0,usage_steal=0,usage_user=5.588822355289421,usage_system=5.588822355289421,usage_nice=0,usage_guest_nice=0,usage_idle=88.82235528942115,usage_iowait=0,usage_guest=0 1591894450000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net in=0i,out=0i 1591894450000000000
+swap,host=Andrews-MBP.hsd1.ma.comcast.net used=653000704i,free=1494482944i,used_percent=30.40771484375,total=2147483648i 1591894450000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net shared=0i,sreclaimable=0i,active=18830000128i,inactive=14941859840i,available_percent=62.277960777282715,available=42797088768i,wired=5227335680i,vmalloc_total=0i,swap_free=0i,slab=0i,huge_pages_free=0i,huge_pages_total=0i,write_back_tmp=0i,total=68719476736i,high_total=0i,swap_cached=0i,committed_as=0i,vmalloc_used=0i,swap_total=0i,cached=0i,mapped=0i,page_tables=0i,vmalloc_chunk=0i,free=27855228928i,used_percent=37.722039222717285,high_free=0i,dirty=0i,huge_page_size=0i,low_free=0i,low_total=0i,sunreclaim=0i,used=25922387968i,buffered=0i,commit_limit=0i,write_back=0i 1591894450000000000
+mem,host=Andrews-MBP.hsd1.ma.comcast.net dirty=0i,swap_free=0i,write_back=0i,buffered=0i,active=18860519424i,committed_as=0i,low_free=0i,mapped=0i,cached=0i,huge_pages_total=0i,sunreclaim=0i,used_percent=37.76906728744507,high_total=0i,huge_page_size=0i,swap_cached=0i,vmalloc_chunk=0i,total=68719476736i,used=25954705408i,wired=5229436928i,commit_limit=0i,high_free=0i,sreclaimable=0i,write_back_tmp=0i,page_tables=0i,shared=0i,available=42764771328i,free=27804237824i,inactive=14960533504i,slab=0i,huge_pages_free=0i,low_total=0i,swap_total=0i,vmalloc_total=0i,available_percent=62.23093271255493,vmalloc_used=0i 1591894470000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net running=3i,sleeping=516i,total=520i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i 1591894460000000000
+processes,host=Andrews-MBP.hsd1.ma.comcast.net running=3i,sleeping=515i,total=519i,unknown=0i,idle=0i,blocked=0i,zombies=1i,stopped=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun1 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=utun0 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=2842i,bytes_recv=0i,packets_sent=17i,packets_recv=0i,err_in=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=llw0 packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=awdl0 drop_out=0i,bytes_sent=19287i,bytes_recv=124i,packets_sent=154i,packets_recv=1i,err_in=0i,err_out=0i,drop_in=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=p2p0 err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en4 drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en3 bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en2 err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i,packets_sent=0i,packets_recv=0i,err_in=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en1 packets_sent=0i,packets_recv=0i,err_in=0i,err_out=0i,drop_in=0i,drop_out=0i,bytes_sent=0i,bytes_recv=0i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en0 bytes_recv=9772314525i,packets_sent=9694760i,packets_recv=9197797i,err_in=0i,err_out=4989i,drop_in=0i,drop_out=4i,bytes_sent=3609999718i 1591894460000000000
+net,host=Andrews-MBP.hsd1.ma.comcast.net,interface=en5 err_out=157523i,drop_in=0i,drop_out=112i,bytes_sent=451864223i,bytes_recv=9706728i,packets_sent=82816i,packets_recv=91879i,err_in=0i 1591894460000000000
+diskio,host=Andrews-MBP.hsd1.ma.comcast.net,name=disk0 reads=1864541i,writes=5593735i,write_bytes=337971593216i,iops_in_progress=0i,merged_reads=0i,read_bytes=44226686976i,read_time=1224735i,write_time=3641758i,io_time=4866494i,weighted_io_time=0i,merged_writes=0i 1591894460000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=1.1375,usage_system=1.09375,usage_idle=97.76875,usage_iowait=0,usage_irq=0 1591894460000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0,usage_idle=100,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_guest_nice=0,usage_user=0,usage_steal=0,usage_guest=0,usage_nice=0 1591894460000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_irq=0,usage_guest_nice=0,usage_system=1.998001998001998,usage_nice=0,usage_iowait=0,usage_steal=0,usage_guest=0,usage_user=0.7992007992007992,usage_idle=97.2027972027972,usage_softirq=0 1591894460000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.1,usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_system=0.1,usage_idle=99.8,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894460000000000
+cpu,cpu=cpu12,host=Andrews-MBP.hsd1.ma.comcast.net usage_steal=0,usage_guest_nice=0,usage_system=0.4004004004004004,usage_idle=99.1991991991992,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_user=0.4004004004004004,usage_guest=0 1591894460000000000
+cpu,cpu=cpu11,host=Andrews-MBP.hsd1.ma.comcast.net usage_nice=0,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_user=0.1,usage_system=0.1,usage_idle=99.8,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894460000000000
+cpu,cpu=cpu10,host=Andrews-MBP.hsd1.ma.comcast.net usage_system=0.8,usage_idle=98.2,usage_guest=0,usage_user=1,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_nice=0 1591894460000000000
+cpu,cpu=cpu9,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_nice=0,usage_iowait=0,usage_idle=99.8001998001998,usage_irq=0,usage_guest=0 1591894460000000000
+cpu,cpu=cpu-total,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=1.131815907953977,usage_system=1.0817908954477238,usage_idle=97.7863931965983,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0 1591894460000000000
+cpu,cpu=cpu8,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_idle=97.3973973973974,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0,usage_guest=0,usage_guest_nice=0,usage_user=1.4014014014014013,usage_system=1.2012012012012012 1591894460000000000
+cpu,cpu=cpu7,host=Andrews-MBP.hsd1.ma.comcast.net usage_guest_nice=0,usage_nice=0,usage_iowait=0,usage_irq=0,usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0.0999000999000999,usage_system=0.0999000999000999,usage_idle=99.8001998001998 1591894460000000000
+cpu,cpu=cpu15,host=Andrews-MBP.hsd1.ma.comcast.net usage_softirq=0,usage_steal=0,usage_guest=0,usage_user=0,usage_idle=100,usage_nice=0,usage_iowait=0,usage_irq=0,usage_guest_nice=0,usage_system=0 1591894460000000000
+cpu,cpu=cpu14,host=Andrews-MBP.hsd1.ma.comcast.net usage_user=0.8008008008008008,usage_softirq=0,usage_guest=0,usage_guest_nice=0,usage_system=1.901901901901902,usage_idle=97.29729729729729,usage_nice=0,usage_iowait=0,usage_irq=0,usage_steal=0 1591894460000000000
+cpu,cpu=cpu13,host=Andrews-MBP.hsd1.ma.comcast.net usage_idle=99.8,usage_iowait=0,usage_softirq=0,usage_guest=0,usage_system=0.1,usage_nice=0,usage_irq=0,usage_steal=0,usage_guest_nice=0,usage_user=0.1 1591894460000000000
diff --git a/test_fixtures/lineproto/prometheus.lp b/test_fixtures/lineproto/prometheus.lp
new file mode 100644
index 0000000..da6f440
--- /dev/null
+++ b/test_fixtures/lineproto/prometheus.lp
@@ -0,0 +1,554 @@
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_redis_consecutive_successes=0 1578431517778522000
+prometheus,endpoint=/api/v2/query,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=500 http_query_request_bytes=19098 1578431517778528000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_redis_total_successes=0 1578431517778535000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_frees_total=148566386293 1578431517778535000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_heap_inuse_bytes=134979584 1578431517778536000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_mspan_sys_bytes=23150592 1578431517778538000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_kafka_write_total_failures=0 1578431517778538000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_stack_inuse_bytes=5144576 1578431517778539000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_requests_total=2 1578431517778541000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_requests_total=2 1578431517778543000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_requests_total=1 1578431517778544000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_requests_total=1 1578431517778545000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_requests_total=1 1578431517778546000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_requests_total=22 1578431517778549000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_requests_total=1 1578431517778550000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=569 1578431517778553000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=569 1578431517778554000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=569 1578431517778555000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=569 1578431517778556000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=569 1578431517778557000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=569 1578431517778558000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=569 1578431517778559000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_requests_total=17080 1578431517778561000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_requests_total=34165 1578431517778562000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_requests_total=68330 1578431517778563000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_requests_total=5 1578431517778564000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_requests_total=1499492 1578431517778564000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_requests_total=143641 1578431517778565000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_count=141578,storage_producer_node_request_duration_seconds_sum=201042.39155161564 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.001,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.0015,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.0022500000000000003,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.0033750000000000004,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.005062500000000001,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.0075937500000000015,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.011390625000000001,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.017085937500000002,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.025628906250000003,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.03844335937500001,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.057665039062500006,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.08649755859375001,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=0 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.129746337890625,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=3162 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.277ea440004d086e,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=12617 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.29192926025390625,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=21507 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.43789389038085935,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=30739 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.656840835571289,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=40151 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=0.0f52079ffbaba445,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=53068 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=1.39025843efc762bb,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=71804 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=2.30d0587a02d01d38,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=108711 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=3.3864f697e8d4ab0b,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=140717 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=4.987885095119475,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=141414 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=7.481827642679213,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=141503 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=11.222741464018819,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=141507 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=16.83411219602823,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=141526 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,le=+Inf,nodename=node1,role=gateway-internal,status=error storage_producer_node_request_duration_seconds_bucket=141578 1578431517778567000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_count=1501555,storage_producer_node_request_duration_seconds_sum=1188856.9056605487 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.001,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.0015,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.0022500000000000003,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.0033750000000000004,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.005062500000000001,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.0075937500000000015,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.011390625000000001,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.017085937500000002,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.025628906250000003,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.03844335937500001,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.057665039062500006,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.08649755859375001,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=0 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.129746337890625,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=22662 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.277ea440004d086e,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=170473 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.29192926025390625,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=360340 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.43789389038085935,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=559743 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.656840835571289,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=820196 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=0.0f52079ffbaba445,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1175909 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=1.39025843efc762bb,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1351424 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=2.30d0587a02d01d38,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1420019 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=3.3864f697e8d4ab0b,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1465150 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=4.987885095119475,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1493100 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=7.481827642679213,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1500952 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=11.222741464018819,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1501375 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=16.83411219602823,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1501491 1578431517778608000
+prometheus,env=toolsus1,hostname=host1,le=+Inf,nodename=node1,role=gateway-internal,status=ok storage_producer_node_request_duration_seconds_bucket=1501555 1578431517778608000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=40c69bd39226fa67,role=gateway-internal,status=204 http_write_request_count=39558 1578431517778638000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=40c69bd39226fa67,role=gateway-internal,status=500 http_write_request_count=9751 1578431517778639000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=204 http_write_request_count=1459934 1578431517778641000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=500 http_write_request_count=133890 1578431517778642000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_redis_consecutive_failures=0 1578431517778643000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_mallocs_total=148566627376 1578431517778643000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_stack_sys_bytes=5144576 1578431517778644000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_kafka_write_consecutive_successes=0 1578431517778644000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_heap_alloc_bytes=119636872 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_count=2063,storage_backup_firehose_request_duration_seconds_sum=1187.3129012100005 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.001,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=79 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.0015,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=84 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.0022500000000000003,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=86 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.0033750000000000004,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=91 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.005062500000000001,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=93 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.0075937500000000015,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=93 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.011390625000000001,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=93 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.017085937500000002,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=94 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.025628906250000003,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=95 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.03844335937500001,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=95 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.057665039062500006,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=97 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.08649755859375001,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=100 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.129746337890625,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=100 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.277ea440004d086e,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=100 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.29192926025390625,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=104 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.43789389038085935,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=580 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.656840835571289,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=1431 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=0.0f52079ffbaba445,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=1977 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=1.39025843efc762bb,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2019 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=2.30d0587a02d01d38,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2063 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=3.3864f697e8d4ab0b,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2063 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=4.987885095119475,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2063 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=7.481827642679213,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2063 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=11.222741464018819,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2063 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=16.83411219602823,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2063 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,le=+Inf,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_duration_seconds_bucket=2063 1578431517778645000
+prometheus,env=toolsus1,hostname=host1,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_count=1499492,storage_backup_firehose_request_duration_seconds_sum=81862.25789057177 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.001,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=0 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.0015,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=0 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.0022500000000000003,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=0 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.0033750000000000004,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=0 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.005062500000000001,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=0 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.0075937500000000015,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=2 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.011390625000000001,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=14237 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.017085937500000002,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=185830 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.025628906250000003,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=411651 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.03844335937500001,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=805604 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.057665039062500006,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1140106 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.08649755859375001,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1308659 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.129746337890625,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1402601 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.277ea440004d086e,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1449158 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.29192926025390625,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1478115 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.43789389038085935,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1490317 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.656840835571289,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1496059 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=0.0f52079ffbaba445,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1498355 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=1.39025843efc762bb,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499179 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=2.30d0587a02d01d38,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499424 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=3.3864f697e8d4ab0b,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499475 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=4.987885095119475,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499491 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=7.481827642679213,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499492 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=11.222741464018819,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499492 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=16.83411219602823,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499492 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,le=+Inf,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_duration_seconds_bucket=1499492 1578431517778682000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_goroutines=252 1578431517778716000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_alloc_bytes=119636872 1578431517778716000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_mspan_inuse_bytes=1254528 1578431517778717000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal,status=error storage_producer_node_requests_total=141578 1578431517778717000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal,status=ok storage_producer_node_requests_total=1501555 1578431517778718000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_heap_sys_bytes=1538359296 1578431517778719000
+prometheus,endpoint=/api/v2/query,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=500 http_query_request_count=5 1578431517778720000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_buck_hash_sys_bytes=4219449 1578431517778721000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_gc_sys_bytes=59664384 1578431517778721000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=40c69bd39226fa67,role=gateway-internal,status=204 http_write_response_bytes=0 1578431517778723000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=40c69bd39226fa67,role=gateway-internal,status=500 http_write_response_bytes=1031089 1578431517778724000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=204 http_write_response_bytes=0 1578431517778725000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=500 http_write_response_bytes=15460253 1578431517778727000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_gc_duration_seconds_sum=3349.391384265,go_gc_duration_seconds_count=475295 1578431517778728000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,quantile=0,role=gateway-internal go_gc_duration_seconds=0.000016941 1578431517778728000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,quantile=0.25,role=gateway-internal go_gc_duration_seconds=0.000055612 1578431517778728000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,quantile=0.5,role=gateway-internal go_gc_duration_seconds=0.000138294 1578431517778728000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,quantile=0.75,role=gateway-internal go_gc_duration_seconds=0.000994865 1578431517778728000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,quantile=1,role=gateway-internal go_gc_duration_seconds=0.076084449 1578431517778728000
+prometheus,endpoint=/api/v2/query,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=500 http_query_response_bytes=340 1578431517778735000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=40c69bd39226fa67,role=gateway-internal,status=204 http_write_request_bytes=267959637 1578431517778736000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=40c69bd39226fa67,role=gateway-internal,status=500 http_write_request_bytes=850922009 1578431517778737000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=204 http_write_request_bytes=536340398235 1578431517778737000
+prometheus,endpoint=/api/v2/write,env=toolsus1,hostname=host1,nodename=node1,org_id=332e4ccb1c0d7943,role=gateway-internal,status=500 http_write_request_bytes=80176295201 1578431517778739000
+prometheus,env=toolsus1,error=false,hostname=host1,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_sum=919.3827050980769,auth_prometheus_request_duration_seconds_count=1643133 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.001,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1606623 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.0015,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1613702 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.0022500000000000003,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1618370 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.0033750000000000004,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1622331 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.005062500000000001,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1626334 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.0075937500000000015,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1630070 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.011390625000000001,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1633208 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.017085937500000002,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1635693 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.025628906250000003,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1637696 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.03844335937500001,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1639178 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.057665039062500006,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1640358 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.08649755859375001,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1641226 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.129746337890625,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1641864 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.277ea440004d086e,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1642312 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.29192926025390625,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1642645 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.43789389038085935,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1642901 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.656840835571289,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643045 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=0.0f52079ffbaba445,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643109 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=1.39025843efc762bb,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643127 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=2.30d0587a02d01d38,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643133 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=3.3864f697e8d4ab0b,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643133 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=4.987885095119475,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643133 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=7.481827642679213,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643133 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=11.222741464018819,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643133 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=16.83411219602823,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643133 1578431517778741000
+prometheus,env=toolsus1,error=false,hostname=host1,le=+Inf,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_request_duration_seconds_bucket=1643133 1578431517778741000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_kafka_write_total_successes=0 1578431517778773000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_lookups_total=0 1578431517778775000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal,status=error storage_producer_node_values_total=793441442 1578431517778775000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal,status=ok storage_producer_node_values_total=4795395785 1578431517778776000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_mcache_inuse_bytes=27776 1578431517778776000
+prometheus,env=toolsus1,hostname=host1,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_count=2063,storage_backup_firehose_request_bytes_sum=151485142 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1.75,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=3.0625,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=5.359375,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=9.37890625,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=16.4130859375,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=28.722900390625,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=50.26507568359375,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=87.96388244628906,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=153.93679428100586,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=269.38938999176025,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=0 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=471.43143248558044,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=45 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=825.0050068497658,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=77 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1443.75876198709,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=85 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=2526.5778334774077,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=99 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=4421.5112085854635,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=102 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=7737.644615024561,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=109 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=13540.878076292982,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=264 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=23696.53663351272,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=801 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=41468.93910864726,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=983 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=72570.6434401327,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=1282 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=126998.62602023222,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=1577 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=222247.59553540638,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2021 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=388933.29218696116,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2026 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=680633.2613271821,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1.1911082073225686e+06,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=2.1777f6e4feb2ea86+06,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=3.6477688849253664e+06,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=6.3b589ca9a39b4eee+06,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1.1171292210083935e+07,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1.9549761367646888e+07,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=3.2fcf160832c9145b+07,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=5.98711441884186e+07,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1.0477450232973254e+08,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=1.8335537907703194e+08,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,le=+Inf,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_request_bytes_bucket=2063 1578431517778777000
+prometheus,env=toolsus1,hostname=host1,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_count=1499492,storage_backup_firehose_request_bytes_sum=71314743155 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1.75,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=3.0625,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=5.359375,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=9.37890625,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=16.4130859375,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=28.722900390625,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=50.26507568359375,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=87.96388244628906,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=153.93679428100586,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=0 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=269.38938999176025,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=7 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=471.43143248558044,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=9309 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=825.0050068497658,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=16030 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1443.75876198709,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=93295 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=2526.5778334774077,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=135980 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=4421.5112085854635,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=194000 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=7737.644615024561,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=244271 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=13540.878076292982,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=416897 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=23696.53663351272,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=679904 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=41468.93910864726,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=851773 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=72570.6434401327,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1147159 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=126998.62602023222,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1353875 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=222247.59553540638,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1497154 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=388933.29218696116,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499466 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=680633.2613271821,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1.1911082073225686e+06,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=2.1777f6e4feb2ea86+06,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=3.6477688849253664e+06,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=6.3b589ca9a39b4eee+06,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1.1171292210083935e+07,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1.9549761367646888e+07,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=3.2fcf160832c9145b+07,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=5.98711441884186e+07,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1.0477450232973254e+08,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=1.8335537907703194e+08,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,le=+Inf,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_request_bytes_bucket=1499492 1578431517778821000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal,version=go1.12.14 go_info=1 1578431517778868000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_alloc_bytes_total=44851926126768 1578431517778869000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_heap_objects=241083 1578431517778869000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_last_gc_time_seconds=1578417663.9609165 1578431517778870000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_other_sys_bytes=6473663 1578431517778871000
+prometheus,env=toolsus1,error=false,hostname=host1,method=FindAuthorizationByToken,nodename=node1,role=gateway-internal auth_prometheus_requests_total=1643133 1578431517778872000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_kafka_write_requests=0 1578431517778872000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_mcache_sys_bytes=32768 1578431517778873000
+prometheus,env=toolsus1,hostname=host1,msg=write,nodename=node1,role=gateway-internal,status=error storage_backup_firehose_requests_total=2063 1578431517778873000
+prometheus,env=toolsus1,hostname=host1,msg=write,nodename=node1,role=gateway-internal,status=ok storage_backup_firehose_requests_total=1499492 1578431517778874000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_threads=41 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_count=2,http_api_request_duration_seconds_sum=0.00017243 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/api/v2,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778875000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_sum=0.015926886,http_api_request_duration_seconds_count=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/api/v2/dashboards,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=2 1578431517778895000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_count=1,http_api_request_duration_seconds_sum=0.013298411 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/68f629c9e1766828/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778915000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_count=1,http_api_request_duration_seconds_sum=0.034666319 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/api/v2/dashboards/3f7cfe811c58e9bc/cells/6de7b64431004afd/view,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778931000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_count=1,http_api_request_duration_seconds_sum=0.011593125 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/api/v2/labels,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517778949000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_sum=0.412263942,http_api_request_duration_seconds_count=22 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=3 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=18 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=20 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=21 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=21 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=21 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=22 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=22 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=22 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=22 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=22 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/api/v2/me,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=22 1578431517778988000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_count=1,http_api_request_duration_seconds_sum=0.000062196 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/api/v2/setup,role=gateway-internal,status=2XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779019000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=569,http_api_request_duration_seconds_sum=62.762816807000014 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=24 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=414 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=482 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=523 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=543 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=558 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=568 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/debug/pprof/allocs,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779036000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=569,http_api_request_duration_seconds_sum=6.49957022300001 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=446 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=501 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=527 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=542 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=559 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=565 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=567 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/debug/pprof/block,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779055000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=569,http_api_request_duration_seconds_sum=29.609671701999982 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=300 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=414 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=479 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=501 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=523 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=546 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=551 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=563 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/debug/pprof/goroutine,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779072000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=569,http_api_request_duration_seconds_sum=62.90559356000006 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=31 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=412 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=484 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=527 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=542 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=558 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=567 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/debug/pprof/heap,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779092000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=569,http_api_request_duration_seconds_sum=6.235567286000004 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=444 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=504 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=538 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=544 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=558 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=562 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=568 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/debug/pprof/mutex,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779108000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=569,http_api_request_duration_seconds_sum=17129.434994090017 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=0 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/debug/pprof/profile,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779129000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=569,http_api_request_duration_seconds_sum=7.0344016 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=435 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=493 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=532 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=547 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=558 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=565 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=567 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=568 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/debug/pprof/threadcreate,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=569 1578431517779145000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_count=17080,http_api_request_duration_seconds_sum=107.89479148400041 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=14892 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=16117 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=16670 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=16845 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=16944 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=17049 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=17068 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=17080 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=17080 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=17080 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=17080 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/health,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=17080 1578431517779163000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_count=34165,http_api_request_duration_seconds_sum=442.01653550500083 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=20433 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=28136 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=32252 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=33182 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=33562 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=33868 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=34038 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=34141 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=34165 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=34165 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=34165 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/metrics,role=gateway-internal,status=2XX,user_agent=Go-http-client http_api_request_duration_seconds_bucket=34165 1578431517779180000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_count=68330,http_api_request_duration_seconds_sum=6.938958378999966 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68262 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68278 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68292 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68310 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68315 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68324 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68329 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68330 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68330 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68330 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68330 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=GET,nodename=node1,path=/ready,role=gateway-internal,status=2XX,user_agent=kube-probe http_api_request_duration_seconds_bucket=68330 1578431517779197000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_count=5,http_api_request_duration_seconds_sum=198.594680122 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=0 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=1 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=POST,nodename=node1,path=/api/v2/query,role=gateway-internal,status=5XX,user_agent=Chrome http_api_request_duration_seconds_bucket=5 1578431517779213000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_count=1499492,http_api_request_duration_seconds_sum=1597291.4117566838 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=123231 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=456270 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=983538 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=1375813 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=1472021 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=1499185 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=2XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=1499492 1578431517779237000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_count=143641,http_api_request_duration_seconds_sum=237157.3812770574 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.005,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.01,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.025,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.05,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=0 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.25,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=14335 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=0.5,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=28882 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=1,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=45498 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=2.5,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=109858 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=5,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=143159 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=10,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=143564 1578431517779255000
+prometheus,env=toolsus1,handler=gateway,hostname=host1,le=+Inf,method=POST,nodename=node1,path=/api/v2/write,role=gateway-internal,status=5XX,user_agent=Telegraf http_api_request_duration_seconds_bucket=143641 1578431517779255000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_gc_cpu_fraction=0.030752107927046763 1578431517779273000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_heap_released_bytes=774291456 1578431517779274000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_next_gc_bytes=140163248 1578431517779274000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_sys_bytes=1637044728 1578431517779275000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_kafka_write_consecutive_failures=0 1578431517779275000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_redis_requests=0 1578431517779275000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal circuitbreaker_redis_total_failures=0 1578431517779276000
+prometheus,env=toolsus1,hostname=host1,nodename=node1,role=gateway-internal go_memstats_heap_idle_bytes=1403379712 1578431517779276000
diff --git a/test_fixtures/lineproto/read_filter.lp.gz b/test_fixtures/lineproto/read_filter.lp.gz
new file mode 100644
index 0000000..142d86b
Binary files /dev/null and b/test_fixtures/lineproto/read_filter.lp.gz differ
diff --git a/test_fixtures/lineproto/tag_values.lp.gz b/test_fixtures/lineproto/tag_values.lp.gz
new file mode 100644
index 0000000..5124a7a
Binary files /dev/null and b/test_fixtures/lineproto/tag_values.lp.gz differ
diff --git a/test_fixtures/lineproto/temperature.lp b/test_fixtures/lineproto/temperature.lp
new file mode 100644
index 0000000..a418ddf
--- /dev/null
+++ b/test_fixtures/lineproto/temperature.lp
@@ -0,0 +1,6 @@
+h2o_temperature,location=santa_monica,state=CA surface_degrees=65.2,bottom_degrees=50.4 1568756160
+h2o_temperature,location=santa_monica,state=CA surface_degrees=63.6,bottom_degrees=49.2 1600756160
+h2o_temperature,location=coyote_creek,state=CA surface_degrees=55.1,bottom_degrees=51.3 1568756160
+h2o_temperature,location=coyote_creek,state=CA surface_degrees=50.2,bottom_degrees=50.9 1600756160
+h2o_temperature,location=puget_sound,state=WA surface_degrees=55.8,bottom_degrees=40.2 1568756160
+h2o_temperature,location=puget_sound,state=WA surface_degrees=54.7,bottom_degrees=40.1 1600756160
\ No newline at end of file
diff --git a/test_fixtures/merge-tsm/merge_a.tsm.gz b/test_fixtures/merge-tsm/merge_a.tsm.gz
new file mode 100644
index 0000000..5abb957
Binary files /dev/null and b/test_fixtures/merge-tsm/merge_a.tsm.gz differ
diff --git a/test_fixtures/merge-tsm/merge_b.tsm.gz b/test_fixtures/merge-tsm/merge_b.tsm.gz
new file mode 100644
index 0000000..500f549
Binary files /dev/null and b/test_fixtures/merge-tsm/merge_b.tsm.gz differ
diff --git a/test_fixtures/parquet/influxql_log_1.parquet b/test_fixtures/parquet/influxql_log_1.parquet
new file mode 100644
index 0000000..8c4b04e
Binary files /dev/null and b/test_fixtures/parquet/influxql_log_1.parquet differ
diff --git a/test_fixtures/parquet/influxql_log_2.parquet b/test_fixtures/parquet/influxql_log_2.parquet
new file mode 100644
index 0000000..ada36c0
Binary files /dev/null and b/test_fixtures/parquet/influxql_log_2.parquet differ
diff --git a/test_fixtures/parquet/influxql_log_3.parquet b/test_fixtures/parquet/influxql_log_3.parquet
new file mode 100644
index 0000000..2c331b2
Binary files /dev/null and b/test_fixtures/parquet/influxql_log_3.parquet differ
diff --git a/test_fixtures/parquet/sql_query_log_1.parquet b/test_fixtures/parquet/sql_query_log_1.parquet
new file mode 100644
index 0000000..ed8f30b
Binary files /dev/null and b/test_fixtures/parquet/sql_query_log_1.parquet differ
diff --git a/test_fixtures/parquet/sql_query_log_2.parquet b/test_fixtures/parquet/sql_query_log_2.parquet
new file mode 100644
index 0000000..a46c641
Binary files /dev/null and b/test_fixtures/parquet/sql_query_log_2.parquet differ
diff --git a/test_fixtures/parquet/sql_query_log_3.parquet b/test_fixtures/parquet/sql_query_log_3.parquet
new file mode 100644
index 0000000..dc6f5e3
Binary files /dev/null and b/test_fixtures/parquet/sql_query_log_3.parquet differ
diff --git a/test_fixtures/parquet/temperature.parquet b/test_fixtures/parquet/temperature.parquet
new file mode 100644
index 0000000..f1880fd
Binary files /dev/null and b/test_fixtures/parquet/temperature.parquet differ
diff --git a/test_fixtures/wal/9.dat b/test_fixtures/wal/9.dat
new file mode 100644
index 0000000..f261e7d
Binary files /dev/null and b/test_fixtures/wal/9.dat differ
diff --git a/test_helpers/Cargo.toml b/test_helpers/Cargo.toml
new file mode 100644
index 0000000..1b0882e
--- /dev/null
+++ b/test_helpers/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "test_helpers"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+dotenvy = "0.15.7"
+parking_lot = "0.12"
+tempfile = "3.9.0"
+tracing-log = "0.2"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+observability_deps = { path = "../observability_deps" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+async-trait = { version = "0.1.77", optional = true }
+tokio = { version = "1.35.1", optional = true, default_features = false, features = ["time"] }
+
+[features]
+default = []
+future_timeout = ["async-trait", "tokio"]
diff --git a/test_helpers/src/lib.rs b/test_helpers/src/lib.rs
new file mode 100644
index 0000000..1e4dc0b
--- /dev/null
+++ b/test_helpers/src/lib.rs
@@ -0,0 +1,211 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::{
+    env, f64,
+    sync::{Arc, Once},
+};
+pub use tempfile;
+#[cfg(feature = "future_timeout")]
+pub mod timeout;
+pub mod tracing;
+
+pub type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
+pub type Result<T = (), E = Error> = std::result::Result<T, E>;
+
+/// A test helper function for asserting floating point numbers are within the
+/// machine epsilon because strict comparison of floating point numbers is
+/// incorrect
+pub fn approximately_equal(f1: f64, f2: f64) -> bool {
+    (f1 - f2).abs() < f64::EPSILON
+}
+
+pub fn all_approximately_equal(f1: &[f64], f2: &[f64]) -> bool {
+    f1.len() == f2.len() && f1.iter().zip(f2).all(|(&a, &b)| approximately_equal(a, b))
+}
+
+/// Return a temporary directory that is deleted when the object is dropped
+pub fn tmp_dir() -> Result<tempfile::TempDir> {
+    let _ = dotenvy::dotenv();
+
+    let root = env::var_os("TEST_INFLUXDB_IOX_DB_DIR").unwrap_or_else(|| env::temp_dir().into());
+
+    Ok(tempfile::Builder::new()
+        .prefix("influxdb_iox")
+        .tempdir_in(root)?)
+}
+
+pub fn tmp_file() -> Result<tempfile::NamedTempFile> {
+    let _ = dotenvy::dotenv();
+
+    let root = env::var_os("TEST_INFLUXDB_IOX_DB_DIR").unwrap_or_else(|| env::temp_dir().into());
+
+    Ok(tempfile::Builder::new()
+        .prefix("influxdb_iox")
+        .tempfile_in(root)?)
+}
+
+/// Writes the specified string to a new temporary file, returning the Path to
+/// the file
+pub fn make_temp_file<C: AsRef<[u8]>>(contents: C) -> tempfile::NamedTempFile {
+    let file = tmp_file().expect("creating temp file");
+
+    std::fs::write(&file, contents).expect("writing data to temp file");
+    file
+}
+
+/// convert form that is easier to type in tests to what some code needs
+pub fn str_vec_to_arc_vec(str_vec: &[&str]) -> Vec<Arc<str>> {
+    str_vec.iter().map(|s| Arc::from(*s)).collect()
+}
+
+/// convert form that is easier to type in tests to what some code needs
+pub fn str_pair_vec_to_vec(str_vec: &[(&str, &str)]) -> Vec<(Arc<str>, Arc<str>)> {
+    str_vec
+        .iter()
+        .map(|(s1, s2)| (Arc::from(*s1), Arc::from(*s2)))
+        .collect()
+}
+
+static LOG_SETUP: Once = Once::new();
+
+/// Enables debug logging regardless of the value of RUST_LOG
+/// environment variable. If RUST_LOG isn't specifies, defaults to
+/// "debug"
+///
+/// Hint: Try running your test with `--no-capture` if you don't see expected logs.
+///
+/// This is likely useful only when debugging a single tests or when running
+/// with `--test-threads=1` , otherwise outputs will be interleaved because
+/// test execution is multi-threaded.
+pub fn start_logging() {
+    use tracing_log::LogTracer;
+    use tracing_subscriber::{filter::EnvFilter, FmtSubscriber};
+
+    // ensure the global has been initialized
+    LOG_SETUP.call_once(|| {
+        // honor any existing RUST_LOG level
+        if std::env::var("RUST_LOG").is_err() {
+            std::env::set_var("RUST_LOG", "debug");
+        }
+
+        LogTracer::init().expect("Cannot init log->trace integration");
+
+        let subscriber = FmtSubscriber::builder()
+            .with_env_filter(EnvFilter::from_default_env())
+            // Note `with_test_writer` allows libtest (used for all
+            // our tests and invoked by `cargo test`) to capture
+            // per-test logging. The captured data will only be
+            // shown for failed tests. Pass `--no-capture` to
+            // disable that feature (but only try to run a single
+            // test to prevent output interleaving).
+            .with_test_writer()
+            .finish();
+
+        observability_deps::tracing::subscriber::set_global_default(subscriber)
+            .expect("setting default subscriber failed");
+    })
+}
+
+/// Enables debug logging if the RUST_LOG environment variable is
+/// set. Does nothing if RUST_LOG is not set. If enable_logging has
+/// been set previously, does nothing
+pub fn maybe_start_logging() {
+    if std::env::var("RUST_LOG").is_ok() {
+        start_logging()
+    }
+}
+
+#[macro_export]
+/// A macro to assert that one string is contained within another with
+/// a nice error message if they are not.
+///
+/// Usage: `assert_contains!(actual, expected)`
+///
+/// Is a macro so test error
+/// messages are on the same line as the failure;
+///
+/// Both arguments must be convertable into `String`s (`Into<String>`)
+macro_rules! assert_contains {
+    ($ACTUAL: expr, $EXPECTED: expr) => {
+        let actual_value: String = $ACTUAL.into();
+        let expected_value: String = $EXPECTED.into();
+        assert!(
+            actual_value.contains(&expected_value),
+            "Can not find expected in actual.\n\nExpected:\n{}\n\nActual:\n{}",
+            expected_value,
+            actual_value
+        );
+    };
+}
+
+#[macro_export]
+/// A macro to assert that one string is NOT contained within another with
+/// a nice error message if that check fails. Is a macro so test error
+/// messages are on the same line as the failure;
+///
+/// Both arguments must be convertable into `String`s (`Into<String>`)
+macro_rules! assert_not_contains {
+    ($ACTUAL: expr, $UNEXPECTED: expr) => {
+        let actual_value: String = $ACTUAL.into();
+        let unexpected_value: String = $UNEXPECTED.into();
+        assert!(
+            !actual_value.contains(&unexpected_value),
+            "Found unexpected value in actual.\n\nUnexpected:\n{}\n\nActual:\n{}",
+            unexpected_value,
+            actual_value
+        );
+    };
+}
+
+#[macro_export]
+/// Assert that an operation fails with one particular error. Panics if the operation succeeds.
+/// Prints debug format of the error value if it doesn't match the specified pattern.
+macro_rules! assert_error {
+    ($OPERATION: expr, $(|)? $( $ERROR_PATTERN:pat_param )|+ $( if $GUARD: expr )? $(,)?) => {
+        let err = $OPERATION.unwrap_err();
+        assert!(
+            matches!(err, $( $ERROR_PATTERN )|+ $( if $GUARD )?),
+            "Expected {}, but got {:?}",
+            stringify!($( $ERROR_PATTERN )|+ $( if $GUARD )?),
+            err
+        );
+    };
+}
+
+#[macro_export]
+/// Assert that `actual` and `expected` values are within `epsilon` of each other. Used to compare
+/// values that may fluctuate from run to run (e.g. because they encode timestamps)
+///
+/// Usage: `assert_close!(actual, expected, epsilon);`
+macro_rules! assert_close {
+    ($ACTUAL:expr, $EXPECTED:expr, $EPSILON:expr) => {{
+        {
+            let actual = $ACTUAL;
+            let expected = $EXPECTED;
+            let epsilon = $EPSILON;
+            // determine how far apart they actually are
+            let delta = actual.abs_diff(expected);
+            assert!(
+                delta <= epsilon,
+                "{} and {} differ by {}, which is more than allowed {}",
+                actual,
+                expected,
+                delta,
+                epsilon
+            )
+        }
+    }};
+}
diff --git a/test_helpers/src/timeout.rs b/test_helpers/src/timeout.rs
new file mode 100644
index 0000000..6a29417
--- /dev/null
+++ b/test_helpers/src/timeout.rs
@@ -0,0 +1,61 @@
+use std::time::Duration;
+
+use async_trait::async_trait;
+use std::future::Future;
+use tokio::time::error::Elapsed;
+
+/// An extension trait to add wall-clock execution timeouts to a future running
+/// in a tokio runtime.
+///
+/// Uses [tokio::time::timeout] internally to apply the timeout.
+#[async_trait]
+pub trait FutureTimeout: Future + Sized {
+    /// Wraps `self` returning the result, or panicking if the future hasn't
+    /// completed after `d` length of time.
+    ///
+    /// # Safety
+    ///
+    /// This method panics if `d` elapses before the task rejoins.
+    #[track_caller]
+    async fn with_timeout_panic(mut self, d: Duration) -> <Self as Future>::Output {
+        self.with_timeout(d)
+            .await
+            .expect("timeout waiting for task to join")
+    }
+
+    /// Wraps `self` returning the result, or an error if the future hasn't
+    /// completed after `d` length of time.
+    #[track_caller]
+    async fn with_timeout(mut self, d: Duration) -> Result<<Self as Future>::Output, Elapsed> {
+        tokio::time::timeout(d, self).await
+    }
+}
+
+impl<F> FutureTimeout for F where F: Future {}
+
+#[cfg(test)]
+mod tests {
+    use tokio::sync::oneshot;
+
+    use super::*;
+
+    #[tokio::test]
+    #[should_panic(expected = "timeout")]
+    async fn test_exceeded_panic() {
+        let (_tx, rx) = oneshot::channel::<()>();
+        let task = tokio::spawn(rx);
+
+        let _ = task.with_timeout_panic(Duration::from_millis(1)).await;
+    }
+
+    #[tokio::test]
+    async fn test_exceeded() {
+        let (_tx, rx) = oneshot::channel::<()>();
+        let task = tokio::spawn(rx);
+
+        let _ = task
+            .with_timeout(Duration::from_millis(1))
+            .await
+            .expect_err("should time out");
+    }
+}
diff --git a/test_helpers/src/tracing.rs b/test_helpers/src/tracing.rs
new file mode 100644
index 0000000..d8d0bc2
--- /dev/null
+++ b/test_helpers/src/tracing.rs
@@ -0,0 +1,110 @@
+//! Utilities for testing tracing
+use std::{fmt, sync::Arc};
+
+use observability_deps::tracing::{
+    self,
+    field::Field,
+    span::{Attributes, Id, Record},
+    subscriber::{DefaultGuard, Subscriber},
+    Event,
+};
+use parking_lot::Mutex;
+
+/// This struct captures tracing `Event`s as strings, and can be used
+/// to verify that messages are making it to logs correctly
+///
+/// Upon creation it registers itself as the global default span
+/// subscriber, and upon drop it sets a NoOp in its place.
+#[derive(Debug)]
+pub struct TracingCapture {
+    /// The raw logs are captured as a list of strings
+    logs: Arc<Mutex<Vec<String>>>,
+    guards: Mutex<Vec<DefaultGuard>>,
+}
+
+impl TracingCapture {
+    /// Create a new TracingCapture object and register it as a subscriber
+    #[allow(clippy::new_without_default)]
+    pub fn new() -> Self {
+        let logs = Arc::new(Mutex::new(Vec::new()));
+
+        let this = Self {
+            logs,
+            guards: Default::default(),
+        };
+
+        this.register_in_current_thread();
+
+        this
+    }
+
+    /// Registers capture in current thread.
+    pub fn register_in_current_thread(&self) {
+        // Register a subscriber to actually capture the log messages
+        let my_subscriber = TracingCaptureSubscriber {
+            logs: Arc::clone(&self.logs),
+        };
+
+        // install the subscriber (is uninstalled when the guard is dropped)
+        let guard = tracing::subscriber::set_default(my_subscriber);
+
+        self.guards.lock().push(guard);
+    }
+}
+
+impl fmt::Display for TracingCapture {
+    /// Retrieves the contents of all captured traces as a string
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let logs = self.logs.lock();
+        write!(f, "{}", logs.join("\n"))
+    }
+}
+
+/// Captures span events to verify
+struct TracingCaptureSubscriber {
+    logs: Arc<Mutex<Vec<String>>>,
+}
+
+impl Subscriber for TracingCaptureSubscriber {
+    fn new_span(&self, _span: &Attributes<'_>) -> Id {
+        Id::from_u64(1)
+    }
+
+    fn enabled(&self, _metadata: &tracing::Metadata<'_>) -> bool {
+        true
+    }
+
+    fn record(&self, _span: &Id, _values: &Record<'_>) {}
+
+    fn record_follows_from(&self, _span: &Id, _follows: &Id) {}
+
+    fn event(&self, event: &Event<'_>) {
+        let mut v = StringVisitor {
+            string: String::new(),
+        };
+        v.record_kv("level", &event.metadata().level().to_string());
+        event.record(&mut v);
+        let mut logs = self.logs.lock();
+        logs.push(v.string);
+    }
+
+    fn enter(&self, _span: &Id) {}
+    fn exit(&self, _span: &Id) {}
+}
+
+struct StringVisitor {
+    string: String,
+}
+
+impl StringVisitor {
+    fn record_kv(&mut self, key: &str, value: &str) {
+        use std::fmt::Write;
+        write!(self.string, "{key} = {value}; ").unwrap();
+    }
+}
+
+impl tracing::field::Visit for StringVisitor {
+    fn record_debug(&mut self, field: &Field, value: &dyn fmt::Debug) {
+        self.record_kv(field.name(), &format!("{value:?}"))
+    }
+}
diff --git a/test_helpers_end_to_end/Cargo.toml b/test_helpers_end_to_end/Cargo.toml
new file mode 100644
index 0000000..64ad443
--- /dev/null
+++ b/test_helpers_end_to_end/Cargo.toml
@@ -0,0 +1,47 @@
+[package]
+name = "test_helpers_end_to_end"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+arrow = { workspace = true }
+arrow-flight = { workspace = true }
+arrow_util = { path = "../arrow_util" }
+assert_cmd = "2.0.13"
+assert_matches = "1.5.0"
+bytes = "1.5"
+data_types = { path = "../data_types" }
+dml = { path = "../dml" }
+futures = "0.3"
+generated_types = { path = "../generated_types" }
+http = "0.2.11"
+hyper = "0.14"
+influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format"] }
+ingester_query_grpc = { path = "../ingester_query_grpc" }
+insta = { version = "1.34.0", features = ["yaml"] }
+iox_catalog = { path = "../iox_catalog" }
+iox_query_params = { path = "../iox_query_params" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+mutable_batch_pb = { path = "../mutable_batch_pb" }
+nix = { version = "0.27", default-features = false, features = ["signal"] }
+observability_deps = { path = "../observability_deps" }
+once_cell = { version = "1.19", features = ["parking_lot"] }
+parking_lot = "0.12"
+prost = { workspace = true }
+rand = "0.8.3"
+regex = "1.10"
+reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] }
+serde_json = "1.0.111"
+snafu = "0.8"
+sqlx = { version = "0.7.3", features = [ "runtime-tokio-rustls" , "postgres", "uuid" ] }
+tempfile = "3.9.0"
+test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
+tokio-util = "0.7"
+tonic = { workspace = true }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/test_helpers_end_to_end/src/addrs.rs b/test_helpers_end_to_end/src/addrs.rs
new file mode 100644
index 0000000..69a0ff5
--- /dev/null
+++ b/test_helpers_end_to_end/src/addrs.rs
@@ -0,0 +1,197 @@
+use std::{
+    fmt::Display,
+    net::SocketAddrV4,
+    sync::{
+        atomic::{AtomicU16, Ordering},
+        Arc,
+    },
+};
+
+// These port numbers are chosen to not collide with a development ioxd server
+// running locally.
+static NEXT_PORT: AtomicU16 = AtomicU16::new(8090);
+
+/// Socket type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum SocketType {
+    Tcp,
+    Udp,
+}
+
+/// Represents port on localhost to bind / connect to
+#[derive(Debug, Clone)]
+pub struct Address {
+    /// the actual address, on which to bind. Example `127.0.0.1:8089`
+    bind_addr: Arc<str>,
+    /// address on which clients can connect. Example `http://127.0.0.1:8089`
+    client_base: Arc<str>,
+}
+
+impl Address {
+    fn new(t: SocketType) -> Self {
+        let bind_addr = Self::get_free_port(t).to_string();
+        let protocol = match t {
+            SocketType::Tcp => "http",
+            SocketType::Udp => "udp",
+        };
+        let client_base = format!("{protocol}://{bind_addr}");
+
+        Self {
+            bind_addr: bind_addr.into(),
+            client_base: client_base.into(),
+        }
+    }
+
+    fn get_free_port(t: SocketType) -> SocketAddrV4 {
+        let ip = std::net::Ipv4Addr::new(127, 0, 0, 1);
+
+        loop {
+            let port = NEXT_PORT.fetch_add(1, Ordering::SeqCst);
+            let addr = SocketAddrV4::new(ip, port);
+
+            let is_working = match t {
+                SocketType::Tcp => std::net::TcpListener::bind(addr).is_ok(),
+                SocketType::Udp => std::net::UdpSocket::bind(addr).is_ok(),
+            };
+
+            if is_working {
+                return addr;
+            }
+        }
+    }
+
+    pub fn bind_addr(&self) -> Arc<str> {
+        Arc::clone(&self.bind_addr)
+    }
+
+    pub fn client_base(&self) -> Arc<str> {
+        Arc::clone(&self.client_base)
+    }
+}
+
+/// This structure contains all the addresses a test server could use
+#[derive(Default, Debug)]
+pub struct BindAddresses {
+    router_http_api: std::sync::Mutex<Option<Address>>,
+    router_grpc_api: std::sync::Mutex<Option<Address>>,
+    router_gossip_api: std::sync::Mutex<Option<Address>>,
+    querier_http_api: std::sync::Mutex<Option<Address>>,
+    querier_grpc_api: std::sync::Mutex<Option<Address>>,
+    querier_gossip_api: std::sync::Mutex<Option<Address>>,
+    ingester_http_api: std::sync::Mutex<Option<Address>>,
+    ingester_grpc_api: std::sync::Mutex<Option<Address>>,
+    ingester_gossip_api: std::sync::Mutex<Option<Address>>,
+    compactor_http_api: std::sync::Mutex<Option<Address>>,
+    compactor_grpc_api: std::sync::Mutex<Option<Address>>,
+    compactor_gossip_api: std::sync::Mutex<Option<Address>>,
+    catalog_http_api: std::sync::Mutex<Option<Address>>,
+    catalog_grpc_api: std::sync::Mutex<Option<Address>>,
+    catalog_gossip_api: std::sync::Mutex<Option<Address>>,
+    parquet_cache_http_api: std::sync::Mutex<Option<Address>>,
+}
+
+impl BindAddresses {
+    pub fn router_http_api(&self) -> Address {
+        get_or_allocate(&self.router_http_api, SocketType::Tcp)
+    }
+
+    pub fn router_grpc_api(&self) -> Address {
+        get_or_allocate(&self.router_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn router_gossip_api(&self) -> Address {
+        get_or_allocate(&self.router_gossip_api, SocketType::Udp)
+    }
+
+    pub fn querier_http_api(&self) -> Address {
+        get_or_allocate(&self.querier_http_api, SocketType::Tcp)
+    }
+
+    pub fn querier_grpc_api(&self) -> Address {
+        get_or_allocate(&self.querier_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn querier_gossip_api(&self) -> Address {
+        get_or_allocate(&self.querier_gossip_api, SocketType::Udp)
+    }
+
+    pub fn ingester_http_api(&self) -> Address {
+        get_or_allocate(&self.ingester_http_api, SocketType::Tcp)
+    }
+
+    pub fn ingester_grpc_api(&self) -> Address {
+        get_or_allocate(&self.ingester_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn ingester_gossip_api(&self) -> Address {
+        get_or_allocate(&self.ingester_gossip_api, SocketType::Udp)
+    }
+
+    pub fn compactor_http_api(&self) -> Address {
+        get_or_allocate(&self.compactor_http_api, SocketType::Tcp)
+    }
+
+    pub fn compactor_grpc_api(&self) -> Address {
+        get_or_allocate(&self.compactor_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn compactor_gossip_api(&self) -> Address {
+        get_or_allocate(&self.compactor_gossip_api, SocketType::Udp)
+    }
+
+    pub fn catalog_http_api(&self) -> Address {
+        get_or_allocate(&self.catalog_http_api, SocketType::Tcp)
+    }
+
+    pub fn catalog_grpc_api(&self) -> Address {
+        get_or_allocate(&self.catalog_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn catalog_gossip_api(&self) -> Address {
+        get_or_allocate(&self.catalog_gossip_api, SocketType::Udp)
+    }
+
+    pub fn all_gossip_apis(&self) -> Vec<Address> {
+        vec![
+            self.router_gossip_api(),
+            self.ingester_gossip_api(),
+            self.compactor_gossip_api(),
+            self.querier_gossip_api(),
+        ]
+    }
+
+    pub fn parquet_cache_http_api(&self) -> Address {
+        get_or_allocate(&self.parquet_cache_http_api, SocketType::Tcp)
+    }
+}
+
+impl Display for BindAddresses {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if let Some(addr) = self.router_http_api.lock().unwrap().as_ref() {
+            write!(f, "router_http: {} ", addr.bind_addr)?
+        }
+        if let Some(addr) = self.router_grpc_api.lock().unwrap().as_ref() {
+            write!(f, "router_grpc: {} ", addr.bind_addr)?
+        }
+        if let Some(addr) = self.querier_grpc_api.lock().unwrap().as_ref() {
+            write!(f, "querier_grpc: {} ", addr.bind_addr)?
+        }
+        if let Some(addr) = self.ingester_grpc_api.lock().unwrap().as_ref() {
+            write!(f, "ingester_grpc: {} ", addr.bind_addr)?
+        }
+        if let Some(addr) = self.compactor_grpc_api.lock().unwrap().as_ref() {
+            write!(f, "compactor_grpc: {} ", addr.bind_addr)?
+        }
+        if let Some(addr) = self.catalog_grpc_api.lock().unwrap().as_ref() {
+            write!(f, "catalog_grpc: {} ", addr.bind_addr)?
+        }
+        Ok(())
+    }
+}
+
+fn get_or_allocate(locked_addr: &std::sync::Mutex<Option<Address>>, t: SocketType) -> Address {
+    let mut locked_addr = locked_addr.lock().unwrap();
+    let addr = locked_addr.take().unwrap_or_else(|| Address::new(t));
+    *locked_addr = Some(addr.clone());
+    addr
+}
diff --git a/test_helpers_end_to_end/src/authz.rs b/test_helpers_end_to_end/src/authz.rs
new file mode 100644
index 0000000..e243161
--- /dev/null
+++ b/test_helpers_end_to_end/src/authz.rs
@@ -0,0 +1,152 @@
+use futures::FutureExt;
+use generated_types::influxdata::iox::authz::v1::{
+    iox_authorizer_service_server::{IoxAuthorizerService, IoxAuthorizerServiceServer},
+    permission::PermissionOneOf,
+    resource_action_permission::{Action, ResourceType},
+    AuthorizeRequest, AuthorizeResponse, Permission, ResourceActionPermission,
+};
+use observability_deps::tracing::{error, info};
+use rand::{distributions::Alphanumeric, thread_rng, Rng};
+use std::{
+    collections::HashMap,
+    net::SocketAddr,
+    sync::{Arc, Mutex},
+};
+use tokio::{
+    net::TcpListener,
+    sync::oneshot,
+    task::{spawn, JoinHandle},
+};
+use tonic::transport::{server::TcpIncoming, Server};
+
+#[derive(Debug)]
+pub struct Authorizer {
+    tokens: Arc<Mutex<HashMap<Vec<u8>, Vec<Permission>>>>,
+    addr: SocketAddr,
+    stop: Option<oneshot::Sender<()>>,
+    handle: JoinHandle<Result<(), tonic::transport::Error>>,
+}
+
+impl Authorizer {
+    pub async fn create() -> Self {
+        let listener = TcpListener::bind("localhost:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+
+        info!("****************");
+        info!(local_addr = %addr, "Authorizer started");
+        info!("****************");
+
+        let incoming = TcpIncoming::from_listener(listener, false, None).unwrap();
+        let (stop, stop_rx) = oneshot::channel();
+        let tokens = Arc::new(Mutex::new(HashMap::new()));
+        let router = Server::builder().add_service(IoxAuthorizerServiceServer::new(
+            AuthorizerService::new(Arc::clone(&tokens)),
+        ));
+        let handle = spawn(router.serve_with_incoming_shutdown(incoming, stop_rx.map(drop)));
+
+        Self {
+            tokens,
+            addr,
+            stop: Some(stop),
+            handle,
+        }
+    }
+
+    pub async fn close(mut self) {
+        info!("****************");
+        info!(local_addr = %self.addr, "Stopping authorizer");
+        info!("****************");
+        if let Some(stop) = self.stop.take() {
+            stop.send(()).expect("Error stopping authorizer");
+        };
+        match self.handle.await {
+            Ok(Ok(())) => {}
+            Ok(Err(e)) => error!(error = %e, "Error stopping authorizer"),
+            Err(e) => error!(error = %e, "Error stopping authorizer"),
+        }
+    }
+
+    /// Create a new token with the requested permissions.
+    ///
+    /// Actions are specified by name, the currently supported actions are:
+    ///  - `"ACTION_READ_SCHEMA"`
+    ///  - `"ACTION_READ"`
+    ///  - `"ACTION_WRITE"`
+    ///  - `"ACTION_CREATE"`
+    ///  - `"ACTION_DELETE"`
+    pub fn create_token_for(&mut self, namespace_name: &str, actions: &[&str]) -> String {
+        let perms = actions
+            .iter()
+            .filter_map(|a| Action::from_str_name(a))
+            .map(|a| Permission {
+                permission_one_of: Some(PermissionOneOf::ResourceAction(
+                    ResourceActionPermission {
+                        resource_type: ResourceType::Database.into(),
+                        resource_id: Some(namespace_name.to_string()),
+                        action: a.into(),
+                    },
+                )),
+            })
+            .collect();
+        let token = format!(
+            "{namespace_name}_{}",
+            thread_rng()
+                .sample_iter(&Alphanumeric)
+                .take(5)
+                .map(char::from)
+                .collect::<String>()
+        );
+        self.tokens
+            .lock()
+            .unwrap()
+            .insert(token.clone().into_bytes(), perms);
+        token
+    }
+
+    /// Get the address the server is listening at.
+    pub fn addr(&self) -> String {
+        format!("http://{}", self.addr)
+    }
+}
+
+/// Test implementation of the IoxAuthorizationService.
+#[derive(Debug)]
+struct AuthorizerService {
+    tokens: Arc<Mutex<HashMap<Vec<u8>, Vec<Permission>>>>,
+}
+
+impl AuthorizerService {
+    /// Create new TestAuthorizationService.
+    fn new(tokens: Arc<Mutex<HashMap<Vec<u8>, Vec<Permission>>>>) -> Self {
+        Self { tokens }
+    }
+}
+
+#[tonic::async_trait]
+impl IoxAuthorizerService for AuthorizerService {
+    async fn authorize(
+        &self,
+        request: tonic::Request<AuthorizeRequest>,
+    ) -> Result<tonic::Response<AuthorizeResponse>, tonic::Status> {
+        let request = request.into_inner();
+        let recognized = self
+            .tokens
+            .lock()
+            .map_err(|e| tonic::Status::internal(e.to_string()))?
+            .get(&request.token)
+            .cloned();
+        let valid = recognized.is_some();
+        let perms = recognized.unwrap_or_default();
+
+        Ok(tonic::Response::new(AuthorizeResponse {
+            valid,
+            subject: None,
+            permissions: request
+                .permissions
+                .iter()
+                .filter(|p| perms.contains(p))
+                .cloned()
+                .collect(),
+        }))
+    }
+}
diff --git a/test_helpers_end_to_end/src/client.rs b/test_helpers_end_to_end/src/client.rs
new file mode 100644
index 0000000..74e8ada
--- /dev/null
+++ b/test_helpers_end_to_end/src/client.rs
@@ -0,0 +1,275 @@
+//! Client helpers for writing end to end ng tests
+use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
+use data_types::{NamespaceId, TableId};
+use dml::{DmlMeta, DmlWrite};
+use futures::TryStreamExt;
+use http::Response;
+use hyper::{Body, Client, Request};
+use influxdb_iox_client::{
+    connection::Connection,
+    ingester::generated_types::{write_service_client::WriteServiceClient, WriteRequest},
+};
+use iox_query_params::StatementParam;
+use mutable_batch_lp::lines_to_batches;
+use mutable_batch_pb::encode::encode_write;
+use std::fmt::Display;
+use tonic::IntoRequest;
+
+/// Writes the line protocol to the write_base/api/v2/write endpoint (typically on the router)
+pub async fn write_to_router(
+    line_protocol: impl Into<String> + Send,
+    org: impl AsRef<str> + Send,
+    bucket: impl AsRef<str> + Send,
+    write_base: impl AsRef<str> + Send,
+    authorization: Option<&str>,
+) -> Response<Body> {
+    let client = Client::new();
+    let url = format!(
+        "{}/api/v2/write?org={}&bucket={}",
+        write_base.as_ref(),
+        org.as_ref(),
+        bucket.as_ref()
+    );
+
+    let mut builder = Request::builder().uri(url).method("POST");
+    if let Some(authorization) = authorization {
+        builder = builder.header(hyper::header::AUTHORIZATION, authorization);
+    };
+    let request = builder
+        .body(Body::from(line_protocol.into()))
+        .expect("failed to construct HTTP request");
+
+    client
+        .request(request)
+        .await
+        .expect("http error sending write")
+}
+
+/// Writes the line protocol to the WriteService endpoint (typically on the ingester)
+pub async fn write_to_ingester(
+    line_protocol: impl Into<String> + Send,
+    namespace_id: NamespaceId,
+    table_id: TableId,
+    ingester_connection: Connection,
+) {
+    let line_protocol = line_protocol.into();
+    let writes = lines_to_batches(&line_protocol, 0).unwrap();
+    let writes = writes
+        .into_iter()
+        .map(|(_name, data)| (table_id, data))
+        .collect();
+
+    let mut client = WriteServiceClient::new(ingester_connection.into_grpc_connection());
+
+    let op = DmlWrite::new(
+        namespace_id,
+        writes,
+        "1970-01-01".into(),
+        DmlMeta::unsequenced(None),
+    );
+
+    client
+        .write(
+            tonic::Request::new(WriteRequest {
+                payload: Some(encode_write(namespace_id.get(), &op)),
+            })
+            .into_request(),
+        )
+        .await
+        .unwrap();
+}
+
+/// Runs a SQL query using the flight API on the specified connection.
+pub async fn try_run_sql(
+    sql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+    with_debug: bool,
+) -> Result<(Vec<RecordBatch>, SchemaRef), influxdb_iox_client::flight::Error> {
+    try_run_sql_with_params(
+        sql_query,
+        namespace,
+        [],
+        querier_connection,
+        authorization,
+        with_debug,
+    )
+    .await
+}
+
+/// Runs a SQL query using the flight API on the specified connection.
+pub async fn try_run_sql_with_params(
+    sql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+    with_debug: bool,
+) -> Result<(Vec<RecordBatch>, SchemaRef), influxdb_iox_client::flight::Error> {
+    let mut client = influxdb_iox_client::flight::Client::new(querier_connection);
+    if with_debug {
+        client.add_header("iox-debug", "true").unwrap();
+    }
+    if let Some(authorization) = authorization {
+        client.add_header("authorization", authorization).unwrap();
+    }
+
+    // Test the client handshake implementation
+    // Normally this would be done one per connection, not per query
+    client.handshake().await?;
+
+    let mut stream = client
+        .query(namespace)
+        .sql(sql_query.into())
+        .with_params(params)
+        .run()
+        .await?;
+
+    let batches = (&mut stream).try_collect().await?;
+
+    // read schema AFTER collection, otherwise the stream does not have the schema data yet
+    let schema = stream
+        .inner()
+        .schema()
+        .cloned()
+        .ok_or(influxdb_iox_client::flight::Error::NoSchema)?;
+
+    Ok((batches, schema))
+}
+
+/// Runs a InfluxQL query using the flight API on the specified connection.
+pub async fn try_run_influxql(
+    influxql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+) -> Result<(Vec<RecordBatch>, SchemaRef), influxdb_iox_client::flight::Error> {
+    try_run_influxql_with_params(
+        influxql_query,
+        namespace,
+        [],
+        querier_connection,
+        authorization,
+    )
+    .await
+}
+
+pub async fn try_run_influxql_with_params(
+    influxql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+) -> Result<(Vec<RecordBatch>, SchemaRef), influxdb_iox_client::flight::Error> {
+    let mut client = influxdb_iox_client::flight::Client::new(querier_connection);
+    if let Some(authorization) = authorization {
+        client.add_header("authorization", authorization).unwrap();
+    }
+
+    // Test the client handshake implementation
+    // Normally this would be done one per connection, not per query
+    client.handshake().await?;
+
+    let mut stream = client
+        .query(namespace)
+        .influxql(influxql_query.into())
+        .with_params(params)
+        .run()
+        .await?;
+
+    let batches = (&mut stream).try_collect().await?;
+
+    // read schema AFTER collection, otherwise the stream does not have the schema data yet
+    let schema = stream
+        .inner()
+        .schema()
+        .cloned()
+        .ok_or(influxdb_iox_client::flight::Error::NoSchema)?;
+
+    Ok((batches, schema))
+}
+
+/// Runs a SQL query using the flight API on the specified connection.
+///
+/// Use [`try_run_sql`] if you want to check the error manually.
+pub async fn run_sql(
+    sql: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+    with_debug: bool,
+) -> (Vec<RecordBatch>, SchemaRef) {
+    try_run_sql(
+        sql,
+        namespace,
+        querier_connection,
+        authorization,
+        with_debug,
+    )
+    .await
+    .expect("Error executing sql query")
+}
+
+/// Runs a SQL query using the flight API on the specified connection.
+///
+/// Use [`try_run_sql`] if you want to check the error manually.
+pub async fn run_sql_with_params(
+    sql: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+    with_debug: bool,
+) -> (Vec<RecordBatch>, SchemaRef) {
+    try_run_sql_with_params(
+        sql,
+        namespace,
+        params,
+        querier_connection,
+        authorization,
+        with_debug,
+    )
+    .await
+    .expect("Error executing sql query")
+}
+
+/// Runs an InfluxQL query using the flight API on the specified connection.
+///
+/// Use [`try_run_influxql`] if you want to check the error manually.
+pub async fn run_influxql(
+    influxql: impl Into<String> + Clone + Display + Send,
+    namespace: impl Into<String> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+) -> (Vec<RecordBatch>, SchemaRef) {
+    try_run_influxql(
+        influxql.clone(),
+        namespace,
+        querier_connection,
+        authorization,
+    )
+    .await
+    .unwrap_or_else(|_| panic!("Error executing InfluxQL query: {influxql}"))
+}
+
+/// Runs an InfluxQL query using the flight API on the specified connection.
+///
+/// Use [`try_run_influxql`] if you want to check the error manually.
+pub async fn run_influxql_with_params(
+    influxql: impl Into<String> + Clone + Display + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+) -> (Vec<RecordBatch>, SchemaRef) {
+    try_run_influxql_with_params(
+        influxql.clone(),
+        namespace,
+        params,
+        querier_connection,
+        authorization,
+    )
+    .await
+    .unwrap_or_else(|_| panic!("Error executing InfluxQL query: {influxql}"))
+}
diff --git a/test_helpers_end_to_end/src/config.rs b/test_helpers_end_to_end/src/config.rs
new file mode 100644
index 0000000..7c45a5e
--- /dev/null
+++ b/test_helpers_end_to_end/src/config.rs
@@ -0,0 +1,505 @@
+use crate::{addrs::BindAddresses, ServerType, UdpCapture};
+use http::{header::HeaderName, HeaderValue};
+use observability_deps::tracing::info;
+use rand::Rng;
+use std::{collections::HashMap, num::NonZeroUsize, path::Path, sync::Arc};
+use tempfile::TempDir;
+
+/// Options for creating test servers (`influxdb_iox` processes)
+#[derive(Debug, Clone)]
+pub struct TestConfig {
+    /// environment variables to pass to server process. HashMap to avoid duplication
+    env: HashMap<String, String>,
+
+    /// Headers to add to all client requests
+    client_headers: Vec<(HeaderName, HeaderValue)>,
+
+    /// Server type
+    server_type: ServerType,
+
+    /// Catalog DSN value. Required unless you're running all-in-one in ephemeral mode.
+    dsn: Option<String>,
+
+    /// Catalog schema name
+    catalog_schema_name: String,
+
+    /// Object store directory, if needed.
+    object_store_dir: Option<Arc<TempDir>>,
+
+    /// WAL directory, if needed.
+    wal_dir: Option<Arc<TempDir>>,
+
+    /// Catalog directory, if needed
+    catalog_dir: Option<Arc<TempDir>>,
+
+    /// Which ports this server should use
+    addrs: Arc<BindAddresses>,
+
+    /// Wait for server to be ready during creation.
+    wait_for_ready: bool,
+}
+
+impl TestConfig {
+    /// Create a new TestConfig. Tests should use one of the specific
+    /// configuration setup below, such as [new_router](Self::new_router).
+    fn new(
+        server_type: ServerType,
+        dsn: Option<String>,
+        catalog_schema_name: impl Into<String>,
+    ) -> Self {
+        let catalog_schema_name = catalog_schema_name.into();
+
+        let (dsn, catalog_dir) = specialize_dsn_if_needed(dsn, &catalog_schema_name);
+
+        Self {
+            env: HashMap::new(),
+            client_headers: vec![],
+            server_type,
+            dsn,
+            catalog_schema_name,
+            object_store_dir: None,
+            wal_dir: None,
+            catalog_dir,
+            addrs: Arc::new(BindAddresses::default()),
+            wait_for_ready: true,
+        }
+    }
+
+    /// Creates a new TestConfig of `server_type` with the same catalog as `other`
+    fn new_with_existing_catalog(server_type: ServerType, other: &TestConfig) -> Self {
+        Self::new(
+            server_type,
+            other.dsn.clone(),
+            other.catalog_schema_name.clone(),
+        )
+        // also copy a reference to the temp dir, if any, so it isn't
+        // deleted too soon
+        .with_catalog_dir(other.catalog_dir.as_ref().map(Arc::clone))
+    }
+
+    /// Create new catalog node w/o peers
+    fn new_catalog(dsn: Option<String>, catalog_schema_name: String) -> Self {
+        Self::new(ServerType::Catalog, dsn, catalog_schema_name)
+            .with_env("INFLUXDB_IOX_CATALOG_CACHE_WARMUP_DELAY", "100ms")
+    }
+
+    /// Create a triplet of catalog cache nodes.
+    pub fn catalog_nodes(dsn: impl Into<String>) -> [Self; 3] {
+        let dsn = Some(dsn.into());
+        let catalog_schema_name = random_catalog_schema_name();
+
+        let n0 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone());
+        let n1 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone());
+        let n2 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone());
+
+        let n0 = n0.with_catalog_peers([
+            n1.addrs().catalog_http_api().client_base(),
+            n2.addrs().catalog_http_api().client_base(),
+        ]);
+        let n1 = n1.with_catalog_peers([
+            n0.addrs().catalog_http_api().client_base(),
+            n2.addrs().catalog_http_api().client_base(),
+        ]);
+        let n2 = n2.with_catalog_peers([
+            n0.addrs().catalog_http_api().client_base(),
+            n1.addrs().catalog_http_api().client_base(),
+        ]);
+
+        [n0, n1, n2]
+    }
+
+    /// Create a minimal router configuration that doesn't connect to an ingester. If you need a
+    /// router that connects to an ingester, call `new_ingester` first and then pass the resulting
+    /// `TestConfig` to `new_router`.
+    pub fn router_only(dsn: impl Into<String>) -> Self {
+        let dsn = Some(dsn.into());
+        Self::new(ServerType::Router, dsn, random_catalog_schema_name()).with_new_object_store()
+    }
+
+    /// Create a minimal router2 configuration sharing configuration with the ingester2 config
+    pub fn new_router(ingester_config: &TestConfig) -> Self {
+        assert_eq!(ingester_config.server_type(), ServerType::Ingester);
+
+        Self::new_with_existing_catalog(ServerType::Router, ingester_config)
+            .with_existing_object_store(ingester_config)
+            .with_ingester_addresses(&[ingester_config.ingester_base()])
+    }
+
+    /// Create a minimal ingester configuration, using the dsn configuration specified. Set the
+    /// persistence options such that it will persist as quickly as possible.
+    pub fn new_ingester(dsn: impl Into<String>) -> Self {
+        let dsn = Some(dsn.into());
+        Self::new(ServerType::Ingester, dsn, random_catalog_schema_name())
+            .with_new_object_store()
+            .with_new_wal()
+            .with_env("INFLUXDB_IOX_WAL_ROTATION_PERIOD_SECONDS", "1")
+    }
+
+    /// Create a minimal ingester configuration, using the dsn configuration specified. Set the
+    /// persistence options such that it will likely never persist, to be able to test when data
+    /// only exists in the ingester's memory.
+    pub fn new_ingester_never_persist(dsn: impl Into<String>) -> Self {
+        let dsn = Some(dsn.into());
+        Self::new(ServerType::Ingester, dsn, random_catalog_schema_name())
+            .with_new_object_store()
+            .with_new_wal()
+            // I didn't run my tests for a day, because that would be too long
+            .with_env("INFLUXDB_IOX_WAL_ROTATION_PERIOD_SECONDS", "86400")
+    }
+
+    /// Create another ingester with the same dsn, catalog schema name, and object store, but with
+    /// its own WAL directory and own addresses.
+    pub fn another_ingester(ingester_config: &TestConfig) -> Self {
+        Self {
+            env: ingester_config.env.clone(),
+            client_headers: ingester_config.client_headers.clone(),
+            server_type: ServerType::Ingester,
+            dsn: ingester_config.dsn.clone(),
+            catalog_schema_name: ingester_config.catalog_schema_name.clone(),
+            object_store_dir: None,
+            wal_dir: None,
+            catalog_dir: ingester_config.catalog_dir.as_ref().map(Arc::clone),
+            addrs: Arc::new(BindAddresses::default()),
+            wait_for_ready: ingester_config.wait_for_ready,
+        }
+        .with_existing_object_store(ingester_config)
+        .with_new_wal()
+    }
+
+    /// Create a minimal querier configuration from the specified ingester configuration, using
+    /// the same dsn and object store, and pointing at the specified ingester.
+    pub fn new_querier(ingester_config: &TestConfig) -> Self {
+        assert_eq!(ingester_config.server_type(), ServerType::Ingester);
+
+        Self::new_querier_without_ingester(ingester_config)
+            .with_ingester_addresses(&[ingester_config.ingester_base()])
+    }
+
+    /// Create a minimal compactor configuration, using the dsn configuration from other
+    pub fn new_compactor(other: &TestConfig) -> Self {
+        Self::new_with_existing_catalog(ServerType::Compactor, other)
+            .with_existing_object_store(other)
+    }
+
+    /// Create a minimal querier configuration from the specified ingester configuration, using
+    /// the same dsn and object store, but without specifying the ingester addresses
+    pub fn new_querier_without_ingester(ingester_config: &TestConfig) -> Self {
+        Self::new_with_existing_catalog(ServerType::Querier, ingester_config)
+            .with_existing_object_store(ingester_config)
+            // Hard code query threads so query plans do not vary based on environment
+            .with_env("INFLUXDB_IOX_NUM_QUERY_THREADS", "4")
+            .with_env(
+                "INFLUXDB_IOX_DATAFUSION_CONFIG",
+                "iox.influxql_metadata_cutoff:1990-01-01T00:00:00Z",
+            )
+    }
+
+    /// Create a minimal all in one configuration
+    pub fn new_all_in_one(dsn: Option<String>) -> Self {
+        Self::new(ServerType::AllInOne, dsn, random_catalog_schema_name()).with_new_object_store()
+    }
+
+    /// Create a minimal all in one configuration with the specified
+    /// data directory (`--data_dir = <data_dir>`)
+    ///
+    /// the data_dir has a file based object store and sqlite catalog
+    pub fn new_all_in_one_with_data_dir(data_dir: &Path) -> Self {
+        let dsn = None; // use default sqlite catalog in data_dir
+
+        let data_dir_str = data_dir.as_os_str().to_str().unwrap();
+        Self::new(ServerType::AllInOne, dsn, random_catalog_schema_name())
+            .with_env("INFLUXDB_IOX_DB_DIR", data_dir_str)
+    }
+
+    /// Set the number of failed ingester queries before the querier considers
+    /// the ingester to be dead.
+    pub fn with_querier_circuit_breaker_threshold(self, count: usize) -> Self {
+        assert!(count > 0);
+        self.with_env(
+            "INFLUXDB_IOX_INGESTER_CIRCUIT_BREAKER_THRESHOLD",
+            count.to_string(),
+        )
+    }
+
+    /// Configure tracing capture
+    pub fn with_tracing(self, udp_capture: &UdpCapture) -> Self {
+        self.with_env("TRACES_EXPORTER", "jaeger")
+            .with_env("TRACES_EXPORTER_JAEGER_AGENT_HOST", udp_capture.ip())
+            .with_env("TRACES_EXPORTER_JAEGER_AGENT_PORT", udp_capture.port())
+            .with_env(
+                "TRACES_EXPORTER_JAEGER_TRACE_CONTEXT_HEADER_NAME",
+                "custom-trace-header",
+            )
+            .with_client_header("custom-trace-header", "4:3:2:1")
+            .with_env("INFLUXDB_IOX_COMPACTION_PARTITION_TRACE", "all")
+    }
+
+    /// Configure a custom debug name for tracing
+    pub fn with_tracing_debug_name(self, custom_debug_name: &str) -> Self {
+        // setup a custom debug name (to ensure it gets plumbed through)
+        self.with_env("TRACES_EXPORTER_JAEGER_DEBUG_NAME", custom_debug_name)
+            .with_client_header(custom_debug_name, "some-debug-id")
+    }
+
+    pub fn with_ingester_addresses(
+        self,
+        ingester_addresses: &[impl std::borrow::Borrow<str>],
+    ) -> Self {
+        self.with_env(
+            "INFLUXDB_IOX_INGESTER_ADDRESSES",
+            ingester_addresses.join(","),
+        )
+    }
+
+    pub fn with_rpc_write_replicas(self, rpc_write_replicas: NonZeroUsize) -> Self {
+        self.with_env(
+            "INFLUXDB_IOX_RPC_WRITE_REPLICAS",
+            rpc_write_replicas.get().to_string(),
+        )
+    }
+
+    pub fn with_ingester_never_persist(self) -> Self {
+        self.with_env("INFLUXDB_IOX_WAL_ROTATION_PERIOD_SECONDS", "86400")
+    }
+
+    /// Configure the single tenancy mode, including the authorization server.
+    pub fn with_single_tenancy(self, addr: impl Into<String>) -> Self {
+        self.with_env("INFLUXDB_IOX_AUTHZ_ADDR", addr)
+            .with_env("INFLUXDB_IOX_SINGLE_TENANCY", "true")
+    }
+
+    /// Enable partial writes.
+    pub fn with_partial_writes(self) -> Self {
+        self.with_env("INFLUXDB_IOX_PARTIAL_WRITES_ENABLED", "true")
+    }
+
+    // Get the catalog DSN URL if set.
+    pub fn dsn(&self) -> &Option<String> {
+        &self.dsn
+    }
+
+    // Get the catalog postgres schema name
+    pub fn catalog_schema_name(&self) -> &str {
+        &self.catalog_schema_name
+    }
+
+    /// Retrieve the directory used to write WAL files to, if set
+    pub fn wal_dir(&self) -> &Option<Arc<TempDir>> {
+        &self.wal_dir
+    }
+
+    /// Retrieve the directory used for object store, if set
+    pub fn object_store_dir(&self) -> &Option<Arc<TempDir>> {
+        &self.object_store_dir
+    }
+
+    // copy a reference to the catalog temp dir, if any
+    fn with_catalog_dir(mut self, catalog_dir: Option<Arc<TempDir>>) -> Self {
+        self.catalog_dir = catalog_dir;
+        self
+    }
+
+    /// add a name=value environment variable when starting the server
+    ///
+    /// Should not be called directly, but instead all mapping to
+    /// environment variables should be done via this structure
+    fn with_env(mut self, name: impl Into<String>, value: impl Into<String>) -> Self {
+        self.env.insert(name.into(), value.into());
+        self
+    }
+
+    /// copy the specified environment variables from other; Panic's if they do not exist.
+    ///
+    /// Should not be called directly, but instead all mapping to
+    /// environment variables should be done via this structure
+    fn copy_env(self, name: impl Into<String>, other: &TestConfig) -> Self {
+        let name = name.into();
+        let value = match other.env.get(&name) {
+            Some(v) => v.clone(),
+            None => panic!(
+                "Cannot copy {} from existing config. Available values are: {:#?}",
+                name, other.env
+            ),
+        };
+
+        self.with_env(name, value)
+    }
+
+    /// add a name=value http header to all client requests made to the server
+    fn with_client_header(mut self, name: impl AsRef<str>, value: impl AsRef<str>) -> Self {
+        self.client_headers.push((
+            name.as_ref().parse().expect("valid header name"),
+            value.as_ref().parse().expect("valid header value"),
+        ));
+        self
+    }
+
+    /// Configures a new WAL
+    fn with_new_wal(mut self) -> Self {
+        let tmpdir = TempDir::new().expect("cannot create tmp dir");
+
+        let wal_string = tmpdir.path().display().to_string();
+        self.wal_dir = Some(Arc::new(tmpdir));
+        self.with_env("INFLUXDB_IOX_WAL_DIRECTORY", wal_string)
+    }
+
+    /// Configures a new object store
+    fn with_new_object_store(mut self) -> Self {
+        let tmpdir = TempDir::new().expect("cannot create tmp dir");
+
+        let object_store_string = tmpdir.path().display().to_string();
+        self.object_store_dir = Some(Arc::new(tmpdir));
+        self.with_env("INFLUXDB_IOX_OBJECT_STORE", "file")
+            .with_env("INFLUXDB_IOX_DB_DIR", object_store_string)
+    }
+
+    /// Configures this TestConfig to use the same object store as other
+    fn with_existing_object_store(mut self, other: &TestConfig) -> Self {
+        // copy a reference to the temp dir, if any
+        self.object_store_dir = other.object_store_dir.clone();
+        self.copy_env("INFLUXDB_IOX_OBJECT_STORE", other)
+            .copy_env("INFLUXDB_IOX_DB_DIR", other)
+    }
+
+    /// Configure maximum per-table query bytes for the querier.
+    pub fn with_querier_mem_pool_bytes(self, bytes: usize) -> Self {
+        self.with_env("INFLUXDB_IOX_EXEC_MEM_POOL_BYTES", bytes.to_string())
+    }
+
+    /// Configure sharding splits for the compactor.
+    pub fn with_compactor_shards(self, n_shards: usize, shard_id: usize) -> Self {
+        self.with_env("INFLUXDB_IOX_COMPACTION_SHARD_COUNT", n_shards.to_string())
+            .with_env("INFLUXDB_IOX_COMPACTION_SHARD_ID", shard_id.to_string())
+    }
+
+    /// Limit the number of concurrent queries.
+    pub fn with_max_concurrent_queries(self, n: usize) -> Self {
+        self.with_env("INFLUXDB_IOX_MAX_CONCURRENT_QUERIES", n.to_string())
+    }
+
+    /// Set up a metadata signing key for bulk ingest.
+    pub fn with_bulk_ingest_metadata_signing_key(self, metadata_signing_key_file: &str) -> Self {
+        self.with_env(
+            "INFLUXDB_IOX_BULK_INGEST_METADATA_SIGNING_KEY_FILE",
+            metadata_signing_key_file,
+        )
+    }
+
+    /// Use a mock presigned URL generator rather than whatever object store may have been
+    /// configured. Allows for testing bulk ingest without needing S3.
+    pub fn with_mock_presigned_url_signer(self) -> Self {
+        self.with_env(
+            "INFLUXDB_IOX_BULK_INGEST_USE_MOCK_PRESIGNED_URL_SIGNER",
+            "true",
+        )
+    }
+
+    /// Register catalog peers.
+    pub fn with_catalog_peers<I, S>(self, peers: I) -> Self
+    where
+        I: IntoIterator<Item = S>,
+        S: std::fmt::Display,
+    {
+        let peers = peers.into_iter().map(|s| s.to_string()).collect::<Vec<_>>();
+        self.with_env("INFLUXDB_IOX_CATALOG_CACHE_PEERS", peers.join(","))
+    }
+
+    /// Set [`wait_for_ready`](Self::wait_for_ready).
+    pub fn with_wait_for_ready(self, wait_for_ready: bool) -> Self {
+        Self {
+            wait_for_ready,
+            ..self
+        }
+    }
+
+    /// Get the test config's server type.
+    #[must_use]
+    pub fn server_type(&self) -> ServerType {
+        self.server_type
+    }
+
+    /// Get a reference to the test config's env.
+    pub fn env(&self) -> impl Iterator<Item = (&str, &str)> {
+        self.env.iter().map(|(k, v)| (k.as_str(), v.as_str()))
+    }
+
+    /// Get a reference to the test config's client headers.
+    #[must_use]
+    pub fn client_headers(&self) -> &[(HeaderName, HeaderValue)] {
+        self.client_headers.as_ref()
+    }
+
+    /// Get a reference to the test config's addrs.
+    #[must_use]
+    pub fn addrs(&self) -> &BindAddresses {
+        &self.addrs
+    }
+
+    /// return the base ingester gRPC address, such as
+    /// `http://localhost:8082/`
+    pub fn ingester_base(&self) -> Arc<str> {
+        self.addrs().ingester_grpc_api().client_base()
+    }
+
+    /// Return a HTTP base that is usable for health and metrics.
+    ///
+    /// This depends on the [server type](Self::server_type).
+    #[must_use]
+    pub fn http_base(&self) -> Arc<str> {
+        let addr = match self.server_type {
+            ServerType::AllInOne => self.addrs.router_http_api(),
+            ServerType::Ingester => self.addrs.ingester_http_api(),
+            ServerType::Router => self.addrs.router_http_api(),
+            ServerType::Querier => self.addrs.querier_http_api(),
+            ServerType::Compactor => self.addrs.compactor_http_api(),
+            ServerType::Catalog => self.addrs.catalog_http_api(),
+            ServerType::ParquetCache => self.addrs.parquet_cache_http_api(),
+        };
+        addr.client_base()
+    }
+
+    /// Wait for server to be ready during creation.
+    pub fn wait_for_ready(&self) -> bool {
+        self.wait_for_ready
+    }
+}
+
+fn random_catalog_schema_name() -> String {
+    let mut rng = rand::thread_rng();
+
+    (&mut rng)
+        .sample_iter(rand::distributions::Alphanumeric)
+        .filter(|c| c.is_ascii_alphabetic())
+        .take(20)
+        .map(char::from)
+        .collect::<String>()
+}
+
+/// Rewrites the special "sqlite" catalog DSN to a new
+/// temporary sqlite filename in a new temporary directory such as
+///
+/// sqlite:///tmp/XygUWHUwBhSdIUNXblXo.sqlite
+///
+///
+/// This is needed to isolate different test runs from each other
+/// (there is no "schema" within a sqlite database, it is the name of
+/// the file).
+///
+/// returns (dsn, catalog_dir)
+fn specialize_dsn_if_needed(
+    dsn: Option<String>,
+    catalog_schema_name: &str,
+) -> (Option<String>, Option<Arc<TempDir>>) {
+    if dsn.as_deref() == Some("sqlite") {
+        let tmpdir = TempDir::new().expect("cannot create tmp dir for catalog");
+        let catalog_dir = Arc::new(tmpdir);
+        let dsn = format!(
+            "sqlite://{}/{catalog_schema_name}.sqlite",
+            catalog_dir.path().display()
+        );
+        info!(%dsn, "rewrote 'sqlite' to temporary file");
+        (Some(dsn), Some(catalog_dir))
+    } else {
+        (dsn, None)
+    }
+}
diff --git a/test_helpers_end_to_end/src/data_generator.rs b/test_helpers_end_to_end/src/data_generator.rs
new file mode 100644
index 0000000..c070abd
--- /dev/null
+++ b/test_helpers_end_to_end/src/data_generator.rs
@@ -0,0 +1,109 @@
+use std::time::SystemTime;
+
+/// Manages a dataset for writing / reading
+#[derive(Debug)]
+pub struct DataGenerator {
+    ns_since_epoch: i64,
+    line_protocol: String,
+}
+
+impl DataGenerator {
+    pub fn new() -> Self {
+        let ns_since_epoch = SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .expect("System time should have been after the epoch")
+            .as_nanos()
+            .try_into()
+            .expect("Unable to represent system time");
+
+        let points = vec![
+            format!("cpu_load_short,host=server01,region=us-west value=0.64 {ns_since_epoch}"),
+            format!(
+                "cpu_load_short,host=server01 value=27.99 {}",
+                ns_since_epoch + 1
+            ),
+            format!(
+                "cpu_load_short,host=server02,region=us-west value=3.89 {}",
+                ns_since_epoch + 2
+            ),
+            format!(
+                "cpu_load_short,host=server01,region=us-east value=1234567.891011 {}",
+                ns_since_epoch + 3
+            ),
+            format!(
+                "cpu_load_short,host=server01,region=us-west value=0.000003 {}",
+                ns_since_epoch + 4
+            ),
+            format!(
+                "system,host=server03 uptime=1303385i {}",
+                ns_since_epoch + 5
+            ),
+            format!(
+                "swap,host=server01,name=disk0 in=3i,out=4i {}",
+                ns_since_epoch + 6
+            ),
+            format!("status active=true {}", ns_since_epoch + 7),
+            format!("attributes color=\"blue\" {}", ns_since_epoch + 8),
+        ];
+
+        Self {
+            ns_since_epoch,
+            line_protocol: points.join("\n"),
+        }
+    }
+
+    /// substitutes "ns" --> ns_since_epoch, ns1-->ns_since_epoch+1, etc
+    pub fn substitute_nanos(&self, lines: &[&str]) -> Vec<String> {
+        let ns_since_epoch = self.ns_since_epoch;
+        let substitutions = vec![
+            ("ns0", format!("{ns_since_epoch}")),
+            ("ns1", format!("{}", ns_since_epoch + 1)),
+            ("ns2", format!("{}", ns_since_epoch + 2)),
+            ("ns3", format!("{}", ns_since_epoch + 3)),
+            ("ns4", format!("{}", ns_since_epoch + 4)),
+            ("ns5", format!("{}", ns_since_epoch + 5)),
+            ("ns6", format!("{}", ns_since_epoch + 6)),
+        ];
+
+        lines
+            .iter()
+            .map(|line| {
+                let mut line = line.to_string();
+                for (from, to) in &substitutions {
+                    line = line.replace(from, to);
+                }
+                line
+            })
+            .collect()
+    }
+
+    /// Get a reference to the data generator's line protocol.
+    #[must_use]
+    pub fn line_protocol(&self) -> &str {
+        self.line_protocol.as_ref()
+    }
+
+    /// Get the data generator's ns since epoch.
+    #[must_use]
+    pub fn ns_since_epoch(&self) -> i64 {
+        self.ns_since_epoch
+    }
+
+    /// Get the minimum time of the range of this data for querying.
+    #[must_use]
+    pub fn min_time(&self) -> i64 {
+        self.ns_since_epoch
+    }
+
+    /// Get the maximum time of the range of this data for querying.
+    #[must_use]
+    pub fn max_time(&self) -> i64 {
+        self.ns_since_epoch + 10
+    }
+}
+
+impl Default for DataGenerator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/test_helpers_end_to_end/src/database.rs b/test_helpers_end_to_end/src/database.rs
new file mode 100644
index 0000000..8e24228
--- /dev/null
+++ b/test_helpers_end_to_end/src/database.rs
@@ -0,0 +1,51 @@
+//! Helpers for initializing the shared database connection
+
+use assert_cmd::Command;
+use observability_deps::tracing::info;
+use once_cell::sync::Lazy;
+use sqlx::{migrate::MigrateDatabase, Postgres};
+use std::collections::BTreeSet;
+use tokio::sync::Mutex;
+
+// I really do want to block everything until the database is initialized...
+static DB_INITIALIZED: Lazy<Mutex<BTreeSet<String>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
+
+/// Performs once-per-process database initialization, if necessary
+pub(crate) async fn initialize_db(dsn: &str, schema_name: &str) {
+    let mut init = DB_INITIALIZED.lock().await;
+
+    // already done
+    if init.contains(schema_name) {
+        return;
+    }
+
+    info!(%dsn, %schema_name, "Initializing database...");
+
+    let dsn = iox_catalog::postgres::parse_dsn(dsn).unwrap();
+    let dsn = &dsn;
+
+    // Create the catalog database if it doesn't exist
+    if dsn.starts_with("postgres") && !Postgres::database_exists(dsn).await.unwrap() {
+        info!("Creating postgres database...");
+        match Postgres::create_database(dsn).await {
+            Err(e) => {
+                panic!("Database initialization failed: {e}.");
+            }
+            Ok(_) => {
+                info!("Database initialization succeeded");
+            }
+        }
+    }
+
+    // Set up the catalog
+    Command::cargo_bin("influxdb_iox")
+        .unwrap()
+        .arg("catalog")
+        .arg("setup")
+        .env("INFLUXDB_IOX_CATALOG_DSN", dsn)
+        .env("INFLUXDB_IOX_CATALOG_POSTGRES_SCHEMA_NAME", schema_name)
+        .ok()
+        .unwrap();
+
+    init.insert(schema_name.into());
+}
diff --git a/test_helpers_end_to_end/src/error.rs b/test_helpers_end_to_end/src/error.rs
new file mode 100644
index 0000000..1fc7279
--- /dev/null
+++ b/test_helpers_end_to_end/src/error.rs
@@ -0,0 +1,34 @@
+/// Check error returned by the Flight API.
+pub fn check_flight_error(
+    err: influxdb_iox_client::flight::Error,
+    expected_error_code: tonic::Code,
+    expected_message: Option<&str>,
+) {
+    if let Some(status) = err.tonic_status() {
+        check_tonic_status(status, expected_error_code, expected_message);
+    } else {
+        panic!("Not a gRPC error: {err}");
+    }
+}
+
+/// Check tonic status.
+pub fn check_tonic_status(
+    status: &tonic::Status,
+    expected_error_code: tonic::Code,
+    expected_message: Option<&str>,
+) {
+    assert_eq!(
+        status.code(),
+        expected_error_code,
+        "Wrong status code: {}\n\nStatus:\n{}",
+        status.code(),
+        status,
+    );
+    if let Some(expected_message) = expected_message {
+        let status_message = status.message();
+        assert_eq!(
+            status_message, expected_message,
+            "\nActual status message:\n{status_message}\nExpected message:\n{expected_message}"
+        );
+    }
+}
diff --git a/test_helpers_end_to_end/src/grpc.rs b/test_helpers_end_to_end/src/grpc.rs
new file mode 100644
index 0000000..0ad3da3
--- /dev/null
+++ b/test_helpers_end_to_end/src/grpc.rs
@@ -0,0 +1,524 @@
+use crate::MiniCluster;
+use generated_types::{
+    aggregate::AggregateType,
+    node::{Comparison, Logical, Type as NodeType, Value},
+    read_group_request::Group,
+    Aggregate, MeasurementFieldsRequest, MeasurementNamesRequest, MeasurementTagKeysRequest,
+    MeasurementTagValuesRequest, Node, Predicate, ReadFilterRequest, ReadGroupRequest, ReadSource,
+    ReadWindowAggregateRequest, TagKeyMetaNames, TagKeysRequest, TagValuesRequest, TimestampRange,
+};
+use prost::Message;
+
+#[derive(Debug, Default, Clone)]
+/// Helps create and send influxrpc / gRPC requests to IOx
+pub struct GrpcRequestBuilder {
+    read_source: Option<generated_types::google::protobuf::Any>,
+    range: Option<TimestampRange>,
+    pub predicate: Option<Predicate>,
+
+    // for read_group requests
+    group: Option<Group>,
+    group_keys: Option<Vec<String>>,
+    // also used for read_window_aggregate requests
+    aggregate_type: Option<AggregateType>,
+
+    window_every: Option<i64>,
+    offset: Option<i64>,
+}
+
+/// Trait for converting various literal rust values to their
+/// corresponding Nodes in GRPC
+pub trait GrpcLiteral {
+    fn make_node(&self) -> Node;
+}
+
+impl GrpcLiteral for f64 {
+    fn make_node(&self) -> Node {
+        Node {
+            node_type: NodeType::Literal.into(),
+            children: vec![],
+            value: Some(Value::FloatValue(*self)),
+        }
+    }
+}
+
+impl GrpcLiteral for i64 {
+    fn make_node(&self) -> Node {
+        Node {
+            node_type: NodeType::Literal.into(),
+            children: vec![],
+            value: Some(Value::IntValue(*self)),
+        }
+    }
+}
+
+impl GrpcLiteral for &str {
+    fn make_node(&self) -> Node {
+        string_value_node(*self)
+    }
+}
+
+impl GrpcLiteral for &String {
+    fn make_node(&self) -> Node {
+        string_value_node(self.as_str())
+    }
+}
+
+impl GrpcRequestBuilder {
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Creates the appropriate `Any` protobuf magic for a read source with the cluster's org and
+    /// bucket name
+    pub fn source(self, cluster: &MiniCluster) -> Self {
+        self.explicit_source(cluster.org_id(), cluster.bucket_id())
+    }
+
+    /// Creates the appropriate `Any` protobuf magic for a read source with the cluster's org and
+    /// bucket name
+    pub fn explicit_source(self, org_id: &str, bucket_id: &str) -> Self {
+        let org_id = u64::from_str_radix(org_id, 16).unwrap();
+        let bucket_id = u64::from_str_radix(bucket_id, 16).unwrap();
+
+        let partition_id = u64::from(u32::MAX);
+        let read_source = ReadSource {
+            org_id,
+            bucket_id,
+            partition_id,
+        };
+
+        // Do the magic to-any conversion
+        let mut d = bytes::BytesMut::new();
+        read_source.encode(&mut d).unwrap();
+        let read_source = generated_types::google::protobuf::Any {
+            type_url: "/TODO".to_string(),
+            value: d.freeze(),
+        };
+
+        Self {
+            read_source: Some(read_source),
+            ..self
+        }
+    }
+
+    pub fn timestamp_range(self, start: i64, end: i64) -> Self {
+        Self {
+            range: Some(TimestampRange { start, end }),
+            ..self
+        }
+    }
+
+    /// Add `tag_name=tag_value` to the predicate in the horrible gRPC structs
+    pub fn tag_predicate(self, tag_name: impl Into<String>, tag_value: impl Into<String>) -> Self {
+        self.tag_comparison_predicate(tag_name, tag_value, Comparison::Equal)
+    }
+
+    /// For all `(tag_name, tag_value)` pairs, add `tag_name=tag_value OR tag_name=tag_value ...`
+    /// to the predicate.
+    pub fn or_tag_predicates(
+        self,
+        tags: impl Iterator<Item = (impl Into<String>, impl Into<String>)>,
+    ) -> Self {
+        tags.into_iter().fold(self, |acc, (tag_name, tag_value)| {
+            let node = comparison_expression_node(
+                tag_ref_node(tag_name.into()),
+                Comparison::Equal,
+                string_value_node(tag_value),
+            );
+            acc.combine_predicate(Logical::Or, node)
+        })
+    }
+
+    /// Add `tag_name!=tag_value` to the predicate in the horrible gRPC structs
+    pub fn not_tag_predicate(
+        self,
+        tag_name: impl Into<String>,
+        tag_value: impl Into<String>,
+    ) -> Self {
+        self.tag_comparison_predicate(tag_name, tag_value, Comparison::NotEqual)
+    }
+
+    /// Add `tag_name <op> value` to the predicate where `<op>` is `Equal` or `NotEqual`
+    fn tag_comparison_predicate(
+        self,
+        tag_name: impl Into<String>,
+        tag_value: impl Into<String>,
+        comparison: Comparison,
+    ) -> Self {
+        let node = comparison_expression_node(
+            tag_ref_node(tag_name.into()),
+            comparison,
+            string_value_node(tag_value),
+        );
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Add `tag_name1=tag_name2` to the predicate
+    pub fn tag_to_tag_predicate(
+        self,
+        tag_name1: impl Into<String>,
+        tag_name2: impl Into<String>,
+    ) -> Self {
+        let node = comparison_expression_node(
+            tag_ref_node(tag_name1.into()),
+            Comparison::Equal,
+            tag_ref_node(tag_name2.into()),
+        );
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Add `_value=val` to the predicate in the horrible gRPC structs
+    pub fn field_value_predicate<L: GrpcLiteral>(self, l: L) -> Self {
+        let node =
+            comparison_expression_node(field_ref_node("_value"), Comparison::Equal, l.make_node());
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// For all values, add `_value=val OR _value=val ...` to the predicate.
+    pub fn or_field_value_predicates<L: GrpcLiteral>(
+        self,
+        values: impl Iterator<Item = L>,
+    ) -> Self {
+        values.into_iter().fold(self, |acc, value| {
+            let node = comparison_expression_node(
+                field_ref_node("_value"),
+                Comparison::Equal,
+                value.make_node(),
+            );
+            acc.combine_predicate(Logical::Or, node)
+        })
+    }
+
+    /// Add `_f=field_name` to the predicate in the horrible gRPC structs
+    pub fn field_predicate(self, field_name: impl Into<String>) -> Self {
+        let node = comparison_expression_node(
+            tag_ref_node([255].to_vec()),
+            Comparison::Equal,
+            string_value_node(field_name),
+        );
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Add `<lit>=<lit>` to the predicate in the horrible gRPC structs
+    pub fn lit_lit_predicate<L1: GrpcLiteral, L2: GrpcLiteral>(self, l1: L1, l2: L2) -> Self {
+        let node = comparison_expression_node(l1.make_node(), Comparison::Equal, l2.make_node());
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Add `_f!=field_name` to the predicate in the horrible gRPC structs
+    pub fn not_field_predicate(self, field_name: impl Into<String>) -> Self {
+        let node = comparison_expression_node(
+            tag_ref_node([255].to_vec()),
+            Comparison::NotEqual,
+            string_value_node(field_name),
+        );
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Add `_m=measurement_name` to the predicate in the horrible gRPC structs
+    pub fn measurement_predicate(self, measurement_name: impl Into<String>) -> Self {
+        let node = comparison_expression_node(
+            tag_ref_node([00].to_vec()),
+            Comparison::Equal,
+            string_value_node(measurement_name),
+        );
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Add `_m!=measurement_name` to the predicate in the horrible gRPC structs
+    pub fn not_measurement_predicate(self, measurement_name: impl Into<String>) -> Self {
+        let node = comparison_expression_node(
+            tag_ref_node([00].to_vec()),
+            Comparison::NotEqual,
+            string_value_node(measurement_name),
+        );
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Add `tag_name ~= /pattern/` to the predicate
+    pub fn regex_match_predicate(
+        self,
+        tag_name: impl Into<String>,
+        pattern: impl Into<String>,
+    ) -> Self {
+        self.regex_predicate(tag_name, pattern, Comparison::Regex)
+    }
+
+    /// Add `tag_name !~ /pattern/` to the predicate
+    pub fn not_regex_match_predicate(
+        self,
+        tag_name: impl Into<String>,
+        pattern: impl Into<String>,
+    ) -> Self {
+        self.regex_predicate(tag_name, pattern, Comparison::NotRegex)
+    }
+
+    /// Add `tag_name <op> /pattern/` to the predicate, where op is `Regex` or `NotRegEx`.
+    /// The constitution of this request was formed by looking at a real request
+    /// made to storage, which looked like this:
+    ///
+    /// ```text
+    /// root:<
+    ///         node_type:COMPARISON_EXPRESSION
+    ///         children:<node_type:TAG_REF tag_ref_value:"tag_key_name" >
+    ///         children:<node_type:LITERAL regex_value:"pattern" >
+    ///         comparison:REGEX
+    /// >
+    /// ```
+    fn regex_predicate(
+        self,
+        tag_name: impl Into<String>,
+        pattern: impl Into<String>,
+        comparison: Comparison,
+    ) -> Self {
+        let node = comparison_expression_node(
+            tag_ref_node(tag_name.into()),
+            comparison,
+            Node {
+                node_type: NodeType::Literal as i32,
+                children: vec![],
+                value: Some(Value::RegexValue(pattern.into())),
+            },
+        );
+        self.combine_predicate(Logical::And, node)
+    }
+
+    /// Set the predicate being created, panicking if this would overwrite an existing predicate
+    pub fn predicate(self, predicate: Predicate) -> Self {
+        assert!(self.predicate.is_none(), "Overwriting existing predicate");
+        Self {
+            predicate: Some(predicate),
+            ..self
+        }
+    }
+
+    /// Combine any existing predicate with the specified logical operator and node. If there is no
+    /// existing predicate, set the predicate to only the specified node.
+    pub fn combine_predicate(mut self, operator: Logical, new_node: Node) -> Self {
+        let old_predicate = self.predicate.take();
+
+        let combined_predicate = match old_predicate {
+            Some(Predicate {
+                root: Some(old_node),
+            }) => Predicate {
+                root: Some(Node {
+                    node_type: NodeType::LogicalExpression as i32,
+                    children: vec![old_node, new_node],
+                    value: Some(Value::Logical(operator as i32)),
+                }),
+            },
+            _ => Predicate {
+                root: Some(new_node),
+            },
+        };
+
+        Self {
+            predicate: Some(combined_predicate),
+            ..self
+        }
+    }
+
+    /// Append the specified strings to the `group_keys` request
+    pub fn group_keys<'a>(self, group_keys: impl IntoIterator<Item = &'a str>) -> Self {
+        assert!(self.group_keys.is_none(), "Overwriting existing group_keys");
+        let group_keys = group_keys.into_iter().map(|s| s.to_string()).collect();
+        Self {
+            group_keys: Some(group_keys),
+            ..self
+        }
+    }
+
+    /// Set the specified grouping method on a read_group request
+    pub fn group(self, group: Group) -> Self {
+        assert!(self.group.is_none(), "Overwriting existing group");
+        Self {
+            group: Some(group),
+            ..self
+        }
+    }
+
+    /// Set the specified grouping grouping aggregate on a read_group or read_window_aggregate
+    /// request
+    pub fn aggregate_type(self, aggregate_type: AggregateType) -> Self {
+        assert!(
+            self.aggregate_type.is_none(),
+            "Overwriting existing aggregate"
+        );
+        Self {
+            aggregate_type: Some(aggregate_type),
+            ..self
+        }
+    }
+
+    /// Set the window_every field for a read_window_aggregate request
+    pub fn window_every(self, window_every: i64) -> Self {
+        assert!(
+            self.window_every.is_none(),
+            "Overwriting existing window_every"
+        );
+        Self {
+            window_every: Some(window_every),
+            ..self
+        }
+    }
+
+    /// Set the offset field for a read_window_aggregate request
+    pub fn offset(self, offset: i64) -> Self {
+        assert!(self.offset.is_none(), "Overwriting existing offset");
+        Self {
+            offset: Some(offset),
+            ..self
+        }
+    }
+
+    pub fn build_read_filter(self) -> tonic::Request<ReadFilterRequest> {
+        tonic::Request::new(ReadFilterRequest {
+            read_source: self.read_source,
+            range: self.range,
+            predicate: self.predicate,
+            ..Default::default()
+        })
+    }
+
+    pub fn build_tag_keys(self) -> tonic::Request<TagKeysRequest> {
+        tonic::Request::new(TagKeysRequest {
+            tags_source: self.read_source,
+            range: self.range,
+            predicate: self.predicate,
+        })
+    }
+
+    pub fn build_tag_values(self, tag_key: &str) -> tonic::Request<TagValuesRequest> {
+        tonic::Request::new(TagValuesRequest {
+            tags_source: self.read_source,
+            range: self.range,
+            predicate: self.predicate,
+            tag_key: tag_key.as_bytes().to_vec(),
+        })
+    }
+
+    pub fn build_measurement_names(self) -> tonic::Request<MeasurementNamesRequest> {
+        tonic::Request::new(MeasurementNamesRequest {
+            source: self.read_source,
+            range: self.range,
+            predicate: self.predicate,
+        })
+    }
+
+    pub fn build_measurement_tag_keys(
+        self,
+        measurement: &str,
+    ) -> tonic::Request<MeasurementTagKeysRequest> {
+        tonic::Request::new(MeasurementTagKeysRequest {
+            source: self.read_source,
+            measurement: measurement.to_string(),
+            range: self.range,
+            predicate: self.predicate,
+        })
+    }
+
+    pub fn build_measurement_tag_values(
+        self,
+        measurement: &str,
+        tag_key: &str,
+    ) -> tonic::Request<MeasurementTagValuesRequest> {
+        tonic::Request::new(MeasurementTagValuesRequest {
+            source: self.read_source,
+            measurement: measurement.to_string(),
+            tag_key: tag_key.to_string(),
+            range: self.range,
+            predicate: self.predicate,
+        })
+    }
+
+    pub fn build_measurement_fields(
+        self,
+        measurement: &str,
+    ) -> tonic::Request<MeasurementFieldsRequest> {
+        tonic::Request::new(MeasurementFieldsRequest {
+            source: self.read_source,
+            measurement: measurement.to_string(),
+            range: self.range,
+            predicate: self.predicate,
+        })
+    }
+
+    /// Creates a read group request
+    pub fn build_read_group(self) -> tonic::Request<ReadGroupRequest> {
+        let aggregate = self.aggregate_type.map(|aggregate_type| Aggregate {
+            r#type: aggregate_type.into(),
+        });
+
+        let group_keys = self.group_keys.unwrap_or_default();
+        let group = self
+            .group
+            .expect("no group specified, can't create read_group_request")
+            .into();
+
+        tonic::Request::new(ReadGroupRequest {
+            read_source: self.read_source,
+            range: self.range,
+            predicate: self.predicate,
+            group_keys,
+            group,
+            aggregate,
+        })
+    }
+
+    /// Creates a read window_aggregate request
+    pub fn build_read_window_aggregate(self) -> tonic::Request<ReadWindowAggregateRequest> {
+        // we support only a single aggregate for now
+        let aggregate = self
+            .aggregate_type
+            .map(|aggregate_type| {
+                vec![Aggregate {
+                    r#type: aggregate_type.into(),
+                }]
+            })
+            .expect("No aggregate specified, can't create read_window_aggregate request");
+
+        tonic::Request::new(ReadWindowAggregateRequest {
+            read_source: self.read_source,
+            range: self.range,
+            predicate: self.predicate,
+            window_every: self.window_every.expect("no window_every specified"),
+            offset: self.offset.expect("no offset specified"),
+            aggregate,
+            window: None,
+            tag_key_meta_names: TagKeyMetaNames::Text as i32,
+        })
+    }
+}
+
+pub(crate) fn field_ref_node(field_name: impl Into<String>) -> Node {
+    Node {
+        node_type: NodeType::FieldRef.into(),
+        children: vec![],
+        value: Some(Value::FieldRefValue(field_name.into())),
+    }
+}
+
+pub(crate) fn tag_ref_node(tag_name: impl Into<Vec<u8>>) -> Node {
+    Node {
+        node_type: NodeType::TagRef as i32,
+        children: vec![],
+        value: Some(Value::TagRefValue(tag_name.into())),
+    }
+}
+
+pub(crate) fn string_value_node(value: impl Into<String>) -> Node {
+    Node {
+        node_type: NodeType::Literal as i32,
+        children: vec![],
+        value: Some(Value::StringValue(value.into())),
+    }
+}
+
+pub(crate) fn comparison_expression_node(lhs: Node, comparison: Comparison, rhs: Node) -> Node {
+    Node {
+        node_type: NodeType::ComparisonExpression as i32,
+        children: vec![lhs, rhs],
+        value: Some(Value::Comparison(comparison as _)),
+    }
+}
diff --git a/test_helpers_end_to_end/src/http_reverse_proxy.rs b/test_helpers_end_to_end/src/http_reverse_proxy.rs
new file mode 100644
index 0000000..5b58a93
--- /dev/null
+++ b/test_helpers_end_to_end/src/http_reverse_proxy.rs
@@ -0,0 +1,160 @@
+//! Poor-mans simulation of an HTTP/2 service that randomizes incoming requests to a number of backend services.
+
+use std::{
+    net::{SocketAddr, TcpListener},
+    sync::{Arc, Weak},
+    thread::JoinHandle,
+};
+
+use http::{Request, Response};
+use hyper::{
+    client::HttpConnector,
+    service::{make_service_fn, service_fn},
+    Body, Client, Server,
+};
+use rand::seq::SliceRandom;
+use tokio_util::sync::CancellationToken;
+
+use crate::service_link::{LinkableService, LinkableServiceImpl};
+
+/// A basic HTTP reverse proxy for use by end-to-end tests
+///
+/// Intended to approximate a Kubernetes Service.
+///
+/// # Implementation
+/// This runs in a dedicated thread in its own tokio runtime. The reason is that we potentially share a single proxy
+/// between multiple tests, but every test sets up its own tokio runtime and moving IO tasks between runtimes can cause blocking.
+#[derive(Debug)]
+pub struct HttpReverseProxy {
+    addr: SocketAddr,
+    shutdown: CancellationToken,
+    task: Option<JoinHandle<()>>,
+    links: LinkableServiceImpl,
+}
+
+impl HttpReverseProxy {
+    pub fn new<I, S>(backends: I) -> Self
+    where
+        I: IntoIterator<Item = S>,
+        S: ToString,
+    {
+        let client = Client::builder().http2_only(true).build_http();
+        let inner = Arc::new(Inner {
+            backends: backends.into_iter().map(|s| s.to_string()).collect(),
+            client,
+        });
+        assert!(!inner.backends.is_empty(), "need at least 1 backend");
+
+        let addr = SocketAddr::from(([127, 0, 0, 1], 0));
+
+        let make_service = make_service_fn(move |_conn| {
+            let inner = Arc::clone(&inner);
+
+            async move {
+                Ok::<_, hyper::Error>(service_fn(move |req| {
+                    let inner = Arc::clone(&inner);
+
+                    async move { inner.handle(req).await }
+                }))
+            }
+        });
+
+        let listener = TcpListener::bind(addr).unwrap();
+        let addr = listener.local_addr().unwrap();
+
+        let shutdown = CancellationToken::new();
+        let shutdown_captured = shutdown.clone();
+        let task = std::thread::spawn(move || {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+
+            rt.block_on(async {
+                let server = Server::from_tcp(listener)
+                    .unwrap()
+                    .http2_only(true)
+                    .serve(make_service);
+
+                tokio::select! {
+                    _ = shutdown_captured.cancelled() => {}
+                    res = server => {
+                        if let Err(e) = res {
+                            eprintln!("server error: {}", e);
+                        }
+                    }
+                }
+            })
+        });
+
+        Self {
+            addr,
+            shutdown,
+            task: Some(task),
+            links: Default::default(),
+        }
+    }
+
+    pub fn addr(&self) -> SocketAddr {
+        self.addr
+    }
+}
+
+impl Drop for HttpReverseProxy {
+    fn drop(&mut self) {
+        self.shutdown.cancel();
+
+        if self.task.take().expect("not joined yet").join().is_err() {
+            eprintln!("server task error, check logs");
+        }
+    }
+}
+
+impl LinkableService for HttpReverseProxy {
+    fn add_link_client(&self, client: Weak<dyn LinkableService>) {
+        self.links.add_link_client(client)
+    }
+
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>> {
+        self.links.remove_link_clients()
+    }
+
+    fn add_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.add_link_server(server)
+    }
+
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.remove_link_server(server)
+    }
+}
+
+#[derive(Debug)]
+struct Inner {
+    backends: Vec<String>,
+    client: Client<HttpConnector>,
+}
+
+impl Inner {
+    async fn handle(&self, req: Request<Body>) -> Result<Response<Body>, hyper::Error> {
+        let uri = self.pick_backend();
+
+        let (mut parts, body) = req.into_parts();
+
+        // build URI
+        let mut uri = uri.to_owned();
+        uri.push_str(parts.uri.path());
+        if let Some(q) = parts.uri.query() {
+            uri.push('?');
+            uri.push_str(q);
+        }
+        parts.uri = uri.parse().unwrap();
+
+        let req = Request::from_parts(parts, body);
+        self.client.request(req).await
+    }
+
+    fn pick_backend(&self) -> &str {
+        let mut rng = rand::thread_rng();
+        self.backends.choose(&mut rng).expect("not empty")
+    }
+}
diff --git a/test_helpers_end_to_end/src/lib.rs b/test_helpers_end_to_end/src/lib.rs
new file mode 100644
index 0000000..8fa331c
--- /dev/null
+++ b/test_helpers_end_to_end/src/lib.rs
@@ -0,0 +1,160 @@
+#![warn(unused_crate_dependencies)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use rand::{
+    distributions::{Alphanumeric, Standard},
+    thread_rng, Rng,
+};
+
+mod addrs;
+mod authz;
+mod client;
+mod config;
+mod data_generator;
+mod database;
+mod error;
+mod grpc;
+mod http_reverse_proxy;
+mod mini_cluster;
+mod server_fixture;
+mod server_type;
+mod service_link;
+pub mod snapshot_comparison;
+mod steps;
+mod udp_listener;
+
+pub use addrs::BindAddresses;
+pub use authz::Authorizer;
+pub use client::*;
+pub use config::TestConfig;
+pub use data_generator::DataGenerator;
+pub use error::{check_flight_error, check_tonic_status};
+pub use grpc::GrpcRequestBuilder;
+pub use http_reverse_proxy::HttpReverseProxy;
+pub use mini_cluster::MiniCluster;
+pub use server_fixture::{ServerFixture, TestServer};
+pub use server_type::{AddAddrEnv, ServerType};
+pub use steps::{FCustom, Step, StepTest, StepTestState};
+pub use udp_listener::UdpCapture;
+
+/// Return a random string suitable for use as a namespace name
+pub fn rand_name() -> String {
+    thread_rng()
+        .sample_iter(&Alphanumeric)
+        .take(10)
+        .map(char::from)
+        .collect()
+}
+
+// return a random 16 digit string comprised of numbers suitable for
+// use as a influxdb2 org_id or bucket_id
+pub fn rand_id() -> String {
+    thread_rng()
+        .sample_iter(&Standard)
+        .filter_map(|c: u8| {
+            if c.is_ascii_digit() {
+                Some(char::from(c))
+            } else {
+                // discard if out of range
+                None
+            }
+        })
+        .take(16)
+        .collect()
+}
+
+/// Log the [`std::process::Command`] being run in a way that's convenient to copy-paste
+fn log_command(command: &std::process::Command) {
+    use observability_deps::tracing::info;
+
+    let envs_for_printing: Vec<_> = command
+        .get_envs()
+        .map(|(key, value)| {
+            format!(
+                "{}={}",
+                key.to_str().unwrap(),
+                value.unwrap_or_default().to_str().unwrap()
+            )
+        })
+        .collect();
+    let envs_for_printing = envs_for_printing.join(" ");
+
+    info!("Running command: `{envs_for_printing} {:?}`", command);
+}
+
+/// Dumps the content of the log file to stdout
+fn dump_log_to_stdout(server_type: &str, log_path: &std::path::Path) {
+    use observability_deps::tracing::info;
+    use std::io::Read;
+
+    let mut f = std::fs::File::open(log_path).expect("failed to open log file");
+    let mut buffer = [0_u8; 8 * 1024];
+
+    info!("****************");
+    info!("Start {server_type} Output");
+    info!("****************");
+
+    while let Ok(read) = f.read(&mut buffer) {
+        if read == 0 {
+            break;
+        }
+        if let Ok(str) = std::str::from_utf8(&buffer[..read]) {
+            print!("{str}");
+        } else {
+            info!(
+                "\n\n-- ERROR IN TRANSFER -- please see {:?} for raw contents ---\n\n",
+                log_path
+            );
+        }
+    }
+
+    info!("****************");
+    info!("End {server_type} Output");
+    info!("****************");
+}
+
+// Helper macro to skip tests if TEST_INTEGRATION and TEST_INFLUXDB_IOX_CATALOG_DSN environment
+// variables are not set.
+#[macro_export]
+macro_rules! maybe_skip_integration {
+    ($panic_msg:expr) => {{
+        use std::env;
+        dotenvy::dotenv().ok();
+
+        match (
+            env::var("TEST_INTEGRATION").is_ok(),
+            env::var("TEST_INFLUXDB_IOX_CATALOG_DSN").ok(),
+        ) {
+            (true, Some(dsn)) => dsn,
+            (true, None) => {
+                panic!(
+                    "TEST_INTEGRATION is set which requires running integration tests, but \
+                    TEST_INFLUXDB_IOX_CATALOG_DSN is not set. Please set \
+                    TEST_INFLUXDB_IOX_CATALOG_DSN to the test catalog database. For example, \
+                    `TEST_INFLUXDB_IOX_CATALOG_DSN=postgres://postgres@localhost/iox_shared_test` \
+                    would connect to a Postgres catalog."
+                )
+            }
+            (false, maybe_dsn) => {
+                let unset_vars = match maybe_dsn {
+                    Some(_) => "TEST_INTEGRATION",
+                    None => "TEST_INTEGRATION and TEST_INFLUXDB_IOX_CATALOG_DSN",
+                };
+
+                eprintln!("skipping end-to-end integration tests - set {unset_vars} to run");
+
+                let panic_msg: &'static str = $panic_msg;
+                if !panic_msg.is_empty() {
+                    panic!("{}", panic_msg);
+                }
+
+                return;
+            }
+        }
+    }};
+    () => {
+        maybe_skip_integration!("")
+    };
+}
diff --git a/test_helpers_end_to_end/src/mini_cluster.rs b/test_helpers_end_to_end/src/mini_cluster.rs
new file mode 100644
index 0000000..49115f0
--- /dev/null
+++ b/test_helpers_end_to_end/src/mini_cluster.rs
@@ -0,0 +1,907 @@
+use crate::{
+    dump_log_to_stdout, log_command, rand_id,
+    server_type::AddAddrEnv,
+    service_link::{link_services, LinkableService},
+    write_to_ingester, write_to_router, HttpReverseProxy, ServerFixture, TestConfig, TestServer,
+};
+use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
+use arrow_flight::{
+    decode::{DecodedFlightData, DecodedPayload, FlightDataDecoder},
+    error::FlightError,
+    Ticket,
+};
+use assert_cmd::prelude::*;
+use data_types::{NamespaceId, TableId};
+use futures::{stream::FuturesOrdered, StreamExt};
+use http::Response;
+use hyper::Body;
+use influxdb_iox_client::{
+    catalog::generated_types::{
+        catalog_service_client::CatalogServiceClient, GetPartitionsByTableIdRequest,
+    },
+    connection::{Connection, GrpcConnection},
+    schema::generated_types::{schema_service_client::SchemaServiceClient, GetSchemaRequest},
+};
+use ingester_query_grpc::influxdata::iox::ingester::v1::{
+    IngesterQueryRequest, IngesterQueryResponseMetadata,
+};
+use observability_deps::tracing::{debug, info};
+use once_cell::sync::Lazy;
+use prost::Message;
+use std::{
+    process::Command,
+    sync::{Arc, Weak},
+    time::Instant,
+};
+use tempfile::NamedTempFile;
+use tokio::sync::{Mutex, OnceCell};
+
+/// Structure that holds services and helpful accessors. Does not start services for a compactor;
+/// that is always run separately on-demand in tests.
+#[derive(Debug, Default)]
+pub struct MiniCluster {
+    /// Standard optional router
+    router: Option<ServerFixture>,
+
+    /// Standard optional ingester(s)
+    ingesters: Vec<ServerFixture>,
+
+    /// Standard optional querier
+    querier: Option<ServerFixture>,
+
+    /// Standard optional compactor configuration, to be used on-demand
+    compactor_config: Option<TestConfig>,
+
+    /// Catalog reverse proxy.
+    catalog_reverse_proxy: Option<Arc<HttpReverseProxy>>,
+
+    /// Catalog cache servers.
+    catalog: Vec<ServerFixture>,
+
+    // Potentially helpful data
+    org_id: String,
+    bucket_id: String,
+    namespace: String,
+    namespace_id: OnceCell<NamespaceId>,
+}
+
+impl MiniCluster {
+    pub fn new() -> Self {
+        let org_id = rand_id();
+        let bucket_id = rand_id();
+        let namespace = format!("{org_id}_{bucket_id}");
+
+        Self {
+            org_id,
+            bucket_id,
+            namespace,
+            ..Self::default()
+        }
+    }
+
+    pub fn new_based_on_tenancy(is_single_tenant: bool) -> Self {
+        let org_id = rand_id();
+        let bucket_id = rand_id();
+        let namespace = match is_single_tenant {
+            true => bucket_id.clone(),
+            false => format!("{org_id}_{bucket_id}"),
+        };
+
+        Self {
+            org_id,
+            bucket_id,
+            namespace,
+            ..Self::default()
+        }
+    }
+
+    /// Create a new MiniCluster that shares the same underlying servers but has a new unique
+    /// namespace and set of connections
+    ///
+    /// Note this is an internal implementation -- please use
+    /// [`create_shared`](Self::create_shared) and [`new`](Self::new) to create new MiniClusters.
+    fn new_from_fixtures(
+        router: Option<ServerFixture>,
+        ingesters: Vec<ServerFixture>,
+        querier: Option<ServerFixture>,
+        compactor_config: Option<TestConfig>,
+        catalog: Vec<ServerFixture>,
+        catalog_reverse_proxy: Option<Arc<HttpReverseProxy>>,
+    ) -> Self {
+        let org_id = rand_id();
+        let bucket_id = rand_id();
+        let namespace = format!("{org_id}_{bucket_id}");
+
+        Self {
+            router,
+            ingesters,
+            querier,
+            compactor_config,
+            catalog,
+            catalog_reverse_proxy,
+
+            org_id,
+            bucket_id,
+            namespace,
+            namespace_id: Default::default(),
+        }
+    }
+
+    /// Create a "standard" shared MiniCluster that starts a router, ingester, and querier. Save
+    /// config for a compactor, but the compactor service should be run on-demand in tests using
+    /// `compactor run-once` rather than using `run compactor`.
+    ///
+    /// Note: Because the underlying server processes are shared across multiple tests, all users
+    /// of this `MiniCluster` instance should only modify their own unique namespace.
+    pub async fn create_shared(database_url: String) -> Self {
+        let start = Instant::now();
+        let mut shared_servers = GLOBAL_SHARED_SERVERS.lock().await;
+        debug!(mutex_wait=?start.elapsed(), "creating standard cluster");
+
+        // try to reuse existing server processes
+        if let Some(shared) = shared_servers.take() {
+            if let Some(cluster) = shared.creatable_cluster().await {
+                debug!("Reusing existing cluster");
+
+                // Put the server back
+                *shared_servers = Some(shared);
+                let start = Instant::now();
+                // drop the lock prior to calling `create()` to allow others to proceed
+                std::mem::drop(shared_servers);
+                let new_self = cluster.create().await;
+                info!(
+                    total_wait=?start.elapsed(),
+                    "created new mini cluster from existing cluster"
+                );
+                return new_self;
+            } else {
+                info!("some server proceses of previous cluster have already returned");
+            }
+        }
+
+        // Have to make a new one
+        info!("Create a new server");
+        let new_cluster = Self::create_non_shared(database_url).await;
+
+        // Update the shared servers to point at the newly created server proesses
+        *shared_servers = Some(SharedServers::new(&new_cluster));
+        new_cluster
+    }
+
+    /// Create a shared  MiniCluster that has a router, ingester set to essentially
+    /// never persist data (except on-demand), and querier. Save config for a compactor, but the
+    /// compactor service should be run on-demand in tests using `compactor run-once` rather than
+    /// using `run compactor`.
+    ///
+    /// Note: Because the underlying server processes are shared across multiple tests, all users
+    /// of this `MiniCluster` instance should only modify their own unique namespace.
+    pub async fn create_shared_never_persist(database_url: String) -> Self {
+        let start = Instant::now();
+        let mut shared_servers = GLOBAL_SHARED_SERVERS_NEVER_PERSIST.lock().await;
+        debug!(mutex_wait=?start.elapsed(), "creating standard cluster");
+
+        // try to reuse existing server processes
+        if let Some(shared) = shared_servers.take() {
+            if let Some(cluster) = shared.creatable_cluster().await {
+                debug!("Reusing existing cluster");
+
+                // Put the server back
+                *shared_servers = Some(shared);
+                let start = Instant::now();
+                // drop the lock prior to calling `create()` to allow others to proceed
+                std::mem::drop(shared_servers);
+                let new_self = cluster.create().await;
+                info!(
+                    total_wait=?start.elapsed(),
+                    "created new mini cluster from existing cluster"
+                );
+                return new_self;
+            } else {
+                info!("some server proceses of previous cluster have already returned");
+            }
+        }
+
+        // Have to make a new one
+        info!("Create a new server set to never persist");
+        let new_cluster = Self::create_non_shared_never_persist(database_url).await;
+
+        // Update the shared servers to point at the newly created server proesses
+        *shared_servers = Some(SharedServers::new(&new_cluster));
+        new_cluster
+    }
+
+    /// Create a non-shared "standard" MiniCluster that has a router, ingester,
+    /// querier. Save config for a compactor, but the compactor service should be run on-demand in
+    /// tests using `compactor run-once` rather than using `run compactor`.
+    pub async fn create_non_shared(database_url: String) -> Self {
+        let catalog_configs = TestConfig::catalog_nodes(&database_url);
+        let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new(
+            catalog_configs
+                .iter()
+                .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()),
+        ));
+
+        let ingester_config =
+            TestConfig::new_ingester(format!("http://{}", catalog_reverse_proxy.addr()));
+        let router_config = TestConfig::new_router(&ingester_config);
+        let querier_config = TestConfig::new_querier(&ingester_config);
+        let compactor_config = TestConfig::new_compactor(&ingester_config);
+
+        // Set up the cluster  ====================================
+        Self::new()
+            .with_catalog(catalog_configs)
+            .await
+            .with_catalog_reverse_proxy(catalog_reverse_proxy)
+            .with_ingester(ingester_config)
+            .await
+            .with_router(router_config)
+            .await
+            .with_querier(querier_config)
+            .await
+            .with_compactor_config(compactor_config)
+    }
+
+    /// Create a non-shared MiniCluster that has a router, ingester set to essentially
+    /// never persist data (except on-demand), and querier. Save config for a compactor, but the
+    /// compactor service should be run on-demand in tests using `compactor run-once` rather than
+    /// using `run compactor`.
+    pub async fn create_non_shared_never_persist(database_url: String) -> Self {
+        let catalog_configs = TestConfig::catalog_nodes(&database_url);
+        let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new(
+            catalog_configs
+                .iter()
+                .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()),
+        ));
+
+        let ingester_config = TestConfig::new_ingester_never_persist(format!(
+            "http://{}",
+            catalog_reverse_proxy.addr()
+        ));
+        let router_config = TestConfig::new_router(&ingester_config);
+        let querier_config = TestConfig::new_querier(&ingester_config);
+        let compactor_config = TestConfig::new_compactor(&ingester_config);
+
+        // Set up the cluster  ====================================
+        Self::new()
+            .with_catalog(catalog_configs)
+            .await
+            .with_catalog_reverse_proxy(catalog_reverse_proxy)
+            .with_ingester(ingester_config)
+            .await
+            .with_router(router_config)
+            .await
+            .with_querier(querier_config)
+            .await
+            .with_compactor_config(compactor_config)
+    }
+
+    /// Create a non-shared MiniCluster that has a router,
+    /// ingester, and querier. The router and querier will be configured
+    /// to use the authorization service and will require all requests to
+    /// be authorized. Save config for a compactor, but the compactor service
+    /// should be run on-demand in tests using `compactor run-once` rather
+    /// than using `run compactor`.
+    pub async fn create_non_shared_with_authz(
+        database_url: String,
+        authz_addr: impl Into<String> + Clone + Send,
+    ) -> Self {
+        let catalog_configs = TestConfig::catalog_nodes(&database_url);
+        let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new(
+            catalog_configs
+                .iter()
+                .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()),
+        ));
+
+        let ingester_config =
+            TestConfig::new_ingester(format!("http://{}", catalog_reverse_proxy.addr()));
+        let router_config =
+            TestConfig::new_router(&ingester_config).with_single_tenancy(authz_addr.clone());
+        let querier_config =
+            TestConfig::new_querier(&ingester_config).with_single_tenancy(authz_addr);
+        let compactor_config = TestConfig::new_compactor(&ingester_config);
+
+        // Set up the cluster  ====================================
+        Self::new_based_on_tenancy(true)
+            .with_catalog(catalog_configs)
+            .await
+            .with_catalog_reverse_proxy(catalog_reverse_proxy)
+            .with_ingester(ingester_config)
+            .await
+            .with_router(router_config)
+            .await
+            .with_querier(querier_config)
+            .await
+            .with_compactor_config(compactor_config)
+    }
+
+    /// Create an all-(minus compactor)-in-one server with the specified configuration
+    pub async fn create_all_in_one(test_config: TestConfig) -> Self {
+        Self::new()
+            .with_ingester(test_config.clone())
+            .await
+            .with_router(test_config.clone())
+            .await
+            .with_querier(test_config.clone())
+            .await
+    }
+
+    /// create a router with the specified configuration
+    pub async fn with_router(mut self, router_config: TestConfig) -> Self {
+        assert!(self.router.is_none());
+        let fixture = ServerFixture::create(router_config).await;
+        self.add_catalog_reverse_proxy_client(fixture.strong());
+        self.add_ingester_client(fixture.strong());
+        self.router = Some(fixture);
+        self
+    }
+
+    /// create an ingester with the specified configuration;
+    pub async fn with_ingester(mut self, ingester_config: TestConfig) -> Self {
+        let fixture = ServerFixture::create(ingester_config).await;
+        self.add_catalog_reverse_proxy_client(fixture.strong());
+        self.ingesters.push(fixture);
+        self
+    }
+
+    fn add_ingester_client(&self, client: Arc<dyn LinkableService>) {
+        for ingester in &self.ingesters {
+            let ingester = ingester.strong();
+            link_services(ingester, Arc::clone(&client));
+        }
+    }
+
+    /// create a querier with the specified configuration;
+    pub async fn with_querier(mut self, querier_config: TestConfig) -> Self {
+        assert!(self.querier.is_none());
+        let fixture = ServerFixture::create(querier_config).await;
+        self.add_catalog_reverse_proxy_client(fixture.strong());
+        self.add_ingester_client(fixture.strong());
+        self.querier = Some(fixture);
+        self
+    }
+
+    pub fn with_compactor_config(mut self, compactor_config: TestConfig) -> Self {
+        self.compactor_config = Some(compactor_config);
+        self
+    }
+
+    /// create an catalog with the specified configuration;
+    pub async fn with_catalog(mut self, catalog_configs: [TestConfig; 3]) -> Self {
+        assert!(self.catalog.is_empty());
+        self.catalog = ServerFixture::create_multiple(catalog_configs).await;
+        self
+    }
+
+    fn add_catalog_client(&self, client: Arc<dyn LinkableService>) {
+        for catalog in &self.catalog {
+            let catalog = catalog.strong();
+            link_services(catalog, Arc::clone(&client));
+        }
+    }
+
+    /// Register catalog reverse proxy.
+    pub fn with_catalog_reverse_proxy(mut self, proxy: Arc<HttpReverseProxy>) -> Self {
+        assert!(self.catalog_reverse_proxy.is_none());
+        self.add_catalog_client(Arc::clone(&proxy) as _);
+        self.catalog_reverse_proxy = Some(proxy);
+        self
+    }
+
+    fn add_catalog_reverse_proxy_client(&self, client: Arc<dyn LinkableService>) {
+        if let Some(proxy) = &self.catalog_reverse_proxy {
+            link_services(Arc::clone(proxy) as _, client);
+        }
+    }
+
+    /// Retrieve the underlying router server, if set
+    pub fn router(&self) -> &ServerFixture {
+        self.router.as_ref().expect("router not initialized")
+    }
+
+    /// Retrieve one of the underlying ingester servers, if there are any
+    pub fn ingester(&self) -> &ServerFixture {
+        self.ingesters.first().unwrap()
+    }
+
+    /// Retrieve all of the underlying ingester servers
+    pub fn ingesters(&self) -> &[ServerFixture] {
+        &self.ingesters
+    }
+
+    /// Restart router.
+    ///
+    /// This will break all currently connected clients!
+    pub async fn restart_router(&mut self) {
+        let router = self.router.take().unwrap();
+        let router = router.restart_server().await;
+        self.router = Some(router);
+    }
+
+    /// Restart ingesters.
+    ///
+    /// This will break all currently connected clients!
+    pub async fn restart_ingesters(&mut self) {
+        let mut restarted = Vec::with_capacity(self.ingesters.len());
+        for ingester in self.ingesters.drain(..) {
+            restarted.push(ingester.restart_server().await);
+        }
+        self.ingesters = restarted;
+    }
+
+    /// Gracefully stop all ingesters and wait for them to exit.
+    ///
+    /// If the shutdown does not complete within
+    /// [`GRACEFUL_SERVER_STOP_TIMEOUT`] it is killed.
+    ///
+    /// [`GRACEFUL_SERVER_STOP_TIMEOUT`]:
+    ///     crate::server_fixture::GRACEFUL_SERVER_STOP_TIMEOUT
+    pub async fn gracefully_stop_ingesters(&mut self) {
+        for ingester in self.ingesters.drain(..) {
+            ingester.shutdown().await;
+        }
+    }
+
+    /// Restart querier.
+    ///
+    /// This will break all currently connected clients!
+    pub async fn restart_querier(&mut self) {
+        let querier = self.querier.take().unwrap();
+        let querier = querier.restart_server().await;
+        self.querier = Some(querier);
+    }
+
+    /// Retrieve the underlying querier server, if set
+    pub fn querier(&self) -> &ServerFixture {
+        self.querier.as_ref().expect("querier not initialized")
+    }
+
+    /// Retrieve the compactor config, if set
+    pub fn compactor_config(&self) -> &TestConfig {
+        self.compactor_config
+            .as_ref()
+            .expect("compactor config not set")
+    }
+
+    /// Get a reference to the mini cluster's org.
+    pub fn org_id(&self) -> &str {
+        self.org_id.as_ref()
+    }
+
+    /// Get a reference to the mini cluster's bucket.
+    pub fn bucket_id(&self) -> &str {
+        self.bucket_id.as_ref()
+    }
+
+    /// Get a reference to the mini cluster's namespace.
+    pub fn namespace(&self) -> &str {
+        self.namespace.as_ref()
+    }
+
+    /// Get a reference to the mini cluster's namespace ID.
+    pub async fn namespace_id(&self) -> NamespaceId {
+        *self
+            .namespace_id
+            .get_or_init(|| async {
+                let c = self
+                    .router
+                    .as_ref()
+                    .expect("no router instance running")
+                    .router_grpc_connection()
+                    .into_grpc_connection();
+
+                let id = SchemaServiceClient::new(c)
+                    .get_schema(GetSchemaRequest {
+                        namespace: self.namespace().to_string(),
+                        table: None,
+                    })
+                    .await
+                    .expect("failed to query for namespace ID")
+                    .into_inner()
+                    .schema
+                    .unwrap()
+                    .id;
+
+                NamespaceId::new(id)
+            })
+            .await
+    }
+
+    /// Get a the table ID for the given table.
+    pub async fn table_id(&self, name: &str) -> TableId {
+        let c = self
+            .router
+            .as_ref()
+            .expect("no router instance running")
+            .router_grpc_connection()
+            .into_grpc_connection();
+
+        let id = SchemaServiceClient::new(c)
+            .get_schema(GetSchemaRequest {
+                namespace: self.namespace().to_string(),
+                table: Some(name.to_string()),
+            })
+            .await
+            .expect("failed to query for namespace ID")
+            .into_inner()
+            .schema
+            .unwrap()
+            .tables
+            .get(name)
+            .expect("table not found")
+            .id;
+
+        TableId::new(id)
+    }
+
+    /// Get all partition keys for the given table.
+    pub async fn partition_keys(
+        &self,
+        table_name: &str,
+        namespace_name: Option<String>,
+    ) -> Vec<String> {
+        let namespace_name = namespace_name.unwrap_or(self.namespace().to_string());
+
+        let c = self
+            .router
+            .as_ref()
+            .expect("no router instance running")
+            .router_grpc_connection()
+            .into_grpc_connection();
+
+        let table_id = SchemaServiceClient::new(c.clone())
+            .get_schema(GetSchemaRequest {
+                namespace: namespace_name.clone(),
+                table: Some(table_name.to_string()),
+            })
+            .await
+            .expect("failed to query for namespace ID")
+            .into_inner()
+            .schema
+            .unwrap()
+            .tables
+            .get(table_name)
+            .expect("table not found")
+            .id;
+
+        CatalogServiceClient::new(c)
+            .get_partitions_by_table_id(GetPartitionsByTableIdRequest { table_id })
+            .await
+            .expect("failed to query for partitions")
+            .into_inner()
+            .partitions
+            .into_iter()
+            .map(|p| p.key)
+            .collect()
+    }
+
+    /// Writes the line protocol to the write_base/api/v2/write endpoint on the router into the
+    /// org/bucket
+    pub async fn write_to_router(
+        &self,
+        line_protocol: impl Into<String> + Send,
+        authorization: Option<&str>,
+    ) -> Response<Body> {
+        write_to_router(
+            line_protocol,
+            &self.org_id,
+            &self.bucket_id,
+            self.router().router_http_base(),
+            authorization,
+        )
+        .await
+    }
+
+    /// Write to the ingester using the gRPC interface directly, rather than through a router.
+    pub async fn write_to_ingester(
+        &self,
+        line_protocol: impl Into<String> + Send,
+        table_name: &str,
+    ) {
+        write_to_ingester(
+            line_protocol,
+            self.namespace_id().await,
+            self.table_id(table_name).await,
+            self.ingester().ingester_grpc_connection(),
+        )
+        .await;
+    }
+
+    /// Query the ingester specified by the given gRPC connection using flight directly, rather than through a querier.
+    pub async fn query_ingester(
+        &self,
+        query: IngesterQueryRequest,
+        ingester_grpc_connection: Connection,
+    ) -> Result<IngesterResponse, FlightError> {
+        let querier_flight = influxdb_iox_client::flight::Client::new(ingester_grpc_connection);
+
+        let ticket = Ticket {
+            ticket: query.encode_to_vec().into(),
+        };
+
+        let mut performed_query = querier_flight
+            .into_inner()
+            .do_get(ticket)
+            .await?
+            .into_inner();
+
+        let mut partitions = vec![];
+        let mut current_partition = None;
+        while let Some((msg, app_metadata)) = next_message(&mut performed_query).await {
+            match msg {
+                DecodedPayload::None => {
+                    if let Some(p) = std::mem::take(&mut current_partition) {
+                        partitions.push(p);
+                    }
+                    current_partition = Some(IngesterResponsePartition {
+                        app_metadata,
+                        schema: None,
+                        record_batches: vec![],
+                    });
+                }
+                DecodedPayload::Schema(schema) => {
+                    let current_partition =
+                        current_partition.as_mut().expect("schema w/o partition");
+                    assert!(
+                        current_partition.schema.is_none(),
+                        "got two schemas for a single partition"
+                    );
+                    current_partition.schema = Some(schema);
+                }
+                DecodedPayload::RecordBatch(batch) => {
+                    let current_partition =
+                        current_partition.as_mut().expect("batch w/o partition");
+                    assert!(current_partition.schema.is_some(), "batch w/o schema");
+                    current_partition.record_batches.push(batch);
+                }
+            }
+        }
+
+        if let Some(p) = current_partition {
+            partitions.push(p);
+        }
+
+        Ok(IngesterResponse { partitions })
+    }
+
+    /// Ask all of the ingesters to persist their data for the cluster namespace.
+    pub async fn persist_ingesters(&self) {
+        self.persist_ingesters_by_namespace(None).await;
+    }
+
+    /// Ask all of the ingesters to persist their data for a specified namespace, or the cluster
+    /// namespace if none specified.
+    pub async fn persist_ingesters_by_namespace(&self, namespace: Option<String>) {
+        let namespace = namespace.unwrap_or_else(|| self.namespace().into());
+        for ingester in &self.ingesters {
+            let mut ingester_client =
+                influxdb_iox_client::ingester::Client::new(ingester.ingester_grpc_connection());
+
+            ingester_client.persist(namespace.clone()).await.unwrap();
+        }
+    }
+
+    pub fn run_compaction(&self) -> Result<(), String> {
+        let (log_file, log_path) = NamedTempFile::new()
+            .expect("opening log file")
+            .keep()
+            .expect("expected to keep");
+
+        let stdout_log_file = log_file
+            .try_clone()
+            .expect("cloning file handle for stdout");
+        let stderr_log_file = log_file;
+
+        info!("****************");
+        info!("Compactor run-once logging to {:?}", log_path);
+        info!("****************");
+
+        // If set in test environment, use that value, else default to info
+        let log_filter =
+            std::env::var("LOG_FILTER").unwrap_or_else(|_| "info,sqlx=warn".to_string());
+
+        let mut command = Command::cargo_bin("influxdb_iox").unwrap();
+        let command = command
+            .arg("run")
+            .arg("compactor")
+            .arg("--compaction-process-once")
+            .arg("--compaction-process-all-partitions")
+            .env("LOG_FILTER", log_filter)
+            .env(
+                "INFLUXDB_IOX_CATALOG_DSN",
+                self.compactor_config()
+                    .dsn()
+                    .as_ref()
+                    .expect("dsn is required to run compaction"),
+            )
+            .env(
+                "INFLUXDB_IOX_CATALOG_POSTGRES_SCHEMA_NAME",
+                self.compactor_config().catalog_schema_name(),
+            )
+            .envs(self.compactor_config().env())
+            .add_addr_env(
+                self.compactor_config().server_type(),
+                self.compactor_config().addrs(),
+            )
+            // redirect output to log file
+            .stdout(stdout_log_file)
+            .stderr(stderr_log_file);
+
+        log_command(command);
+
+        let run_result = command.ok();
+
+        dump_log_to_stdout("compactor run-once", &log_path);
+
+        // Return the command output from the log file as the error message to enable
+        // assertions on the error message contents
+        run_result.map_err(|_| std::fs::read_to_string(&log_path).unwrap())?;
+
+        Ok(())
+    }
+
+    /// Create a storage client connected to the querier member of the cluster
+    pub fn querier_storage_client(
+        &self,
+    ) -> generated_types::storage_client::StorageClient<GrpcConnection> {
+        let grpc_connection = self
+            .querier()
+            .querier_grpc_connection()
+            .into_grpc_connection();
+
+        generated_types::storage_client::StorageClient::new(grpc_connection)
+    }
+}
+
+/// Gathers data from ingester Flight queries
+#[derive(Debug)]
+pub struct IngesterResponse {
+    pub partitions: Vec<IngesterResponsePartition>,
+}
+
+#[derive(Debug)]
+pub struct IngesterResponsePartition {
+    pub app_metadata: IngesterQueryResponseMetadata,
+    pub schema: Option<SchemaRef>,
+    pub record_batches: Vec<RecordBatch>,
+}
+
+/// holds shared server processes to share across tests
+#[derive(Clone)]
+struct SharedServers {
+    router: Option<Weak<TestServer>>,
+    ingesters: Vec<Weak<TestServer>>,
+    querier: Option<Weak<TestServer>>,
+    compactor_config: Option<TestConfig>,
+    catalog: Vec<Weak<TestServer>>,
+    catalog_reverse_proxy: Option<Weak<HttpReverseProxy>>,
+}
+
+/// Deferred creation of a mini cluster
+struct CreatableMiniCluster {
+    router: Option<Arc<TestServer>>,
+    ingesters: Vec<Arc<TestServer>>,
+    querier: Option<Arc<TestServer>>,
+    compactor_config: Option<TestConfig>,
+    catalog: Vec<Arc<TestServer>>,
+    catalog_reverse_proxy: Option<Arc<HttpReverseProxy>>,
+}
+
+async fn create_if_needed(server: Option<Arc<TestServer>>) -> Option<ServerFixture> {
+    if let Some(server) = server {
+        Some(ServerFixture::create_from_existing(server).await)
+    } else {
+        None
+    }
+}
+
+async fn create_if_needed_many(
+    servers: impl IntoIterator<Item = Arc<TestServer>> + Send,
+) -> Vec<ServerFixture> {
+    servers
+        .into_iter()
+        .map(|server| async move { ServerFixture::create_from_existing(server).await })
+        .collect::<FuturesOrdered<_>>()
+        .collect::<Vec<_>>()
+        .await
+}
+
+impl CreatableMiniCluster {
+    async fn create(self) -> MiniCluster {
+        let Self {
+            router,
+            ingesters,
+            querier,
+            compactor_config,
+            catalog,
+            catalog_reverse_proxy,
+        } = self;
+
+        let router_fixture = create_if_needed(router).await;
+        let ingester_fixtures = create_if_needed_many(ingesters).await;
+        let querier_fixture = create_if_needed(querier).await;
+        let catalog_fixtures = create_if_needed_many(catalog).await;
+
+        MiniCluster::new_from_fixtures(
+            router_fixture,
+            ingester_fixtures,
+            querier_fixture,
+            compactor_config,
+            catalog_fixtures,
+            catalog_reverse_proxy,
+        )
+    }
+}
+
+impl SharedServers {
+    /// Save the server processes in this shared servers as weak references
+    pub(crate) fn new(cluster: &MiniCluster) -> Self {
+        Self {
+            router: cluster.router.as_ref().map(|c| c.weak()),
+            ingesters: cluster.ingesters.iter().map(|c| c.weak()).collect(),
+            querier: cluster.querier.as_ref().map(|c| c.weak()),
+            compactor_config: cluster.compactor_config.clone(),
+            catalog: cluster.catalog.iter().map(|c| c.weak()).collect(),
+            catalog_reverse_proxy: cluster.catalog_reverse_proxy.as_ref().map(Arc::downgrade),
+        }
+    }
+
+    /// Returns a creatable MiniCluster that will reuse the existing
+    /// [TestServer]s. Return None if they are no longer active
+    async fn creatable_cluster(&self) -> Option<CreatableMiniCluster> {
+        // The goal of the following code is to bail out (return None
+        // from the function) if any of the optional weak references
+        // aren't present so that the cluster is recreated correctly
+        Some(CreatableMiniCluster {
+            router: server_from_weak(self.router.as_ref())?,
+            ingesters: servers_from_weak(&self.ingesters)?,
+            querier: server_from_weak(self.querier.as_ref())?,
+            compactor_config: self.compactor_config.clone(),
+            catalog: servers_from_weak(&self.catalog)?,
+            catalog_reverse_proxy: server_from_weak(self.catalog_reverse_proxy.as_ref())?,
+        })
+    }
+}
+
+/// Returns None if there was a weak server but we couldn't upgrade.
+/// Returns Some(None) if there was no weak server
+/// Returns Some(Some(fixture)) if there was a weak server that we can upgrade and make a fixture from
+fn server_from_weak<T>(server: Option<&Weak<T>>) -> Option<Option<Arc<T>>> {
+    if let Some(server) = server.as_ref() {
+        // return None if can't upgrade
+        let server = server.upgrade()?;
+
+        Some(Some(server))
+    } else {
+        Some(None)
+    }
+}
+
+/// See [`server_from_weak`].
+fn servers_from_weak<'a, T>(servers: impl IntoIterator<Item = &'a Weak<T>>) -> Option<Vec<Arc<T>>>
+where
+    T: 'a,
+{
+    let mut out = vec![];
+
+    for server in servers {
+        out.push(server.upgrade()?);
+    }
+
+    Some(out)
+}
+
+static GLOBAL_SHARED_SERVERS: Lazy<Mutex<Option<SharedServers>>> = Lazy::new(|| Mutex::new(None));
+static GLOBAL_SHARED_SERVERS_NEVER_PERSIST: Lazy<Mutex<Option<SharedServers>>> =
+    Lazy::new(|| Mutex::new(None));
+
+async fn next_message(
+    performed_query: &mut FlightDataDecoder,
+) -> Option<(DecodedPayload, IngesterQueryResponseMetadata)> {
+    let DecodedFlightData { inner, payload } = performed_query.next().await.transpose().unwrap()?;
+
+    // extract the metadata from the underlying FlightData structure
+    let app_metadata = &inner.app_metadata[..];
+    let app_metadata: IngesterQueryResponseMetadata = Message::decode(app_metadata).unwrap();
+
+    Some((payload, app_metadata))
+}
diff --git a/test_helpers_end_to_end/src/server_fixture.rs b/test_helpers_end_to_end/src/server_fixture.rs
new file mode 100644
index 0000000..03ce374
--- /dev/null
+++ b/test_helpers_end_to_end/src/server_fixture.rs
@@ -0,0 +1,942 @@
+use assert_cmd::cargo::CommandCargoExt;
+use futures::prelude::*;
+use influxdb_iox_client::connection::Connection;
+use observability_deps::tracing::{info, warn};
+use std::{
+    fmt::Debug,
+    fs::OpenOptions,
+    ops::DerefMut,
+    path::Path,
+    process::{Child, Command},
+    str,
+    sync::{Arc, Weak},
+    time::Duration,
+};
+use tempfile::NamedTempFile;
+use test_helpers::timeout::FutureTimeout;
+use tokio::sync::Mutex;
+
+use crate::{
+    database::initialize_db,
+    dump_log_to_stdout, log_command,
+    server_type::AddAddrEnv,
+    service_link::{link_services, unlink_services, LinkableService, LinkableServiceImpl},
+};
+
+use super::{addrs::BindAddresses, ServerType, TestConfig};
+
+/// The duration of time a [`TestServer`] is given to gracefully shutdown after
+/// receiving a SIGTERM, before a SIGKILL is sent to kill it.
+pub(crate) const GRACEFUL_SERVER_STOP_TIMEOUT: Duration = Duration::from_secs(5);
+
+/// Represents a server that has been started and is available for
+/// testing.
+///
+/// Note This structure can not be shared between tests because even
+/// though a `Connection` is `Cloneable` if multiple requests are
+/// issued through the same connection chaos ensues.
+#[derive(Debug)]
+pub struct ServerFixture {
+    server: Arc<TestServer>,
+    connections: Connections,
+}
+
+impl ServerFixture {
+    /// Create a new server fixture and wait for it to be ready. This
+    /// is called "create" rather than new because it is async and
+    /// waits.
+    pub async fn create(test_config: TestConfig) -> Self {
+        let server = TestServer::new(test_config).await;
+        Self::create_from_existing(Arc::new(server)).await
+    }
+
+    /// Create multiple, potentially interdependent sever fixtures concurrently because [`create](Self::create)  only
+    /// returns when health is OK.
+    pub async fn create_multiple(
+        test_configs: impl IntoIterator<Item = TestConfig> + Send,
+    ) -> Vec<Self> {
+        let test_configs = test_configs.into_iter().collect::<Vec<_>>();
+        let n_configs = test_configs.len();
+        futures::stream::iter(test_configs)
+            .map(|cfg| async move { Self::create(cfg).await })
+            .buffered(n_configs)
+            .collect::<Vec<_>>()
+            .await
+    }
+
+    /// Create a new server fixture that shares the same TestServer,
+    /// but has its own connections
+    pub(crate) async fn create_from_existing(server: Arc<TestServer>) -> Self {
+        // ensure the server is ready
+        let connections = server.wait_until_ready().await;
+
+        ServerFixture {
+            server,
+            connections,
+        }
+    }
+
+    /// Restart test server, panic'ing if it is shared with some other
+    /// test
+    ///
+    /// This will break all currently connected clients!
+    pub async fn restart_server(self) -> Self {
+        // unlink clients because we are going to drop the server
+        let clients = unlink_services(Arc::clone(&self.server) as _);
+
+        // get the underlying server, if possible
+        let mut server = match Arc::try_unwrap(self.server) {
+            Ok(s) => s,
+            Err(_) => panic!("Can not restart server as it is shared"),
+        };
+
+        // disconnect so server doesn't wait for our client
+        drop(self.connections);
+
+        server.restart().await;
+        let connections = server.wait_until_ready().await;
+        let server = Arc::new(server);
+
+        // relink clients
+        for client in clients {
+            link_services(Arc::clone(&server) as _, client);
+        }
+
+        Self {
+            server,
+            connections,
+        }
+    }
+
+    /// Shutdown server in a clean way and wait for process to exit.
+    pub async fn shutdown(self) {
+        // unlink clients because we are going to drop the server
+        unlink_services(Arc::clone(&self.server) as _);
+
+        // get the underlying server, if possible
+        let mut server = match Arc::try_unwrap(self.server) {
+            Ok(s) => s,
+            Err(_) => panic!("Can not restart server as it is shared"),
+        };
+
+        // disconnect so server doesn't wait for our client
+        drop(self.connections);
+
+        server.stop().await;
+    }
+
+    pub fn connections(&self) -> &Connections {
+        &self.connections
+    }
+
+    /// Return a channel connected to the gRPC API, panic'ing if not the correct type of server
+    pub fn router_grpc_connection(&self) -> Connection {
+        self.connections.router_grpc_connection()
+    }
+
+    /// Return a channel connected to the ingester gRPC API, panic'ing if not the correct type of
+    /// server
+    pub fn ingester_grpc_connection(&self) -> Connection {
+        self.connections.ingester_grpc_connection()
+    }
+
+    /// Return a channel connected to the querier gRPC API, panic'ing if not the correct type of
+    /// server
+    pub fn querier_grpc_connection(&self) -> Connection {
+        self.connections.querier_grpc_connection()
+    }
+
+    /// Return the http base URL for the router HTTP API
+    pub fn router_http_base(&self) -> Arc<str> {
+        self.server.addrs().router_http_api().client_base()
+    }
+
+    /// Return the http base URL for the router gRPC API
+    pub fn router_grpc_base(&self) -> Arc<str> {
+        self.server.addrs().router_grpc_api().client_base()
+    }
+
+    /// Return the http base URL for the ingester gRPC API
+    pub fn ingester_grpc_base(&self) -> Arc<str> {
+        self.server.addrs().ingester_grpc_api().client_base()
+    }
+
+    /// Return the http base URL for the querier gRPC API
+    pub fn querier_grpc_base(&self) -> Arc<str> {
+        self.server.addrs().querier_grpc_api().client_base()
+    }
+
+    /// Return the http base URL for the catalog HTTP API
+    pub fn catalog_http_base(&self) -> Arc<str> {
+        self.server.addrs().catalog_http_api().client_base()
+    }
+
+    /// Return the grpc base URL for the catalog gRPC API
+    pub fn catalog_grpc_base(&self) -> Arc<str> {
+        self.server.addrs().catalog_grpc_api().client_base()
+    }
+
+    /// Return log path for server process.
+    pub fn log_path(&self) -> Box<Path> {
+        self.server.log_path.clone()
+    }
+
+    /// Get a strong reference to the underlying `TestServer`
+    pub(crate) fn strong(&self) -> Arc<TestServer> {
+        Arc::clone(&self.server)
+    }
+
+    /// Get a weak reference to the underlying `TestServer`
+    pub(crate) fn weak(&self) -> Weak<TestServer> {
+        Arc::downgrade(&self.server)
+    }
+}
+
+/// Represents the current known state of a TestServer
+#[derive(Debug)]
+enum ServerState {
+    Started,
+    Starting,
+    Ready,
+    Error,
+    Stopped,
+}
+
+/// Mananges some number of gRPC connections
+#[derive(Debug, Default)]
+pub struct Connections {
+    /// connection to router gRPC, if available
+    router_grpc_connection: Option<Connection>,
+
+    /// connection to ingester gRPC, if available
+    ingester_grpc_connection: Option<Connection>,
+
+    /// connection to querier gRPC, if available
+    querier_grpc_connection: Option<Connection>,
+
+    /// connection to catalog gRPC, if available
+    catalog_grpc_connection: Option<Connection>,
+}
+
+impl Connections {
+    /// Create a new set of connections, that are initially unconnected
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// Return a channel connected to the gRPC API, panic'ing if not the correct type of server
+    pub fn router_grpc_connection(&self) -> Connection {
+        self.router_grpc_connection
+            .as_ref()
+            .expect("Server type does not have router")
+            .clone()
+    }
+
+    /// Return a channel connected to the ingester gRPC API, panic'ing if not the correct type of
+    /// server
+    pub fn ingester_grpc_connection(&self) -> Connection {
+        self.ingester_grpc_connection
+            .as_ref()
+            .expect("Server type does not have ingester")
+            .clone()
+    }
+
+    /// Return a channel connected to the querier gRPC API, panic'ing if not the correct type of
+    /// server
+    pub fn querier_grpc_connection(&self) -> Connection {
+        self.querier_grpc_connection
+            .as_ref()
+            .expect("Server type does not have querier")
+            .clone()
+    }
+
+    /// Return a channel connected to the gRPC API, panic'ing if not the correct type of server
+    pub fn catalog_grpc_connection(&self) -> Connection {
+        self.catalog_grpc_connection
+            .as_ref()
+            .expect("Server type does not have router")
+            .clone()
+    }
+
+    /// (re)establish channels to all gRPC services that were started with the specified test config
+    async fn reconnect(&mut self, test_config: &TestConfig) -> Result<(), String> {
+        let server_type = test_config.server_type();
+
+        self.router_grpc_connection = match server_type {
+            ServerType::AllInOne | ServerType::Router => {
+                let client_base = test_config.addrs().router_grpc_api().client_base();
+                Some(
+                    grpc_channel(test_config, client_base.as_ref())
+                        .await
+                        .map_err(|e| format!("Cannot connect to router at {client_base}: {e}"))?,
+                )
+            }
+            _ => None,
+        };
+
+        self.ingester_grpc_connection = match server_type {
+            ServerType::AllInOne | ServerType::Ingester => {
+                let client_base = test_config.addrs().ingester_grpc_api().client_base();
+                Some(
+                    grpc_channel(test_config, client_base.as_ref())
+                        .await
+                        .map_err(|e| format!("Cannot connect to ingester at {client_base}: {e}"))?,
+                )
+            }
+            _ => None,
+        };
+
+        self.querier_grpc_connection = match server_type {
+            ServerType::AllInOne | ServerType::Querier => {
+                let client_base = test_config.addrs().querier_grpc_api().client_base();
+                Some(
+                    grpc_channel(test_config, client_base.as_ref())
+                        .await
+                        .map_err(|e| format!("Cannot connect to querier at {client_base}: {e}"))?,
+                )
+            }
+            _ => None,
+        };
+
+        self.catalog_grpc_connection = match server_type {
+            ServerType::Catalog => {
+                let client_base = test_config.addrs().catalog_grpc_api().client_base();
+                Some(
+                    grpc_channel(test_config, client_base.as_ref())
+                        .await
+                        .map_err(|e| format!("Cannot connect to catalog at {client_base}: {e}"))?,
+                )
+            }
+            _ => None,
+        };
+
+        Ok(())
+    }
+}
+
+/// Create a connection channel to the specified gRPC endpoint
+async fn grpc_channel(
+    test_config: &TestConfig,
+    client_base: &str,
+) -> influxdb_iox_client::connection::Result<Connection> {
+    let builder = influxdb_iox_client::connection::Builder::default();
+
+    info!("Creating gRPC channel to {}", client_base);
+    test_config
+        .client_headers()
+        .iter()
+        .fold(builder, |builder, (header_name, header_value)| {
+            builder.header(header_name, header_value)
+        })
+        .build(client_base)
+        .await
+}
+
+#[derive(Debug)]
+pub struct TestServer {
+    /// Is the server ready to accept connections?
+    ready: Mutex<ServerState>,
+
+    /// Path to log file.
+    log_path: Box<Path>,
+
+    /// Handle to the server process being controlled
+    server_process: Arc<Mutex<Option<Child>>>,
+
+    /// Configuration values for starting the test server
+    test_config: TestConfig,
+
+    /// Service links.
+    links: LinkableServiceImpl,
+}
+
+impl TestServer {
+    async fn new(test_config: TestConfig) -> Self {
+        let ready = Mutex::new(ServerState::Started);
+
+        let (_log_file, log_path) = NamedTempFile::new()
+            .expect("opening log file")
+            .keep()
+            .expect("expected to keep");
+
+        let server_process = Arc::new(Mutex::new(Some(
+            Self::create_server_process(&test_config, &log_path).await,
+        )));
+
+        Self {
+            ready,
+            log_path: log_path.into_boxed_path(),
+            server_process,
+            test_config,
+            links: Default::default(),
+        }
+    }
+
+    /// Returns the addresses to which the server has been bound
+    fn addrs(&self) -> &BindAddresses {
+        self.test_config.addrs()
+    }
+
+    /// Stop server.
+    async fn stop(&mut self) {
+        let mut ready_guard = self.ready.lock().await;
+        let mut server_lock = self.server_process.lock().await;
+
+        Self::stop_inner(
+            &mut ready_guard,
+            &mut server_lock,
+            self.test_config.server_type(),
+        )
+        .await;
+    }
+
+    async fn stop_inner(
+        ready: &mut ServerState,
+        server_process: &mut Option<Child>,
+        t: ServerType,
+    ) {
+        let server_process = server_process.take().expect("server process exists");
+        tokio::task::spawn_blocking(move || {
+            kill_politely(server_process, Duration::from_secs(5), t);
+        })
+        .await
+        .expect("kill politely worked");
+
+        *ready = ServerState::Stopped;
+    }
+
+    /// Restarts the tests server process, but does not reconnect clients
+    async fn restart(&mut self) {
+        let mut ready_guard = self.ready.lock().await;
+        let mut server_lock = self.server_process.lock().await;
+
+        Self::stop_inner(
+            &mut ready_guard,
+            &mut server_lock,
+            self.test_config.server_type(),
+        )
+        .await;
+
+        *server_lock = Some(Self::create_server_process(&self.test_config, &self.log_path).await);
+        *ready_guard = ServerState::Started;
+    }
+
+    async fn create_server_process(test_config: &TestConfig, log_path: &Path) -> Child {
+        // Create a new file each time and keep it around to aid debugging
+        let log_file = OpenOptions::new()
+            .read(true)
+            .append(true)
+            .open(log_path)
+            .expect("log file should still be there");
+
+        let stdout_log_file = log_file
+            .try_clone()
+            .expect("cloning file handle for stdout");
+        let stderr_log_file = log_file;
+
+        let server_type = test_config.server_type();
+
+        info!("****************");
+        info!("Server {:?} Logging to {:?}", server_type, log_path);
+        info!("****************");
+
+        // If set in test environment, use that value, else default to info
+        let log_filter =
+            std::env::var("LOG_FILTER").unwrap_or_else(|_| "info,sqlx=warn".to_string());
+
+        let run_command_name = server_type.run_command();
+
+        let mut command = Command::cargo_bin("influxdb_iox").unwrap();
+        let mut command = command
+            .arg("run")
+            .arg(run_command_name)
+            .env("LOG_FILTER", log_filter)
+            // add http/grpc address information
+            .add_addr_env(server_type, test_config.addrs())
+            .envs(test_config.env());
+
+        let dsn = test_config.dsn();
+
+        // If this isn't running all-in-one in ephemeral mode, use the DSN
+        if !(server_type == ServerType::AllInOne && dsn.is_none()) {
+            let dsn = dsn.as_ref().expect("dsn is required for this mode");
+            let schema_name = test_config.catalog_schema_name();
+            initialize_db(dsn, schema_name).await;
+            command = command
+                .env("INFLUXDB_IOX_CATALOG_DSN", dsn)
+                .env("INFLUXDB_IOX_CATALOG_POSTGRES_SCHEMA_NAME", schema_name);
+        }
+
+        command = command
+            // redirect output to log file
+            .stdout(stdout_log_file)
+            .stderr(stderr_log_file);
+
+        log_command(command);
+
+        command.spawn().unwrap()
+    }
+
+    /// Polls the various services to ensure the server is
+    /// operational, reestablishing grpc connections, and returning
+    /// those active connections
+    async fn wait_until_ready(&self) -> Connections {
+        let mut need_wait_for_startup = false;
+        {
+            let mut ready = self.ready.lock().await;
+            match *ready {
+                ServerState::Started => {
+                    // first time, need to try and start it
+                    need_wait_for_startup = true;
+                    *ready = ServerState::Starting;
+                }
+                ServerState::Starting => {
+                    // someone else is starting this
+                }
+                ServerState::Ready => {}
+                ServerState::Error => {
+                    panic!("Server was previously found to be in Error, aborting");
+                }
+                ServerState::Stopped => {
+                    panic!("Server was stopped");
+                }
+            };
+        }
+
+        // at first, attempt to reconnect all the clients
+        let mut connections = Connections::new();
+        async {
+            let mut interval = tokio::time::interval(Duration::from_millis(100));
+            loop {
+                match connections.reconnect(&self.test_config).await {
+                    Err(e) => info!("wait_until_ready: can not yet connect: {}", e),
+                    Ok(()) => return,
+                }
+                interval.tick().await;
+            }
+        }
+        .with_timeout_panic(Duration::from_secs(10))
+        .await;
+
+        if !need_wait_for_startup {
+            return connections;
+        }
+
+        // Poll the RPC and HTTP servers separately as they listen on
+        // different ports but both need to be up for the test to run
+        let try_grpc_connect = self.wait_for_grpc(&connections);
+
+        let server_process = Arc::clone(&self.server_process);
+        let try_http_connect = async {
+            let client = reqwest::Client::new();
+            let url = format!("{}/health", self.test_config.http_base());
+            let mut interval = tokio::time::interval(Duration::from_millis(100));
+            loop {
+                if server_dead(server_process.as_ref()).await {
+                    break;
+                }
+                match client.get(&url).send().await {
+                    Ok(resp)
+                        if resp.status().is_success() || !self.test_config.wait_for_ready() =>
+                    {
+                        info!(
+                            "Successfully got a response from {:?} HTTP: {:?}",
+                            self.test_config.server_type(),
+                            resp
+                        );
+                        return;
+                    }
+                    Ok(resp) => {
+                        info!(
+                            "Waiting for {:?} HTTP server to be up: {:?}",
+                            self.test_config.server_type(),
+                            resp
+                        );
+                        return;
+                    }
+                    Err(e) => {
+                        info!(
+                            "Waiting for {:?} HTTP server to be up: {}",
+                            self.test_config.server_type(),
+                            e
+                        );
+                    }
+                }
+                interval.tick().await;
+            }
+        };
+
+        let pair = future::join(try_http_connect, try_grpc_connect);
+
+        let capped_check = tokio::time::timeout(Duration::from_secs(30), pair);
+
+        match capped_check.await {
+            Ok(_) => {
+                info!("Successfully started {}", self);
+                let mut ready = self.ready.lock().await;
+                *ready = ServerState::Ready;
+            }
+            Err(e) => {
+                // tell others that this server had some problem
+                let mut ready = self.ready.lock().await;
+                *ready = ServerState::Error;
+                std::mem::drop(ready);
+                panic!(
+                    "{:?} Server was not ready in required time: {}",
+                    self.test_config.server_type(),
+                    e
+                );
+            }
+        }
+
+        connections
+    }
+
+    pub async fn wait_for_grpc(&self, connections: &Connections) {
+        let server_process = Arc::clone(&self.server_process);
+        let mut interval = tokio::time::interval(Duration::from_millis(100));
+
+        let server_type = self.test_config.server_type();
+        loop {
+            if server_dead(server_process.as_ref()).await {
+                break;
+            }
+
+            match server_type {
+                ServerType::Compactor => {
+                    unimplemented!(
+                        "Don't use a long-running compactor and gRPC in e2e tests; use \
+                        `influxdb_iox compactor run-once` instead"
+                    );
+                }
+                ServerType::Catalog => {
+                    if check_catalog_v2_service_health(
+                        server_type,
+                        connections.catalog_grpc_connection(),
+                        self.test_config.wait_for_ready(),
+                    )
+                    .await
+                    {
+                        return;
+                    }
+                }
+                ServerType::ParquetCache => {
+                    unimplemented!("ParquetCache server should not use grpc, only http");
+                }
+                ServerType::Router => {
+                    if check_catalog_service_health(
+                        server_type,
+                        connections.router_grpc_connection(),
+                    )
+                    .await
+                    {
+                        return;
+                    }
+                }
+                ServerType::Ingester => {
+                    if check_arrow_service_health(
+                        server_type,
+                        connections.ingester_grpc_connection(),
+                    )
+                    .await
+                    {
+                        return;
+                    }
+                }
+                ServerType::Querier => {
+                    if check_arrow_service_health(
+                        server_type,
+                        connections.querier_grpc_connection(),
+                    )
+                    .await
+                    {
+                        return;
+                    }
+                }
+                ServerType::AllInOne => {
+                    // ensure we can write and query
+                    // TODO also check ingester
+                    if check_catalog_service_health(
+                        server_type,
+                        connections.router_grpc_connection(),
+                    )
+                    .await
+                        && check_arrow_service_health(
+                            server_type,
+                            connections.ingester_grpc_connection(),
+                        )
+                        .await
+                        && check_arrow_service_health(
+                            server_type,
+                            connections.querier_grpc_connection(),
+                        )
+                        .await
+                    {
+                        return;
+                    }
+                }
+            }
+            interval.tick().await;
+        }
+    }
+}
+
+impl LinkableService for TestServer {
+    fn add_link_client(&self, client: Weak<dyn LinkableService>) {
+        self.links.add_link_client(client)
+    }
+
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>> {
+        self.links.remove_link_clients()
+    }
+
+    fn add_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.add_link_server(server)
+    }
+
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.remove_link_server(server)
+    }
+}
+
+/// checks catalog service health, as a proxy for all gRPC
+/// services. Returns false if the service should be checked again
+async fn check_catalog_service_health(server_type: ServerType, connection: Connection) -> bool {
+    let mut health = influxdb_iox_client::health::Client::new(connection);
+
+    match health
+        .check("influxdata.iox.catalog.v1.CatalogService")
+        .await
+    {
+        Ok(true) => {
+            info!("CatalogService service {:?} is running", server_type);
+            true
+        }
+        Ok(false) => {
+            info!("CatalogService {:?} is not running", server_type);
+            true
+        }
+        Err(e) => {
+            info!("CatalogService {:?} not yet healthy: {:?}", server_type, e);
+            false
+        }
+    }
+}
+
+/// checks catalog service V2 health, as a proxy for all gRPC
+/// services. Returns false if the service should be checked again
+async fn check_catalog_v2_service_health(
+    server_type: ServerType,
+    connection: Connection,
+    wait_for_ready: bool,
+) -> bool {
+    let mut health = influxdb_iox_client::health::Client::new(connection);
+
+    match health
+        .check("influxdata.iox.catalog.v2.CatalogService")
+        .await
+    {
+        Ok(ready) => {
+            if ready || !wait_for_ready {
+                info!("CatalogService service {:?} is running", server_type);
+                true
+            } else {
+                info!("CatalogService {:?} is not running", server_type);
+                false
+            }
+        }
+        Err(e) => {
+            info!("CatalogService {:?} not yet healthy: {:?}", server_type, e);
+            false
+        }
+    }
+}
+
+/// checks the arrow service service health, returning false if the service should be checked again
+async fn check_arrow_service_health(server_type: ServerType, connection: Connection) -> bool {
+    let mut health = influxdb_iox_client::health::Client::new(connection);
+
+    match health.check_arrow().await {
+        Ok(true) => {
+            info!("Flight service {:?} is running", server_type);
+            true
+        }
+        Ok(false) => {
+            info!("Flight service {:?} is not running", server_type);
+            true
+        }
+        Err(e) => {
+            info!("Flight service {:?} not yet healthy: {:?}", server_type, e);
+            false
+        }
+    }
+}
+
+impl std::fmt::Display for TestServer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::result::Result<(), std::fmt::Error> {
+        write!(
+            f,
+            "TestServer {:?} ({})",
+            self.test_config.server_type(),
+            self.addrs()
+        )
+    }
+}
+
+impl Drop for TestServer {
+    fn drop(&mut self) {
+        let mut server_lock = self
+            .server_process
+            .try_lock()
+            .expect("should be able to get a server process lock");
+
+        if let Some(server_process) = server_lock.take() {
+            let test_config = self.test_config.clone();
+            let log_path = self.log_path.clone();
+            let links = self.links.clone();
+
+            let kill_and_dump = move || {
+                kill_politely(
+                    server_process,
+                    GRACEFUL_SERVER_STOP_TIMEOUT,
+                    test_config.server_type(),
+                );
+
+                dump_log_to_stdout(&format!("{:?}", test_config.server_type()), &log_path);
+
+                // keep links til server is actually gone
+                drop(links);
+
+                // keep test config til the very last because it contains the WAL dir
+                drop(test_config);
+            };
+
+            // if there's still a tokio runtime around, use that to help the shut down process, because our client
+            // connections need to interact with the HTTP/2 shutdown and we shall not block the runtime during that
+            match tokio::runtime::Handle::try_current() {
+                Ok(handle) => {
+                    // tokio might decide to not schedule our future, in which case we still want to kill the child, so
+                    // we wrap the kill method into a helper that is either executed within a tokio context or is
+                    // executed when tokio drops it.
+                    let mut kill_and_dump = ExecOnDrop(Some(Box::new(kill_and_dump)));
+                    handle.spawn_blocking(move || {
+                        kill_and_dump.maybe_exec();
+                    });
+                }
+                Err(_) => {
+                    kill_and_dump();
+                }
+            }
+        }
+    }
+}
+
+struct ExecOnDrop(Option<Box<dyn FnOnce() + Send>>);
+
+impl ExecOnDrop {
+    fn maybe_exec(&mut self) {
+        if let Some(f) = self.0.take() {
+            f();
+        }
+    }
+}
+
+impl Drop for ExecOnDrop {
+    fn drop(&mut self) {
+        self.maybe_exec();
+    }
+}
+
+/// returns true if the server process has exited (for any reason), and
+/// prints what happened to stdout
+async fn server_dead(server_process: &Mutex<Option<Child>>) -> bool {
+    match server_process.lock().await.deref_mut() {
+        Some(server_process) => server_dead_inner(server_process),
+        None => true,
+    }
+}
+
+fn server_dead_inner(server_process: &mut Child) -> bool {
+    match server_process.try_wait() {
+        Ok(None) => false,
+        Ok(Some(status)) => {
+            warn!("Server process exited: {}", status);
+            true
+        }
+        Err(e) => {
+            warn!("Error getting server process exit status: {}", e);
+            true
+        }
+    }
+}
+
+/// Attempt to kill a child process politely.
+fn kill_politely(mut child: Child, wait: Duration, t: ServerType) {
+    if server_dead_inner(&mut child) {
+        // fast path
+        return;
+    }
+
+    kill_politely_inner(&mut child, wait, t);
+}
+
+fn kill_politely_inner(child: &mut Child, wait: Duration, t: ServerType) {
+    use nix::{
+        sys::{
+            signal::{self, Signal},
+            wait::waitpid,
+        },
+        unistd::Pid,
+    };
+
+    let pid = Pid::from_raw(child.id().try_into().unwrap());
+
+    // try to be polite
+    let wait_errored = match signal::kill(pid, Signal::SIGTERM) {
+        Ok(()) => wait_timeout(pid, wait).is_err(),
+        Err(e) => {
+            info!("Error sending SIGTERM to child ({t:?}): {e}");
+            true
+        }
+    };
+
+    if wait_errored {
+        // timeout => kill it
+        warn!("Cannot terminate child ({t:?}) politely, using SIGKILL...");
+
+        if let Err(e) = signal::kill(pid, Signal::SIGKILL) {
+            info!("Error sending SIGKILL to child ({t:?}): {e}");
+        }
+        if let Err(e) = waitpid(pid, None) {
+            info!("Cannot wait for child ({t:?}): {e}");
+        }
+    } else {
+        info!("Killed child ({t:?}) politely");
+    }
+}
+
+/// Wait for given PID to exit with a timeout.
+fn wait_timeout(pid: nix::unistd::Pid, timeout: Duration) -> Result<(), ()> {
+    use nix::sys::wait::waitpid;
+
+    // use some thread and channel trickery, see https://stackoverflow.com/a/42720480
+    let (sender, receiver) = std::sync::mpsc::channel();
+    std::thread::spawn(move || {
+        let waitpid_res = waitpid(pid, None).map(|_| ());
+
+        // errors if the receiver side is gone which is OK.
+        sender.send(waitpid_res).ok();
+    });
+
+    match receiver.recv_timeout(timeout) {
+        Ok(Ok(())) => Ok(()),
+        Ok(Err(e)) => {
+            info!("Cannot wait for child: {e}");
+            Err(())
+        }
+        Err(_) => {
+            info!("Timeout waiting for child");
+            Err(())
+        }
+    }
+}
diff --git a/test_helpers_end_to_end/src/server_type.rs b/test_helpers_end_to_end/src/server_type.rs
new file mode 100644
index 0000000..ab23f82
--- /dev/null
+++ b/test_helpers_end_to_end/src/server_type.rs
@@ -0,0 +1,181 @@
+use super::addrs::BindAddresses;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ServerType {
+    AllInOne,
+    Ingester,
+    Router,
+    Querier,
+    Compactor,
+    Catalog,
+    ParquetCache,
+}
+
+impl ServerType {
+    /// returns the name of the 'run' subcommand of the `influxdb_iox` binary
+    pub fn run_command(&self) -> &'static str {
+        match self {
+            Self::AllInOne => "all-in-one",
+            Self::Ingester => "ingester",
+            Self::Router => "router",
+            Self::Querier => "querier",
+            Self::Compactor => "compactor",
+            Self::Catalog => "catalog",
+            Self::ParquetCache => "parquet-cache",
+        }
+    }
+}
+
+pub trait AddAddrEnv {
+    /// add the relevant bind addreses for the server type
+    fn add_addr_env(&mut self, server_type: ServerType, addrs: &BindAddresses) -> &mut Self;
+}
+
+impl AddAddrEnv for std::process::Command {
+    fn add_addr_env(&mut self, server_type: ServerType, addrs: &BindAddresses) -> &mut Self {
+        self.envs(addr_envs(server_type, addrs))
+    }
+}
+
+impl AddAddrEnv for assert_cmd::Command {
+    fn add_addr_env(&mut self, server_type: ServerType, addrs: &BindAddresses) -> &mut Self {
+        self.envs(addr_envs(server_type, addrs))
+    }
+}
+
+fn addr_envs(server_type: ServerType, addrs: &BindAddresses) -> Vec<(&'static str, String)> {
+    match server_type {
+        ServerType::AllInOne => vec![
+            (
+                "INFLUXDB_IOX_ROUTER_HTTP_BIND_ADDR",
+                addrs.router_http_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_ROUTER_GRPC_BIND_ADDR",
+                addrs.router_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_QUERIER_GRPC_BIND_ADDR",
+                addrs.querier_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_INGESTER_GRPC_BIND_ADDR",
+                addrs.ingester_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_COMPACTOR_GRPC_BIND_ADDR",
+                addrs.compactor_grpc_api().bind_addr().to_string(),
+            ),
+        ],
+        ServerType::Ingester => vec![
+            (
+                "INFLUXDB_IOX_BIND_ADDR",
+                addrs.ingester_http_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GRPC_BIND_ADDR",
+                addrs.ingester_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.ingester_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
+        ],
+        ServerType::Router => vec![
+            (
+                "INFLUXDB_IOX_BIND_ADDR",
+                addrs.router_http_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GRPC_BIND_ADDR",
+                addrs.router_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_INGESTER_ADDRESSES",
+                addrs.ingester_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.router_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
+        ],
+        ServerType::Querier => vec![
+            (
+                "INFLUXDB_IOX_BIND_ADDR",
+                addrs.querier_http_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GRPC_BIND_ADDR",
+                addrs.querier_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.querier_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
+        ],
+        ServerType::Compactor => vec![
+            (
+                "INFLUXDB_IOX_BIND_ADDR",
+                addrs.compactor_http_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GRPC_BIND_ADDR",
+                addrs.compactor_grpc_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.compactor_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
+        ],
+        ServerType::Catalog => vec![
+            (
+                "INFLUXDB_IOX_BIND_ADDR",
+                addrs.catalog_http_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GRPC_BIND_ADDR",
+                addrs.catalog_grpc_api().bind_addr().to_string(),
+            ),
+        ],
+        ServerType::ParquetCache => vec![(
+            "INFLUXDB_IOX_BIND_ADDR",
+            addrs.parquet_cache_http_api().bind_addr().to_string(),
+        )],
+    }
+}
diff --git a/test_helpers_end_to_end/src/service_link.rs b/test_helpers_end_to_end/src/service_link.rs
new file mode 100644
index 0000000..e649f34
--- /dev/null
+++ b/test_helpers_end_to_end/src/service_link.rs
@@ -0,0 +1,99 @@
+//! Helpers to ensure service links are respected during shutdown.
+//!
+//! This does NOT affect correctness of the tests but often speeds them up because clients (like the ingester
+//! communicating with the catalog) no longer get stuck on retries during the shutdown phase (and would be killed after
+//! a timeout).
+use std::sync::{Arc, Weak};
+
+use parking_lot::Mutex;
+
+/// An abstract service that can be linked in a client-server relationship
+pub(crate) trait LinkableService: std::fmt::Debug + Send + Sync {
+    /// Add new known client.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`add_link_server`](Self::add_link_server)) for the
+    /// client. Use [`link_services`] instead.**
+    fn add_link_client(&self, client: Weak<dyn LinkableService>);
+
+    /// Unlink all clients from this service.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`remove_link_server`](Self::remove_link_server)) for the
+    /// returned clients. Use [`unlink_services`] instead.**
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>>;
+
+    /// Add new known server that should be kept alive until the client is gone.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`add_link_client`](Self::add_link_client)) for the
+    /// client. Use [`link_services`] instead.**
+    fn add_link_server(&self, server: Arc<dyn LinkableService>);
+
+    /// Remove given server.
+    ///
+    /// The server will no longer kept alive. This is a no-op if the server is unknown.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`remove_link_clients`](Self::remove_link_clients)) for the
+    /// server. Use [`unlink_services`] instead.**
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>);
+}
+
+/// Simple implementation of [`LinkableService`] that can be used as a struct member.
+///
+/// Using this as a struct member and NOT directly is important so that the tracked [`Arc`]s use the actual service
+/// struct, not this helper.
+#[derive(Debug, Default)]
+pub(crate) struct LinkableServiceImpl {
+    clients: Mutex<Vec<Weak<dyn LinkableService>>>,
+    servers: Mutex<Vec<Arc<dyn LinkableService>>>,
+}
+
+impl LinkableService for LinkableServiceImpl {
+    fn add_link_client(&self, client: Weak<dyn LinkableService>) {
+        self.clients.lock().push(client);
+    }
+
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>> {
+        let mut guard = self.clients.lock();
+        guard
+            .drain(..)
+            .filter_map(|client| client.upgrade())
+            .collect()
+    }
+
+    fn add_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.servers.lock().push(server);
+    }
+
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.servers
+            .lock()
+            .retain(|server2| !Arc::ptr_eq(&server, server2));
+    }
+}
+
+impl Clone for LinkableServiceImpl {
+    fn clone(&self) -> Self {
+        let clients = self.clients.lock();
+        let server = self.servers.lock();
+        Self {
+            clients: Mutex::new(clients.clone()),
+            servers: Mutex::new(server.clone()),
+        }
+    }
+}
+
+/// Cross-link server and client.
+pub(crate) fn link_services(server: Arc<dyn LinkableService>, client: Arc<dyn LinkableService>) {
+    server.add_link_client(Arc::downgrade(&client));
+    client.add_link_server(server);
+}
+
+/// Unlink clients from a given server so it is no longer kept alive.
+///
+/// The known clients are returned so they can potentially be re-linked.
+pub(crate) fn unlink_services(server: Arc<dyn LinkableService>) -> Vec<Arc<dyn LinkableService>> {
+    let clients = server.remove_link_clients();
+    for client in &clients {
+        client.remove_link_server(Arc::clone(&server));
+    }
+    clients
+}
diff --git a/test_helpers_end_to_end/src/snapshot_comparison.rs b/test_helpers_end_to_end/src/snapshot_comparison.rs
new file mode 100644
index 0000000..1aeced7
--- /dev/null
+++ b/test_helpers_end_to_end/src/snapshot_comparison.rs
@@ -0,0 +1,230 @@
+mod queries;
+
+use crate::{
+    snapshot_comparison::queries::TestQueries, try_run_influxql, try_run_sql, MiniCluster,
+};
+use arrow::record_batch::RecordBatch;
+use arrow_flight::error::FlightError;
+use arrow_util::test_util::{sort_record_batch, Normalizer, REGEX_UUID};
+use influxdb_iox_client::format::influxql::{write_columnar, Options, TableBorders};
+use once_cell::sync::Lazy;
+use regex::{Captures, Regex};
+use snafu::{OptionExt, Snafu};
+use sqlx::types::Uuid;
+use std::collections::HashMap;
+use std::{
+    fmt::{Display, Formatter},
+    path::{Path, PathBuf},
+};
+use tonic::Code;
+
+use self::queries::Query;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Could not read case file '{:?}': {}", path, source))]
+    ReadingCaseFile {
+        path: PathBuf,
+        source: std::io::Error,
+    },
+
+    #[snafu(context(false))]
+    MakingOutputPath { source: OutputPathError },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Match the parquet directory names
+/// For example, given
+/// `51/216/1a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f4502/1d325760-2b20-48de-ab48-2267b034133d.parquet`
+///
+/// matches `51/216/1a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f45021a3f4502`
+pub static REGEX_DIRS: Lazy<Regex> =
+    Lazy::new(|| Regex::new(r#"([0-9]+)/([0-9]+)/([0-9a-f]{64})"#).expect("directory regex"));
+
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub enum Language {
+    #[default]
+    Sql,
+    InfluxQL,
+}
+
+impl Language {
+    fn normalize_results(&self, n: &Normalizer, results: Vec<RecordBatch>) -> Vec<String> {
+        match self {
+            Language::Sql => n.normalize_results(results),
+            Language::InfluxQL => {
+                let results = if n.sorted_compare && !results.is_empty() {
+                    let schema = results[0].schema();
+                    let batch = arrow::compute::concat_batches(&schema, &results)
+                        .expect("concatenating batches");
+                    vec![sort_record_batch(batch)]
+                } else {
+                    results
+                };
+
+                let options = if n.no_table_borders {
+                    Options {
+                        borders: TableBorders::None,
+                    }
+                } else {
+                    Options::default()
+                };
+
+                let mut buf = Vec::<u8>::new();
+                write_columnar(&mut buf, &results, options).unwrap();
+
+                let mut seen: HashMap<String, u128> = HashMap::new();
+
+                unsafe { String::from_utf8_unchecked(buf) }
+                    .trim()
+                    .lines()
+                    .map(|s| {
+                        // replace any UUIDs
+                        let s = REGEX_UUID.replace_all(s, |c: &Captures<'_>| {
+                            let next = seen.len() as u128;
+                            Uuid::from_u128(
+                                *seen
+                                    .entry(c.get(0).unwrap().as_str().to_owned())
+                                    .or_insert(next),
+                            )
+                            .to_string()
+                        });
+
+                        let s = REGEX_DIRS.replace_all(&s, |_c: &Captures<'_>| {
+                            // this ensures 15/232/5 is replaced with a string of a known width
+                            //              1/1/1
+                            ["1", "1", "1"].join("/")
+                        });
+
+                        // Need to remove trailing spaces when using no_borders
+                        let s = if n.no_table_borders {
+                            s.trim_end()
+                        } else {
+                            s.as_ref()
+                        };
+
+                        s.to_string()
+                    })
+                    .collect::<Vec<_>>()
+            }
+        }
+    }
+}
+
+impl Display for Language {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Language::Sql => Display::fmt("SQL", f),
+            Language::InfluxQL => Display::fmt("InfluxQL", f),
+        }
+    }
+}
+
+pub async fn run(
+    cluster: &mut MiniCluster,
+    input_file_path: PathBuf,
+    setup_name: String,
+    contents: String,
+    language: Language,
+) -> Result<()> {
+    // create output and expected output
+    let test_name = input_file_path
+        .file_name()
+        .expect("input path missing file path")
+        .to_str()
+        .expect("input path file path is not valid UTF-8");
+
+    let output_path = input_file_path.parent().context(NoParentSnafu {
+        path: &input_file_path,
+    })?;
+    let output_path = make_absolute(output_path);
+
+    println!("Running case in {input_file_path:?}");
+    println!("Producing output in {output_path:?}");
+    println!("Processing contents:\n{contents}");
+
+    let queries = TestQueries::from_lines(contents.lines(), language);
+
+    //Build up the test output line by line
+    let mut output = vec![];
+    output.push(format!("-- Test Setup: {setup_name}"));
+
+    for q in queries.iter() {
+        q.add_comments(&mut output);
+        output.push(format!("-- {}: {}", language, q.text()));
+        q.add_description(&mut output);
+        let results = run_query(cluster, q).await?;
+        output.extend(results);
+    }
+
+    // Configure insta to send the results to query_tests/out/<test_name>.sql.snap
+    let mut settings = insta::Settings::clone_current();
+    settings.set_snapshot_path(output_path);
+    settings.set_prepend_module_to_snapshot(false);
+    settings.bind(|| {
+        let test_output = output.join("\n");
+        insta::assert_snapshot!(test_name, test_output); // panic on failure
+    });
+
+    Ok(())
+}
+
+#[derive(Debug, Snafu)]
+pub enum OutputPathError {
+    #[snafu(display("Input path has no file stem: '{:?}'", path))]
+    NoFileStem { path: PathBuf },
+
+    #[snafu(display("Input path missing file extension: '{:?}'", path))]
+    MissingFileExt { path: PathBuf },
+
+    #[snafu(display("Input path has no parent?!: '{:?}'", path))]
+    NoParent { path: PathBuf },
+}
+
+/// Return the absolute path to `path`, regardless of if it exists on the local filesystem
+fn make_absolute(path: &Path) -> PathBuf {
+    let mut absolute = std::env::current_dir().expect("cannot get current working directory");
+    absolute.extend(path);
+    absolute
+}
+
+async fn run_query(cluster: &MiniCluster, query: &Query) -> Result<Vec<String>> {
+    let (query_text, language) = (query.text(), query.language());
+    let result = match language {
+        Language::Sql => {
+            try_run_sql(
+                query_text,
+                cluster.namespace(),
+                cluster.querier().querier_grpc_connection(),
+                None,
+                true,
+            )
+            .await
+        }
+        Language::InfluxQL => {
+            try_run_influxql(
+                query_text,
+                cluster.namespace(),
+                cluster.querier().querier_grpc_connection(),
+                None,
+            )
+            .await
+        }
+    };
+
+    let batches = match result {
+        Ok((mut batches, schema)) => {
+            batches.push(RecordBatch::new_empty(schema));
+            batches
+        }
+        Err(influxdb_iox_client::flight::Error::ArrowFlightError(FlightError::Tonic(status)))
+            if status.code() == Code::InvalidArgument =>
+        {
+            return Ok(status.message().lines().map(str::to_string).collect())
+        }
+        Err(e) => panic!("error running query '{query_text}': {e}"),
+    };
+
+    Ok(query.normalize_results(batches, language))
+}
diff --git a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs
new file mode 100644
index 0000000..1008af4
--- /dev/null
+++ b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs
@@ -0,0 +1,367 @@
+use crate::snapshot_comparison::Language;
+use arrow::record_batch::RecordBatch;
+use arrow_util::test_util::Normalizer;
+
+/// A query to run with optional annotations
+#[derive(Debug, PartialEq, Eq, Default)]
+pub(crate) struct Query {
+    /// Describes how query text should be normalized
+    normalizer: Normalizer,
+
+    /// Specifies the query language of `text`.
+    language: Language,
+
+    /// Comments that precede the query
+    comments: Vec<String>,
+
+    /// The query string
+    text: String,
+}
+
+impl Query {
+    #[cfg(test)]
+    fn new(text: impl Into<String>) -> Self {
+        let text = text.into();
+        Self {
+            normalizer: Normalizer::new(),
+            language: Language::Sql,
+            comments: vec![],
+            text,
+        }
+    }
+
+    pub(crate) fn text(&self) -> &str {
+        &self.text
+    }
+
+    pub(crate) fn language(&self) -> Language {
+        self.language
+    }
+
+    /// Add a comment to the query
+    #[cfg(test)]
+    pub(crate) fn with_comment(mut self, comment: impl Into<String>) -> Self {
+        self.comments.push(comment.into());
+        self
+    }
+
+    pub(crate) fn with_sorted_compare(mut self) -> Self {
+        self.normalizer.sorted_compare = true;
+        self
+    }
+
+    pub(crate) fn with_normalized_uuids(mut self) -> Self {
+        self.normalizer.normalized_uuids = true;
+        self
+    }
+
+    pub(crate) fn with_normalize_metrics(mut self) -> Self {
+        self.normalizer.normalized_metrics = true;
+        self
+    }
+
+    pub(crate) fn with_normalize_filters(mut self) -> Self {
+        self.normalizer.normalized_filters = true;
+        self
+    }
+
+    pub(crate) fn with_no_table_borders(mut self) -> Self {
+        self.normalizer.no_table_borders = true;
+        self
+    }
+
+    /// Take the output of running the query and apply the specified normalizations to them
+    pub(crate) fn normalize_results(
+        &self,
+        results: Vec<RecordBatch>,
+        language: Language,
+    ) -> Vec<String> {
+        language.normalize_results(&self.normalizer, results)
+    }
+
+    /// Adds any comments from the input to the output
+    pub(crate) fn add_comments(&self, output: &mut Vec<String>) {
+        output.extend_from_slice(&self.comments);
+    }
+
+    /// Adds information to the output about what normalizations were applied
+    pub(crate) fn add_description(&self, output: &mut Vec<String>) {
+        self.normalizer.add_description(output)
+    }
+}
+
+#[derive(Debug, Default)]
+struct QueryBuilder {
+    pub(crate) language: Language,
+    pub(crate) query: Query,
+}
+
+impl QueryBuilder {
+    fn new(language: Language) -> Self {
+        Self {
+            language,
+            ..Default::default()
+        }
+    }
+    fn push_comment(&mut self, s: &str) {
+        self.query.comments.push(s.to_string())
+    }
+
+    fn push_str(&mut self, s: &str) {
+        self.query.text.push_str(s)
+    }
+
+    fn push(&mut self, c: char) {
+        self.query.text.push(c)
+    }
+
+    fn is_empty(&self) -> bool {
+        self.query.text.is_empty()
+    }
+
+    /// Creates a Query and resets this builder to default
+    fn build_and_reset(&mut self) -> Option<Query> {
+        (!self.is_empty()).then(|| {
+            let mut q = std::mem::take(&mut self.query);
+            q.language = self.language;
+            q
+        })
+    }
+}
+
+/// Poor man's parser to find all the SQL queries in an input file
+#[derive(Debug, PartialEq, Eq)]
+pub(crate) struct TestQueries {
+    queries: Vec<Query>,
+}
+
+impl TestQueries {
+    /// find all queries (more or less a fancy split on `;`
+    pub(crate) fn from_lines<I, S>(lines: I, language: Language) -> Self
+    where
+        I: IntoIterator<Item = S>,
+        S: AsRef<str>,
+    {
+        let mut queries = vec![];
+
+        let mut builder =
+            lines
+                .into_iter()
+                .fold(QueryBuilder::new(language), |mut builder, line| {
+                    let line = line.as_ref().trim();
+                    const COMPARE_STR: &str = "-- IOX_COMPARE: ";
+                    if line.starts_with(COMPARE_STR) {
+                        let (_, options) = line.split_at(COMPARE_STR.len());
+                        for option in options.split(',') {
+                            let option = option.trim();
+                            match option {
+                                "sorted" => {
+                                    builder.query = builder.query.with_sorted_compare();
+                                }
+                                "uuid" => {
+                                    builder.query = builder.query.with_normalized_uuids();
+                                }
+                                "metrics" => {
+                                    builder.query = builder.query.with_normalize_metrics();
+                                }
+                                "filters" => {
+                                    builder.query = builder.query.with_normalize_filters();
+                                }
+                                "no_borders" => {
+                                    builder.query = builder.query.with_no_table_borders();
+                                }
+                                _ => {}
+                            }
+                        }
+                    } else if line.starts_with("-- IOX_SETUP: ") {
+                        // ignore setup lines
+                    } else if line.starts_with("--") {
+                        builder.push_comment(line);
+                    }
+
+                    if line.starts_with("--") {
+                        return builder;
+                    }
+                    if line.is_empty() {
+                        return builder;
+                    }
+
+                    // replace newlines
+                    if !builder.is_empty() {
+                        builder.push(' ');
+                    }
+                    builder.push_str(line);
+
+                    // declare queries when we see a semicolon at the end of the line
+                    if line.ends_with(';') {
+                        if let Some(q) = builder.build_and_reset() {
+                            queries.push(q);
+                        }
+                    }
+                    builder
+                });
+
+        // get last one, if any
+        if let Some(q) = builder.build_and_reset() {
+            queries.push(q);
+        }
+
+        Self { queries }
+    }
+
+    // Get an iterator over the queries
+    pub(crate) fn iter(&self) -> impl Iterator<Item = &Query> {
+        self.queries.iter()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_parse_queries() {
+        let input = r#"
+-- This is a test
+select * from foo;
+-- another comment
+
+select * from bar;
+-- This query has been commented out and should not be seen
+-- select * from baz;
+"#;
+        let queries = TestQueries::from_lines(input.split('\n'), Language::Sql);
+        assert_eq!(
+            queries,
+            TestQueries {
+                queries: vec![
+                    Query::new("select * from foo;").with_comment("-- This is a test"),
+                    Query::new("select * from bar;").with_comment("-- another comment"),
+                ]
+            }
+        )
+    }
+
+    #[test]
+    fn test_parse_queries_no_ending_semi() {
+        let input = r#"
+select * from foo;
+-- no ending semi colon
+select * from bar
+"#;
+        let queries = TestQueries::from_lines(input.split('\n'), Language::Sql);
+        assert_eq!(
+            queries,
+            TestQueries {
+                queries: vec![
+                    Query::new("select * from foo;"),
+                    Query::new("select * from bar").with_comment("-- no ending semi colon"),
+                ]
+            }
+        )
+    }
+
+    #[test]
+    fn test_parse_queries_mulit_line() {
+        let input = r#"
+select
+  *
+from
+  foo;
+
+select * from bar;
+
+"#;
+        let queries = TestQueries::from_lines(input.split('\n'), Language::Sql);
+        assert_eq!(
+            queries,
+            TestQueries {
+                queries: vec![
+                    Query::new("select * from foo;"),
+                    Query::new("select * from bar;"),
+                ]
+            }
+        )
+    }
+
+    #[test]
+    fn test_parse_queries_empty() {
+        let input = r#"
+-- This is a test
+-- another comment
+"#;
+        let queries = TestQueries::from_lines(input.split('\n'), Language::Sql);
+        assert_eq!(queries, TestQueries { queries: vec![] })
+    }
+
+    #[test]
+    fn test_parse_queries_sorted_compare() {
+        let input = r#"
+select * from foo;
+
+-- The second query should be compared to expected after sorting
+-- IOX_COMPARE: sorted
+select * from bar;
+
+-- Since this query is not annotated, it should not use exected sorted
+select * from baz;
+select * from baz2;
+
+-- IOX_COMPARE: sorted
+select * from waz;
+-- (But the compare should work subsequently)
+"#;
+        let queries = TestQueries::from_lines(input.split('\n'), Language::Sql);
+        assert_eq!(
+            queries,
+            TestQueries {
+                queries: vec![
+                    Query::new("select * from foo;"),
+                    Query::new("select * from bar;")
+                        .with_comment(
+                            "-- The second query should be compared to expected after sorting"
+                        )
+                        .with_sorted_compare(),
+                    Query::new("select * from baz;").with_comment(
+                        "-- Since this query is not annotated, it should not use exected sorted"
+                    ),
+                    Query::new("select * from baz2;"),
+                    Query::new("select * from waz;").with_sorted_compare(),
+                ]
+            }
+        )
+    }
+
+    #[test]
+    fn test_parse_queries_sorted_compare_after() {
+        let input = r#"
+select * from foo;
+-- IOX_COMPARE: sorted
+"#;
+        let queries = TestQueries::from_lines(input.split('\n'), Language::Sql);
+        assert_eq!(
+            queries,
+            TestQueries {
+                queries: vec![Query::new("select * from foo;")]
+            }
+        )
+    }
+
+    #[test]
+    fn test_parse_queries_sorted_compare_not_match_ignored() {
+        let input = r#"
+-- IOX_COMPARE: something_else
+select * from foo;
+"#;
+        let queries = TestQueries::from_lines(input.split('\n'), Language::Sql);
+        assert_eq!(
+            queries,
+            TestQueries {
+                queries: vec![
+                    // Note the --IOX_COMPARE is not treated as a comment
+                    Query::new("select * from foo;")
+                ]
+            }
+        )
+    }
+}
diff --git a/test_helpers_end_to_end/src/steps.rs b/test_helpers_end_to_end/src/steps.rs
new file mode 100644
index 0000000..c727de4
--- /dev/null
+++ b/test_helpers_end_to_end/src/steps.rs
@@ -0,0 +1,850 @@
+use crate::snapshot_comparison::Language;
+use crate::{
+    check_flight_error, run_influxql, run_influxql_with_params, run_sql, run_sql_with_params,
+    snapshot_comparison, try_run_influxql, try_run_influxql_with_params, try_run_sql,
+    try_run_sql_with_params, MiniCluster,
+};
+use arrow::record_batch::RecordBatch;
+use arrow_util::assert_batches_sorted_eq;
+use futures::future::BoxFuture;
+use http::StatusCode;
+use iox_query_params::StatementParam;
+use observability_deps::tracing::info;
+use std::collections::HashMap;
+use std::{path::PathBuf, time::Duration};
+use test_helpers::assert_contains;
+
+const MAX_QUERY_RETRY_TIME_SEC: u64 = 20;
+
+/// Test harness for end to end tests that are comprised of several steps
+#[allow(missing_debug_implementations)]
+pub struct StepTest<'a, S> {
+    cluster: &'a mut MiniCluster,
+
+    /// The test steps to perform
+    steps: Box<dyn Iterator<Item = S> + Send + Sync + 'a>,
+}
+
+/// The test state that is passed to custom steps
+#[derive(Debug)]
+pub struct StepTestState<'a> {
+    /// The mini cluster
+    cluster: &'a mut MiniCluster,
+
+    /// How many Parquet files the catalog service knows about for the mini cluster's namespace,
+    /// for tracking when persistence has happened. If this is `None`, we haven't ever checked with
+    /// the catalog service.
+    num_parquet_files: Option<usize>,
+}
+
+impl<'a> StepTestState<'a> {
+    /// Get a reference to the step test state's cluster.
+    #[must_use]
+    pub fn cluster(&self) -> &&'a mut MiniCluster {
+        &self.cluster
+    }
+
+    /// Get a reference to the step test state's cluster.
+    #[must_use]
+    pub fn cluster_mut(&mut self) -> &mut &'a mut MiniCluster {
+        &mut self.cluster
+    }
+
+    /// Store the number of Parquet files the catalog has for the mini cluster's namespace.
+    /// Call this before a write to be able to tell when a write has been persisted by checking for
+    /// a change in this count.
+    pub async fn record_num_parquet_files(&mut self) {
+        let num_parquet_files = self.get_num_parquet_files().await;
+
+        info!(
+            "Recorded count of Parquet files for namespace {}: {num_parquet_files}",
+            self.cluster.namespace()
+        );
+        self.num_parquet_files = Some(num_parquet_files);
+    }
+
+    /// Wait for a change (up to a timeout) in the number of Parquet files the catalog has for the
+    /// mini cluster's namespacee since the last time the number of Parquet files was recorded,
+    /// which indicates persistence has taken place.
+    pub async fn wait_for_num_parquet_file_change(&mut self, expected_increase: usize) {
+        let retry_duration = Duration::from_secs(MAX_QUERY_RETRY_TIME_SEC);
+        let num_parquet_files = self.num_parquet_files.expect(
+            "No previous number of Parquet files recorded! \
+                Use `Step::RecordNumParquetFiles` before `Step::WaitForPersisted`.",
+        );
+        let expected_count = num_parquet_files + expected_increase;
+
+        tokio::time::timeout(retry_duration, async move {
+            let mut interval = tokio::time::interval(Duration::from_millis(1000));
+            loop {
+                let current_count = self.get_num_parquet_files().await;
+                if current_count >= expected_count {
+                    info!(
+                        "Success; Parquet file count is now {current_count} \
+                        which is at least {expected_count}"
+                    );
+                    // Reset the saved value to require recording before waiting again
+                    self.num_parquet_files = None;
+                    return;
+                }
+                info!(
+                    "Retrying; Parquet file count is still {current_count} \
+                    which is less than {expected_count}"
+                );
+
+                interval.tick().await;
+            }
+        })
+        .await
+        .expect("did not get additional Parquet files in the catalog");
+    }
+
+    /// Ask the catalog service how many Parquet files it has for the mini cluster's namespace.
+    async fn get_num_parquet_files(&self) -> usize {
+        let connection = self.cluster.router().router_grpc_connection();
+        let mut catalog_client = influxdb_iox_client::catalog::Client::new(connection);
+
+        catalog_client
+            .get_parquet_files_by_namespace(self.cluster.namespace())
+            .await
+            .map(|parquet_files| parquet_files.len())
+            .unwrap_or_default()
+    }
+
+    /// waits for `MAX_QUERY_RETRY_TIME_SEC` for the database to
+    /// report exactly `expected` for its partition keys
+    async fn wait_for_partition_keys(
+        &self,
+        table_name: &str,
+        namespace_name: &Option<String>,
+        expected: &[&str],
+    ) {
+        let retry_duration = Duration::from_secs(MAX_QUERY_RETRY_TIME_SEC);
+        let partition_keys = tokio::time::timeout(retry_duration, async {
+            loop {
+                let mut partition_keys = self
+                    .cluster()
+                    .partition_keys(table_name, namespace_name.clone())
+                    .await;
+                partition_keys.sort();
+                info!("====Read partition keys: {partition_keys:?}");
+
+                if partition_keys == *expected {
+                    return partition_keys;
+                }
+            }
+        })
+        .await
+        .expect("did not get expected partition keys before timeout");
+
+        assert_eq!(partition_keys, *expected);
+    }
+}
+
+/// Function used for custom [`Step`]s.
+///
+/// It is an async function that receives a mutable reference to [`StepTestState`].
+///
+/// Example of creating one (note the `boxed()` call):
+/// ```
+/// use futures::FutureExt;
+/// use futures::future::BoxFuture;
+/// use test_helpers_end_to_end::{FCustom, StepTestState};
+///
+/// let custom_function: FCustom = Box::new(|state: &mut StepTestState| {
+///   async move {
+///     // access the cluster:
+///     let cluster = state.cluster();
+///     // Your potentially async code here
+///   }.boxed()
+/// });
+/// ```
+pub type FCustom =
+    Box<dyn for<'b> Fn(&'b mut StepTestState<'_>) -> BoxFuture<'b, ()> + Send + Sync>;
+
+/// Function to do custom validation on metrics. Expected to panic on validation failure.
+pub(crate) type MetricsValidationFn = Box<dyn Fn(&mut StepTestState<'_>, String) + Send + Sync>;
+
+/// Possible test steps that a test can perform
+#[allow(missing_debug_implementations)]
+pub enum Step {
+    /// Writes the specified line protocol to the `/api/v2/write`
+    /// endpoint, assert the data was written successfully
+    WriteLineProtocol(String),
+
+    /// Writes the specified line protocol to the `/api/v2/write` endpoint; assert the request
+    /// returned an error with the given code
+    WriteLineProtocolExpectingError {
+        line_protocol: String,
+        expected_error_code: StatusCode,
+        expected_error_message: String,
+        expected_line_number: Option<usize>,
+    },
+
+    /// Writes the specified line protocol to the `/api/v2/write` endpoint
+    /// using the specified authorization header, assert the data was
+    /// written successfully.
+    WriteLineProtocolWithAuthorization {
+        line_protocol: String,
+        authorization: String,
+    },
+
+    /// Ask the catalog service how many Parquet files it has for this cluster's namespace. Do this
+    /// before a write where you're interested in when the write has been persisted to Parquet;
+    /// then after the write use `WaitForPersisted` to observe the change in the number of Parquet
+    /// files from the value this step recorded.
+    RecordNumParquetFiles,
+
+    /// Query the catalog service for how many parquet files it has for this
+    /// cluster's namespace, asserting the value matches expected.
+    AssertNumParquetFiles { expected: usize },
+
+    /// Ask the ingester to persist immediately through the persist service gRPC API
+    Persist,
+
+    /// Wait for all previously written data to be persisted by observing an increase in the number
+    /// of Parquet files in the catalog as specified for this cluster's namespace.
+    WaitForPersisted { expected_increase: usize },
+
+    /// Set the namespace retention interval to a retention period,
+    /// specified in ns relative to `now()`.  `None` represents infinite retention
+    /// (i.e. never drop data).
+    SetRetention(Option<i64>),
+
+    /// Run one compaction operation and wait for it to finish, expecting success.
+    Compact,
+
+    /// Run one compaction operation and wait for it to finish, expecting an error that matches
+    /// the specified message.
+    CompactExpectingError { expected_message: String },
+
+    /// Run a SQL query using the FlightSQL interface and verify that the
+    /// results match the expected results using the
+    /// `assert_batches_eq!` macro
+    Query {
+        sql: String,
+        expected: Vec<&'static str>,
+    },
+
+    /// Run SQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// with the supplied parameters. Then verify that the
+    /// results match the expected results using the
+    /// `assert_batches_eq!` macro
+    QueryWithParams {
+        sql: String,
+        params: HashMap<String, StatementParam>,
+        expected: Vec<&'static str>,
+    },
+
+    /// Read the SQL queries in the specified file and verify that the results match the expected
+    /// results in the corresponding expected file
+    QueryAndCompare {
+        input_path: PathBuf,
+        setup_name: String,
+        contents: String,
+    },
+
+    /// Run a SQL query that's expected to fail using the FlightSQL interface and verify that the
+    /// request returns the expected error code and message
+    QueryExpectingError {
+        sql: String,
+        expected_error_code: tonic::Code,
+        expected_message: String,
+    },
+
+    /// Run SQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// with the supplied parameters. Then verify that the
+    /// request returns the expected error code and message
+    QueryWithParamsExpectingError {
+        sql: String,
+        params: HashMap<String, StatementParam>,
+        expected_error_code: tonic::Code,
+        expected_message: String,
+    },
+
+    /// Run a SQL query using the FlightSQL interface authorized by the
+    /// authorization header. Verify that the
+    /// results match the expected results using the `assert_batches_eq!`
+    /// macro
+    QueryWithAuthorization {
+        sql: String,
+        authorization: String,
+        expected: Vec<&'static str>,
+    },
+
+    /// Run a SQL query using the FlightSQL interface with the `iox-debug` header set.
+    /// Verify that the
+    /// results match the expected results using the `assert_batches_eq!`
+    /// macro
+    QueryWithDebug {
+        sql: String,
+        expected: Vec<&'static str>,
+    },
+
+    /// Run a SQL query using the FlightSQL interface, and then verifies
+    /// the results using the provided validation function on the
+    /// results.
+    ///
+    /// The validation function is expected to panic on validation
+    /// failure.
+    VerifiedQuery {
+        sql: String,
+        verify: Box<dyn Fn(Vec<RecordBatch>) + Send + Sync>,
+    },
+
+    /// Run an InfluxQL query using the FlightSQL interface and verify that the
+    /// results match the expected results using the
+    /// `assert_batches_eq!` macro
+    InfluxQLQuery {
+        query: String,
+        expected: Vec<&'static str>,
+    },
+
+    /// Run an InfluxQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// in the query text with values provided by the params HashMap. Then verify that the
+    /// results match the expected results using the `assert_batches_eq!` macro
+    InfluxQLQueryWithParams {
+        query: String,
+        params: HashMap<String, StatementParam>,
+        expected: Vec<&'static str>,
+    },
+
+    /// Read the InfluxQL queries in the specified file and verify that the results match the
+    /// expected results in the corresponding expected file
+    InfluxQLQueryAndCompare {
+        input_path: PathBuf,
+        setup_name: String,
+        contents: String,
+    },
+
+    /// Run an InfluxQL query that's expected to fail using the FlightSQL interface and verify that
+    /// the request returns the expected error code and message
+    InfluxQLExpectingError {
+        query: String,
+        expected_error_code: tonic::Code,
+        expected_message: String,
+    },
+
+    /// Run InfluxQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// with the supplied parameters. Then verify that the
+    /// request returns the expected error code and message
+    InfluxQLWithParamsExpectingError {
+        query: String,
+        params: HashMap<String, StatementParam>,
+        expected_error_code: tonic::Code,
+        expected_message: String,
+    },
+
+    /// Run an InfluxQL query using the FlightSQL interface including an
+    /// authorization header. Verify that the results match the expected
+    /// results using the `assert_batches_eq!` macro.
+    InfluxQLQueryWithAuthorization {
+        query: String,
+        authorization: String,
+        expected: Vec<&'static str>,
+    },
+
+    /// Read and verify partition keys for a given table
+    PartitionKeys {
+        table_name: String,
+        namespace_name: Option<String>,
+        expected: Vec<&'static str>,
+    },
+
+    /// Attempt to gracefully shutdown all running ingester instances.
+    ///
+    /// This step blocks until all ingesters have gracefully stopped, or at
+    /// least [`GRACEFUL_SERVER_STOP_TIMEOUT`] elapses before they are killed.
+    ///
+    /// [`GRACEFUL_SERVER_STOP_TIMEOUT`]:
+    ///     crate::server_fixture::GRACEFUL_SERVER_STOP_TIMEOUT
+    GracefulStopIngesters,
+
+    /// Retrieve the metrics and verify the results using the provided
+    /// validation function.
+    ///
+    /// The validation function is expected to panic on validation
+    /// failure.
+    VerifiedMetrics(MetricsValidationFn),
+
+    /// A custom step that can be used to implement special cases that
+    /// are only used once.
+    Custom(FCustom),
+}
+
+impl AsRef<Step> for Step {
+    fn as_ref(&self) -> &Step {
+        self
+    }
+}
+
+impl<'a, S> StepTest<'a, S>
+where
+    S: AsRef<Step> + Send,
+{
+    /// Create a new test that runs each `step`, in sequence, against
+    /// `cluster` panic'ing if any step fails
+    pub fn new<I>(cluster: &'a mut MiniCluster, steps: I) -> Self
+    where
+        I: IntoIterator<Item = S> + Send + Sync + 'a,
+        <I as IntoIterator>::IntoIter: Send + Sync,
+    {
+        Self {
+            cluster,
+            steps: Box::new(steps.into_iter()),
+        }
+    }
+
+    /// run the test.
+    pub async fn run(self) {
+        let Self { cluster, steps } = self;
+
+        let mut state = StepTestState {
+            cluster,
+            num_parquet_files: Default::default(),
+        };
+
+        for (i, step) in steps.enumerate() {
+            info!("**** Begin step {} *****", i);
+            match step.as_ref() {
+                Step::WriteLineProtocol(line_protocol) => {
+                    info!(
+                        "====Begin writing line protocol to v2 HTTP API:\n{}",
+                        line_protocol
+                    );
+                    let response = state.cluster.write_to_router(line_protocol, None).await;
+                    let status = response.status();
+                    let body = hyper::body::to_bytes(response.into_body())
+                        .await
+                        .expect("reading response body");
+                    assert!(
+                        status == StatusCode::NO_CONTENT,
+                        "Invalid response code while writing line protocol:\n\nLine Protocol:\n{}\n\nExpected Status: {}\nActual Status: {}\n\nBody:\n{:?}",
+                        line_protocol,
+                        StatusCode::NO_CONTENT,
+                        status,
+                        body,
+                    );
+                    info!("====Done writing line protocol");
+                }
+                Step::WriteLineProtocolExpectingError {
+                    line_protocol,
+                    expected_error_code,
+                    expected_error_message,
+                    expected_line_number,
+                } => {
+                    info!(
+                        "====Begin writing line protocol expecting error to v2 HTTP API:\n{}",
+                        line_protocol
+                    );
+                    let response = state.cluster.write_to_router(line_protocol, None).await;
+                    assert_eq!(response.status(), *expected_error_code);
+
+                    let body: serde_json::Value = serde_json::from_slice(
+                        &hyper::body::to_bytes(response.into_body())
+                            .await
+                            .expect("should be able to read response body"),
+                    )
+                    .expect("response body should be valid json");
+
+                    assert_matches::assert_matches!(
+                        body["message"],
+                        serde_json::Value::String(ref s) if s.contains(expected_error_message),
+                        "error message did not match: expected '{}' to contain '{}'",
+                        body["message"],
+                        expected_error_message
+                    );
+
+                    match expected_line_number {
+                        Some(line) => {
+                            assert_matches::assert_matches!(
+                                body["line"],
+                                serde_json::Value::Number(ref n) if n == &serde_json::Number::from(*line),
+                                "error line did not match: expected '{}' to be '{}'",
+                                body["line"],
+                                line
+                            );
+                        }
+                        None => {
+                            assert!(
+                                !body.as_object().unwrap().contains_key("line"),
+                                "error line should not be present"
+                            );
+                        }
+                    };
+
+                    info!("====Done writing line protocol expecting error");
+                }
+                Step::WriteLineProtocolWithAuthorization {
+                    line_protocol,
+                    authorization,
+                } => {
+                    info!(
+                        "====Begin writing line protocol (authenticated) to v2 HTTP API:\n{}",
+                        line_protocol
+                    );
+                    let response = state
+                        .cluster
+                        .write_to_router(line_protocol, Some(authorization))
+                        .await;
+                    assert_eq!(response.status(), StatusCode::NO_CONTENT);
+                    info!("====Done writing line protocol");
+                }
+                // Get the current number of Parquet files in the cluster's namespace before
+                // starting a new write so we can observe a change when waiting for persistence.
+                Step::RecordNumParquetFiles => {
+                    state.record_num_parquet_files().await;
+                }
+                Step::AssertNumParquetFiles { expected } => {
+                    let have_files = state.get_num_parquet_files().await;
+                    assert_eq!(have_files, *expected);
+                }
+                // Ask the ingesters to persist immediately through the persist service gRPC API
+                Step::Persist => {
+                    state.cluster().persist_ingesters().await;
+                }
+                Step::WaitForPersisted { expected_increase } => {
+                    info!("====Begin waiting for a change in the number of Parquet files");
+                    state
+                        .wait_for_num_parquet_file_change(*expected_increase)
+                        .await;
+                    info!("====Done waiting for a change in the number of Parquet files");
+                }
+                Step::Compact => {
+                    info!("====Begin running compaction");
+                    state.cluster.run_compaction().unwrap();
+                    info!("====Done running compaction");
+                }
+                Step::CompactExpectingError { expected_message } => {
+                    info!("====Begin running compaction expected to error");
+                    let err = state.cluster.run_compaction().unwrap_err();
+
+                    assert_contains!(err, expected_message);
+
+                    info!("====Done running");
+                }
+
+                Step::SetRetention(retention_period_ns) => {
+                    info!("====Begin setting retention period to {retention_period_ns:?}");
+                    let namespace = state.cluster().namespace();
+                    let router_connection = state.cluster().router().router_grpc_connection();
+                    let mut client = influxdb_iox_client::namespace::Client::new(router_connection);
+                    client
+                        .update_namespace_retention(namespace, *retention_period_ns)
+                        .await
+                        .expect("Error updating retention period");
+                    info!("====Done setting retention period");
+                }
+                Step::Query { sql, expected } => {
+                    info!("====Begin running SQL query: {}", sql);
+                    // run query
+                    let (mut batches, schema) = run_sql(
+                        sql,
+                        state.cluster.namespace(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        None,
+                        false,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
+                Step::QueryWithParams {
+                    sql,
+                    params,
+                    expected,
+                } => {
+                    info!("====Begin running SQL query: {}", sql);
+                    info!("params: {:?}", params);
+                    // run query
+                    let (mut batches, schema) = run_sql_with_params(
+                        sql,
+                        state.cluster.namespace(),
+                        params.clone(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        None,
+                        false,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
+                Step::QueryAndCompare {
+                    input_path,
+                    setup_name,
+                    contents,
+                } => {
+                    info!(
+                        "====Begin running SQL queries in file {}",
+                        input_path.display()
+                    );
+                    snapshot_comparison::run(
+                        state.cluster,
+                        input_path.into(),
+                        setup_name.into(),
+                        contents.into(),
+                        Language::Sql,
+                    )
+                    .await
+                    .unwrap();
+                    info!("====Done running SQL queries");
+                }
+                Step::QueryExpectingError {
+                    sql,
+                    expected_error_code,
+                    expected_message,
+                } => {
+                    info!("====Begin running SQL query expected to error: {}", sql);
+
+                    let err = try_run_sql(
+                        sql,
+                        state.cluster().namespace(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        None,
+                        false,
+                    )
+                    .await
+                    .unwrap_err();
+
+                    check_flight_error(err, *expected_error_code, Some(expected_message));
+
+                    info!("====Done running");
+                }
+                Step::QueryWithParamsExpectingError {
+                    sql,
+                    params,
+                    expected_error_code,
+                    expected_message,
+                } => {
+                    info!("====Begin running SQL query expected to error: {}", sql);
+
+                    let err = try_run_sql_with_params(
+                        sql,
+                        state.cluster().namespace(),
+                        params.clone(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        None,
+                        false,
+                    )
+                    .await
+                    .unwrap_err();
+
+                    check_flight_error(err, *expected_error_code, Some(expected_message));
+
+                    info!("====Done running");
+                }
+                Step::QueryWithAuthorization {
+                    sql,
+                    authorization,
+                    expected,
+                } => {
+                    info!("====Begin running SQL query (authenticated): {}", sql);
+                    // run query
+                    let (mut batches, schema) = run_sql(
+                        sql,
+                        state.cluster.namespace(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        Some(authorization.as_str()),
+                        false,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
+                Step::QueryWithDebug { sql, expected } => {
+                    info!("====Begin running SQL query (w/ iox-debug): {}", sql);
+                    // run query
+                    let (mut batches, schema) = run_sql(
+                        sql,
+                        state.cluster.namespace(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        None,
+                        true,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
+                Step::VerifiedQuery { sql, verify } => {
+                    info!("====Begin running SQL verified query: {}", sql);
+                    // run query
+                    let (batches, _schema) = run_sql(
+                        sql,
+                        state.cluster.namespace(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        None,
+                        true,
+                    )
+                    .await;
+                    verify(batches);
+                    info!("====Done running");
+                }
+                Step::InfluxQLQuery { query, expected } => {
+                    info!("====Begin running InfluxQL query: {}", query);
+                    // run query
+                    let (mut batches, schema) = run_influxql(
+                        query,
+                        state.cluster.namespace(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        None,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
+                Step::InfluxQLQueryWithParams {
+                    query,
+                    expected,
+                    params,
+                } => {
+                    info!("====Begin running InfluxQL query: {}", query);
+                    info!("params: {:?}", params);
+                    // run query
+                    let (mut batches, schema) = run_influxql_with_params(
+                        query,
+                        state.cluster.namespace(),
+                        params.clone(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        None,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
+                Step::InfluxQLQueryAndCompare {
+                    input_path,
+                    setup_name,
+                    contents,
+                } => {
+                    info!(
+                        "====Begin running InfluxQL queries in file {}",
+                        input_path.display()
+                    );
+                    snapshot_comparison::run(
+                        state.cluster,
+                        input_path.into(),
+                        setup_name.into(),
+                        contents.into(),
+                        Language::InfluxQL,
+                    )
+                    .await
+                    .unwrap();
+                    info!("====Done running InfluxQL queries");
+                }
+                Step::InfluxQLExpectingError {
+                    query,
+                    expected_error_code,
+                    expected_message,
+                } => {
+                    info!(
+                        "====Begin running InfluxQL query expected to error: {}",
+                        query
+                    );
+
+                    let err = try_run_influxql(
+                        query,
+                        state.cluster().namespace(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        None,
+                    )
+                    .await
+                    .unwrap_err();
+
+                    check_flight_error(err, *expected_error_code, Some(expected_message));
+
+                    info!("====Done running");
+                }
+                Step::InfluxQLWithParamsExpectingError {
+                    query,
+                    params,
+                    expected_error_code,
+                    expected_message,
+                } => {
+                    info!(
+                        "====Begin running InfluxQL query expected to error: {}",
+                        query
+                    );
+                    info!("params: {:?}", params);
+                    let err = try_run_influxql_with_params(
+                        query,
+                        state.cluster().namespace(),
+                        params.clone(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        None,
+                    )
+                    .await
+                    .unwrap_err();
+
+                    check_flight_error(err, *expected_error_code, Some(expected_message));
+
+                    info!("====Done running");
+                }
+                Step::InfluxQLQueryWithAuthorization {
+                    query,
+                    authorization,
+                    expected,
+                } => {
+                    info!("====Begin running InfluxQL query: {}", query);
+                    // run query
+                    let (mut batches, schema) = run_influxql(
+                        query,
+                        state.cluster.namespace(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        Some(authorization),
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
+                Step::PartitionKeys {
+                    table_name,
+                    namespace_name,
+                    expected,
+                } => {
+                    info!("====Persist ingesters to ensure catalog partition records exist");
+                    state
+                        .cluster()
+                        .persist_ingesters_by_namespace(namespace_name.clone())
+                        .await;
+
+                    info!("====Begin reading partition keys for table: {}", table_name);
+                    state
+                        .wait_for_partition_keys(table_name, namespace_name, expected)
+                        .await;
+                    info!("====Done reading partition keys");
+                }
+                Step::GracefulStopIngesters => {
+                    info!("====Gracefully stop all ingesters");
+
+                    state.cluster_mut().gracefully_stop_ingesters().await;
+                }
+                Step::VerifiedMetrics(verify) => {
+                    info!("====Begin validating metrics");
+
+                    let cluster = state.cluster();
+                    let http_base = cluster.router().router_http_base();
+                    let url = format!("{http_base}/metrics");
+
+                    let client = reqwest::Client::new();
+                    let metrics = client.get(&url).send().await.unwrap().text().await.unwrap();
+
+                    verify(&mut state, metrics);
+
+                    info!("====Done validating metrics");
+                }
+                Step::Custom(f) => {
+                    info!("====Begin custom step");
+                    f(&mut state).await;
+                    info!("====Done custom step");
+                }
+            }
+        }
+    }
+}
diff --git a/test_helpers_end_to_end/src/udp_listener.rs b/test_helpers_end_to_end/src/udp_listener.rs
new file mode 100644
index 0000000..02da414
--- /dev/null
+++ b/test_helpers_end_to_end/src/udp_listener.rs
@@ -0,0 +1,136 @@
+//! Captures UDP packets
+
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};
+
+/// UDP listener server that captures UDP messages (e.g. Jaeger spans)
+/// for use in tests
+use parking_lot::Mutex;
+use tokio::{net::UdpSocket, select};
+use tokio_util::sync::CancellationToken;
+
+/// Maximum time to wait for a message, in seconds
+const MAX_WAIT_TIME_SEC: u64 = 2;
+
+/// A UDP message received by this server
+#[derive(Clone)]
+pub struct Message {
+    data: Vec<u8>,
+}
+
+impl std::fmt::Debug for Message {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Message({} bytes: {}", self.data.len(), self.to_string())
+    }
+}
+
+impl ToString for Message {
+    fn to_string(&self) -> String {
+        String::from_utf8_lossy(&self.data).to_string()
+    }
+}
+
+#[derive(Debug)]
+pub struct UdpCapture {
+    socket_addr: std::net::SocketAddr,
+    join_handle: tokio::task::JoinHandle<()>,
+    token: CancellationToken,
+    messages: Arc<Mutex<Vec<Message>>>,
+}
+
+impl UdpCapture {
+    // Create a new server, listening for Udp messages
+    pub async fn new() -> Self {
+        // Bind to some address, letting the OS pick
+        let socket = UdpSocket::bind("127.0.0.1:0")
+            .await
+            .expect("bind udp listener");
+
+        let socket_addr = socket.local_addr().unwrap();
+
+        println!(
+            "UDP server listening at {} port {}",
+            socket_addr.ip(),
+            socket_addr.port()
+        );
+
+        let token = CancellationToken::new();
+        let messages = Arc::new(Mutex::new(vec![]));
+
+        // Spawns a background task that listens on the
+        let captured_messages = Arc::clone(&messages);
+        let captured_token = token.clone();
+
+        let join_handle = tokio::spawn(async move {
+            println!("Starting udp listen");
+            loop {
+                let mut data = vec![0; 1024];
+
+                select! {
+                    _ = captured_token.cancelled() => {
+                        println!("Received shutdown request");
+                        return;
+                    },
+                    res  = socket.recv_from(&mut data) => {
+                        let (sz, _origin) = res.expect("successful socket read");
+                        data.resize(sz, 0);
+                        let mut messages = captured_messages.lock();
+                        messages.push(Message { data });
+                    }
+                }
+            }
+        });
+
+        Self {
+            socket_addr,
+            join_handle,
+            token,
+            messages,
+        }
+    }
+
+    /// return the ip on which this server is listening
+    pub fn ip(&self) -> String {
+        self.socket_addr.ip().to_string()
+    }
+
+    /// return the port on which this server is listening
+    pub fn port(&self) -> String {
+        self.socket_addr.port().to_string()
+    }
+
+    /// stop and wait for successful shutdown of this server
+    pub async fn stop(self) {
+        self.token.cancel();
+        if let Err(e) = self.join_handle.await {
+            println!("Error waiting for shutdown of udp server: {e}");
+        }
+    }
+
+    // Return all messages this server has seen so far
+    pub fn messages(&self) -> Vec<Message> {
+        let messages = self.messages.lock();
+        messages.clone()
+    }
+
+    // wait for a message to appear that passes `pred` or the timeout expires
+    pub async fn wait_for<P>(&self, pred: P)
+    where
+        P: FnMut(&Message) -> bool + Copy + Send,
+    {
+        let end = Instant::now() + Duration::from_secs(MAX_WAIT_TIME_SEC);
+
+        while Instant::now() < end {
+            if self.messages.lock().iter().any(pred) {
+                return;
+            }
+            tokio::time::sleep(Duration::from_millis(200)).await
+        }
+        panic!(
+            "Timeout expired before finding find messages that matches predicate. Messages:\n{:#?}",
+            self.messages.lock()
+        )
+    }
+}
diff --git a/tokio_metrics_bridge/Cargo.toml b/tokio_metrics_bridge/Cargo.toml
new file mode 100644
index 0000000..9a43a1a
--- /dev/null
+++ b/tokio_metrics_bridge/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "tokio_metrics_bridge"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+metric = { path = "../metric" }
+parking_lot = "0.12.1"
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/tokio_metrics_bridge/src/bridge.rs b/tokio_metrics_bridge/src/bridge.rs
new file mode 100644
index 0000000..b69737b
--- /dev/null
+++ b/tokio_metrics_bridge/src/bridge.rs
@@ -0,0 +1,420 @@
+use std::{
+    borrow::Cow,
+    collections::{hash_map::Entry, HashMap},
+    sync::Arc,
+};
+
+use metric::{Attributes, Instrument, MetricKind, Observation, Registry, Reporter};
+use parking_lot::RwLock;
+use tokio::runtime::RuntimeMetrics;
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+/// Register metric instrumentation for given tokio runtime.
+///
+/// # Lifecycle
+/// When the [`Runtime`] linked to the given metrics is shut down, the metrics will be frozen in time (i.e. they keep
+/// their last values). This instrumentation does NOT prevent the runtime from shutting down.
+///
+/// # Panics
+/// Panics if a runtime with the same name is already registered in this registry.
+///
+///
+/// [`Runtime`]: tokio::runtime::Runtime
+pub fn setup_tokio_metrics(
+    runtime_metrics: RuntimeMetrics,
+    runtime_name: &'static str,
+    registry: Arc<Registry>,
+) {
+    // Don't use the runtime_name directly as a instrument name, because this may confuse users when the name conflicts.
+    // Since we cannot create a static string by concatenation, use an additional dispatch layer.
+    let dispatcher = registry.register_instrument("tokio", TokioInstrumentDispatcher::default);
+    let instrument = TokioInstrument::new(runtime_metrics, runtime_name);
+    let mut guard = dispatcher.instruments.write();
+
+    match guard.entry(runtime_name) {
+        Entry::Vacant(v) => {
+            v.insert(instrument);
+        }
+        Entry::Occupied(_) => {
+            panic!("metrics for runtime '{runtime_name}' already registered");
+        }
+    }
+}
+
+/// Dispatcher from a single [`Instrument`] to the per-runtime [`TokioInstrument`]. This is used so we can use a
+/// predictable instrument name and so that we only need to emit the same metric name/type/description once.
+#[derive(Debug, Clone, Default)]
+struct TokioInstrumentDispatcher {
+    /// Maps from runtime name to the actual instrument.
+    instruments: Arc<RwLock<HashMap<&'static str, TokioInstrument>>>,
+}
+
+macro_rules! rt_metric {
+    (
+        this = $this:expr,
+        reporter = $reporter:expr,
+        metric = $metric:ident,
+        descr = $descr:literal,
+        t = $t:ident,
+    ) => {
+        $reporter.start_metric(
+            concat!("tokio_runtime_", stringify!($metric)),
+            $descr,
+            MetricKind::$t,
+        );
+        for sub in $this.values() {
+            $reporter.report_observation(
+                &sub.attr_rt,
+                Observation::$t(sub.runtime_metrics.$metric() as _),
+            );
+        }
+        $reporter.finish_metric();
+    };
+}
+
+macro_rules! worker_metric {
+    (
+        this = $this:expr,
+        reporter = $reporter:expr,
+        metric = $metric:ident,
+        descr = $descr:literal,
+        t = $t:ident,
+    ) => {
+        $reporter.start_metric(
+            // do NOT use worker_ prefix here because all metric names already contain it
+            concat!("tokio_", stringify!($metric)),
+            $descr,
+            MetricKind::$t,
+        );
+        for sub in $this.values() {
+            for (w, attr) in sub.attr_worker.iter().enumerate() {
+                $reporter
+                    .report_observation(attr, Observation::$t(sub.runtime_metrics.$metric(w) as _));
+            }
+        }
+        $reporter.finish_metric();
+    };
+}
+
+impl Instrument for TokioInstrumentDispatcher {
+    fn report(&self, reporter: &mut dyn Reporter) {
+        let guard = self.instruments.read();
+
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = active_tasks_count,
+            descr = "The number of active tasks in the runtime.",
+            t = U64Gauge,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = blocking_queue_depth,
+            descr = "The number of tasks currently scheduled in the blocking thread pool, spawned using `spawn_blocking`.",
+            t = U64Gauge,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = budget_forced_yield_count,
+            descr = "Number of times that tasks have been forced to yield back to the scheduler after exhausting their task budgets.",
+            t = U64Counter,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = injection_queue_depth,
+            descr = "The number of tasks currently scheduled in the runtime's injection queue.",
+            t = U64Gauge,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = io_driver_ready_count,
+            descr = "The number of ready events processed by the runtime's I/O driver.",
+            t = U64Counter,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = io_driver_fd_deregistered_count,
+            descr = "The number of file descriptors that have been deregistered by the runtime's I/O driver.",
+            t = U64Counter,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = io_driver_fd_registered_count,
+            descr = "The number of file descriptors that have been registered with the runtime's I/O driver.",
+            t = U64Counter,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = num_blocking_threads,
+            descr = "Number of additional threads spawned by the runtime.",
+            t = U64Gauge,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = num_idle_blocking_threads,
+            descr = "Number of idle threads, which have spawned by the runtime for `spawn_blocking` calls.",
+            t = U64Gauge,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = num_workers,
+            descr = "Number of worker threads used by the runtime",
+            t = U64Gauge,
+        );
+        rt_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = remote_schedule_count,
+            descr = "Number of tasks scheduled from **outside** of the runtime.",
+            t = U64Counter,
+        );
+
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_local_queue_depth,
+            descr = "The number of tasks currently scheduled in the given worker's local queue.",
+            t = U64Gauge,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_local_schedule_count,
+            descr = "The number of tasks scheduled from **within** the runtime on the given worker's local queue.",
+            t = U64Counter,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_noop_count,
+            descr = "The number of times the given worker thread unparked but performed no work before parking again.",
+            t = U64Counter,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_overflow_count,
+            descr = "The number of times the given worker thread saturated its local queue.",
+            t = U64Counter,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_park_count,
+            descr = "The total number of times the given worker thread has parked.",
+            t = U64Counter,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_poll_count,
+            descr = "The number of tasks the given worker thread has polled.",
+            t = U64Counter,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_steal_count,
+            descr = "The number of tasks the given worker thread stole from another worker thread.",
+            t = U64Counter,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_steal_operations,
+            descr = "The number of times the given worker thread stole tasks from another worker thread.",
+            t = U64Counter,
+        );
+        worker_metric!(
+            this = guard,
+            reporter = reporter,
+            metric = worker_total_busy_duration,
+            descr = "The amount of time the given worker thread has been busy.",
+            t = DurationCounter,
+        );
+    }
+
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+}
+
+/// [`Instrument`] for a single tokio [`Runtime`].
+///
+///
+/// [`Runtime`]: tokio::runtime::Runtime
+#[derive(Debug, Clone)]
+struct TokioInstrument {
+    /// Handler for the runtime metrics.
+    ///
+    /// This can be considered a WEAK reference to the tokio runtime. [`RuntimeMetrics`] is internally just based on
+    /// [`Handle`] which -- according to its own method documentation -- is useless if the runtime was shut down.
+    /// Especially it does NOT prevent the runtime from shutting down.
+    ///
+    ///
+    /// [`Handle`]: tokio::runtime::Handle
+    runtime_metrics: RuntimeMetrics,
+
+    /// [`Attributes`] that are used for all runtime-scoped metrics.
+    attr_rt: Attributes,
+
+    /// [`Attributes`] that are used for all worker-scoped metrics.
+    attr_worker: Vec<Attributes>,
+}
+
+impl TokioInstrument {
+    fn new(runtime_metrics: RuntimeMetrics, runtime_name: &'static str) -> Self {
+        let workers = runtime_metrics.num_workers();
+        Self {
+            runtime_metrics,
+            attr_rt: Attributes::from(&[("runtime", runtime_name)]),
+            attr_worker: (0..workers)
+                .map(|w| {
+                    Attributes::from([
+                        ("runtime", Cow::<'static, str>::from(runtime_name)),
+                        ("worker", Cow::<'static, str>::from(w.to_string())),
+                    ])
+                })
+                .collect(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use metric::RawReporter;
+    use tokio::runtime::{Builder as RuntimeBuilder, Runtime};
+
+    use super::*;
+
+    #[test]
+    #[should_panic(expected = "metrics for runtime 'foo' already registered")]
+    fn test_panic_register_twice() {
+        let registry = Arc::new(Registry::default());
+        let rt = Runtime::new().unwrap();
+        setup_tokio_metrics(rt.metrics(), "foo", Arc::clone(&registry));
+        setup_tokio_metrics(rt.metrics(), "foo", registry);
+    }
+
+    /// Test that runtimes are scoped per [`Registry`] and are NOT inserted into some global static state.
+    ///
+    /// If there would be a global state, this would panic like [`test_panic_register_twice`].
+    #[test]
+    fn test_runtimes_registered_per_registry() {
+        let registry1 = Arc::new(Registry::default());
+        let registry2 = Arc::new(Registry::default());
+        let rt = Runtime::new().unwrap();
+        setup_tokio_metrics(rt.metrics(), "foo", Arc::clone(&registry1));
+        setup_tokio_metrics(rt.metrics(), "foo", Arc::clone(&registry2));
+
+        // ensure that registries are only dropped at the end of the test
+        drop(registry1);
+        drop(registry2);
+    }
+
+    #[test]
+    fn test_metrics() {
+        let registry = Arc::new(Registry::default());
+        let rt1 = RuntimeBuilder::new_multi_thread()
+            .worker_threads(3)
+            .build()
+            .unwrap();
+        let rt2 = RuntimeBuilder::new_multi_thread()
+            .worker_threads(5)
+            .build()
+            .unwrap();
+        setup_tokio_metrics(rt1.metrics(), "foo", Arc::clone(&registry));
+        setup_tokio_metrics(rt2.metrics(), "bar", Arc::clone(&registry));
+
+        let mut reporter = RawReporter::default();
+        registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("tokio_runtime_num_workers")
+                .unwrap()
+                .observation(&[("runtime", "foo")])
+                .unwrap(),
+            &Observation::U64Gauge(3),
+        );
+        assert_eq!(
+            reporter
+                .metric("tokio_runtime_num_workers")
+                .unwrap()
+                .observation(&[("runtime", "bar")])
+                .unwrap(),
+            &Observation::U64Gauge(5),
+        );
+        assert_eq!(
+            reporter
+                .metric("tokio_worker_steal_operations")
+                .unwrap()
+                .observation(&[("runtime", "foo"), ("worker", "0")])
+                .unwrap(),
+            &Observation::U64Counter(0),
+        );
+        assert_eq!(
+            reporter
+                .metric("tokio_worker_steal_operations")
+                .unwrap()
+                .observation(&[("runtime", "foo"), ("worker", "2")])
+                .unwrap(),
+            &Observation::U64Counter(0),
+        );
+        assert_eq!(
+            reporter
+                .metric("tokio_worker_steal_operations")
+                .unwrap()
+                .observation(&[("runtime", "bar"), ("worker", "0")])
+                .unwrap(),
+            &Observation::U64Counter(0),
+        );
+        assert_eq!(
+            reporter
+                .metric("tokio_worker_steal_operations")
+                .unwrap()
+                .observation(&[("runtime", "bar"), ("worker", "4")])
+                .unwrap(),
+            &Observation::U64Counter(0),
+        );
+    }
+
+    #[test]
+    fn test_runtime_shutdown() {
+        let registry = Arc::new(Registry::default());
+        let rt = RuntimeBuilder::new_multi_thread()
+            .worker_threads(3)
+            .build()
+            .unwrap();
+        setup_tokio_metrics(rt.metrics(), "foo", Arc::clone(&registry));
+
+        rt.shutdown_timeout(Duration::from_secs(5));
+
+        // can still report
+        let mut reporter = RawReporter::default();
+        registry.report(&mut reporter);
+        assert_eq!(
+            reporter
+                .metric("tokio_runtime_num_workers")
+                .unwrap()
+                .observation(&[("runtime", "foo")])
+                .unwrap(),
+            &Observation::U64Gauge(3),
+        );
+
+        // ensure that registry is only dropped at the end of the test
+        drop(registry);
+    }
+}
diff --git a/tokio_metrics_bridge/src/lib.rs b/tokio_metrics_bridge/src/lib.rs
new file mode 100644
index 0000000..032b380
--- /dev/null
+++ b/tokio_metrics_bridge/src/lib.rs
@@ -0,0 +1,29 @@
+//! Integrates tokio runtime stats into the IOx metric system.
+//!
+//! This is NOT called `tokio-metrics` since this name is already taken.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+#[cfg(not(tokio_unstable))]
+mod not_tokio_unstable {
+    use metric as _;
+    use parking_lot as _;
+    use tokio as _;
+    use workspace_hack as _;
+}
+
+#[cfg(tokio_unstable)]
+mod bridge;
+#[cfg(tokio_unstable)]
+pub use bridge::*;
diff --git a/tokio_watchdog/Cargo.toml b/tokio_watchdog/Cargo.toml
new file mode 100644
index 0000000..c050a92
--- /dev/null
+++ b/tokio_watchdog/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "tokio_watchdog"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+metric = { path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+test_helpers = { path = "../test_helpers" }
diff --git a/tokio_watchdog/src/lib.rs b/tokio_watchdog/src/lib.rs
new file mode 100644
index 0000000..e7e2d75
--- /dev/null
+++ b/tokio_watchdog/src/lib.rs
@@ -0,0 +1,231 @@
+//! Monitors if the tokio runtime still looks healthy.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use observability_deps::tracing::warn;
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::time::{Duration, Instant};
+
+use metric::{DurationHistogram, Registry, U64Counter};
+use tokio::{
+    runtime::Handle,
+    sync::mpsc::{channel, error::TryRecvError},
+};
+
+/// Tokio watchdog config.
+#[allow(missing_debug_implementations)]
+pub struct WatchdogConfig<'a> {
+    handle: &'a Handle,
+    metric_registry: &'a Registry,
+    runtime_name: &'static str,
+    tick_duration: Duration,
+    warn_threshold: Duration,
+    new_thread_hook: Option<Box<dyn FnOnce() + Send>>,
+}
+
+impl<'a> WatchdogConfig<'a> {
+    /// Create new config for given runtime handle and metric registry.
+    #[must_use]
+    pub fn new(handle: &'a Handle, metric_registry: &'a Registry) -> Self {
+        Self {
+            handle,
+            metric_registry,
+            runtime_name: "tokio",
+            tick_duration: Duration::from_millis(100),
+            warn_threshold: Duration::from_millis(100),
+            new_thread_hook: None,
+        }
+    }
+
+    /// Set runtime name.
+    #[must_use]
+    pub fn with_runtime_name(self, name: &'static str) -> Self {
+        Self {
+            runtime_name: name,
+            ..self
+        }
+    }
+
+    /// Set tick duration.
+    ///
+    /// The tick duration determines how often the alive check will be performed.
+    #[must_use]
+    pub fn with_tick_duration(self, d: Duration) -> Self {
+        Self {
+            tick_duration: d,
+            ..self
+        }
+    }
+
+    /// Set warn duration.
+    ///
+    /// Determines how long the watchdog waits after each check before it detects a hang.
+    #[must_use]
+    pub fn with_warn_duration(self, d: Duration) -> Self {
+        Self {
+            warn_threshold: d,
+            ..self
+        }
+    }
+
+    /// Sets a hook that is called when the watchdog thread is created.
+    ///
+    /// The hook is called from the new thread.
+    #[must_use]
+    pub fn with_new_thread_hook<F>(self, f: F) -> Self
+    where
+        F: FnOnce() + Send + 'static,
+    {
+        Self {
+            new_thread_hook: Some(Box::new(f)),
+            ..self
+        }
+    }
+
+    /// Install watchdog.
+    ///
+    /// # Panic
+    /// Panics if the sum of [tick duration](Self::with_tick_duration) and [warn duration](Self::with_warn_duration) is zero.
+    pub fn install(self) {
+        let Self {
+            handle,
+            metric_registry,
+            runtime_name,
+            tick_duration,
+            warn_threshold,
+            new_thread_hook,
+        } = self;
+
+        assert!(
+            !(tick_duration + warn_threshold).is_zero(),
+            "sum of tick and warn duration must be non-zero"
+        );
+
+        let (tx_request, mut rx_request) = channel::<Instant>(1);
+        let (tx_response, mut rx_response) = channel::<Duration>(1);
+
+        let metric_latency = metric_registry
+            .register_metric::<DurationHistogram>(
+                "tokio_watchdog_response_time",
+                "Response time of the tokio watchdog task",
+            )
+            .recorder(&[("runtime", runtime_name)]);
+        let metric_hang = metric_registry
+            .register_metric::<U64Counter>(
+                "tokio_watchdog_hangs",
+                "Number of hangs detected by the tokio watchdog",
+            )
+            .recorder(&[("runtime", runtime_name)]);
+
+        handle.spawn(async move {
+            loop {
+                let Some(start) = rx_request.recv().await else {
+                    return;
+                };
+
+                if tx_response.try_send(start.elapsed()).is_err() {
+                    return;
+                }
+            }
+        });
+
+        std::thread::Builder::new()
+            .name(format!("tokio watchdog {runtime_name}"))
+            .spawn(move || {
+                if let Some(hook) = new_thread_hook {
+                    hook();
+                }
+
+                loop {
+                    std::thread::sleep(tick_duration);
+
+                    if tx_request.try_send(Instant::now()).is_err() {
+                        return;
+                    }
+
+                    std::thread::sleep(warn_threshold);
+
+                    let d = match rx_response.try_recv() {
+                        Ok(d) => d,
+                        Err(TryRecvError::Empty) => {
+                            warn!(runtime = runtime_name, "tokio starts hanging",);
+                            metric_hang.inc(1);
+
+                            let Some(d) = rx_response.blocking_recv() else {
+                                return;
+                            };
+                            warn!(
+                                runtime = runtime_name,
+                                hang_secs = d.as_secs_f64(),
+                                "tokio stops hanging",
+                            );
+                            d
+                        }
+                        Err(TryRecvError::Disconnected) => {
+                            return;
+                        }
+                    };
+
+                    metric_latency.record(d);
+                }
+            })
+            .expect("start watchdog thread");
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use test_helpers::tracing::TracingCapture;
+
+    use super::*;
+
+    #[tokio::test]
+    #[should_panic(expected = "sum of tick and warn duration must be non-zero")]
+    async fn test_panic_zero_duration() {
+        let registry = Registry::default();
+        WatchdogConfig::new(&Handle::current(), &registry)
+            .with_tick_duration(Duration::ZERO)
+            .with_warn_duration(Duration::ZERO)
+            .install();
+    }
+
+    #[tokio::test]
+    async fn test() {
+        let capture = Arc::new(TracingCapture::new());
+        let registry = Registry::default();
+        let tick_duration = Duration::from_millis(100);
+        let warn_threshold = Duration::from_millis(200);
+
+        let capture2 = Arc::clone(&capture);
+        WatchdogConfig::new(&Handle::current(), &registry)
+            .with_tick_duration(tick_duration)
+            .with_warn_duration(warn_threshold)
+            .with_new_thread_hook(move || {
+                capture2.register_in_current_thread();
+            })
+            .install();
+
+        std::thread::sleep(warn_threshold * 2);
+        tokio::time::sleep(tick_duration * 2).await;
+
+        let logs = capture.to_string();
+        assert!(logs.contains("tokio starts hanging"));
+        assert!(logs.contains("tokio stops hanging"));
+    }
+}
diff --git a/tower_trailer/Cargo.toml b/tower_trailer/Cargo.toml
new file mode 100644
index 0000000..e2da638
--- /dev/null
+++ b/tower_trailer/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "tower_trailer"
+description = "Allow to send HTTP/2 trailer using a tower layer"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+futures = "0.3"
+http = "0.2"
+http-body = "0.4"
+parking_lot = "0.12"
+pin-project = "1.1"
+tower = "0.4"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/tower_trailer/src/lib.rs b/tower_trailer/src/lib.rs
new file mode 100644
index 0000000..153d0c1
--- /dev/null
+++ b/tower_trailer/src/lib.rs
@@ -0,0 +1,194 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use futures::ready;
+use http::{Request, Response};
+use http_body::SizeHint;
+use parking_lot::Mutex;
+use pin_project::pin_project;
+use tower::{Layer, Service};
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+// re-export public types
+pub use http::HeaderMap;
+
+/// Layer that installs [`Trailers`] as a [request extension](Request::extensions).
+#[derive(Debug, Clone, Default)]
+#[allow(missing_copy_implementations)]
+pub struct TrailerLayer;
+
+impl<S> Layer<S> for TrailerLayer {
+    type Service = TrailerService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        TrailerService { service }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct TrailerService<S> {
+    service: S,
+}
+
+impl<S, ReqBody, ResBody> Service<Request<ReqBody>> for TrailerService<S>
+where
+    S: Service<Request<ReqBody>, Response = Response<ResBody>>,
+    ResBody: http_body::Body,
+{
+    type Response = Response<WrappedBody<ResBody>>;
+    type Error = S::Error;
+    type Future = WrappedFuture<S::Future>;
+
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.service.poll_ready(cx)
+    }
+
+    fn call(&mut self, mut request: Request<ReqBody>) -> Self::Future {
+        let trailers = Trailers::new();
+        let callbacks = trailers.callbacks.clone();
+        let existing = request.extensions_mut().insert(trailers);
+        assert!(
+            existing.is_none(),
+            "trailer layer/service installed multiple times"
+        );
+
+        WrappedFuture {
+            callbacks,
+            inner: self.service.call(request),
+        }
+    }
+}
+
+#[pin_project]
+#[derive(Debug)]
+pub struct WrappedFuture<F> {
+    callbacks: SharedCallbacks,
+    #[pin]
+    inner: F,
+}
+
+impl<F, ResBody, Error> Future for WrappedFuture<F>
+where
+    F: Future<Output = Result<Response<ResBody>, Error>>,
+    ResBody: http_body::Body,
+{
+    type Output = Result<Response<WrappedBody<ResBody>>, Error>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let result: Result<Response<ResBody>, Error> =
+            ready!(self.as_mut().project().inner.poll(cx));
+
+        match result {
+            Ok(response) => Poll::Ready(Ok(response.map(|body| WrappedBody {
+                callbacks: self.callbacks.clone(),
+                inner: body,
+            }))),
+            Err(e) => Poll::Ready(Err(e)),
+        }
+    }
+}
+
+#[pin_project]
+#[derive(Debug)]
+pub struct WrappedBody<B> {
+    callbacks: SharedCallbacks,
+    #[pin]
+    inner: B,
+}
+
+impl<B: http_body::Body> http_body::Body for WrappedBody<B> {
+    type Data = B::Data;
+    type Error = B::Error;
+
+    fn poll_data(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Data, Self::Error>>> {
+        self.as_mut().project().inner.poll_data(cx)
+    }
+
+    fn poll_trailers(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Result<Option<http::header::HeaderMap>, Self::Error>> {
+        let result: Result<Option<http::header::HeaderMap>, Self::Error> =
+            ready!(self.as_mut().project().inner.poll_trailers(cx));
+
+        let res = match result {
+            Ok(trailers) => {
+                let mut trailers = trailers.unwrap_or_default();
+
+                for callback in self.callbacks.0.lock().iter() {
+                    callback(&mut trailers);
+                }
+
+                Ok((!trailers.is_empty()).then_some(trailers))
+            }
+            Err(e) => Err(e),
+        };
+        Poll::Ready(res)
+    }
+
+    fn is_end_stream(&self) -> bool {
+        self.inner.is_end_stream()
+    }
+
+    fn size_hint(&self) -> SizeHint {
+        self.inner.size_hint()
+    }
+}
+
+type TrailerCallback = Box<dyn for<'a> Fn(&'a mut HeaderMap) + Send>;
+
+#[derive(Clone, Default)]
+struct SharedCallbacks(Arc<Mutex<Vec<TrailerCallback>>>);
+
+impl std::fmt::Debug for SharedCallbacks {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_tuple("SharedCallbacks").field(&"...").finish()
+    }
+}
+
+/// Handle to manage trailers of a HTTP response.
+#[derive(Clone, Debug)]
+pub struct Trailers {
+    callbacks: SharedCallbacks,
+}
+
+impl Trailers {
+    /// Private constructor.
+    ///
+    /// It is pointless / a potential bug to construct this type outside this crate, because it will NOT be hooked up
+    /// into the layer.
+    fn new() -> Self {
+        Self {
+            callbacks: Default::default(),
+        }
+    }
+
+    /// Register callback that is called when the trailers are sent.
+    pub fn add_callback<F>(&self, f: F)
+    where
+        for<'a> F: Fn(&'a mut HeaderMap) + Send + 'static,
+    {
+        let mut guard = self.callbacks.0.lock();
+        guard.push(Box::new(f));
+    }
+}
diff --git a/trace/Cargo.toml b/trace/Cargo.toml
new file mode 100644
index 0000000..1f09b8a
--- /dev/null
+++ b/trace/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "trace"
+description = "Distributed tracing support within IOx"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+chrono = { version = "0.4", default-features = false }
+observability_deps = { path = "../observability_deps" }
+parking_lot = "0.12"
+rand = "0.8"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/trace/src/ctx.rs b/trace/src/ctx.rs
new file mode 100644
index 0000000..50fb639
--- /dev/null
+++ b/trace/src/ctx.rs
@@ -0,0 +1,203 @@
+use std::borrow::Cow;
+use std::num::{NonZeroU128, NonZeroU64};
+use std::sync::Arc;
+
+use rand::Rng;
+
+use crate::{span::Span, TraceCollector};
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TraceId(pub NonZeroU128);
+
+impl TraceId {
+    pub fn new(val: u128) -> Option<Self> {
+        Some(Self(NonZeroU128::new(val)?))
+    }
+
+    pub fn get(self) -> u128 {
+        self.0.get()
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SpanId(pub NonZeroU64);
+
+impl SpanId {
+    pub fn new(val: u64) -> Option<Self> {
+        Some(Self(NonZeroU64::new(val)?))
+    }
+
+    pub fn gen() -> Self {
+        // Should this be a UUID?
+        Self(rand::thread_rng().gen())
+    }
+
+    pub fn get(self) -> u64 {
+        self.0.get()
+    }
+}
+
+/// The immutable context of a `Span`
+///
+/// Importantly this contains all the information necessary to create a child `Span`
+#[derive(Debug, Clone)]
+pub struct SpanContext {
+    pub trace_id: TraceId,
+
+    pub parent_span_id: Option<SpanId>,
+
+    pub span_id: SpanId,
+
+    /// Link to other spans, can be cross-trace if this span aggregates multiple spans.
+    ///
+    /// See <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/overview.md#links-between-spans>.
+    pub links: Vec<(TraceId, SpanId)>,
+
+    pub collector: Option<Arc<dyn TraceCollector>>,
+
+    /// If we should also sample based on this context (i.e. emit child spans).
+    pub sampled: bool,
+}
+
+impl SpanContext {
+    /// Create a new root span context, sent to `collector`. The
+    /// new span context has a random trace_id and span_id, and thus
+    /// is not connected to any existing span or trace.
+    pub fn new(collector: Arc<dyn TraceCollector>) -> Self {
+        Self::new_with_optional_collector(Some(collector))
+    }
+
+    /// Same as [`new`](Self::new), but with an optional collector.
+    pub fn new_with_optional_collector(collector: Option<Arc<dyn TraceCollector>>) -> Self {
+        let mut rng = rand::thread_rng();
+        let trace_id: u128 = rng.gen_range(1..u128::MAX);
+        let span_id: u64 = rng.gen_range(1..u64::MAX);
+
+        Self {
+            trace_id: TraceId(NonZeroU128::new(trace_id).unwrap()),
+            parent_span_id: None,
+            span_id: SpanId(NonZeroU64::new(span_id).unwrap()),
+            links: vec![],
+            collector,
+            sampled: true,
+        }
+    }
+
+    /// Creates a new child of the Span described by this TraceContext
+    pub fn child(&self, name: impl Into<Cow<'static, str>>) -> Span {
+        let ctx = Self {
+            trace_id: self.trace_id,
+            span_id: SpanId::gen(),
+            collector: self.collector.clone(),
+            links: Vec::with_capacity(0),
+            parent_span_id: Some(self.span_id),
+            sampled: self.sampled,
+        };
+        Span::new(name, ctx)
+    }
+
+    /// Return the approximate memory size of the span, in bytes.
+    ///
+    /// This includes `Self`.
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + self
+                .links
+                .iter()
+                .map(|(t_id, s_id)| std::mem::size_of_val(t_id) + std::mem::size_of_val(s_id))
+                .sum::<usize>()
+    }
+}
+
+impl PartialEq for SpanContext {
+    fn eq(&self, other: &Self) -> bool {
+        self.trace_id == other.trace_id
+            && self.parent_span_id == other.parent_span_id
+            && self.span_id == other.span_id
+            && self.links == other.links
+            && self.collector.is_some() == other.collector.is_some()
+            && self.sampled == other.sampled
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::RingBufferTraceCollector;
+
+    #[test]
+    fn test_new() {
+        // two newly created spans should not have duplicated trace or span ids
+        let collector = Arc::new(RingBufferTraceCollector::new(5)) as _;
+
+        let ctx1 = SpanContext::new(Arc::clone(&collector));
+        let ctx2 = SpanContext::new(collector);
+
+        assert_ne!(ctx1.trace_id, ctx2.trace_id);
+        assert_ne!(ctx1.span_id, ctx2.span_id);
+    }
+
+    #[test]
+    fn test_partial_eq() {
+        let collector_1 = Arc::new(RingBufferTraceCollector::new(5)) as _;
+        let collector_2 = Arc::new(RingBufferTraceCollector::new(5)) as _;
+
+        let ctx_ref = SpanContext {
+            trace_id: TraceId::new(1).unwrap(),
+            parent_span_id: Some(SpanId::new(2).unwrap()),
+            span_id: SpanId::new(3).unwrap(),
+            links: vec![
+                (TraceId::new(4).unwrap(), SpanId::new(5).unwrap()),
+                (TraceId::new(6).unwrap(), SpanId::new(7).unwrap()),
+            ],
+            collector: Some(collector_1),
+            sampled: true,
+        };
+
+        let ctx = SpanContext { ..ctx_ref.clone() };
+        assert_eq!(ctx_ref, ctx);
+
+        let ctx = SpanContext {
+            trace_id: TraceId::new(10).unwrap(),
+            ..ctx_ref.clone()
+        };
+        assert_ne!(ctx_ref, ctx);
+
+        let ctx = SpanContext {
+            parent_span_id: Some(SpanId::new(10).unwrap()),
+            ..ctx_ref.clone()
+        };
+        assert_ne!(ctx_ref, ctx);
+
+        let ctx = SpanContext {
+            span_id: SpanId::new(10).unwrap(),
+            ..ctx_ref.clone()
+        };
+        assert_ne!(ctx_ref, ctx);
+
+        let ctx = SpanContext {
+            links: vec![(TraceId::new(4).unwrap(), SpanId::new(5).unwrap())],
+            ..ctx_ref.clone()
+        };
+        assert_ne!(ctx_ref, ctx);
+
+        let ctx = SpanContext {
+            collector: None,
+            ..ctx_ref.clone()
+        };
+        assert_ne!(ctx_ref, ctx);
+
+        let ctx = SpanContext {
+            collector: Some(collector_2),
+            ..ctx_ref.clone()
+        };
+        assert_eq!(ctx_ref, ctx);
+
+        let ctx = SpanContext {
+            sampled: false,
+            ..ctx_ref.clone()
+        };
+        assert_ne!(ctx_ref, ctx);
+    }
+}
diff --git a/trace/src/lib.rs b/trace/src/lib.rs
new file mode 100644
index 0000000..6e352ff
--- /dev/null
+++ b/trace/src/lib.rs
@@ -0,0 +1,106 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::{any::Any, collections::VecDeque, sync::Arc};
+
+use parking_lot::Mutex;
+
+use observability_deps::tracing::info;
+
+use crate::span::Span;
+
+pub mod ctx;
+pub mod span;
+
+/// A TraceCollector is a sink for completed `Span`
+pub trait TraceCollector: std::fmt::Debug + Send + Sync {
+    /// Exports the specified `Span` for collection by the sink
+    fn export(&self, span: Span);
+
+    /// Cast client to [`Any`], useful for downcasting.
+    fn as_any(&self) -> &dyn Any;
+}
+
+/// A basic trace collector that prints to stdout
+#[derive(Debug, Copy, Clone)]
+pub struct LogTraceCollector {}
+
+impl LogTraceCollector {
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl Default for LogTraceCollector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TraceCollector for LogTraceCollector {
+    fn export(&self, span: Span) {
+        info!("completed span {:?}", span)
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+/// A trace collector that maintains a ring buffer of spans
+#[derive(Debug)]
+pub struct RingBufferTraceCollector {
+    buffer: Mutex<VecDeque<Span>>,
+}
+
+impl RingBufferTraceCollector {
+    pub fn new(capacity: usize) -> Self {
+        Self {
+            buffer: Mutex::new(VecDeque::with_capacity(capacity)),
+        }
+    }
+
+    pub fn spans(&self) -> Vec<Span> {
+        self.buffer.lock().iter().cloned().collect()
+    }
+}
+
+impl TraceCollector for RingBufferTraceCollector {
+    fn export(&self, span: Span) {
+        let mut buffer = self.buffer.lock();
+        if buffer.len() == buffer.capacity() {
+            buffer.pop_front();
+        }
+        buffer.push_back(span);
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+impl<T> TraceCollector for Arc<T>
+where
+    T: TraceCollector,
+{
+    fn export(&self, span: Span) {
+        (**self).export(span)
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        (**self).as_any()
+    }
+}
diff --git a/trace/src/span.rs b/trace/src/span.rs
new file mode 100644
index 0000000..d7a0de1
--- /dev/null
+++ b/trace/src/span.rs
@@ -0,0 +1,351 @@
+use std::collections::HashMap;
+use std::{borrow::Cow, sync::Arc};
+
+use chrono::{DateTime, Utc};
+
+use crate::{ctx::SpanContext, TraceCollector};
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum SpanStatus {
+    Unknown,
+    Ok,
+    Err,
+}
+
+/// A `Span` is a representation of a an interval of time spent performing some operation
+///
+/// A `Span` has a name, metadata, a start and end time and a unique ID. Additionally they
+/// have relationships with other Spans that together comprise a Trace
+///
+///
+#[derive(Debug, Clone)]
+pub struct Span {
+    pub name: Cow<'static, str>,
+
+    pub ctx: SpanContext,
+
+    pub start: Option<DateTime<Utc>>,
+
+    pub end: Option<DateTime<Utc>>,
+
+    pub status: SpanStatus,
+
+    pub metadata: HashMap<Cow<'static, str>, MetaValue>,
+
+    pub events: Vec<SpanEvent>,
+}
+
+impl Span {
+    /// Create new span with given context and name.
+    pub(crate) fn new(name: impl Into<Cow<'static, str>>, ctx: SpanContext) -> Self {
+        Self {
+            name: name.into(),
+            ctx,
+            start: None,
+            end: None,
+            status: SpanStatus::Unknown,
+            // assume no metadata by default
+            metadata: HashMap::with_capacity(0),
+            // assume no events by default
+            events: Vec::with_capacity(0),
+        }
+    }
+
+    /// Create new root span.
+    pub fn root(name: impl Into<Cow<'static, str>>, collector: Arc<dyn TraceCollector>) -> Self {
+        let ctx = SpanContext::new(collector);
+        Self::new(name, ctx)
+    }
+
+    /// Record an event on this `Span`
+    pub fn event(&mut self, event: SpanEvent) {
+        self.events.push(event);
+    }
+
+    /// Record success on this `Span` setting the status if it isn't already set
+    pub fn ok(&mut self, msg: impl Into<Cow<'static, str>>) {
+        self.event(SpanEvent::new(msg));
+        self.status(SpanStatus::Ok);
+    }
+
+    /// Record an error on this `Span` setting the status if it isn't already set
+    pub fn error(&mut self, msg: impl Into<Cow<'static, str>>) {
+        self.event(SpanEvent::new(msg));
+        self.status(SpanStatus::Err);
+    }
+
+    /// Set status of `Span`
+    pub fn status(&mut self, status: SpanStatus) {
+        if self.status == SpanStatus::Unknown {
+            self.status = status;
+        }
+    }
+
+    /// Exports this `Span` to its registered collector if any
+    pub fn export(mut self) {
+        if let Some(collector) = self.ctx.collector.take() {
+            collector.export(self)
+        }
+    }
+
+    /// Create a new child span with the specified name
+    ///
+    /// Note that the created Span will not be emitted
+    /// automatically. The caller must explicitly call [`Self::export`].
+    ///
+    /// See [`SpanRecorder`] for a helper that automatically emits span data.
+    pub fn child(&self, name: impl Into<Cow<'static, str>>) -> Self {
+        self.ctx.child(name)
+    }
+
+    /// Link this span to another context.
+    pub fn link(&mut self, other: &SpanContext) {
+        self.ctx.links.push((other.trace_id, other.span_id));
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct SpanEvent {
+    pub time: DateTime<Utc>,
+
+    pub msg: Cow<'static, str>,
+
+    pub metadata: HashMap<Cow<'static, str>, MetaValue>,
+}
+
+impl SpanEvent {
+    /// Create new event.
+    pub fn new(msg: impl Into<Cow<'static, str>>) -> Self {
+        Self {
+            time: Utc::now(),
+            msg: msg.into(),
+            // assume no metadata by default
+            metadata: HashMap::with_capacity(0),
+        }
+    }
+
+    /// Set meta data.
+    pub fn set_metadata(&mut self, key: impl Into<Cow<'static, str>>, value: impl Into<MetaValue>) {
+        self.metadata.insert(key.into(), value.into());
+    }
+}
+
+/// Values that can be stored in a Span's metadata and events
+#[derive(Debug, Clone, PartialEq)]
+pub enum MetaValue {
+    String(Cow<'static, str>),
+    Float(f64),
+    Int(i64),
+    Bool(bool),
+}
+
+impl MetaValue {
+    pub fn string(&self) -> Option<&str> {
+        match &self {
+            Self::String(s) => Some(s.as_ref()),
+            _ => None,
+        }
+    }
+}
+
+impl From<&'static str> for MetaValue {
+    fn from(v: &'static str) -> Self {
+        Self::String(Cow::Borrowed(v))
+    }
+}
+
+impl From<String> for MetaValue {
+    fn from(v: String) -> Self {
+        Self::String(Cow::Owned(v))
+    }
+}
+
+impl From<f64> for MetaValue {
+    fn from(v: f64) -> Self {
+        Self::Float(v)
+    }
+}
+
+impl From<i64> for MetaValue {
+    fn from(v: i64) -> Self {
+        Self::Int(v)
+    }
+}
+
+/// Utility for instrumenting code that produces [`Span`].
+///
+/// If a [`SpanRecorder`] is created from a [`Span`] it will update the start timestamp
+/// of the span on creation, and on Drop will set the finish time and call [`Span::export`]
+///
+/// If not created with a `Span`, e.g. this request is not being sampled, all operations
+/// called on this `SpanRecorder` will be a no-op
+#[derive(Debug, Default)]
+pub struct SpanRecorder {
+    span: Option<Span>,
+}
+
+impl SpanRecorder {
+    pub fn new(mut span: Option<Span>) -> Self {
+        if let Some(span) = span.as_mut() {
+            span.start = Some(Utc::now());
+        }
+
+        Self { span }
+    }
+
+    /// Set meta data on the [`Span`], if any.
+    pub fn set_metadata(&mut self, key: impl Into<Cow<'static, str>>, value: impl Into<MetaValue>) {
+        if let Some(span) = self.span.as_mut() {
+            span.metadata.insert(key.into(), value.into());
+        }
+    }
+
+    /// Record an event on the contained `Span` if any
+    pub fn event(&mut self, event: SpanEvent) {
+        if let Some(span) = self.span.as_mut() {
+            span.event(event);
+        }
+    }
+
+    /// Record success on the contained `Span` if any
+    pub fn ok(&mut self, meta: impl Into<Cow<'static, str>>) {
+        if let Some(span) = self.span.as_mut() {
+            span.ok(meta)
+        }
+    }
+
+    /// Record an error on the contained `Span` if any
+    pub fn error(&mut self, meta: impl Into<Cow<'static, str>>) {
+        if let Some(span) = self.span.as_mut() {
+            span.error(meta)
+        }
+    }
+
+    /// Set status of contained `Span` if any
+    pub fn status(&mut self, status: SpanStatus) {
+        if let Some(span) = self.span.as_mut() {
+            span.status(status);
+        }
+    }
+
+    /// Take the contents of this recorder returning a new recorder
+    ///
+    /// From this point on `self` will behave as if it were created with no span
+    pub fn take(&mut self) -> Self {
+        std::mem::take(self)
+    }
+
+    /// If this `SpanRecorder` has a `Span`, creates a new child of that `Span` and
+    /// returns a `SpanRecorder` for it. Otherwise returns an empty `SpanRecorder`
+    pub fn child(&self, name: impl Into<Cow<'static, str>>) -> Self {
+        Self::new(self.child_span(name))
+    }
+
+    /// Return a reference to the span contained in this SpanRecorder,
+    /// or None if there is no active span
+    pub fn span(&self) -> Option<&Span> {
+        self.span.as_ref()
+    }
+
+    /// Return a child span of the specified name, if this SpanRecorder
+    /// has an active span, `None` otherwise.
+    pub fn child_span(&self, name: impl Into<Cow<'static, str>>) -> Option<Span> {
+        self.span.as_ref().map(|span| span.child(name))
+    }
+
+    /// Link this span to another context.
+    pub fn link(&mut self, other: &SpanContext) {
+        if let Some(span) = self.span.as_mut() {
+            span.link(other);
+        }
+    }
+}
+
+/// Helper trait to make spans easier to work with
+pub trait SpanExt {
+    /// Return a child_span, if that makes sense
+    fn child_span(&self, name: &'static str) -> Option<Span>;
+}
+
+impl SpanExt for Option<SpanContext> {
+    fn child_span(&self, name: &'static str) -> Option<Span> {
+        self.as_ref().child_span(name)
+    }
+}
+impl SpanExt for Option<&SpanContext> {
+    fn child_span(&self, name: &'static str) -> Option<Span> {
+        self.map(|span| span.child(name))
+    }
+}
+
+impl Drop for SpanRecorder {
+    fn drop(&mut self) {
+        if let Some(mut span) = self.span.take() {
+            let now = Utc::now();
+
+            // SystemTime is not monotonic so must also check min
+            span.start = Some(match span.start {
+                Some(a) => a.min(now),
+                None => now,
+            });
+
+            span.end = Some(match span.end {
+                Some(a) => a.max(now),
+                None => now,
+            });
+
+            span.export()
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use crate::{RingBufferTraceCollector, TraceCollector};
+
+    use super::*;
+
+    fn make_span(collector: Arc<dyn TraceCollector>) -> Span {
+        SpanContext::new(collector).child("foo")
+    }
+
+    #[test]
+    fn test_span() {
+        let collector = Arc::new(RingBufferTraceCollector::new(5));
+
+        let span = make_span(Arc::<RingBufferTraceCollector>::clone(&collector));
+
+        assert_eq!(collector.spans().len(), 0);
+
+        span.export();
+
+        // Should publish span
+        let spans = collector.spans();
+        assert_eq!(spans.len(), 1);
+    }
+
+    #[test]
+    fn test_entered_span() {
+        let collector = Arc::new(RingBufferTraceCollector::new(5));
+
+        let span = make_span(Arc::<RingBufferTraceCollector>::clone(&collector));
+
+        let recorder = SpanRecorder::new(Some(span));
+
+        std::thread::sleep(std::time::Duration::from_millis(100));
+
+        std::mem::drop(recorder);
+
+        // Span should have been published on drop with set spans
+        let spans = collector.spans();
+        assert_eq!(spans.len(), 1);
+
+        let span = &spans[0];
+
+        assert!(span.start.is_some());
+        assert!(span.end.is_some());
+        assert!(span.start.unwrap() < span.end.unwrap());
+    }
+}
diff --git a/trace_exporters/Cargo.toml b/trace_exporters/Cargo.toml
new file mode 100644
index 0000000..177ad96
--- /dev/null
+++ b/trace_exporters/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "trace_exporters"
+description = "Additional tracing exporters for IOx"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+async-trait = "0.1"
+clap = { version = "4", features = ["derive", "env"] }
+futures = "0.3"
+iox_time = { path = "../iox_time" }
+observability_deps = { path = "../observability_deps" }
+snafu = "0.8"
+thrift = { version = "0.17.0" }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt", "sync"] }
+trace = { path = "../trace" }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+chrono = { version = "0.4", default-features = false, features = ["clock"] }
diff --git a/trace_exporters/README.md b/trace_exporters/README.md
new file mode 100644
index 0000000..4e89c67
--- /dev/null
+++ b/trace_exporters/README.md
@@ -0,0 +1,54 @@
+# Trace Exporters
+
+## Regenerating Jaeger Thrift
+
+_The instructions below use docker, but this is optional._
+
+_Depending on your setup there may be permissions complications that require using`-u`_
+
+Startup a Debian bookworm image
+
+```
+docker run -it -v $PWD:/out debian:bookworm-slim
+```
+
+Install the thrift-compiler
+
+```
+$ apt-get update
+$ apt-get install thrift-compiler wget
+```
+
+Verify the version of the compiler matches the version of `thrift` in [Cargo.toml](./Cargo.toml)
+
+```
+$ thrift --version
+Thrift version 0.13.0
+```
+
+Get the IDL definition
+
+```
+$ wget https://raw.githubusercontent.com/jaegertracing/jaeger-idl/master/thrift/jaeger.thrift https://raw.githubusercontent.com/jaegertracing/jaeger-idl/master/thrift/zipkincore.thrift https://raw.githubusercontent.com/jaegertracing/jaeger-idl/master/thrift/agent.thrift
+```
+
+Generate the code
+
+```
+$ thrift --out /out/src/thrift --gen rs agent.thrift
+$ thrift --out /out/src/thrift --gen rs jaeger.thrift
+$ thrift --out /out/src/thrift --gen rs zipkincore.thrift
+```
+
+Patch up imports
+
+```
+sed -i 's/use jaeger;/use super::jaeger;/g' /out/src/thrift/agent.rs
+sed -i 's/use zipkincore;/use super::zipkincore;/g' /out/src/thrift/agent.rs
+```
+
+Remove the clippy line
+
+```
+#![cfg_attr(feature = "cargo-clippy", allow(too_many_arguments, type_complexity))]
+```
diff --git a/trace_exporters/src/export.rs b/trace_exporters/src/export.rs
new file mode 100644
index 0000000..c4c4424
--- /dev/null
+++ b/trace_exporters/src/export.rs
@@ -0,0 +1,185 @@
+use std::{
+    any::Any,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+};
+
+use async_trait::async_trait;
+use futures::{
+    future::{BoxFuture, Shared},
+    FutureExt, TryFutureExt,
+};
+use tokio::sync::mpsc;
+use tokio::task::JoinError;
+
+use observability_deps::tracing::{error, info, warn};
+use trace::{span::Span, TraceCollector};
+
+/// Size of the exporter buffer
+const CHANNEL_SIZE: usize = 100_000;
+
+/// An `AsyncExport` is a batched async version of `trace::TraceCollector`
+#[async_trait]
+pub trait AsyncExport: Send + 'static {
+    async fn export(&mut self, span: Vec<Span>);
+}
+
+/// `AsyncExporter` wraps a `AsyncExport` and sinks spans to it
+///
+/// In order to do this it spawns a background worker that pulls messages
+/// off a queue and writes them to the `AsyncExport`.
+///
+/// If this worker cannot keep up, and this queue fills up, spans will
+/// be dropped and warnings logged
+///
+/// Note: Currently this does not batch spans (#2392)
+#[derive(Debug)]
+pub struct AsyncExporter {
+    join: Shared<BoxFuture<'static, Result<(), Arc<JoinError>>>>,
+
+    /// Communication queue with the background worker
+    ///
+    /// Sending None triggers termination
+    sender: tokio::sync::mpsc::Sender<Option<Span>>,
+
+    /// Flags if we already warned about a saturated channel.
+    warned_sender_full: AtomicBool,
+}
+
+impl AsyncExporter {
+    /// Creates a new `AsyncExporter`
+    pub fn new<T: AsyncExport>(collector: T) -> Self {
+        let (sender, receiver) = mpsc::channel(CHANNEL_SIZE);
+
+        let handle = tokio::spawn(background_worker(collector, receiver));
+        let join = handle.map_err(Arc::new).boxed().shared();
+
+        Self {
+            join,
+            sender,
+            warned_sender_full: AtomicBool::new(false),
+        }
+    }
+
+    /// Triggers shutdown of this `AsyncExporter` and waits until all in-flight
+    /// spans have been published to the `AsyncExport`
+    pub async fn drain(&self) -> Result<(), Arc<JoinError>> {
+        info!("batched exporter shutting down");
+        let _ = self.sender.send(None).await;
+        self.join.clone().await
+    }
+}
+
+impl TraceCollector for AsyncExporter {
+    fn export(&self, span: Span) {
+        use mpsc::error::TrySendError;
+        match self.sender.try_send(Some(span)) {
+            Ok(_) => {
+                // sending worked again, so re-enable warning
+                self.warned_sender_full.store(false, Ordering::SeqCst);
+
+                //TODO: Increment some metric (#2613)
+            }
+            Err(TrySendError::Full(_)) => {
+                // avoid spamming the log system (there might be thousands of traces incoming)
+                if !self.warned_sender_full.swap(true, Ordering::SeqCst) {
+                    warn!("exporter cannot keep up, dropping spans");
+                }
+            }
+            Err(TrySendError::Closed(_)) => {
+                warn!("background worker shutdown")
+            }
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+async fn background_worker<T: AsyncExport>(
+    mut exporter: T,
+    mut receiver: mpsc::Receiver<Option<Span>>,
+) {
+    loop {
+        match receiver.recv().await {
+            Some(Some(span)) => exporter.export(vec![span]).await,
+            Some(None) => {
+                info!("async exporter shut down");
+                break;
+            }
+            None => {
+                error!("sender-side of async exporter dropped without waiting for shut down");
+                break;
+            }
+        }
+    }
+}
+
+/// An `AsyncExporter` that sinks writes to a tokio mpsc channel.
+///
+/// Intended for testing ONLY
+///
+#[derive(Debug)]
+pub struct TestAsyncExporter {
+    channel: mpsc::Sender<Span>,
+}
+
+impl TestAsyncExporter {
+    pub fn new(channel: mpsc::Sender<Span>) -> Self {
+        Self { channel }
+    }
+}
+
+#[async_trait]
+impl AsyncExport for TestAsyncExporter {
+    async fn export(&mut self, batch: Vec<Span>) {
+        for span in batch {
+            self.channel.send(span).await.expect("channel closed")
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use trace::ctx::SpanContext;
+
+    #[tokio::test]
+    async fn test_exporter() {
+        let (sender, mut receiver) = mpsc::channel(10);
+        let exporter = AsyncExporter::new(TestAsyncExporter::new(sender));
+
+        let root = SpanContext::new(Arc::new(trace::LogTraceCollector::new()));
+        let s1 = root.child("foo");
+        let s2 = root.child("bar");
+
+        exporter.export(s1.clone());
+        exporter.export(s2.clone());
+        exporter.export(s2.clone());
+
+        // Drain should wait for all published spans to be flushed
+        exporter.drain().await.unwrap();
+
+        let r1 = receiver.recv().await.unwrap();
+        let r2 = receiver.recv().await.unwrap();
+        let r3 = receiver.recv().await.unwrap();
+
+        // Should not be fatal despite exporter having been shutdown
+        exporter.export(s2.clone());
+
+        assert_eq!(root.span_id.get(), r1.ctx.parent_span_id.unwrap().get());
+        assert_eq!(s1.ctx.span_id.get(), r1.ctx.span_id.get());
+        assert_eq!(s1.ctx.trace_id.get(), r1.ctx.trace_id.get());
+
+        assert_eq!(root.span_id.get(), r2.ctx.parent_span_id.unwrap().get());
+        assert_eq!(s2.ctx.span_id.get(), r2.ctx.span_id.get());
+        assert_eq!(s2.ctx.trace_id.get(), r2.ctx.trace_id.get());
+
+        assert_eq!(root.span_id.get(), r3.ctx.parent_span_id.unwrap().get());
+        assert_eq!(s2.ctx.span_id.get(), r3.ctx.span_id.get());
+        assert_eq!(s2.ctx.trace_id.get(), r3.ctx.trace_id.get());
+    }
+}
diff --git a/trace_exporters/src/jaeger.rs b/trace_exporters/src/jaeger.rs
new file mode 100644
index 0000000..d02c68a
--- /dev/null
+++ b/trace_exporters/src/jaeger.rs
@@ -0,0 +1,564 @@
+use std::{
+    net::{SocketAddr, ToSocketAddrs, UdpSocket},
+    num::NonZeroU64,
+    str::FromStr,
+    sync::Arc,
+};
+
+use async_trait::async_trait;
+
+use iox_time::TimeProvider;
+use observability_deps::tracing::*;
+use trace::span::Span;
+
+use crate::thrift::agent::{AgentSyncClient, TAgentSyncClient};
+use crate::thrift::jaeger;
+use crate::{export::AsyncExport, rate_limiter::RateLimiter};
+use thrift::protocol::{TCompactInputProtocol, TCompactOutputProtocol};
+
+mod span;
+
+/// A key=value pair for span annotations.
+#[derive(Debug, Clone)]
+pub struct JaegerTag {
+    key: String,
+    value: String,
+}
+
+impl JaegerTag {
+    /// Create a new static tag for all jaeger spans.
+    pub fn new(key: impl Into<String>, value: impl Into<String>) -> Self {
+        Self {
+            key: key.into(),
+            value: value.into(),
+        }
+    }
+
+    /// Key.
+    pub fn key(&self) -> &str {
+        &self.key
+    }
+}
+
+impl From<JaegerTag> for jaeger::Tag {
+    fn from(t: JaegerTag) -> Self {
+        Self::new(
+            t.key,
+            jaeger::TagType::String,
+            Some(t.value),
+            None,
+            None,
+            None,
+            None,
+        )
+    }
+}
+
+impl FromStr for JaegerTag {
+    type Err = Box<dyn std::error::Error + Send + Sync + 'static>;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let parts = s.split('=').collect::<Vec<_>>();
+        match *parts {
+            [key, value] if !key.is_empty() && !value.is_empty() => Ok(Self::new(key, value)),
+            _ => Err(format!("invalid key=value pair ({s})").into()),
+        }
+    }
+}
+
+/// `JaegerAgentExporter` receives span data and writes it over UDP to a local jaeger agent
+///
+/// Note: will drop data if the UDP socket would block
+pub struct JaegerAgentExporter {
+    /// The name of the service
+    service_name: String,
+
+    /// The agent client that encodes messages
+    client:
+        AgentSyncClient<TCompactInputProtocol<NoopReader>, TCompactOutputProtocol<MessageWriter>>,
+
+    /// Spans should be assigned a sequential sequence number
+    /// to allow jaeger to better detect dropped spans
+    next_sequence: i64,
+
+    /// Optional static tags to annotate every span with.
+    tags: Option<Vec<jaeger::Tag>>,
+
+    /// Rate limiter
+    rate_limiter: RateLimiter,
+}
+
+impl JaegerAgentExporter {
+    pub fn new<E: ToSocketAddrs + std::fmt::Display>(
+        service_name: String,
+        agent_endpoint: E,
+        time_provider: Arc<dyn TimeProvider>,
+        max_msgs_per_second: NonZeroU64,
+    ) -> super::Result<Self> {
+        info!(%agent_endpoint, %service_name, "Creating jaeger tracing exporter");
+        let remote_addr = agent_endpoint.to_socket_addrs()?.next().ok_or_else(|| {
+            super::Error::ResolutionError {
+                address: agent_endpoint.to_string(),
+            }
+        })?;
+
+        let local_addr: SocketAddr = if remote_addr.is_ipv4() {
+            "0.0.0.0:0"
+        } else {
+            "[::]:0"
+        }
+        .parse()
+        .unwrap();
+
+        let socket = UdpSocket::bind(local_addr)?;
+        socket.set_nonblocking(true)?;
+        socket.connect(remote_addr)?;
+
+        let client = AgentSyncClient::new(
+            TCompactInputProtocol::new(NoopReader::default()),
+            TCompactOutputProtocol::new(MessageWriter::new(socket)),
+        );
+
+        Ok(Self {
+            service_name,
+            client,
+            next_sequence: 0,
+            tags: None,
+            rate_limiter: RateLimiter::new(max_msgs_per_second, time_provider),
+        })
+    }
+
+    /// Annotate all spans emitted by this exporter with the specified static
+    /// tags.
+    pub fn with_tags(self, tags: &[JaegerTag]) -> Self {
+        debug!(?tags, "setting static jaeger span tags");
+        let tags = Some(tags.iter().cloned().map(Into::into).collect());
+        Self { tags, ..self }
+    }
+
+    fn make_batch(&mut self, spans: Vec<Span>) -> jaeger::Batch {
+        let seq_no = Some(self.next_sequence);
+        self.next_sequence += 1;
+        jaeger::Batch {
+            process: jaeger::Process {
+                service_name: self.service_name.clone(),
+                tags: self.tags.clone(),
+            },
+            spans: spans
+                .into_iter()
+                .filter_map(|span| match jaeger::Span::try_from(span) {
+                    Ok(span) => Some(span),
+                    Err(e) => {
+                        warn!(
+                            %e,
+                            "cannot convert span to jaeger format",
+                        );
+                        None
+                    }
+                })
+                .collect(),
+            seq_no,
+            stats: None,
+        }
+    }
+}
+
+#[async_trait]
+impl AsyncExport for JaegerAgentExporter {
+    async fn export(&mut self, spans: Vec<Span>) {
+        let batch = self.make_batch(spans);
+
+        // The Jaeger UDP protocol provides no backchannel and therefore no way for us to know about backpressure or
+        // dropped messages. To make dropped messages (by the OS, network, or Jaeger itself) less likely, we employ a
+        // simple rate limit.
+        self.rate_limiter.send().await;
+
+        if let Err(e) = self.client.emit_batch(batch) {
+            let e = NiceThriftError::from(e);
+
+            // not a user-visible error but only a monitoring outage, print on info level
+            // Ref: https://github.com/influxdata/influxdb_iox/issues/9726
+            info!(%e, "error writing batch to jaeger agent")
+        }
+    }
+}
+
+/// Thrift error formatting is messy, try better.
+///
+/// See <https://github.com/influxdata/influxdb_iox/issues/9726>.
+#[derive(Debug)]
+struct NiceThriftError(thrift::Error);
+
+impl From<thrift::Error> for NiceThriftError {
+    fn from(e: thrift::Error) -> Self {
+        Self(e)
+    }
+}
+
+impl std::fmt::Display for NiceThriftError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.0 {
+            thrift::Error::Transport(e) => {
+                let kind = match e.kind {
+                    thrift::TransportErrorKind::Unknown => "unknown",
+                    thrift::TransportErrorKind::NotOpen => "not open",
+                    thrift::TransportErrorKind::AlreadyOpen => "already open",
+                    thrift::TransportErrorKind::TimedOut => "timed out",
+                    thrift::TransportErrorKind::EndOfFile => "end of file",
+                    thrift::TransportErrorKind::NegativeSize => "negative size message",
+                    thrift::TransportErrorKind::SizeLimit => "message too long",
+                    _ => "unknown variant",
+                };
+
+                write!(f, "transport: {}: {}", kind, e.message)
+            }
+            thrift::Error::Protocol(e) => {
+                let kind = match e.kind {
+                    thrift::ProtocolErrorKind::Unknown => "unknown",
+                    thrift::ProtocolErrorKind::InvalidData => "bad data",
+                    thrift::ProtocolErrorKind::NegativeSize => "negative message size",
+                    thrift::ProtocolErrorKind::SizeLimit => "message too long",
+                    thrift::ProtocolErrorKind::BadVersion => "invalid thrift version",
+                    thrift::ProtocolErrorKind::NotImplemented => "not implemented",
+                    thrift::ProtocolErrorKind::DepthLimit => "maximum skip depth reached",
+                    _ => "unknown variant",
+                };
+
+                write!(f, "protocol: {}: {}", kind, e.message)
+            }
+            thrift::Error::Application(e) => {
+                let kind = match e.kind {
+                    thrift::ApplicationErrorKind::Unknown => "unknown",
+                    thrift::ApplicationErrorKind::UnknownMethod => "unknown service method",
+                    thrift::ApplicationErrorKind::InvalidMessageType => {
+                        "wrong message type received"
+                    }
+                    thrift::ApplicationErrorKind::WrongMethodName => {
+                        "unknown method reply received"
+                    }
+                    thrift::ApplicationErrorKind::BadSequenceId => "out of order sequence id",
+                    thrift::ApplicationErrorKind::MissingResult => "missing method result",
+                    thrift::ApplicationErrorKind::InternalError => "remote service threw exception",
+                    thrift::ApplicationErrorKind::ProtocolError => "protocol error",
+                    thrift::ApplicationErrorKind::InvalidTransform => "invalid transform",
+                    thrift::ApplicationErrorKind::InvalidProtocol => "invalid protocol requested",
+                    thrift::ApplicationErrorKind::UnsupportedClientType => {
+                        "unsupported protocol client"
+                    }
+                    _ => "unknown variant",
+                };
+
+                write!(f, "application: {}: {}", kind, e.message)
+            }
+            thrift::Error::User(e) => write!(f, "user: {e}"),
+        }
+    }
+}
+
+/// `NoopReader` is a `std::io::Read` that never returns any data
+#[derive(Debug, Default)]
+struct NoopReader {}
+
+impl std::io::Read for NoopReader {
+    fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
+        Ok(0)
+    }
+}
+
+/// A `MessageWriter` only writes entire message payloads to the provided UDP socket
+///
+/// If the UDP socket would block, drops the packet
+struct MessageWriter {
+    buf: Vec<u8>,
+    socket: UdpSocket,
+}
+
+impl MessageWriter {
+    fn new(socket: UdpSocket) -> Self {
+        Self {
+            buf: vec![],
+            socket,
+        }
+    }
+}
+
+impl std::io::Write for MessageWriter {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.buf.extend_from_slice(buf);
+        Ok(buf.len())
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        let message_len = self.buf.len();
+        let r = self.socket.send(&self.buf);
+        self.buf.clear();
+        match r {
+            Ok(written) => {
+                if written != message_len {
+                    // In the event a message is truncated, there isn't an obvious way to recover
+                    //
+                    // The Thrift protocol is normally used on top of a reliable stream,
+                    // e.g. TCP, and it is a bit of a hack to send it over UDP
+                    //
+                    // Jaeger requires that each thrift Message is encoded in exactly one UDP
+                    // packet, as this ensures it either arrives in its entirety or not at all
+                    //
+                    // If for whatever reason the packet is truncated, the agent will fail to
+                    // to decode it, likely due to a missing stop-field, and discard it
+                    error!(%written, %message_len, "jaeger agent exporter failed to write message as single UDP packet");
+                }
+                Ok(())
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => {
+                error!("jaeger agent exporter would have blocked - dropping message");
+                Ok(())
+            }
+            Err(e) => Err(e),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::thrift::agent::{AgentSyncHandler, AgentSyncProcessor};
+    use chrono::{TimeZone, Utc};
+    use iox_time::SystemProvider;
+    use std::borrow::Cow;
+    use std::collections::HashMap;
+    use std::sync::{Arc, Mutex};
+    use thrift::server::TProcessor;
+    use thrift::transport::TBufferChannel;
+    use trace::ctx::{SpanContext, SpanId, TraceId};
+    use trace::span::{MetaValue, SpanEvent, SpanStatus};
+
+    struct TestHandler {
+        batches: Arc<Mutex<Vec<jaeger::Batch>>>,
+    }
+
+    impl AgentSyncHandler for TestHandler {
+        fn handle_emit_zipkin_batch(
+            &self,
+            _spans: Vec<crate::thrift::zipkincore::Span>,
+        ) -> thrift::Result<()> {
+            unimplemented!()
+        }
+
+        fn handle_emit_batch(&self, batch: jaeger::Batch) -> thrift::Result<()> {
+            self.batches.lock().unwrap().push(batch);
+            Ok(())
+        }
+    }
+
+    /// Wraps a UdpSocket and a buffer the size of the max UDP datagram and provides
+    /// `std::io::Read` on this buffer's contents, ensuring that reads are not truncated
+    struct Reader {
+        socket: UdpSocket,
+        buffer: Box<[u8; 65535]>,
+        idx: usize,
+        len: usize,
+    }
+
+    impl Reader {
+        pub fn new(socket: UdpSocket) -> Self {
+            Self {
+                socket,
+                buffer: Box::new([0; 65535]),
+                idx: 0,
+                len: 0,
+            }
+        }
+    }
+
+    impl std::io::Read for Reader {
+        fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+            if self.idx == self.len {
+                self.idx = 0;
+                self.len = self.socket.recv(self.buffer.as_mut())?;
+            }
+            let to_read = buf.len().min(self.len - self.idx);
+            buf.copy_from_slice(&self.buffer[self.idx..(self.idx + to_read)]);
+            self.idx += to_read;
+            Ok(to_read)
+        }
+    }
+
+    #[test]
+    fn test_jaeger_tag_from_str() {
+        "".parse::<JaegerTag>().expect_err("empty tag should fail");
+        "key"
+            .parse::<JaegerTag>()
+            .expect_err("no value should fail");
+        "key="
+            .parse::<JaegerTag>()
+            .expect_err("no value should fail");
+        "key=="
+            .parse::<JaegerTag>()
+            .expect_err("no value should fail");
+        "=value"
+            .parse::<JaegerTag>()
+            .expect_err("no key should fail");
+        "==value"
+            .parse::<JaegerTag>()
+            .expect_err("empty key should fail");
+        "key==value"
+            .parse::<JaegerTag>()
+            .expect_err("too many = should fail");
+        "=".parse::<JaegerTag>()
+            .expect_err("empty key value should fail");
+        "key=value"
+            .parse::<JaegerTag>()
+            .expect("valid form should succeed");
+    }
+
+    #[tokio::test]
+    async fn test_jaeger() {
+        let server = UdpSocket::bind("0.0.0.0:0").unwrap();
+        server
+            .set_read_timeout(Some(std::time::Duration::from_secs(1)))
+            .unwrap();
+
+        let tags = [JaegerTag::new("bananas", "great")];
+        let address = server.local_addr().unwrap();
+        let mut exporter = JaegerAgentExporter::new(
+            "service_name".to_string(),
+            address,
+            Arc::new(SystemProvider::new()),
+            NonZeroU64::new(1_000).unwrap(),
+        )
+        .unwrap()
+        .with_tags(&tags);
+
+        // Encoded form of tags.
+        let want_tags = [jaeger::Tag {
+            key: "bananas".into(),
+            v_str: Some("great".into()),
+            v_type: jaeger::TagType::String,
+            v_double: None,
+            v_bool: None,
+            v_long: None,
+            v_binary: None,
+        }];
+
+        let batches = Arc::new(Mutex::new(vec![]));
+
+        let mut processor_input = TCompactInputProtocol::new(Reader::new(server));
+        let mut processor_output = TCompactOutputProtocol::new(TBufferChannel::with_capacity(0, 0));
+        let processor = AgentSyncProcessor::new(TestHandler {
+            batches: Arc::clone(&batches),
+        });
+
+        let ctx = SpanContext {
+            trace_id: TraceId::new(43434).unwrap(),
+            parent_span_id: None,
+            span_id: SpanId::new(3495993).unwrap(),
+            links: vec![],
+            collector: None,
+            sampled: true,
+        };
+        let mut span = ctx.child("foo");
+        span.ctx.links = vec![
+            (TraceId::new(12).unwrap(), SpanId::new(123).unwrap()),
+            (TraceId::new(45).unwrap(), SpanId::new(456).unwrap()),
+        ];
+        span.status = SpanStatus::Ok;
+        span.events = vec![SpanEvent {
+            time: Utc.timestamp_nanos(200000),
+            msg: "hello".into(),
+            metadata: HashMap::from([(Cow::from("evt_md"), MetaValue::Int(42))]),
+        }];
+        span.start = Some(Utc.timestamp_nanos(100000));
+        span.end = Some(Utc.timestamp_nanos(300000));
+        span.metadata = HashMap::from([(Cow::from("span_md"), MetaValue::Int(1337))]);
+
+        exporter.export(vec![span.clone(), span.clone()]).await;
+        exporter.export(vec![span.clone()]).await;
+
+        processor
+            .process(&mut processor_input, &mut processor_output)
+            .unwrap();
+
+        processor
+            .process(&mut processor_input, &mut processor_output)
+            .unwrap();
+
+        let batches = batches.lock().unwrap();
+        assert_eq!(batches.len(), 2);
+
+        let b1 = &batches[0];
+
+        assert_eq!(b1.spans.len(), 2);
+        assert_eq!(b1.process.service_name.as_str(), "service_name");
+        assert_eq!(b1.seq_no.unwrap(), 0);
+        let got_tags = b1
+            .process
+            .tags
+            .as_ref()
+            .expect("expected static process tags");
+        assert_eq!(got_tags, &want_tags);
+
+        let b2 = &batches[1];
+        assert_eq!(b2.spans.len(), 1);
+        assert_eq!(b2.process.service_name.as_str(), "service_name");
+        assert_eq!(b2.seq_no.unwrap(), 1);
+        let got_tags = b2
+            .process
+            .tags
+            .as_ref()
+            .expect("expected static process tags");
+        assert_eq!(got_tags, &want_tags);
+
+        // Span tags should be constant
+        assert_eq!(b1.process.tags, b2.process.tags);
+
+        let b1_s0 = &b1.spans[0];
+
+        assert_eq!(b1_s0, &b1.spans[1]);
+        assert_eq!(b1_s0, &b2.spans[0]);
+
+        assert_eq!(b1_s0.span_id, span.ctx.span_id.get() as i64);
+        assert_eq!(
+            b1_s0.parent_span_id,
+            span.ctx.parent_span_id.unwrap().get() as i64
+        );
+
+        // test links
+        let b1_s0_refs = b1_s0.references.as_ref().unwrap();
+        assert_eq!(b1_s0_refs.len(), 2);
+        let b1_s0_r0 = &b1_s0_refs[0];
+        let b1_s0_r1 = &b1_s0_refs[1];
+        assert_eq!(b1_s0_r0.span_id, span.ctx.links[0].1.get() as i64);
+        assert_eq!(b1_s0_r1.span_id, span.ctx.links[1].1.get() as i64);
+
+        // microseconds not nanoseconds
+        assert_eq!(b1_s0.start_time, 100);
+        assert_eq!(b1_s0.duration, 200);
+
+        let logs = b1_s0.logs.as_ref().unwrap();
+        assert_eq!(logs.len(), 1);
+        assert_eq!(logs[0].timestamp, 200);
+        assert_eq!(logs[0].fields.len(), 2);
+        assert_eq!(logs[0].fields[0].key.as_str(), "event");
+        assert_eq!(logs[0].fields[0].v_str.as_ref().unwrap().as_str(), "hello");
+        assert_eq!(logs[0].fields[1].key.as_str(), "evt_md");
+        assert_eq!(logs[0].fields[1].v_long.unwrap(), 42);
+
+        let tags = b1_s0.tags.as_ref().unwrap();
+        assert_eq!(tags.len(), 2);
+        assert_eq!(tags[0].key.as_str(), "ok");
+        assert!(tags[0].v_bool.unwrap());
+        assert_eq!(tags[1].key.as_str(), "span_md");
+        assert_eq!(tags[1].v_long.unwrap(), 1337);
+    }
+
+    #[test]
+    fn test_resolve() {
+        JaegerAgentExporter::new(
+            "service_name".to_string(),
+            "localhost:8082",
+            Arc::new(SystemProvider::new()),
+            NonZeroU64::new(1_000).unwrap(),
+        )
+        .unwrap();
+    }
+}
diff --git a/trace_exporters/src/jaeger/span.rs b/trace_exporters/src/jaeger/span.rs
new file mode 100644
index 0000000..d4aa44f
--- /dev/null
+++ b/trace_exporters/src/jaeger/span.rs
@@ -0,0 +1,188 @@
+/// Contains the conversion logic from a `trace::span::Span` to `thrift::jaeger::Span`
+use crate::thrift::jaeger::{self, SpanRef};
+use trace::{
+    ctx::TraceId,
+    span::{MetaValue, Span, SpanEvent, SpanStatus},
+};
+
+/// Split [`TraceId`] into high and low part.
+fn split_trace_id(trace_id: TraceId) -> (i64, i64) {
+    let trace_id = trace_id.get();
+    let trace_id_high = (trace_id >> 64) as i64;
+    let trace_id_low = trace_id as i64;
+    (trace_id_high, trace_id_low)
+}
+
+impl TryFrom<Span> for jaeger::Span {
+    type Error = String;
+
+    fn try_from(mut s: Span) -> Result<Self, Self::Error> {
+        let (trace_id_high, trace_id_low) = split_trace_id(s.ctx.trace_id);
+
+        // A parent span id of 0 indicates no parent span ID (span IDs are non-zero)
+        let parent_span_id = s.ctx.parent_span_id.map(|id| id.get()).unwrap_or_default() as i64;
+
+        let (start_time, duration) = match (s.start, s.end) {
+            (Some(start), Some(end)) => (
+                start.timestamp_nanos_opt().ok_or_else(|| {
+                    format!("start timestamp cannot be represented as nanos: {start}")
+                })? / 1000,
+                (end - start).num_microseconds().expect("no overflow"),
+            ),
+            (Some(start), _) => (
+                start.timestamp_nanos_opt().ok_or_else(|| {
+                    format!("start timestamp cannot be represented as nanos: {start}")
+                })? / 1000,
+                0,
+            ),
+            _ => (0, 0),
+        };
+
+        // These don't appear to be standardised, however, the jaeger UI treats
+        // the presence of an "error" tag as indicating an error
+        match s.status {
+            SpanStatus::Ok => {
+                s.metadata
+                    .entry("ok".into())
+                    .or_insert(MetaValue::Bool(true));
+            }
+            SpanStatus::Err => {
+                s.metadata
+                    .entry("error".into())
+                    .or_insert(MetaValue::Bool(true));
+            }
+            SpanStatus::Unknown => {}
+        }
+
+        let tags = match s.metadata.is_empty() {
+            true => None,
+            false => {
+                let mut md = s.metadata.into_iter().collect::<Vec<_>>();
+                md.sort_by(|(k1, _v1), (k2, _v2)| k1.cmp(k2));
+                Some(
+                    md.into_iter()
+                        .map(|(name, value)| tag_from_meta(name.to_string(), value))
+                        .collect(),
+                )
+            }
+        };
+
+        let logs = match s.events.is_empty() {
+            true => None,
+            false => Some(
+                s.events
+                    .into_iter()
+                    .map(TryInto::try_into)
+                    .collect::<Result<_, _>>()?,
+            ),
+        };
+
+        let references = if s.ctx.links.is_empty() {
+            None
+        } else {
+            Some(
+                s.ctx
+                    .links
+                    .into_iter()
+                    .map(|(trace_id, span_id)| {
+                        // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/sdk_exporters/jaeger.md#links
+                        let (trace_id_high, trace_id_low) = split_trace_id(trace_id);
+                        SpanRef {
+                            ref_type: jaeger::SpanRefType::FollowsFrom,
+                            trace_id_high,
+                            trace_id_low,
+                            span_id: span_id.get() as i64,
+                        }
+                    })
+                    .collect(),
+            )
+        };
+
+        Ok(Self {
+            trace_id_low,
+            trace_id_high,
+            span_id: s.ctx.span_id.get() as i64,
+            parent_span_id,
+            operation_name: s.name.to_string(),
+            references,
+            flags: 0,
+            start_time,
+            duration,
+            tags,
+            logs,
+        })
+    }
+}
+
+impl TryFrom<SpanEvent> for jaeger::Log {
+    type Error = String;
+
+    fn try_from(event: SpanEvent) -> Result<Self, Self::Error> {
+        let mut md = event.metadata.into_iter().collect::<Vec<_>>();
+        md.sort_by(|(k1, _v1), (k2, _v2)| k1.cmp(k2));
+
+        Ok(Self {
+            timestamp: event.time.timestamp_nanos_opt().ok_or_else(|| {
+                format!("timestamp cannot be represented as nanos: {}", event.time)
+            })? / 1000,
+            fields: std::iter::once(jaeger::Tag {
+                key: "event".to_string(),
+                v_type: jaeger::TagType::String,
+                v_str: Some(event.msg.to_string()),
+                v_double: None,
+                v_bool: None,
+                v_long: None,
+                v_binary: None,
+            })
+            .chain(md.into_iter().map(|(k, v)| tag_from_meta(k.to_string(), v)))
+            .collect(),
+        })
+    }
+}
+
+fn tag_from_meta(key: String, value: MetaValue) -> jaeger::Tag {
+    let mut tag = jaeger::Tag {
+        key,
+        v_type: jaeger::TagType::String,
+        v_str: None,
+        v_double: None,
+        v_bool: None,
+        v_long: None,
+        v_binary: None,
+    };
+
+    match value {
+        MetaValue::String(v) => {
+            tag.v_type = jaeger::TagType::String;
+            tag.v_str = Some(v.to_string())
+        }
+        MetaValue::Float(v) => {
+            tag.v_type = jaeger::TagType::Double;
+            tag.v_double = Some(v.into())
+        }
+        MetaValue::Int(v) => {
+            tag.v_type = jaeger::TagType::Long;
+            tag.v_long = Some(v)
+        }
+        MetaValue::Bool(v) => {
+            tag.v_type = jaeger::TagType::Bool;
+            tag.v_bool = Some(v)
+        }
+    };
+    tag
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_split_trace_id_integer_conversion() {
+        // test case from
+        // https://github.com/open-telemetry/opentelemetry-specification/blob/639c7443e78800b085d2c9826d1b300f5e81fded/specification/trace/sdk_exporters/jaeger.md#ids
+        let trace_id = TraceId::new(0xFF00000000000000).unwrap();
+        let (high, low) = split_trace_id(trace_id);
+        assert_eq!(high, 0);
+        assert_eq!(low, -72057594037927936);
+    }
+}
diff --git a/trace_exporters/src/lib.rs b/trace_exporters/src/lib.rs
new file mode 100644
index 0000000..b1a5337
--- /dev/null
+++ b/trace_exporters/src/lib.rs
@@ -0,0 +1,229 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(unreachable_pub)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use crate::export::AsyncExporter;
+use crate::jaeger::JaegerAgentExporter;
+use iox_time::SystemProvider;
+use jaeger::JaegerTag;
+use snafu::Snafu;
+use std::num::{NonZeroU16, NonZeroU64};
+use std::sync::Arc;
+
+pub mod export;
+
+mod jaeger;
+mod rate_limiter;
+
+/// Auto-generated thrift code
+#[allow(
+    dead_code,
+    deprecated,
+    clippy::redundant_field_names,
+    clippy::unused_unit,
+    clippy::use_self,
+    clippy::too_many_arguments,
+    clippy::type_complexity
+)]
+mod thrift {
+    pub mod agent;
+
+    pub mod zipkincore;
+
+    pub mod jaeger;
+}
+
+/// Default header name used to export traces
+pub const DEFAULT_JAEGER_TRACE_CONTEXT_HEADER_NAME: &str = "uber-trace-id";
+
+/// Default header name for Influx Cloud
+pub const DEFAULT_INFLUX_TRACE_CONTEXT_HEADER_NAME: &str = "influx-trace-id";
+
+/// CLI config for distributed tracing options
+#[derive(Debug, Clone, clap::Parser)]
+pub struct TracingConfig {
+    /// Tracing: exporter type
+    ///
+    /// Can be one of: none, jaeger
+    #[clap(
+        long = "traces-exporter",
+        env = "TRACES_EXPORTER",
+        default_value = "none",
+        action
+    )]
+    pub traces_exporter: TracesExporter,
+
+    /// Tracing: Jaeger agent network hostname
+    ///
+    /// Protocol is Thrift/Compact over UDP.
+    ///
+    /// Only used if `--traces-exporter` is "jaeger".
+    #[clap(
+        long = "traces-exporter-jaeger-agent-host",
+        env = "TRACES_EXPORTER_JAEGER_AGENT_HOST",
+        default_value = "0.0.0.0",
+        action
+    )]
+    pub traces_exporter_jaeger_agent_host: String,
+
+    /// Tracing: Jaeger agent network port
+    ///
+    /// Protocol is Thrift/Compact over UDP.
+    ///
+    /// Only used if `--traces-exporter` is "jaeger".
+    #[clap(
+        long = "traces-exporter-jaeger-agent-port",
+        env = "TRACES_EXPORTER_JAEGER_AGENT_PORT",
+        default_value = "6831",
+        action
+    )]
+    pub traces_exporter_jaeger_agent_port: NonZeroU16,
+
+    /// Tracing: Jaeger service name.
+    ///
+    /// Only used if `--traces-exporter` is "jaeger".
+    #[clap(
+        long = "traces-exporter-jaeger-service-name",
+        env = "TRACES_EXPORTER_JAEGER_SERVICE_NAME",
+        default_value = "iox-conductor",
+        action
+    )]
+    pub traces_exporter_jaeger_service_name: String,
+
+    /// Tracing: specifies the header name used for passing trace context
+    ///
+    /// Only used if `--traces-exporter` is "jaeger".
+    #[clap(
+        long = "traces-exporter-jaeger-trace-context-header-name",
+        env = "TRACES_EXPORTER_JAEGER_TRACE_CONTEXT_HEADER_NAME",
+        default_value = DEFAULT_JAEGER_TRACE_CONTEXT_HEADER_NAME,
+        action,
+    )]
+    pub traces_jaeger_trace_context_header_name: String,
+
+    /// Tracing: specifies the header name used for force sampling
+    ///
+    /// Only used if `--traces-exporter` is "jaeger".
+    #[clap(
+        long = "traces-jaeger-debug-name",
+        env = "TRACES_EXPORTER_JAEGER_DEBUG_NAME",
+        default_value = "jaeger-debug-id",
+        action
+    )]
+    pub traces_jaeger_debug_name: String,
+
+    /// Tracing: set of key=value pairs to annotate tracing spans with.
+    ///
+    /// Use a comma-delimited string to set multiple pairs: env=prod,region=eu-1
+    ///
+    /// Only used if `--traces-exporter` is "jaeger".
+    #[clap(
+        long = "traces-jaeger-tags",
+        env = "TRACES_EXPORTER_JAEGER_TAGS",
+        value_delimiter = ',',
+        action
+    )]
+    pub traces_jaeger_tags: Option<Vec<JaegerTag>>,
+
+    /// Tracing: Maximum number of message sent to a Jaeger service, per second.
+    ///
+    /// Only used if `--traces-exporter` is "jaeger".
+    #[clap(
+        long = "traces-jaeger-max-msgs-per-second",
+        env = "TRACES_JAEGER_MAX_MSGS_PER_SECOND",
+        default_value = "1000",
+        action
+    )]
+    pub traces_jaeger_max_msgs_per_second: NonZeroU64,
+}
+
+impl TracingConfig {
+    pub fn build(&self) -> Result<Option<Arc<AsyncExporter>>> {
+        match self.traces_exporter {
+            TracesExporter::None => Ok(None),
+            TracesExporter::Jaeger => Ok(Some(jaeger_exporter(self)?)),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum TracesExporter {
+    None,
+    Jaeger,
+}
+
+impl std::str::FromStr for TracesExporter {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_ascii_lowercase().as_str() {
+            "none" => Ok(Self::None),
+            "jaeger" => Ok(Self::Jaeger),
+            _ => Err(format!(
+                "Invalid traces exporter '{s}'. Valid options: none, jaeger"
+            )),
+        }
+    }
+}
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to resolve address: {}", address))]
+    ResolutionError { address: String },
+
+    #[snafu(context(false))]
+    IOError { source: std::io::Error },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+fn jaeger_exporter(config: &TracingConfig) -> Result<Arc<AsyncExporter>> {
+    let agent_endpoint = format!(
+        "{}:{}",
+        config.traces_exporter_jaeger_agent_host.trim(),
+        config.traces_exporter_jaeger_agent_port
+    );
+
+    let service_name = &config.traces_exporter_jaeger_service_name;
+    let mut jaeger = JaegerAgentExporter::new(
+        service_name.clone(),
+        agent_endpoint,
+        Arc::new(SystemProvider::new()),
+        config.traces_jaeger_max_msgs_per_second,
+    )?;
+
+    // Use any specified static span tags.
+    let mut tags = config
+        .traces_jaeger_tags
+        .as_ref()
+        .cloned()
+        .unwrap_or_default();
+
+    // add hostname
+    const TAG_HOSTNAME: &str = "hostname";
+    if !tags.iter().any(|t| t.key() == TAG_HOSTNAME) {
+        if let Ok(hostname) = std::env::var("HOSTNAME") {
+            tags.push(JaegerTag::new(TAG_HOSTNAME, hostname));
+        }
+    }
+
+    // commit tags
+    if !tags.is_empty() {
+        jaeger = jaeger.with_tags(&tags);
+    }
+
+    Ok(Arc::new(AsyncExporter::new(jaeger)))
+}
diff --git a/trace_exporters/src/rate_limiter.rs b/trace_exporters/src/rate_limiter.rs
new file mode 100644
index 0000000..ce347d4
--- /dev/null
+++ b/trace_exporters/src/rate_limiter.rs
@@ -0,0 +1,140 @@
+use std::{num::NonZeroU64, sync::Arc, time::Duration};
+
+use iox_time::{Time, TimeProvider};
+
+/// Limits `send` actions to a specific "messages per second".
+#[derive(Debug)]
+pub struct RateLimiter {
+    wait_time: Duration,
+    last_msg: Option<Time>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl RateLimiter {
+    /// Create new rate limiter using the given config.
+    pub fn new(msgs_per_second: NonZeroU64, time_provider: Arc<dyn TimeProvider>) -> Self {
+        Self {
+            wait_time: Duration::from_secs_f64(1.0 / msgs_per_second.get() as f64),
+            last_msg: None,
+            time_provider,
+        }
+    }
+
+    /// Record a send action.
+    ///
+    /// This may async-block if the rate limit was hit until the it is OK to send a message again.
+    ///
+    /// It is safe to cancel this method.
+    pub async fn send(&mut self) {
+        let mut now = self.time_provider.now();
+
+        if let Some(last) = &self.last_msg {
+            let wait_until = *last + self.wait_time;
+            if wait_until > now {
+                self.time_provider.sleep_until(wait_until).await;
+
+                // refresh `now`
+                now = self.time_provider.now();
+            }
+        }
+
+        // modify AFTER `await` due to cancellation
+        self.last_msg = Some(now);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use iox_time::MockProvider;
+    use std::future::Future;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn new_always_works() {
+        let mut limiter = RateLimiter::new(
+            NonZeroU64::new(1).unwrap(),
+            Arc::new(MockProvider::new(Time::MIN)),
+        );
+        limiter.send().await;
+    }
+
+    #[tokio::test]
+    async fn u64_max_msgs_per_second() {
+        let mut limiter = RateLimiter::new(
+            NonZeroU64::new(u64::MAX).unwrap(),
+            Arc::new(MockProvider::new(Time::MIN)),
+        );
+        limiter.send().await;
+        limiter.send().await;
+    }
+
+    #[tokio::test]
+    async fn throttle() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut limiter =
+            RateLimiter::new(NonZeroU64::new(1).unwrap(), Arc::clone(&time_provider) as _);
+        limiter.send().await;
+
+        {
+            // do NOT advance time
+            let fut = limiter.send();
+            tokio::pin!(fut);
+            assert_fut_pending(&mut fut).await;
+
+            // tick
+            time_provider.inc(Duration::from_secs(1));
+            fut.await;
+
+            // fut dropped here (important because it mut-borrows `limiter`)
+        }
+
+        // tick (but not enough)
+        time_provider.inc(Duration::from_millis(500));
+        let fut = limiter.send();
+        tokio::pin!(fut);
+        assert_fut_pending(&mut fut).await;
+
+        // tick (enough)
+        time_provider.inc(Duration::from_millis(500));
+        fut.await;
+    }
+
+    #[tokio::test]
+    async fn throttle_after_cancel() {
+        let time_provider = Arc::new(MockProvider::new(Time::MIN));
+        let mut limiter =
+            RateLimiter::new(NonZeroU64::new(1).unwrap(), Arc::clone(&time_provider) as _);
+        limiter.send().await;
+
+        // do NOT advance time
+        {
+            let fut = limiter.send();
+            tokio::pin!(fut);
+            assert_fut_pending(&mut fut).await;
+
+            // fut dropped here
+        }
+
+        // 2nd try should still be pending
+        let fut = limiter.send();
+        tokio::pin!(fut);
+        assert_fut_pending(&mut fut).await;
+
+        // tick
+        time_provider.inc(Duration::from_secs(1));
+        fut.await;
+    }
+
+    /// Assert that given future is pending.
+    async fn assert_fut_pending<F>(fut: &mut F)
+    where
+        F: Future + Send + Unpin,
+        F::Output: std::fmt::Debug,
+    {
+        tokio::select! {
+            e = fut => panic!("future is not pending, yielded: {e:?}"),
+            _ = tokio::time::sleep(Duration::from_millis(10)) => {},
+        };
+    }
+}
diff --git a/trace_exporters/src/thrift/agent.rs b/trace_exporters/src/thrift/agent.rs
new file mode 100644
index 0000000..e68d5db
--- /dev/null
+++ b/trace_exporters/src/thrift/agent.rs
@@ -0,0 +1,305 @@
+// Autogenerated by Thrift Compiler (0.13.0)
+// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+
+#![allow(unused_imports)]
+#![allow(unused_extern_crates)]
+#![cfg_attr(rustfmt, rustfmt_skip)]
+
+extern crate thrift;
+
+use thrift::OrderedFloat;
+use std::cell::RefCell;
+use std::collections::{BTreeMap, BTreeSet};
+use std::convert::{From, TryFrom};
+use std::default::Default;
+use std::error::Error;
+use std::fmt;
+use std::fmt::{Display, Formatter};
+use std::rc::Rc;
+
+use thrift::{ApplicationError, ApplicationErrorKind, ProtocolError, ProtocolErrorKind, TThriftClient};
+use thrift::protocol::{TFieldIdentifier, TListIdentifier, TMapIdentifier, TMessageIdentifier, TMessageType, TInputProtocol, TOutputProtocol, TSetIdentifier, TStructIdentifier, TType};
+use thrift::protocol::field_id;
+use thrift::protocol::verify_expected_message_type;
+use thrift::protocol::verify_expected_sequence_number;
+use thrift::protocol::verify_expected_service_call;
+use thrift::protocol::verify_required_field_exists;
+use thrift::server::TProcessor;
+
+use super::jaeger;
+use super::zipkincore;
+
+//
+// Agent service client
+//
+
+pub trait TAgentSyncClient {
+  fn emit_zipkin_batch(&mut self, spans: Vec<zipkincore::Span>) -> thrift::Result<()>;
+  fn emit_batch(&mut self, batch: jaeger::Batch) -> thrift::Result<()>;
+}
+
+pub trait TAgentSyncClientMarker {}
+
+pub struct AgentSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  _i_prot: IP,
+  _o_prot: OP,
+  _sequence_number: i32,
+}
+
+impl <IP, OP> AgentSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  pub fn new(input_protocol: IP, output_protocol: OP) -> AgentSyncClient<IP, OP> {
+    AgentSyncClient { _i_prot: input_protocol, _o_prot: output_protocol, _sequence_number: 0 }
+  }
+}
+
+impl <IP, OP> TThriftClient for AgentSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  fn i_prot_mut(&mut self) -> &mut dyn TInputProtocol { &mut self._i_prot }
+  fn o_prot_mut(&mut self) -> &mut dyn TOutputProtocol { &mut self._o_prot }
+  fn sequence_number(&self) -> i32 { self._sequence_number }
+  fn increment_sequence_number(&mut self) -> i32 { self._sequence_number += 1; self._sequence_number }
+}
+
+impl <IP, OP> TAgentSyncClientMarker for AgentSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {}
+
+impl <C: TThriftClient + TAgentSyncClientMarker> TAgentSyncClient for C {
+  fn emit_zipkin_batch(&mut self, spans: Vec<zipkincore::Span>) -> thrift::Result<()> {
+    (
+      {
+        self.increment_sequence_number();
+        let message_ident = TMessageIdentifier::new("emitZipkinBatch", TMessageType::OneWay, self.sequence_number());
+        let call_args = AgentEmitZipkinBatchArgs { spans: spans };
+        self.o_prot_mut().write_message_begin(&message_ident)?;
+        call_args.write_to_out_protocol(self.o_prot_mut())?;
+        self.o_prot_mut().write_message_end()?;
+        self.o_prot_mut().flush()
+      }
+    )?;
+    Ok(())
+  }
+  fn emit_batch(&mut self, batch: jaeger::Batch) -> thrift::Result<()> {
+    (
+      {
+        self.increment_sequence_number();
+        let message_ident = TMessageIdentifier::new("emitBatch", TMessageType::OneWay, self.sequence_number());
+        let call_args = AgentEmitBatchArgs { batch: batch };
+        self.o_prot_mut().write_message_begin(&message_ident)?;
+        call_args.write_to_out_protocol(self.o_prot_mut())?;
+        self.o_prot_mut().write_message_end()?;
+        self.o_prot_mut().flush()
+      }
+    )?;
+    Ok(())
+  }
+}
+
+//
+// Agent service processor
+//
+
+pub trait AgentSyncHandler {
+  fn handle_emit_zipkin_batch(&self, spans: Vec<zipkincore::Span>) -> thrift::Result<()>;
+  fn handle_emit_batch(&self, batch: jaeger::Batch) -> thrift::Result<()>;
+}
+
+pub struct AgentSyncProcessor<H: AgentSyncHandler> {
+  handler: H,
+}
+
+impl <H: AgentSyncHandler> AgentSyncProcessor<H> {
+  pub fn new(handler: H) -> AgentSyncProcessor<H> {
+    AgentSyncProcessor {
+      handler,
+    }
+  }
+  fn process_emit_zipkin_batch(&self, incoming_sequence_number: i32, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    TAgentProcessFunctions::process_emit_zipkin_batch(&self.handler, incoming_sequence_number, i_prot, o_prot)
+  }
+  fn process_emit_batch(&self, incoming_sequence_number: i32, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    TAgentProcessFunctions::process_emit_batch(&self.handler, incoming_sequence_number, i_prot, o_prot)
+  }
+}
+
+pub struct TAgentProcessFunctions;
+
+impl TAgentProcessFunctions {
+  pub fn process_emit_zipkin_batch<H: AgentSyncHandler>(handler: &H, _: i32, i_prot: &mut dyn TInputProtocol, _: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let args = AgentEmitZipkinBatchArgs::read_from_in_protocol(i_prot)?;
+    match handler.handle_emit_zipkin_batch(args.spans) {
+      Ok(_) => {
+        Ok(())
+      },
+      Err(e) => {
+        match e {
+          thrift::Error::Application(app_err) => {
+            Err(thrift::Error::Application(app_err))
+          },
+          _ => {
+            let ret_err = {
+              ApplicationError::new(
+                ApplicationErrorKind::Unknown,
+                e.description()
+              )
+            };
+            Err(thrift::Error::Application(ret_err))
+          },
+        }
+      },
+    }
+  }
+  pub fn process_emit_batch<H: AgentSyncHandler>(handler: &H, _: i32, i_prot: &mut dyn TInputProtocol, _: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let args = AgentEmitBatchArgs::read_from_in_protocol(i_prot)?;
+    match handler.handle_emit_batch(args.batch) {
+      Ok(_) => {
+        Ok(())
+      },
+      Err(e) => {
+        match e {
+          thrift::Error::Application(app_err) => {
+            Err(thrift::Error::Application(app_err))
+          },
+          _ => {
+            let ret_err = {
+              ApplicationError::new(
+                ApplicationErrorKind::Unknown,
+                e.description()
+              )
+            };
+            Err(thrift::Error::Application(ret_err))
+          },
+        }
+      },
+    }
+  }
+}
+
+impl <H: AgentSyncHandler> TProcessor for AgentSyncProcessor<H> {
+  fn process(&self, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let message_ident = i_prot.read_message_begin()?;
+    let res = match &*message_ident.name {
+      "emitZipkinBatch" => {
+        self.process_emit_zipkin_batch(message_ident.sequence_number, i_prot, o_prot)
+      },
+      "emitBatch" => {
+        self.process_emit_batch(message_ident.sequence_number, i_prot, o_prot)
+      },
+      method => {
+        Err(
+          thrift::Error::Application(
+            ApplicationError::new(
+              ApplicationErrorKind::UnknownMethod,
+              format!("unknown method {method}")
+            )
+          )
+        )
+      },
+    };
+    thrift::server::handle_process_result(&message_ident, res, o_prot)
+  }
+}
+
+//
+// AgentEmitZipkinBatchArgs
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+struct AgentEmitZipkinBatchArgs {
+  spans: Vec<zipkincore::Span>,
+}
+
+impl AgentEmitZipkinBatchArgs {
+  fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<AgentEmitZipkinBatchArgs> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<Vec<zipkincore::Span>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<zipkincore::Span> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_0 = zipkincore::Span::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_0);
+          }
+          i_prot.read_list_end()?;
+          f_1 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("AgentEmitZipkinBatchArgs.spans", &f_1)?;
+    let ret = AgentEmitZipkinBatchArgs {
+      spans: f_1.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("emitZipkinBatch_args");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("spans", TType::List, 1))?;
+    o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.spans.len() as i32))?;
+    for e in &self.spans {
+      e.write_to_out_protocol(o_prot)?;
+      o_prot.write_list_end()?;
+    }
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// AgentEmitBatchArgs
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+struct AgentEmitBatchArgs {
+  batch: jaeger::Batch,
+}
+
+impl AgentEmitBatchArgs {
+  fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<AgentEmitBatchArgs> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<jaeger::Batch> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = jaeger::Batch::read_from_in_protocol(i_prot)?;
+          f_1 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("AgentEmitBatchArgs.batch", &f_1)?;
+    let ret = AgentEmitBatchArgs {
+      batch: f_1.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("emitBatch_args");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("batch", TType::Struct, 1))?;
+    self.batch.write_to_out_protocol(o_prot)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
diff --git a/trace_exporters/src/thrift/jaeger.rs b/trace_exporters/src/thrift/jaeger.rs
new file mode 100644
index 0000000..4014e4c
--- /dev/null
+++ b/trace_exporters/src/thrift/jaeger.rs
@@ -0,0 +1,1224 @@
+// Autogenerated by Thrift Compiler (0.13.0)
+// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+
+#![allow(unused_imports)]
+#![allow(unused_extern_crates)]
+#![cfg_attr(rustfmt, rustfmt_skip)]
+
+extern crate thrift;
+
+use thrift::OrderedFloat;
+use std::cell::RefCell;
+use std::collections::{BTreeMap, BTreeSet};
+use std::convert::{From, TryFrom};
+use std::default::Default;
+use std::error::Error;
+use std::fmt;
+use std::fmt::{Display, Formatter};
+use std::rc::Rc;
+
+use thrift::{ApplicationError, ApplicationErrorKind, ProtocolError, ProtocolErrorKind, TThriftClient};
+use thrift::protocol::{TFieldIdentifier, TListIdentifier, TMapIdentifier, TMessageIdentifier, TMessageType, TInputProtocol, TOutputProtocol, TSetIdentifier, TStructIdentifier, TType};
+use thrift::protocol::field_id;
+use thrift::protocol::verify_expected_message_type;
+use thrift::protocol::verify_expected_sequence_number;
+use thrift::protocol::verify_expected_service_call;
+use thrift::protocol::verify_required_field_exists;
+use thrift::server::TProcessor;
+
+#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub enum TagType {
+  String = 0,
+  Double = 1,
+  Bool = 2,
+  Long = 3,
+  Binary = 4,
+}
+
+impl TagType {
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    o_prot.write_i32(*self as i32)
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<TagType> {
+    let enum_value = i_prot.read_i32()?;
+    TagType::try_from(enum_value)  }
+}
+
+impl TryFrom<i32> for TagType {
+  type Error = thrift::Error;  fn try_from(i: i32) -> Result<Self, Self::Error> {
+    match i {
+      0 => Ok(TagType::String),
+      1 => Ok(TagType::Double),
+      2 => Ok(TagType::Bool),
+      3 => Ok(TagType::Long),
+      4 => Ok(TagType::Binary),
+      _ => {
+        Err(
+          thrift::Error::Protocol(
+            ProtocolError::new(
+              ProtocolErrorKind::InvalidData,
+              format!("cannot convert enum constant {i} to TagType")
+            )
+          )
+        )
+      },
+    }
+  }
+}
+
+#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub enum SpanRefType {
+  ChildOf = 0,
+  FollowsFrom = 1,
+}
+
+impl SpanRefType {
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    o_prot.write_i32(*self as i32)
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<SpanRefType> {
+    let enum_value = i_prot.read_i32()?;
+    SpanRefType::try_from(enum_value)  }
+}
+
+impl TryFrom<i32> for SpanRefType {
+  type Error = thrift::Error;  fn try_from(i: i32) -> Result<Self, Self::Error> {
+    match i {
+      0 => Ok(SpanRefType::ChildOf),
+      1 => Ok(SpanRefType::FollowsFrom),
+      _ => {
+        Err(
+          thrift::Error::Protocol(
+            ProtocolError::new(
+              ProtocolErrorKind::InvalidData,
+              format!("cannot convert enum constant {i} to SpanRefType")
+            )
+          )
+        )
+      },
+    }
+  }
+}
+
+//
+// Tag
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Tag {
+  pub key: String,
+  pub v_type: TagType,
+  pub v_str: Option<String>,
+  pub v_double: Option<OrderedFloat<f64>>,
+  pub v_bool: Option<bool>,
+  pub v_long: Option<i64>,
+  pub v_binary: Option<Vec<u8>>,
+}
+
+impl Tag {
+  pub fn new<F3, F4, F5, F6, F7>(key: String, v_type: TagType, v_str: F3, v_double: F4, v_bool: F5, v_long: F6, v_binary: F7) -> Tag where F3: Into<Option<String>>, F4: Into<Option<OrderedFloat<f64>>>, F5: Into<Option<bool>>, F6: Into<Option<i64>>, F7: Into<Option<Vec<u8>>> {
+    Tag {
+      key: key,
+      v_type: v_type,
+      v_str: v_str.into(),
+      v_double: v_double.into(),
+      v_bool: v_bool.into(),
+      v_long: v_long.into(),
+      v_binary: v_binary.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Tag> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<String> = None;
+    let mut f_2: Option<TagType> = None;
+    let mut f_3: Option<String> = None;
+    let mut f_4: Option<OrderedFloat<f64>> = None;
+    let mut f_5: Option<bool> = None;
+    let mut f_6: Option<i64> = None;
+    let mut f_7: Option<Vec<u8>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_string()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let val = TagType::read_from_in_protocol(i_prot)?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = i_prot.read_string()?;
+          f_3 = Some(val);
+        },
+        4 => {
+          let val = OrderedFloat::from(i_prot.read_double()?);
+          f_4 = Some(val);
+        },
+        5 => {
+          let val = i_prot.read_bool()?;
+          f_5 = Some(val);
+        },
+        6 => {
+          let val = i_prot.read_i64()?;
+          f_6 = Some(val);
+        },
+        7 => {
+          let val = i_prot.read_bytes()?;
+          f_7 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("Tag.key", &f_1)?;
+    verify_required_field_exists("Tag.v_type", &f_2)?;
+    let ret = Tag {
+      key: f_1.expect("auto-generated code should have checked for presence of required fields"),
+      v_type: f_2.expect("auto-generated code should have checked for presence of required fields"),
+      v_str: f_3,
+      v_double: f_4,
+      v_bool: f_5,
+      v_long: f_6,
+      v_binary: f_7,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Tag");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("key", TType::String, 1))?;
+    o_prot.write_string(&self.key)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("vType", TType::I32, 2))?;
+    self.v_type.write_to_out_protocol(o_prot)?;
+    o_prot.write_field_end()?;
+    if let Some(ref fld_var) = self.v_str {
+      o_prot.write_field_begin(&TFieldIdentifier::new("vStr", TType::String, 3))?;
+      o_prot.write_string(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.v_double {
+      o_prot.write_field_begin(&TFieldIdentifier::new("vDouble", TType::Double, 4))?;
+      o_prot.write_double(fld_var.into())?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.v_bool {
+      o_prot.write_field_begin(&TFieldIdentifier::new("vBool", TType::Bool, 5))?;
+      o_prot.write_bool(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.v_long {
+      o_prot.write_field_begin(&TFieldIdentifier::new("vLong", TType::I64, 6))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.v_binary {
+      o_prot.write_field_begin(&TFieldIdentifier::new("vBinary", TType::String, 7))?;
+      o_prot.write_bytes(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// Log
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Log {
+  pub timestamp: i64,
+  pub fields: Vec<Tag>,
+}
+
+impl Log {
+  pub fn new(timestamp: i64, fields: Vec<Tag>) -> Log {
+    Log {
+      timestamp: timestamp,
+      fields: fields,
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Log> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<i64> = None;
+    let mut f_2: Option<Vec<Tag>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_i64()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Tag> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_0 = Tag::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_0);
+          }
+          i_prot.read_list_end()?;
+          f_2 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("Log.timestamp", &f_1)?;
+    verify_required_field_exists("Log.fields", &f_2)?;
+    let ret = Log {
+      timestamp: f_1.expect("auto-generated code should have checked for presence of required fields"),
+      fields: f_2.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Log");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("timestamp", TType::I64, 1))?;
+    o_prot.write_i64(self.timestamp)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("fields", TType::List, 2))?;
+    o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.fields.len() as i32))?;
+    for e in &self.fields {
+      e.write_to_out_protocol(o_prot)?;
+      o_prot.write_list_end()?;
+    }
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// SpanRef
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct SpanRef {
+  pub ref_type: SpanRefType,
+  pub trace_id_low: i64,
+  pub trace_id_high: i64,
+  pub span_id: i64,
+}
+
+impl SpanRef {
+  pub fn new(ref_type: SpanRefType, trace_id_low: i64, trace_id_high: i64, span_id: i64) -> SpanRef {
+    SpanRef {
+      ref_type: ref_type,
+      trace_id_low: trace_id_low,
+      trace_id_high: trace_id_high,
+      span_id: span_id,
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<SpanRef> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<SpanRefType> = None;
+    let mut f_2: Option<i64> = None;
+    let mut f_3: Option<i64> = None;
+    let mut f_4: Option<i64> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = SpanRefType::read_from_in_protocol(i_prot)?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let val = i_prot.read_i64()?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = i_prot.read_i64()?;
+          f_3 = Some(val);
+        },
+        4 => {
+          let val = i_prot.read_i64()?;
+          f_4 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("SpanRef.ref_type", &f_1)?;
+    verify_required_field_exists("SpanRef.trace_id_low", &f_2)?;
+    verify_required_field_exists("SpanRef.trace_id_high", &f_3)?;
+    verify_required_field_exists("SpanRef.span_id", &f_4)?;
+    let ret = SpanRef {
+      ref_type: f_1.expect("auto-generated code should have checked for presence of required fields"),
+      trace_id_low: f_2.expect("auto-generated code should have checked for presence of required fields"),
+      trace_id_high: f_3.expect("auto-generated code should have checked for presence of required fields"),
+      span_id: f_4.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("SpanRef");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("refType", TType::I32, 1))?;
+    self.ref_type.write_to_out_protocol(o_prot)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("traceIdLow", TType::I64, 2))?;
+    o_prot.write_i64(self.trace_id_low)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("traceIdHigh", TType::I64, 3))?;
+    o_prot.write_i64(self.trace_id_high)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("spanId", TType::I64, 4))?;
+    o_prot.write_i64(self.span_id)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// Span
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Span {
+  pub trace_id_low: i64,
+  pub trace_id_high: i64,
+  pub span_id: i64,
+  pub parent_span_id: i64,
+  pub operation_name: String,
+  pub references: Option<Vec<SpanRef>>,
+  pub flags: i32,
+  pub start_time: i64,
+  pub duration: i64,
+  pub tags: Option<Vec<Tag>>,
+  pub logs: Option<Vec<Log>>,
+}
+
+impl Span {
+  pub fn new<F6, F10, F11>(trace_id_low: i64, trace_id_high: i64, span_id: i64, parent_span_id: i64, operation_name: String, references: F6, flags: i32, start_time: i64, duration: i64, tags: F10, logs: F11) -> Span where F6: Into<Option<Vec<SpanRef>>>, F10: Into<Option<Vec<Tag>>>, F11: Into<Option<Vec<Log>>> {
+    Span {
+      trace_id_low: trace_id_low,
+      trace_id_high: trace_id_high,
+      span_id: span_id,
+      parent_span_id: parent_span_id,
+      operation_name: operation_name,
+      references: references.into(),
+      flags: flags,
+      start_time: start_time,
+      duration: duration,
+      tags: tags.into(),
+      logs: logs.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Span> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<i64> = None;
+    let mut f_2: Option<i64> = None;
+    let mut f_3: Option<i64> = None;
+    let mut f_4: Option<i64> = None;
+    let mut f_5: Option<String> = None;
+    let mut f_6: Option<Vec<SpanRef>> = None;
+    let mut f_7: Option<i32> = None;
+    let mut f_8: Option<i64> = None;
+    let mut f_9: Option<i64> = None;
+    let mut f_10: Option<Vec<Tag>> = None;
+    let mut f_11: Option<Vec<Log>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_i64()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let val = i_prot.read_i64()?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = i_prot.read_i64()?;
+          f_3 = Some(val);
+        },
+        4 => {
+          let val = i_prot.read_i64()?;
+          f_4 = Some(val);
+        },
+        5 => {
+          let val = i_prot.read_string()?;
+          f_5 = Some(val);
+        },
+        6 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<SpanRef> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_1 = SpanRef::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_1);
+          }
+          i_prot.read_list_end()?;
+          f_6 = Some(val);
+        },
+        7 => {
+          let val = i_prot.read_i32()?;
+          f_7 = Some(val);
+        },
+        8 => {
+          let val = i_prot.read_i64()?;
+          f_8 = Some(val);
+        },
+        9 => {
+          let val = i_prot.read_i64()?;
+          f_9 = Some(val);
+        },
+        10 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Tag> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_2 = Tag::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_2);
+          }
+          i_prot.read_list_end()?;
+          f_10 = Some(val);
+        },
+        11 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Log> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_3 = Log::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_3);
+          }
+          i_prot.read_list_end()?;
+          f_11 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("Span.trace_id_low", &f_1)?;
+    verify_required_field_exists("Span.trace_id_high", &f_2)?;
+    verify_required_field_exists("Span.span_id", &f_3)?;
+    verify_required_field_exists("Span.parent_span_id", &f_4)?;
+    verify_required_field_exists("Span.operation_name", &f_5)?;
+    verify_required_field_exists("Span.flags", &f_7)?;
+    verify_required_field_exists("Span.start_time", &f_8)?;
+    verify_required_field_exists("Span.duration", &f_9)?;
+    let ret = Span {
+      trace_id_low: f_1.expect("auto-generated code should have checked for presence of required fields"),
+      trace_id_high: f_2.expect("auto-generated code should have checked for presence of required fields"),
+      span_id: f_3.expect("auto-generated code should have checked for presence of required fields"),
+      parent_span_id: f_4.expect("auto-generated code should have checked for presence of required fields"),
+      operation_name: f_5.expect("auto-generated code should have checked for presence of required fields"),
+      references: f_6,
+      flags: f_7.expect("auto-generated code should have checked for presence of required fields"),
+      start_time: f_8.expect("auto-generated code should have checked for presence of required fields"),
+      duration: f_9.expect("auto-generated code should have checked for presence of required fields"),
+      tags: f_10,
+      logs: f_11,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Span");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("traceIdLow", TType::I64, 1))?;
+    o_prot.write_i64(self.trace_id_low)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("traceIdHigh", TType::I64, 2))?;
+    o_prot.write_i64(self.trace_id_high)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("spanId", TType::I64, 3))?;
+    o_prot.write_i64(self.span_id)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("parentSpanId", TType::I64, 4))?;
+    o_prot.write_i64(self.parent_span_id)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("operationName", TType::String, 5))?;
+    o_prot.write_string(&self.operation_name)?;
+    o_prot.write_field_end()?;
+    if let Some(ref fld_var) = self.references {
+      o_prot.write_field_begin(&TFieldIdentifier::new("references", TType::List, 6))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_begin(&TFieldIdentifier::new("flags", TType::I32, 7))?;
+    o_prot.write_i32(self.flags)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("startTime", TType::I64, 8))?;
+    o_prot.write_i64(self.start_time)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("duration", TType::I64, 9))?;
+    o_prot.write_i64(self.duration)?;
+    o_prot.write_field_end()?;
+    if let Some(ref fld_var) = self.tags {
+      o_prot.write_field_begin(&TFieldIdentifier::new("tags", TType::List, 10))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.logs {
+      o_prot.write_field_begin(&TFieldIdentifier::new("logs", TType::List, 11))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// Process
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Process {
+  pub service_name: String,
+  pub tags: Option<Vec<Tag>>,
+}
+
+impl Process {
+  pub fn new<F2>(service_name: String, tags: F2) -> Process where F2: Into<Option<Vec<Tag>>> {
+    Process {
+      service_name: service_name,
+      tags: tags.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Process> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<String> = None;
+    let mut f_2: Option<Vec<Tag>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_string()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Tag> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_4 = Tag::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_4);
+          }
+          i_prot.read_list_end()?;
+          f_2 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("Process.service_name", &f_1)?;
+    let ret = Process {
+      service_name: f_1.expect("auto-generated code should have checked for presence of required fields"),
+      tags: f_2,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Process");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("serviceName", TType::String, 1))?;
+    o_prot.write_string(&self.service_name)?;
+    o_prot.write_field_end()?;
+    if let Some(ref fld_var) = self.tags {
+      o_prot.write_field_begin(&TFieldIdentifier::new("tags", TType::List, 2))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// ClientStats
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct ClientStats {
+  pub full_queue_dropped_spans: i64,
+  pub too_large_dropped_spans: i64,
+  pub failed_to_emit_spans: i64,
+}
+
+impl ClientStats {
+  pub fn new(full_queue_dropped_spans: i64, too_large_dropped_spans: i64, failed_to_emit_spans: i64) -> ClientStats {
+    ClientStats {
+      full_queue_dropped_spans: full_queue_dropped_spans,
+      too_large_dropped_spans: too_large_dropped_spans,
+      failed_to_emit_spans: failed_to_emit_spans,
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<ClientStats> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<i64> = None;
+    let mut f_2: Option<i64> = None;
+    let mut f_3: Option<i64> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_i64()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let val = i_prot.read_i64()?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = i_prot.read_i64()?;
+          f_3 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("ClientStats.full_queue_dropped_spans", &f_1)?;
+    verify_required_field_exists("ClientStats.too_large_dropped_spans", &f_2)?;
+    verify_required_field_exists("ClientStats.failed_to_emit_spans", &f_3)?;
+    let ret = ClientStats {
+      full_queue_dropped_spans: f_1.expect("auto-generated code should have checked for presence of required fields"),
+      too_large_dropped_spans: f_2.expect("auto-generated code should have checked for presence of required fields"),
+      failed_to_emit_spans: f_3.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("ClientStats");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("fullQueueDroppedSpans", TType::I64, 1))?;
+    o_prot.write_i64(self.full_queue_dropped_spans)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("tooLargeDroppedSpans", TType::I64, 2))?;
+    o_prot.write_i64(self.too_large_dropped_spans)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("failedToEmitSpans", TType::I64, 3))?;
+    o_prot.write_i64(self.failed_to_emit_spans)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// Batch
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Batch {
+  pub process: Process,
+  pub spans: Vec<Span>,
+  pub seq_no: Option<i64>,
+  pub stats: Option<ClientStats>,
+}
+
+impl Batch {
+  pub fn new<F3, F4>(process: Process, spans: Vec<Span>, seq_no: F3, stats: F4) -> Batch where F3: Into<Option<i64>>, F4: Into<Option<ClientStats>> {
+    Batch {
+      process: process,
+      spans: spans,
+      seq_no: seq_no.into(),
+      stats: stats.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Batch> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<Process> = None;
+    let mut f_2: Option<Vec<Span>> = None;
+    let mut f_3: Option<i64> = None;
+    let mut f_4: Option<ClientStats> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = Process::read_from_in_protocol(i_prot)?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Span> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_5 = Span::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_5);
+          }
+          i_prot.read_list_end()?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = i_prot.read_i64()?;
+          f_3 = Some(val);
+        },
+        4 => {
+          let val = ClientStats::read_from_in_protocol(i_prot)?;
+          f_4 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("Batch.process", &f_1)?;
+    verify_required_field_exists("Batch.spans", &f_2)?;
+    let ret = Batch {
+      process: f_1.expect("auto-generated code should have checked for presence of required fields"),
+      spans: f_2.expect("auto-generated code should have checked for presence of required fields"),
+      seq_no: f_3,
+      stats: f_4,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Batch");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("process", TType::Struct, 1))?;
+    self.process.write_to_out_protocol(o_prot)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("spans", TType::List, 2))?;
+    o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.spans.len() as i32))?;
+    for e in &self.spans {
+      e.write_to_out_protocol(o_prot)?;
+      o_prot.write_list_end()?;
+    }
+    o_prot.write_field_end()?;
+    if let Some(fld_var) = self.seq_no {
+      o_prot.write_field_begin(&TFieldIdentifier::new("seqNo", TType::I64, 3))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.stats {
+      o_prot.write_field_begin(&TFieldIdentifier::new("stats", TType::Struct, 4))?;
+      fld_var.write_to_out_protocol(o_prot)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// BatchSubmitResponse
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct BatchSubmitResponse {
+  pub ok: bool,
+}
+
+impl BatchSubmitResponse {
+  pub fn new(ok: bool) -> BatchSubmitResponse {
+    BatchSubmitResponse {
+      ok: ok,
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<BatchSubmitResponse> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<bool> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_bool()?;
+          f_1 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("BatchSubmitResponse.ok", &f_1)?;
+    let ret = BatchSubmitResponse {
+      ok: f_1.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("BatchSubmitResponse");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("ok", TType::Bool, 1))?;
+    o_prot.write_bool(self.ok)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// Collector service client
+//
+
+pub trait TCollectorSyncClient {
+  fn submit_batches(&mut self, batches: Vec<Batch>) -> thrift::Result<Vec<BatchSubmitResponse>>;
+}
+
+pub trait TCollectorSyncClientMarker {}
+
+pub struct CollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  _i_prot: IP,
+  _o_prot: OP,
+  _sequence_number: i32,
+}
+
+impl <IP, OP> CollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  pub fn new(input_protocol: IP, output_protocol: OP) -> CollectorSyncClient<IP, OP> {
+    CollectorSyncClient { _i_prot: input_protocol, _o_prot: output_protocol, _sequence_number: 0 }
+  }
+}
+
+impl <IP, OP> TThriftClient for CollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  fn i_prot_mut(&mut self) -> &mut dyn TInputProtocol { &mut self._i_prot }
+  fn o_prot_mut(&mut self) -> &mut dyn TOutputProtocol { &mut self._o_prot }
+  fn sequence_number(&self) -> i32 { self._sequence_number }
+  fn increment_sequence_number(&mut self) -> i32 { self._sequence_number += 1; self._sequence_number }
+}
+
+impl <IP, OP> TCollectorSyncClientMarker for CollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {}
+
+impl <C: TThriftClient + TCollectorSyncClientMarker> TCollectorSyncClient for C {
+  fn submit_batches(&mut self, batches: Vec<Batch>) -> thrift::Result<Vec<BatchSubmitResponse>> {
+    (
+      {
+        self.increment_sequence_number();
+        let message_ident = TMessageIdentifier::new("submitBatches", TMessageType::Call, self.sequence_number());
+        let call_args = CollectorSubmitBatchesArgs { batches: batches };
+        self.o_prot_mut().write_message_begin(&message_ident)?;
+        call_args.write_to_out_protocol(self.o_prot_mut())?;
+        self.o_prot_mut().write_message_end()?;
+        self.o_prot_mut().flush()
+      }
+    )?;
+    {
+      let message_ident = self.i_prot_mut().read_message_begin()?;
+      verify_expected_sequence_number(self.sequence_number(), message_ident.sequence_number)?;
+      verify_expected_service_call("submitBatches", &message_ident.name)?;
+      if message_ident.message_type == TMessageType::Exception {
+        let remote_error = thrift::Error::read_application_error_from_in_protocol(self.i_prot_mut())?;
+        self.i_prot_mut().read_message_end()?;
+        return Err(thrift::Error::Application(remote_error))
+      }
+      verify_expected_message_type(TMessageType::Reply, message_ident.message_type)?;
+      let result = CollectorSubmitBatchesResult::read_from_in_protocol(self.i_prot_mut())?;
+      self.i_prot_mut().read_message_end()?;
+      result.ok_or()
+    }
+  }
+}
+
+//
+// Collector service processor
+//
+
+pub trait CollectorSyncHandler {
+  fn handle_submit_batches(&self, batches: Vec<Batch>) -> thrift::Result<Vec<BatchSubmitResponse>>;
+}
+
+pub struct CollectorSyncProcessor<H: CollectorSyncHandler> {
+  handler: H,
+}
+
+impl <H: CollectorSyncHandler> CollectorSyncProcessor<H> {
+  pub fn new(handler: H) -> CollectorSyncProcessor<H> {
+    CollectorSyncProcessor {
+      handler,
+    }
+  }
+  fn process_submit_batches(&self, incoming_sequence_number: i32, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    TCollectorProcessFunctions::process_submit_batches(&self.handler, incoming_sequence_number, i_prot, o_prot)
+  }
+}
+
+pub struct TCollectorProcessFunctions;
+
+impl TCollectorProcessFunctions {
+  pub fn process_submit_batches<H: CollectorSyncHandler>(handler: &H, incoming_sequence_number: i32, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let args = CollectorSubmitBatchesArgs::read_from_in_protocol(i_prot)?;
+    match handler.handle_submit_batches(args.batches) {
+      Ok(handler_return) => {
+        let message_ident = TMessageIdentifier::new("submitBatches", TMessageType::Reply, incoming_sequence_number);
+        o_prot.write_message_begin(&message_ident)?;
+        let ret = CollectorSubmitBatchesResult { result_value: Some(handler_return) };
+        ret.write_to_out_protocol(o_prot)?;
+        o_prot.write_message_end()?;
+        o_prot.flush()
+      },
+      Err(e) => {
+        match e {
+          thrift::Error::Application(app_err) => {
+            let message_ident = TMessageIdentifier::new("submitBatches", TMessageType::Exception, incoming_sequence_number);
+            o_prot.write_message_begin(&message_ident)?;
+            thrift::Error::write_application_error_to_out_protocol(&app_err, o_prot)?;
+            o_prot.write_message_end()?;
+            o_prot.flush()
+          },
+          _ => {
+            let ret_err = {
+              ApplicationError::new(
+                ApplicationErrorKind::Unknown,
+                e.description()
+              )
+            };
+            let message_ident = TMessageIdentifier::new("submitBatches", TMessageType::Exception, incoming_sequence_number);
+            o_prot.write_message_begin(&message_ident)?;
+            thrift::Error::write_application_error_to_out_protocol(&ret_err, o_prot)?;
+            o_prot.write_message_end()?;
+            o_prot.flush()
+          },
+        }
+      },
+    }
+  }
+}
+
+impl <H: CollectorSyncHandler> TProcessor for CollectorSyncProcessor<H> {
+  fn process(&self, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let message_ident = i_prot.read_message_begin()?;
+    let res = match &*message_ident.name {
+      "submitBatches" => {
+        self.process_submit_batches(message_ident.sequence_number, i_prot, o_prot)
+      },
+      method => {
+        Err(
+          thrift::Error::Application(
+            ApplicationError::new(
+              ApplicationErrorKind::UnknownMethod,
+              format!("unknown method {method}")
+            )
+          )
+        )
+      },
+    };
+    thrift::server::handle_process_result(&message_ident, res, o_prot)
+  }
+}
+
+//
+// CollectorSubmitBatchesArgs
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+struct CollectorSubmitBatchesArgs {
+  batches: Vec<Batch>,
+}
+
+impl CollectorSubmitBatchesArgs {
+  fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<CollectorSubmitBatchesArgs> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<Vec<Batch>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Batch> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_6 = Batch::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_6);
+          }
+          i_prot.read_list_end()?;
+          f_1 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("CollectorSubmitBatchesArgs.batches", &f_1)?;
+    let ret = CollectorSubmitBatchesArgs {
+      batches: f_1.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("submitBatches_args");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("batches", TType::List, 1))?;
+    o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.batches.len() as i32))?;
+    for e in &self.batches {
+      e.write_to_out_protocol(o_prot)?;
+      o_prot.write_list_end()?;
+    }
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// CollectorSubmitBatchesResult
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+struct CollectorSubmitBatchesResult {
+  result_value: Option<Vec<BatchSubmitResponse>>,
+}
+
+impl CollectorSubmitBatchesResult {
+  fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<CollectorSubmitBatchesResult> {
+    i_prot.read_struct_begin()?;
+    let mut f_0: Option<Vec<BatchSubmitResponse>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        0 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<BatchSubmitResponse> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_7 = BatchSubmitResponse::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_7);
+          }
+          i_prot.read_list_end()?;
+          f_0 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    let ret = CollectorSubmitBatchesResult {
+      result_value: f_0,
+    };
+    Ok(ret)
+  }
+  fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("CollectorSubmitBatchesResult");
+    o_prot.write_struct_begin(&struct_ident)?;
+    if let Some(ref fld_var) = self.result_value {
+      o_prot.write_field_begin(&TFieldIdentifier::new("result_value", TType::List, 0))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+  fn ok_or(self) -> thrift::Result<Vec<BatchSubmitResponse>> {
+    if self.result_value.is_some() {
+      Ok(self.result_value.unwrap())
+    } else {
+      Err(
+        thrift::Error::Application(
+          ApplicationError::new(
+            ApplicationErrorKind::MissingResult,
+            "no result received for CollectorSubmitBatches"
+          )
+        )
+      )
+    }
+  }
+}
+
diff --git a/trace_exporters/src/thrift/zipkincore.rs b/trace_exporters/src/thrift/zipkincore.rs
new file mode 100644
index 0000000..3f80ce6
--- /dev/null
+++ b/trace_exporters/src/thrift/zipkincore.rs
@@ -0,0 +1,1091 @@
+// Autogenerated by Thrift Compiler (0.13.0)
+// DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+
+#![allow(unused_imports)]
+#![allow(unused_extern_crates)]
+#![cfg_attr(rustfmt, rustfmt_skip)]
+
+extern crate thrift;
+
+use thrift::OrderedFloat;
+use std::cell::RefCell;
+use std::collections::{BTreeMap, BTreeSet};
+use std::convert::{From, TryFrom};
+use std::default::Default;
+use std::error::Error;
+use std::fmt;
+use std::fmt::{Display, Formatter};
+use std::rc::Rc;
+
+use thrift::{ApplicationError, ApplicationErrorKind, ProtocolError, ProtocolErrorKind, TThriftClient};
+use thrift::protocol::{TFieldIdentifier, TListIdentifier, TMapIdentifier, TMessageIdentifier, TMessageType, TInputProtocol, TOutputProtocol, TSetIdentifier, TStructIdentifier, TType};
+use thrift::protocol::field_id;
+use thrift::protocol::verify_expected_message_type;
+use thrift::protocol::verify_expected_sequence_number;
+use thrift::protocol::verify_expected_service_call;
+use thrift::protocol::verify_required_field_exists;
+use thrift::server::TProcessor;
+
+#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub enum AnnotationType {
+  Bool = 0,
+  Bytes = 1,
+  I16 = 2,
+  I32 = 3,
+  I64 = 4,
+  Double = 5,
+  String = 6,
+}
+
+impl AnnotationType {
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    o_prot.write_i32(*self as i32)
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<AnnotationType> {
+    let enum_value = i_prot.read_i32()?;
+    AnnotationType::try_from(enum_value)  }
+}
+
+impl TryFrom<i32> for AnnotationType {
+  type Error = thrift::Error;  fn try_from(i: i32) -> Result<Self, Self::Error> {
+    match i {
+      0 => Ok(AnnotationType::Bool),
+      1 => Ok(AnnotationType::Bytes),
+      2 => Ok(AnnotationType::I16),
+      3 => Ok(AnnotationType::I32),
+      4 => Ok(AnnotationType::I64),
+      5 => Ok(AnnotationType::Double),
+      6 => Ok(AnnotationType::String),
+      _ => {
+        Err(
+          thrift::Error::Protocol(
+            ProtocolError::new(
+              ProtocolErrorKind::InvalidData,
+              format!("cannot convert enum constant {i} to AnnotationType")
+            )
+          )
+        )
+      },
+    }
+  }
+}
+
+//
+// Endpoint
+//
+
+/// Indicates the network context of a service recording an annotation with two
+/// exceptions.
+/// 
+/// When a BinaryAnnotation, and key is CLIENT_ADDR or SERVER_ADDR,
+/// the endpoint indicates the source or destination of an RPC. This exception
+/// allows zipkin to display network context of uninstrumented services, or
+/// clients such as web browsers.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Endpoint {
+  /// IPv4 host address packed into 4 bytes.
+  /// 
+  /// Ex for the ip 1.2.3.4, it would be (1 << 24) | (2 << 16) | (3 << 8) | 4
+  pub ipv4: Option<i32>,
+  /// IPv4 port
+  /// 
+  /// Note: this is to be treated as an unsigned integer, so watch for negatives.
+  /// 
+  /// Conventionally, when the port isn't known, port = 0.
+  pub port: Option<i16>,
+  /// Service name in lowercase, such as "memcache" or "zipkin-web"
+  /// 
+  /// Conventionally, when the service name isn't known, service_name = "unknown".
+  pub service_name: Option<String>,
+  /// IPv6 host address packed into 16 bytes. Ex Inet6Address.getBytes()
+  pub ipv6: Option<Vec<u8>>,
+}
+
+impl Endpoint {
+  pub fn new<F1, F2, F3, F4>(ipv4: F1, port: F2, service_name: F3, ipv6: F4) -> Endpoint where F1: Into<Option<i32>>, F2: Into<Option<i16>>, F3: Into<Option<String>>, F4: Into<Option<Vec<u8>>> {
+    Endpoint {
+      ipv4: ipv4.into(),
+      port: port.into(),
+      service_name: service_name.into(),
+      ipv6: ipv6.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Endpoint> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<i32> = Some(0);
+    let mut f_2: Option<i16> = Some(0);
+    let mut f_3: Option<String> = Some("".to_owned());
+    let mut f_4: Option<Vec<u8>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_i32()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let val = i_prot.read_i16()?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = i_prot.read_string()?;
+          f_3 = Some(val);
+        },
+        4 => {
+          let val = i_prot.read_bytes()?;
+          f_4 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    let ret = Endpoint {
+      ipv4: f_1,
+      port: f_2,
+      service_name: f_3,
+      ipv6: f_4,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Endpoint");
+    o_prot.write_struct_begin(&struct_ident)?;
+    if let Some(fld_var) = self.ipv4 {
+      o_prot.write_field_begin(&TFieldIdentifier::new("ipv4", TType::I32, 1))?;
+      o_prot.write_i32(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.port {
+      o_prot.write_field_begin(&TFieldIdentifier::new("port", TType::I16, 2))?;
+      o_prot.write_i16(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.service_name {
+      o_prot.write_field_begin(&TFieldIdentifier::new("service_name", TType::String, 3))?;
+      o_prot.write_string(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.ipv6 {
+      o_prot.write_field_begin(&TFieldIdentifier::new("ipv6", TType::String, 4))?;
+      o_prot.write_bytes(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+impl Default for Endpoint {
+  fn default() -> Self {
+    Endpoint{
+      ipv4: Some(0),
+      port: Some(0),
+      service_name: Some("".to_owned()),
+      ipv6: Some(Vec::new()),
+    }
+  }
+}
+
+//
+// Annotation
+//
+
+/// An annotation is similar to a log statement. It includes a host field which
+/// allows these events to be attributed properly, and also aggregatable.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Annotation {
+  /// Microseconds from epoch.
+  /// 
+  /// This value should use the most precise value possible. For example,
+  /// gettimeofday or syncing nanoTime against a tick of currentTimeMillis.
+  pub timestamp: Option<i64>,
+  pub value: Option<String>,
+  /// Always the host that recorded the event. By specifying the host you allow
+  /// rollup of all events (such as client requests to a service) by IP address.
+  pub host: Option<Endpoint>,
+}
+
+impl Annotation {
+  pub fn new<F1, F2, F3>(timestamp: F1, value: F2, host: F3) -> Annotation where F1: Into<Option<i64>>, F2: Into<Option<String>>, F3: Into<Option<Endpoint>> {
+    Annotation {
+      timestamp: timestamp.into(),
+      value: value.into(),
+      host: host.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Annotation> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<i64> = Some(0);
+    let mut f_2: Option<String> = Some("".to_owned());
+    let mut f_3: Option<Endpoint> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_i64()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let val = i_prot.read_string()?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = Endpoint::read_from_in_protocol(i_prot)?;
+          f_3 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    let ret = Annotation {
+      timestamp: f_1,
+      value: f_2,
+      host: f_3,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Annotation");
+    o_prot.write_struct_begin(&struct_ident)?;
+    if let Some(fld_var) = self.timestamp {
+      o_prot.write_field_begin(&TFieldIdentifier::new("timestamp", TType::I64, 1))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.value {
+      o_prot.write_field_begin(&TFieldIdentifier::new("value", TType::String, 2))?;
+      o_prot.write_string(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.host {
+      o_prot.write_field_begin(&TFieldIdentifier::new("host", TType::Struct, 3))?;
+      fld_var.write_to_out_protocol(o_prot)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+impl Default for Annotation {
+  fn default() -> Self {
+    Annotation{
+      timestamp: Some(0),
+      value: Some("".to_owned()),
+      host: None,
+    }
+  }
+}
+
+//
+// BinaryAnnotation
+//
+
+/// Binary annotations are tags applied to a Span to give it context. For
+/// example, a binary annotation of "http.uri" could the path to a resource in a
+/// RPC call.
+/// 
+/// Binary annotations of type STRING are always queryable, though more a
+/// historical implementation detail than a structural concern.
+/// 
+/// Binary annotations can repeat, and vary on the host. Similar to Annotation,
+/// the host indicates who logged the event. This allows you to tell the
+/// difference between the client and server side of the same key. For example,
+/// the key "http.uri" might be different on the client and server side due to
+/// rewriting, like "/api/v1/myresource" vs "/myresource. Via the host field,
+/// you can see the different points of view, which often help in debugging.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct BinaryAnnotation {
+  pub key: Option<String>,
+  pub value: Option<Vec<u8>>,
+  pub annotation_type: Option<AnnotationType>,
+  /// The host that recorded tag, which allows you to differentiate between
+  /// multiple tags with the same key. There are two exceptions to this.
+  /// 
+  /// When the key is CLIENT_ADDR or SERVER_ADDR, host indicates the source or
+  /// destination of an RPC. This exception allows zipkin to display network
+  /// context of uninstrumented services, or clients such as web browsers.
+  pub host: Option<Endpoint>,
+}
+
+impl BinaryAnnotation {
+  pub fn new<F1, F2, F3, F4>(key: F1, value: F2, annotation_type: F3, host: F4) -> BinaryAnnotation where F1: Into<Option<String>>, F2: Into<Option<Vec<u8>>>, F3: Into<Option<AnnotationType>>, F4: Into<Option<Endpoint>> {
+    BinaryAnnotation {
+      key: key.into(),
+      value: value.into(),
+      annotation_type: annotation_type.into(),
+      host: host.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<BinaryAnnotation> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<String> = Some("".to_owned());
+    let mut f_2: Option<Vec<u8>> = Some(Vec::new());
+    let mut f_3: Option<AnnotationType> = None;
+    let mut f_4: Option<Endpoint> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_string()?;
+          f_1 = Some(val);
+        },
+        2 => {
+          let val = i_prot.read_bytes()?;
+          f_2 = Some(val);
+        },
+        3 => {
+          let val = AnnotationType::read_from_in_protocol(i_prot)?;
+          f_3 = Some(val);
+        },
+        4 => {
+          let val = Endpoint::read_from_in_protocol(i_prot)?;
+          f_4 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    let ret = BinaryAnnotation {
+      key: f_1,
+      value: f_2,
+      annotation_type: f_3,
+      host: f_4,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("BinaryAnnotation");
+    o_prot.write_struct_begin(&struct_ident)?;
+    if let Some(ref fld_var) = self.key {
+      o_prot.write_field_begin(&TFieldIdentifier::new("key", TType::String, 1))?;
+      o_prot.write_string(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.value {
+      o_prot.write_field_begin(&TFieldIdentifier::new("value", TType::String, 2))?;
+      o_prot.write_bytes(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.annotation_type {
+      o_prot.write_field_begin(&TFieldIdentifier::new("annotation_type", TType::I32, 3))?;
+      fld_var.write_to_out_protocol(o_prot)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.host {
+      o_prot.write_field_begin(&TFieldIdentifier::new("host", TType::Struct, 4))?;
+      fld_var.write_to_out_protocol(o_prot)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+impl Default for BinaryAnnotation {
+  fn default() -> Self {
+    BinaryAnnotation{
+      key: Some("".to_owned()),
+      value: Some(Vec::new()),
+      annotation_type: None,
+      host: None,
+    }
+  }
+}
+
+//
+// Span
+//
+
+/// A trace is a series of spans (often RPC calls) which form a latency tree.
+/// 
+/// The root span is where trace_id = id and parent_id = Nil. The root span is
+/// usually the longest interval in the trace, starting with a SERVER_RECV
+/// annotation and ending with a SERVER_SEND.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Span {
+  pub trace_id: Option<i64>,
+  /// Span name in lowercase, rpc method for example
+  /// 
+  /// Conventionally, when the span name isn't known, name = "unknown".
+  pub name: Option<String>,
+  pub id: Option<i64>,
+  pub parent_id: Option<i64>,
+  pub annotations: Option<Vec<Annotation>>,
+  pub binary_annotations: Option<Vec<BinaryAnnotation>>,
+  pub debug: Option<bool>,
+  /// Microseconds from epoch of the creation of this span.
+  /// 
+  /// This value should be set directly by instrumentation, using the most
+  /// precise value possible. For example, gettimeofday or syncing nanoTime
+  /// against a tick of currentTimeMillis.
+  /// 
+  /// For compatibility with instrumentation that precede this field, collectors
+  /// or span stores can derive this via Annotation.timestamp.
+  /// For example, SERVER_RECV.timestamp or CLIENT_SEND.timestamp.
+  /// 
+  /// This field is optional for compatibility with old data: first-party span
+  /// stores are expected to support this at time of introduction.
+  pub timestamp: Option<i64>,
+  /// Measurement of duration in microseconds, used to support queries.
+  /// 
+  /// This value should be set directly, where possible. Doing so encourages
+  /// precise measurement decoupled from problems of clocks, such as skew or NTP
+  /// updates causing time to move backwards.
+  /// 
+  /// For compatibility with instrumentation that precede this field, collectors
+  /// or span stores can derive this by subtracting Annotation.timestamp.
+  /// For example, SERVER_SEND.timestamp - SERVER_RECV.timestamp.
+  /// 
+  /// If this field is persisted as unset, zipkin will continue to work, except
+  /// duration query support will be implementation-specific. Similarly, setting
+  /// this field non-atomically is implementation-specific.
+  /// 
+  /// This field is i64 vs i32 to support spans longer than 35 minutes.
+  pub duration: Option<i64>,
+  /// Optional unique 8-byte additional identifier for a trace. If non zero, this
+  /// means the trace uses 128 bit traceIds instead of 64 bit.
+  pub trace_id_high: Option<i64>,
+}
+
+impl Span {
+  pub fn new<F1, F3, F4, F5, F6, F8, F9, F10, F11, F12>(trace_id: F1, name: F3, id: F4, parent_id: F5, annotations: F6, binary_annotations: F8, debug: F9, timestamp: F10, duration: F11, trace_id_high: F12) -> Span where F1: Into<Option<i64>>, F3: Into<Option<String>>, F4: Into<Option<i64>>, F5: Into<Option<i64>>, F6: Into<Option<Vec<Annotation>>>, F8: Into<Option<Vec<BinaryAnnotation>>>, F9: Into<Option<bool>>, F10: Into<Option<i64>>, F11: Into<Option<i64>>, F12: Into<Option<i64>> {
+    Span {
+      trace_id: trace_id.into(),
+      name: name.into(),
+      id: id.into(),
+      parent_id: parent_id.into(),
+      annotations: annotations.into(),
+      binary_annotations: binary_annotations.into(),
+      debug: debug.into(),
+      timestamp: timestamp.into(),
+      duration: duration.into(),
+      trace_id_high: trace_id_high.into(),
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Span> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<i64> = Some(0);
+    let mut f_3: Option<String> = Some("".to_owned());
+    let mut f_4: Option<i64> = Some(0);
+    let mut f_5: Option<i64> = None;
+    let mut f_6: Option<Vec<Annotation>> = Some(Vec::new());
+    let mut f_8: Option<Vec<BinaryAnnotation>> = Some(Vec::new());
+    let mut f_9: Option<bool> = None;
+    let mut f_10: Option<i64> = None;
+    let mut f_11: Option<i64> = None;
+    let mut f_12: Option<i64> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_i64()?;
+          f_1 = Some(val);
+        },
+        3 => {
+          let val = i_prot.read_string()?;
+          f_3 = Some(val);
+        },
+        4 => {
+          let val = i_prot.read_i64()?;
+          f_4 = Some(val);
+        },
+        5 => {
+          let val = i_prot.read_i64()?;
+          f_5 = Some(val);
+        },
+        6 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Annotation> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_0 = Annotation::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_0);
+          }
+          i_prot.read_list_end()?;
+          f_6 = Some(val);
+        },
+        8 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<BinaryAnnotation> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_1 = BinaryAnnotation::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_1);
+          }
+          i_prot.read_list_end()?;
+          f_8 = Some(val);
+        },
+        9 => {
+          let val = i_prot.read_bool()?;
+          f_9 = Some(val);
+        },
+        10 => {
+          let val = i_prot.read_i64()?;
+          f_10 = Some(val);
+        },
+        11 => {
+          let val = i_prot.read_i64()?;
+          f_11 = Some(val);
+        },
+        12 => {
+          let val = i_prot.read_i64()?;
+          f_12 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    let ret = Span {
+      trace_id: f_1,
+      name: f_3,
+      id: f_4,
+      parent_id: f_5,
+      annotations: f_6,
+      binary_annotations: f_8,
+      debug: f_9,
+      timestamp: f_10,
+      duration: f_11,
+      trace_id_high: f_12,
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Span");
+    o_prot.write_struct_begin(&struct_ident)?;
+    if let Some(fld_var) = self.trace_id {
+      o_prot.write_field_begin(&TFieldIdentifier::new("trace_id", TType::I64, 1))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.name {
+      o_prot.write_field_begin(&TFieldIdentifier::new("name", TType::String, 3))?;
+      o_prot.write_string(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.id {
+      o_prot.write_field_begin(&TFieldIdentifier::new("id", TType::I64, 4))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.parent_id {
+      o_prot.write_field_begin(&TFieldIdentifier::new("parent_id", TType::I64, 5))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.annotations {
+      o_prot.write_field_begin(&TFieldIdentifier::new("annotations", TType::List, 6))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(ref fld_var) = self.binary_annotations {
+      o_prot.write_field_begin(&TFieldIdentifier::new("binary_annotations", TType::List, 8))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.debug {
+      o_prot.write_field_begin(&TFieldIdentifier::new("debug", TType::Bool, 9))?;
+      o_prot.write_bool(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.timestamp {
+      o_prot.write_field_begin(&TFieldIdentifier::new("timestamp", TType::I64, 10))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.duration {
+      o_prot.write_field_begin(&TFieldIdentifier::new("duration", TType::I64, 11))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    if let Some(fld_var) = self.trace_id_high {
+      o_prot.write_field_begin(&TFieldIdentifier::new("trace_id_high", TType::I64, 12))?;
+      o_prot.write_i64(fld_var)?;
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+impl Default for Span {
+  fn default() -> Self {
+    Span{
+      trace_id: Some(0),
+      name: Some("".to_owned()),
+      id: Some(0),
+      parent_id: Some(0),
+      annotations: Some(Vec::new()),
+      binary_annotations: Some(Vec::new()),
+      debug: Some(false),
+      timestamp: Some(0),
+      duration: Some(0),
+      trace_id_high: Some(0),
+    }
+  }
+}
+
+//
+// Response
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Response {
+  pub ok: bool,
+}
+
+impl Response {
+  pub fn new(ok: bool) -> Response {
+    Response {
+      ok: ok,
+    }
+  }
+  pub fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<Response> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<bool> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let val = i_prot.read_bool()?;
+          f_1 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("Response.ok", &f_1)?;
+    let ret = Response {
+      ok: f_1.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  pub fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("Response");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("ok", TType::Bool, 1))?;
+    o_prot.write_bool(self.ok)?;
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+pub const C_L_I_E_N_T_S_E_N_D: &str = "cs";
+
+pub const C_L_I_E_N_T_R_E_C_V: &str = "cr";
+
+pub const S_E_R_V_E_R_S_E_N_D: &str = "ss";
+
+pub const S_E_R_V_E_R_R_E_C_V: &str = "sr";
+
+pub const M_E_S_S_A_G_E_S_E_N_D: &str = "ms";
+
+pub const M_E_S_S_A_G_E_R_E_C_V: &str = "mr";
+
+pub const W_I_R_E_S_E_N_D: &str = "ws";
+
+pub const W_I_R_E_R_E_C_V: &str = "wr";
+
+pub const C_L_I_E_N_T_S_E_N_D_F_R_A_G_M_E_N_T: &str = "csf";
+
+pub const C_L_I_E_N_T_R_E_C_V_F_R_A_G_M_E_N_T: &str = "crf";
+
+pub const S_E_R_V_E_R_S_E_N_D_F_R_A_G_M_E_N_T: &str = "ssf";
+
+pub const S_E_R_V_E_R_R_E_C_V_F_R_A_G_M_E_N_T: &str = "srf";
+
+pub const L_O_C_A_L_C_O_M_P_O_N_E_N_T: &str = "lc";
+
+pub const C_L_I_E_N_T_A_D_D_R: &str = "ca";
+
+pub const S_E_R_V_E_R_A_D_D_R: &str = "sa";
+
+pub const M_E_S_S_A_G_E_A_D_D_R: &str = "ma";
+
+//
+// ZipkinCollector service client
+//
+
+pub trait TZipkinCollectorSyncClient {
+  fn submit_zipkin_batch(&mut self, spans: Vec<Span>) -> thrift::Result<Vec<Response>>;
+}
+
+pub trait TZipkinCollectorSyncClientMarker {}
+
+pub struct ZipkinCollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  _i_prot: IP,
+  _o_prot: OP,
+  _sequence_number: i32,
+}
+
+impl <IP, OP> ZipkinCollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  pub fn new(input_protocol: IP, output_protocol: OP) -> ZipkinCollectorSyncClient<IP, OP> {
+    ZipkinCollectorSyncClient { _i_prot: input_protocol, _o_prot: output_protocol, _sequence_number: 0 }
+  }
+}
+
+impl <IP, OP> TThriftClient for ZipkinCollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {
+  fn i_prot_mut(&mut self) -> &mut dyn TInputProtocol { &mut self._i_prot }
+  fn o_prot_mut(&mut self) -> &mut dyn TOutputProtocol { &mut self._o_prot }
+  fn sequence_number(&self) -> i32 { self._sequence_number }
+  fn increment_sequence_number(&mut self) -> i32 { self._sequence_number += 1; self._sequence_number }
+}
+
+impl <IP, OP> TZipkinCollectorSyncClientMarker for ZipkinCollectorSyncClient<IP, OP> where IP: TInputProtocol, OP: TOutputProtocol {}
+
+impl <C: TThriftClient + TZipkinCollectorSyncClientMarker> TZipkinCollectorSyncClient for C {
+  fn submit_zipkin_batch(&mut self, spans: Vec<Span>) -> thrift::Result<Vec<Response>> {
+    (
+      {
+        self.increment_sequence_number();
+        let message_ident = TMessageIdentifier::new("submitZipkinBatch", TMessageType::Call, self.sequence_number());
+        let call_args = ZipkinCollectorSubmitZipkinBatchArgs { spans: spans };
+        self.o_prot_mut().write_message_begin(&message_ident)?;
+        call_args.write_to_out_protocol(self.o_prot_mut())?;
+        self.o_prot_mut().write_message_end()?;
+        self.o_prot_mut().flush()
+      }
+    )?;
+    {
+      let message_ident = self.i_prot_mut().read_message_begin()?;
+      verify_expected_sequence_number(self.sequence_number(), message_ident.sequence_number)?;
+      verify_expected_service_call("submitZipkinBatch", &message_ident.name)?;
+      if message_ident.message_type == TMessageType::Exception {
+        let remote_error = thrift::Error::read_application_error_from_in_protocol(self.i_prot_mut())?;
+        self.i_prot_mut().read_message_end()?;
+        return Err(thrift::Error::Application(remote_error))
+      }
+      verify_expected_message_type(TMessageType::Reply, message_ident.message_type)?;
+      let result = ZipkinCollectorSubmitZipkinBatchResult::read_from_in_protocol(self.i_prot_mut())?;
+      self.i_prot_mut().read_message_end()?;
+      result.ok_or()
+    }
+  }
+}
+
+//
+// ZipkinCollector service processor
+//
+
+pub trait ZipkinCollectorSyncHandler {
+  fn handle_submit_zipkin_batch(&self, spans: Vec<Span>) -> thrift::Result<Vec<Response>>;
+}
+
+pub struct ZipkinCollectorSyncProcessor<H: ZipkinCollectorSyncHandler> {
+  handler: H,
+}
+
+impl <H: ZipkinCollectorSyncHandler> ZipkinCollectorSyncProcessor<H> {
+  pub fn new(handler: H) -> ZipkinCollectorSyncProcessor<H> {
+    ZipkinCollectorSyncProcessor {
+      handler,
+    }
+  }
+  fn process_submit_zipkin_batch(&self, incoming_sequence_number: i32, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    TZipkinCollectorProcessFunctions::process_submit_zipkin_batch(&self.handler, incoming_sequence_number, i_prot, o_prot)
+  }
+}
+
+pub struct TZipkinCollectorProcessFunctions;
+
+impl TZipkinCollectorProcessFunctions {
+  pub fn process_submit_zipkin_batch<H: ZipkinCollectorSyncHandler>(handler: &H, incoming_sequence_number: i32, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let args = ZipkinCollectorSubmitZipkinBatchArgs::read_from_in_protocol(i_prot)?;
+    match handler.handle_submit_zipkin_batch(args.spans) {
+      Ok(handler_return) => {
+        let message_ident = TMessageIdentifier::new("submitZipkinBatch", TMessageType::Reply, incoming_sequence_number);
+        o_prot.write_message_begin(&message_ident)?;
+        let ret = ZipkinCollectorSubmitZipkinBatchResult { result_value: Some(handler_return) };
+        ret.write_to_out_protocol(o_prot)?;
+        o_prot.write_message_end()?;
+        o_prot.flush()
+      },
+      Err(e) => {
+        match e {
+          thrift::Error::Application(app_err) => {
+            let message_ident = TMessageIdentifier::new("submitZipkinBatch", TMessageType::Exception, incoming_sequence_number);
+            o_prot.write_message_begin(&message_ident)?;
+            thrift::Error::write_application_error_to_out_protocol(&app_err, o_prot)?;
+            o_prot.write_message_end()?;
+            o_prot.flush()
+          },
+          _ => {
+            let ret_err = {
+              ApplicationError::new(
+                ApplicationErrorKind::Unknown,
+                e.description()
+              )
+            };
+            let message_ident = TMessageIdentifier::new("submitZipkinBatch", TMessageType::Exception, incoming_sequence_number);
+            o_prot.write_message_begin(&message_ident)?;
+            thrift::Error::write_application_error_to_out_protocol(&ret_err, o_prot)?;
+            o_prot.write_message_end()?;
+            o_prot.flush()
+          },
+        }
+      },
+    }
+  }
+}
+
+impl <H: ZipkinCollectorSyncHandler> TProcessor for ZipkinCollectorSyncProcessor<H> {
+  fn process(&self, i_prot: &mut dyn TInputProtocol, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let message_ident = i_prot.read_message_begin()?;
+    let res = match &*message_ident.name {
+      "submitZipkinBatch" => {
+        self.process_submit_zipkin_batch(message_ident.sequence_number, i_prot, o_prot)
+      },
+      method => {
+        Err(
+          thrift::Error::Application(
+            ApplicationError::new(
+              ApplicationErrorKind::UnknownMethod,
+              format!("unknown method {method}")
+            )
+          )
+        )
+      },
+    };
+    thrift::server::handle_process_result(&message_ident, res, o_prot)
+  }
+}
+
+//
+// ZipkinCollectorSubmitZipkinBatchArgs
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+struct ZipkinCollectorSubmitZipkinBatchArgs {
+  spans: Vec<Span>,
+}
+
+impl ZipkinCollectorSubmitZipkinBatchArgs {
+  fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<ZipkinCollectorSubmitZipkinBatchArgs> {
+    i_prot.read_struct_begin()?;
+    let mut f_1: Option<Vec<Span>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        1 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Span> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_2 = Span::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_2);
+          }
+          i_prot.read_list_end()?;
+          f_1 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    verify_required_field_exists("ZipkinCollectorSubmitZipkinBatchArgs.spans", &f_1)?;
+    let ret = ZipkinCollectorSubmitZipkinBatchArgs {
+      spans: f_1.expect("auto-generated code should have checked for presence of required fields"),
+    };
+    Ok(ret)
+  }
+  fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("submitZipkinBatch_args");
+    o_prot.write_struct_begin(&struct_ident)?;
+    o_prot.write_field_begin(&TFieldIdentifier::new("spans", TType::List, 1))?;
+    o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, self.spans.len() as i32))?;
+    for e in &self.spans {
+      e.write_to_out_protocol(o_prot)?;
+      o_prot.write_list_end()?;
+    }
+    o_prot.write_field_end()?;
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+}
+
+//
+// ZipkinCollectorSubmitZipkinBatchResult
+//
+
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+struct ZipkinCollectorSubmitZipkinBatchResult {
+  result_value: Option<Vec<Response>>,
+}
+
+impl ZipkinCollectorSubmitZipkinBatchResult {
+  fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol) -> thrift::Result<ZipkinCollectorSubmitZipkinBatchResult> {
+    i_prot.read_struct_begin()?;
+    let mut f_0: Option<Vec<Response>> = None;
+    loop {
+      let field_ident = i_prot.read_field_begin()?;
+      if field_ident.field_type == TType::Stop {
+        break;
+      }
+      let field_id = field_id(&field_ident)?;
+      match field_id {
+        0 => {
+          let list_ident = i_prot.read_list_begin()?;
+          let mut val: Vec<Response> = Vec::with_capacity(list_ident.size as usize);
+          for _ in 0..list_ident.size {
+            let list_elem_3 = Response::read_from_in_protocol(i_prot)?;
+            val.push(list_elem_3);
+          }
+          i_prot.read_list_end()?;
+          f_0 = Some(val);
+        },
+        _ => {
+          i_prot.skip(field_ident.field_type)?;
+        },
+      };
+      i_prot.read_field_end()?;
+    }
+    i_prot.read_struct_end()?;
+    let ret = ZipkinCollectorSubmitZipkinBatchResult {
+      result_value: f_0,
+    };
+    Ok(ret)
+  }
+  fn write_to_out_protocol(&self, o_prot: &mut dyn TOutputProtocol) -> thrift::Result<()> {
+    let struct_ident = TStructIdentifier::new("ZipkinCollectorSubmitZipkinBatchResult");
+    o_prot.write_struct_begin(&struct_ident)?;
+    if let Some(ref fld_var) = self.result_value {
+      o_prot.write_field_begin(&TFieldIdentifier::new("result_value", TType::List, 0))?;
+      o_prot.write_list_begin(&TListIdentifier::new(TType::Struct, fld_var.len() as i32))?;
+      for e in fld_var {
+        e.write_to_out_protocol(o_prot)?;
+        o_prot.write_list_end()?;
+      }
+      o_prot.write_field_end()?;
+      ()
+    } else {
+      ()
+    }
+    o_prot.write_field_stop()?;
+    o_prot.write_struct_end()
+  }
+  fn ok_or(self) -> thrift::Result<Vec<Response>> {
+    if self.result_value.is_some() {
+      Ok(self.result_value.unwrap())
+    } else {
+      Err(
+        thrift::Error::Application(
+          ApplicationError::new(
+            ApplicationErrorKind::MissingResult,
+            "no result received for ZipkinCollectorSubmitZipkinBatch"
+          )
+        )
+      )
+    }
+  }
+}
+
diff --git a/trace_http/Cargo.toml b/trace_http/Cargo.toml
new file mode 100644
index 0000000..691a8ae
--- /dev/null
+++ b/trace_http/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "trace_http"
+description = "Distributed tracing support for HTTP services"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+bytes = "1.5"
+trace = { path = "../trace" }
+futures = "0.3"
+hashbrown = { workspace = true }
+http = "0.2"
+http-body = "0.4"
+itertools = "0.12"
+metric = { path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+parking_lot = "0.12"
+pin-project = "1.1"
+snafu = "0.8"
+tower = "0.4"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/trace_http/src/classify.rs b/trace_http/src/classify.rs
new file mode 100644
index 0000000..4b00bdc
--- /dev/null
+++ b/trace_http/src/classify.rs
@@ -0,0 +1,110 @@
+use std::borrow::Cow;
+
+/// A classification of if a given request was successful
+///
+/// Note: the variant order defines the override order for classification
+/// e.g. a request that encounters both a ClientErr and a ServerErr will
+/// be recorded as a ServerErr
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub(crate) enum Classification {
+    /// Successful request
+    Ok,
+
+    /// The request was to an unrecognized path
+    ///
+    /// This is used by the metrics collection to avoid generating a new set of metrics
+    /// for a request path that doesn't correspond to a valid route
+    PathNotFound,
+
+    /// Method was not allowed.
+    MethodNotAllowed,
+
+    /// The request was unsuccessful (4XX) but it was not the fault of the service
+    ClientErr,
+
+    /// The request was unsuccessful (5XX) and it was the fault of the service
+    ServerErr,
+
+    /// The request produced a response that is not 2XX Ok, 4XX ClientErr or 5XX
+    /// ServerErr. This is unexpected and likely shouldn't happen
+    UnexpectedResponse,
+}
+
+pub(crate) fn classify_response<B>(
+    response: &http::Response<B>,
+) -> (Cow<'static, str>, Classification) {
+    let status = response.status();
+
+    if status.is_success() {
+        classify_headers(Some(response.headers()))
+    } else if status.is_client_error() {
+        match status {
+            http::StatusCode::NOT_FOUND => ("not found".into(), Classification::PathNotFound),
+            http::StatusCode::METHOD_NOT_ALLOWED => (
+                "method not allowed".into(),
+                Classification::MethodNotAllowed,
+            ),
+            _ => (
+                format!("unexpected 4XX status code: {status}").into(),
+                Classification::ClientErr,
+            ),
+        }
+    } else if status.is_server_error() {
+        (
+            format!("unexpected 5XX status code: {status}").into(),
+            Classification::ServerErr,
+        )
+    } else {
+        (
+            format!("unexpected non-error status code: {status}").into(),
+            Classification::UnexpectedResponse,
+        )
+    }
+}
+
+/// gRPC indicates failure via a [special][1] header allowing it to signal an error
+/// at the end of an HTTP chunked stream as part of the [response trailer][2]
+///
+/// [1]: https://grpc.github.io/grpc/core/md_doc_statuscodes.html
+/// [2]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Trailer
+pub(crate) fn classify_headers(
+    headers: Option<&http::header::HeaderMap>,
+) -> (Cow<'static, str>, Classification) {
+    match headers.and_then(|headers| headers.get("grpc-status")) {
+        Some(header) => {
+            let value = match header.to_str() {
+                Ok(value) => value,
+                Err(_) => return ("grpc status not string".into(), Classification::ServerErr),
+            };
+            let value: i32 = match value.parse() {
+                Ok(value) => value,
+                Err(_) => return ("grpc status not integer".into(), Classification::ServerErr),
+            };
+
+            match value {
+                0 => ("ok".into(), Classification::Ok),
+                1 => ("cancelled".into(), Classification::ClientErr),
+                2 => ("unknown".into(), Classification::ServerErr),
+                3 => ("invalid argument".into(), Classification::ClientErr),
+                4 => ("deadline exceeded".into(), Classification::ServerErr),
+                5 => ("not found".into(), Classification::ClientErr),
+                6 => ("already exists".into(), Classification::ClientErr),
+                7 => ("permission denied".into(), Classification::ClientErr),
+                8 => ("resource exhausted".into(), Classification::ServerErr),
+                9 => ("failed precondition".into(), Classification::ClientErr),
+                10 => ("aborted".into(), Classification::ClientErr),
+                11 => ("out of range".into(), Classification::ClientErr),
+                12 => ("unimplemented".into(), Classification::ServerErr),
+                13 => ("internal".into(), Classification::ServerErr),
+                14 => ("unavailable".into(), Classification::ServerErr),
+                15 => ("data loss".into(), Classification::ServerErr),
+                16 => ("unauthenticated".into(), Classification::ClientErr),
+                _ => (
+                    format!("unrecognised status code: {value}").into(),
+                    Classification::ServerErr,
+                ),
+            }
+        }
+        None => ("ok".into(), Classification::Ok),
+    }
+}
diff --git a/trace_http/src/ctx.rs b/trace_http/src/ctx.rs
new file mode 100644
index 0000000..d436013
--- /dev/null
+++ b/trace_http/src/ctx.rs
@@ -0,0 +1,586 @@
+use std::num::{NonZeroU128, NonZeroU64, ParseIntError};
+use std::str::FromStr;
+use std::sync::Arc;
+
+use http::HeaderMap;
+use observability_deps::tracing::*;
+use snafu::Snafu;
+
+use trace::ctx::{SpanContext, SpanId, TraceId};
+use trace::TraceCollector;
+
+const B3_FLAGS: &str = "X-B3-Flags";
+const B3_SAMPLED_HEADER: &str = "X-B3-Sampled";
+const B3_TRACE_ID_HEADER: &str = "X-B3-TraceId";
+const B3_PARENT_SPAN_ID_HEADER: &str = "X-B3-ParentSpanId";
+const B3_SPAN_ID_HEADER: &str = "X-B3-SpanId";
+
+/// Error decoding SpanContext from transport representation
+#[derive(Debug, Snafu)]
+pub enum ContextError {
+    #[snafu(display("header '{}' not found", header))]
+    Missing { header: String },
+
+    #[snafu(display("header '{}' has non-UTF8 content: {}", header, source))]
+    InvalidUtf8 {
+        header: String,
+        source: http::header::ToStrError,
+    },
+
+    #[snafu(display("error decoding header '{}': {}", header, source))]
+    HeaderDecodeError { header: String, source: DecodeError },
+}
+
+/// Error decoding a specific header value
+#[derive(Debug, Snafu)]
+pub enum DecodeError {
+    #[snafu(display("value decode error: {}", source))]
+    ValueDecodeError { source: ParseIntError },
+
+    #[snafu(display("Expected \"trace-id:span-id:parent-span-id:flags\""))]
+    InvalidJaegerTrace,
+
+    #[snafu(display("value cannot be 0"))]
+    ZeroError,
+}
+
+impl From<ParseIntError> for DecodeError {
+    // Snafu doesn't allow both no context and a custom message
+    fn from(source: ParseIntError) -> Self {
+        Self::ValueDecodeError { source }
+    }
+}
+
+fn parse_trace(s: &str) -> Result<TraceId, DecodeError> {
+    Ok(TraceId(
+        NonZeroU128::new(u128::from_str_radix(s, 16)?).ok_or(DecodeError::ZeroError)?,
+    ))
+}
+
+fn parse_span(s: &str) -> Result<SpanId, DecodeError> {
+    Ok(SpanId(
+        NonZeroU64::new(u64::from_str_radix(s, 16)?).ok_or(DecodeError::ZeroError)?,
+    ))
+}
+
+/// Extracts tracing information such as the `SpanContext`s , if any,
+/// from http request headers.
+#[derive(Debug, Clone, Default)]
+pub struct TraceHeaderParser {
+    /// header that contains pre-existing trace context, if any
+    jaeger_trace_context_header_name: Option<Arc<str>>,
+    /// header that forces sampling
+    jaeger_debug_name: Option<Arc<str>>,
+}
+
+impl TraceHeaderParser {
+    /// Create a new span context parser with default Jaeger trace
+    /// header name
+    pub fn new() -> Self {
+        Default::default()
+    }
+
+    /// specify a header for jaeger_trace_context_header_name
+    ///
+    /// For example, 'uber-trace-id'
+    pub fn with_jaeger_trace_context_header_name(mut self, name: impl AsRef<str>) -> Self {
+        self.jaeger_trace_context_header_name = Some(name.as_ref().into());
+        self
+    }
+
+    /// specify a custom jaeger_debug_header_name
+    ///
+    /// For example, 'jaeger-debug-id'
+    pub fn with_jaeger_debug_name(mut self, name: impl AsRef<str>) -> Self {
+        self.jaeger_debug_name = Some(name.as_ref().into());
+        self
+    }
+
+    /// Create a SpanContext for the trace described in the request's
+    /// headers, if any
+    ///
+    /// Currently support the following formats:
+    /// * <https://github.com/openzipkin/b3-propagation#multiple-headers>
+    /// * <https://www.jaegertracing.io/docs/1.21/client-libraries/#propagation-format>
+    pub fn parse(
+        &self,
+        collector: Option<&Arc<dyn TraceCollector>>,
+        headers: &HeaderMap,
+    ) -> Result<Option<SpanContext>, ContextError> {
+        if let Some(trace_header) = self.jaeger_trace_context_header_name.as_ref() {
+            if headers.contains_key(trace_header.as_ref()) {
+                return decode_jaeger(collector, headers, trace_header.as_ref()).map(Some);
+            }
+        }
+
+        if headers.contains_key(B3_TRACE_ID_HEADER) {
+            return decode_b3(collector, headers).map(Some);
+        }
+
+        if let Some(debug_header_name) = self.jaeger_debug_name.as_ref() {
+            if let Some(debug_header_value) = headers.get(debug_header_name.as_ref()) {
+                // create a new trace / span
+                let new_trace_context =
+                    SpanContext::new_with_optional_collector(collector.cloned());
+                // It would be nice to record the debug-name in the span somehow for easy finding in Jaeger
+                // for now, also log it.
+                let trace_id = format!("{:x}", new_trace_context.trace_id.get());
+                trace!(%trace_id, ?debug_header_value, "Created new trace rooted at IOx");
+                return Ok(Some(new_trace_context));
+            }
+        }
+
+        Ok(None)
+    }
+}
+
+/// Decodes headers in the B3 format
+fn decode_b3(
+    collector: Option<&Arc<dyn TraceCollector>>,
+    headers: &HeaderMap,
+) -> Result<SpanContext, ContextError> {
+    let debug = decoded_header(headers, B3_FLAGS)?
+        .map(|header| header == "1")
+        .unwrap_or(false);
+
+    let sampled = match debug {
+        // Debug implies an accept decision
+        true => true,
+        false => decoded_header(headers, B3_SAMPLED_HEADER)?
+            .map(|value| value == "1" || value == "true")
+            .unwrap_or(false),
+    };
+
+    // Links cannot be specified via the HTTP header
+    let links = vec![];
+
+    Ok(SpanContext {
+        trace_id: required_header(headers, B3_TRACE_ID_HEADER, parse_trace)?,
+        parent_span_id: parsed_header(headers, B3_PARENT_SPAN_ID_HEADER, parse_span)?,
+        span_id: required_header(headers, B3_SPAN_ID_HEADER, parse_span)?,
+        links,
+        collector: collector.cloned(),
+        sampled,
+    })
+}
+
+struct JaegerCtx {
+    trace_id: TraceId,
+    span_id: SpanId,
+    parent_span_id: Option<SpanId>,
+    flags: u8,
+}
+
+impl FromStr for JaegerCtx {
+    type Err = DecodeError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use itertools::Itertools;
+
+        let (trace_id, span_id, parent_span_id, flags) = s
+            .split(':')
+            .collect_tuple()
+            .ok_or(DecodeError::InvalidJaegerTrace)?;
+
+        let trace_id = parse_trace(trace_id)?;
+        let span_id = parse_span(span_id)?;
+        let parent_span_id = match parse_span(parent_span_id) {
+            Ok(span_id) => Some(span_id),
+            Err(DecodeError::ZeroError) => None,
+            Err(e) => return Err(e),
+        };
+        let flags = u8::from_str_radix(flags, 16)?;
+
+        Ok(Self {
+            trace_id,
+            span_id,
+            parent_span_id,
+            flags,
+        })
+    }
+}
+
+/// Decodes headers in the Jaeger format
+fn decode_jaeger(
+    collector: Option<&Arc<dyn TraceCollector>>,
+    headers: &HeaderMap,
+    jaeger_header: &str,
+) -> Result<SpanContext, ContextError> {
+    let decoded: JaegerCtx = required_header(headers, jaeger_header, FromStr::from_str)?;
+    let sampled = decoded.flags & 0x01 == 1;
+
+    // Links cannot be specified via the HTTP header
+    let links = vec![];
+
+    Ok(SpanContext {
+        trace_id: decoded.trace_id,
+        parent_span_id: decoded.parent_span_id,
+        span_id: decoded.span_id,
+        links,
+        collector: collector.cloned(),
+        sampled,
+    })
+}
+
+/// Decodes a given header from the provided HeaderMap to a string
+///
+/// - Returns Ok(None) if the header doesn't exist
+/// - Returns Err if the header fails to decode to a string
+/// - Returns Ok(Some(_)) otherwise
+fn decoded_header<'a>(
+    headers: &'a HeaderMap,
+    header: &str,
+) -> Result<Option<&'a str>, ContextError> {
+    headers
+        .get(header)
+        .map(|value| {
+            value.to_str().map_err(|source| ContextError::InvalidUtf8 {
+                header: header.to_string(),
+                source,
+            })
+        })
+        .transpose()
+}
+
+/// Decodes and parses a given header from the provided HeaderMap
+///
+/// - Returns Ok(None) if the header doesn't exist
+/// - Returns Err if the header fails to decode to a string or fails to parse
+/// - Returns Ok(Some(_)) otherwise
+fn parsed_header<T, F: FnOnce(&str) -> Result<T, DecodeError>>(
+    headers: &HeaderMap,
+    header: &str,
+    parse: F,
+) -> Result<Option<T>, ContextError> {
+    decoded_header(headers, header)?
+        .map(parse)
+        .transpose()
+        .map_err(|source| ContextError::HeaderDecodeError {
+            source,
+            header: header.to_string(),
+        })
+}
+
+/// Decodes and parses a given required header from the provided HeaderMap
+///
+/// - Returns Err if the header fails to decode to a string, fails to parse, or doesn't exist
+/// - Returns Ok(str) otherwise
+fn required_header<T, F: FnOnce(&str) -> Result<T, DecodeError>>(
+    headers: &HeaderMap,
+    header: &str,
+    parse: F,
+) -> Result<T, ContextError> {
+    parsed_header(headers, header, parse)?.ok_or(ContextError::Missing {
+        header: header.to_string(),
+    })
+}
+
+/// Span context from external source together with some flags.
+#[derive(Debug, Clone)]
+pub struct RequestLogContext(SpanContext);
+
+impl RequestLogContext {
+    pub(crate) fn new(ctx: SpanContext) -> Self {
+        Self(ctx)
+    }
+
+    pub fn ctx(&self) -> &SpanContext {
+        &self.0
+    }
+}
+
+/// Format span context as Jaeger trace context.
+///
+/// This only emits the value-part required for tracer. You must still add the header name to the framework / output
+/// stream you're using.
+///
+/// You may use [`TraceHeaderParser`] to parse the resulting value.
+#[allow(clippy::bool_to_int_with_if)] // if sampled 1 else 0 is clearer than i32::from(sampled) imo
+pub fn format_jaeger_trace_context(span_context: &SpanContext) -> String {
+    let flags = if span_context.sampled { 1 } else { 0 };
+
+    format!(
+        "{:x}:{:x}:{:x}:{}",
+        span_context.trace_id.get(),
+        span_context.span_id.get(),
+        span_context
+            .parent_span_id
+            .as_ref()
+            .map(|span_id| span_id.get())
+            .unwrap_or_default(),
+        flags,
+    )
+}
+
+/// A simple way to format an external span context in a jaeger-like fashion, e.g. for logging.
+pub trait RequestLogContextExt {
+    /// Format context.
+    fn format_jaeger(&self) -> String;
+}
+
+impl RequestLogContextExt for Option<RequestLogContext> {
+    fn format_jaeger(&self) -> String {
+        self.as_ref()
+            .map(|ctx| format_jaeger_trace_context(&ctx.0))
+            .unwrap_or_default()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use http::HeaderValue;
+
+    use super::*;
+
+    #[test]
+    fn test_decode_b3() {
+        let parser = TraceHeaderParser::new();
+        let collector: Arc<dyn TraceCollector> = Arc::new(trace::LogTraceCollector::new());
+
+        let mut headers = HeaderMap::new();
+
+        // No headers should be None
+        assert!(parser.parse(Some(&collector), &headers).unwrap().is_none());
+
+        headers.insert(B3_TRACE_ID_HEADER, HeaderValue::from_static("ee25f"));
+        headers.insert(B3_SAMPLED_HEADER, HeaderValue::from_static("0"));
+
+        // Missing required headers
+        assert_eq!(
+            parser
+                .parse(Some(&collector), &headers)
+                .unwrap_err()
+                .to_string(),
+            "header 'X-B3-SpanId' not found"
+        );
+
+        headers.insert(B3_SPAN_ID_HEADER, HeaderValue::from_static("34e"));
+
+        // Not sampled
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+        assert_eq!(span.span_id.0.get(), 0x34e);
+        assert_eq!(span.trace_id.0.get(), 0xee25f);
+        assert!(span.parent_span_id.is_none());
+        assert!(!span.sampled);
+
+        // sample
+        headers.insert(B3_SAMPLED_HEADER, HeaderValue::from_static("1"));
+
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert_eq!(span.span_id.0.get(), 0x34e);
+        assert_eq!(span.trace_id.0.get(), 0xee25f);
+        assert!(span.parent_span_id.is_none());
+        assert!(span.sampled);
+
+        headers.insert(
+            B3_PARENT_SPAN_ID_HEADER,
+            HeaderValue::from_static("4595945"),
+        );
+
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert_eq!(span.span_id.0.get(), 0x34e);
+        assert_eq!(span.trace_id.0.get(), 0xee25f);
+        assert_eq!(span.parent_span_id.unwrap().0.get(), 0x4595945);
+        assert!(span.sampled);
+
+        headers.insert(B3_SPAN_ID_HEADER, HeaderValue::from_static("not a number"));
+
+        assert_eq!(
+            parser.parse(Some(&collector), &headers)
+                .unwrap_err()
+                .to_string(),
+            "error decoding header 'X-B3-SpanId': value decode error: invalid digit found in string"
+        );
+
+        headers.insert(B3_SPAN_ID_HEADER, HeaderValue::from_static("0"));
+
+        assert_eq!(
+            parser
+                .parse(Some(&collector), &headers)
+                .unwrap_err()
+                .to_string(),
+            "error decoding header 'X-B3-SpanId': value cannot be 0"
+        );
+    }
+
+    #[test]
+    fn test_decode_jaeger() {
+        const TRACE_HEADER: &str = "uber-trace-id";
+
+        let parser = TraceHeaderParser::new().with_jaeger_trace_context_header_name(TRACE_HEADER);
+
+        let collector: Arc<dyn TraceCollector> = Arc::new(trace::LogTraceCollector::new());
+        let mut headers = HeaderMap::new();
+
+        // Invalid format
+        headers.insert(TRACE_HEADER, HeaderValue::from_static("invalid"));
+        assert_eq!(
+            parser.parse(Some(&collector), &headers)
+                .unwrap_err()
+                .to_string(),
+            "error decoding header 'uber-trace-id': Expected \"trace-id:span-id:parent-span-id:flags\""
+        );
+
+        // Not sampled
+        headers.insert(TRACE_HEADER, HeaderValue::from_static("343:4325345:0:0"));
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert_eq!(span.trace_id.0.get(), 0x343);
+        assert_eq!(span.span_id.0.get(), 0x4325345);
+        assert!(span.parent_span_id.is_none());
+        assert!(!span.sampled);
+
+        // Sampled
+        headers.insert(TRACE_HEADER, HeaderValue::from_static("3a43:432e345:0:1"));
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert_eq!(span.trace_id.0.get(), 0x3a43);
+        assert_eq!(span.span_id.0.get(), 0x432e345);
+        assert!(span.parent_span_id.is_none());
+        assert!(span.sampled);
+
+        // Parent span
+        headers.insert(TRACE_HEADER, HeaderValue::from_static("343:4325345:3434:F"));
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert_eq!(span.trace_id.0.get(), 0x343);
+        assert_eq!(span.span_id.0.get(), 0x4325345);
+        assert_eq!(span.parent_span_id.unwrap().0.get(), 0x3434);
+        assert!(span.sampled);
+
+        // Invalid trace id
+        headers.insert(TRACE_HEADER, HeaderValue::from_static("0:4325345:3434:1"));
+        assert_eq!(
+            parser
+                .parse(Some(&collector), &headers)
+                .unwrap_err()
+                .to_string(),
+            "error decoding header 'uber-trace-id': value cannot be 0"
+        );
+
+        headers.insert(
+            TRACE_HEADER,
+            HeaderValue::from_static("008e813572f53b3a:008e813572f53b3a:0000000000000000:1"),
+        );
+
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert_eq!(span.trace_id.0.get(), 0x008e813572f53b3a);
+        assert_eq!(span.span_id.0.get(), 0x008e813572f53b3a);
+        assert!(span.parent_span_id.is_none());
+        assert!(span.sampled);
+    }
+
+    #[test]
+    fn test_decode_jaeger_custom_header() {
+        const DEFAULT_JAEGER_TRACE_HEADER: &str = "uber-trace-id";
+        let parser =
+            TraceHeaderParser::new().with_jaeger_trace_context_header_name("my-awesome-header");
+
+        let collector: Arc<dyn TraceCollector> = Arc::new(trace::LogTraceCollector::new());
+        let mut headers = HeaderMap::new();
+
+        let value = HeaderValue::from_static("1:2:3:1");
+
+        // Default header is ignored
+        headers.insert(DEFAULT_JAEGER_TRACE_HEADER, value.clone());
+        assert!(parser.parse(Some(&collector), &headers).unwrap().is_none());
+
+        // custom header is parsed
+        let mut headers = HeaderMap::new();
+        headers.insert("my-awesome-header", value);
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert_eq!(span.trace_id.0.get(), 1);
+        assert_eq!(span.span_id.0.get(), 2);
+        assert_eq!(span.parent_span_id.unwrap().get(), 3);
+        assert!(span.sampled);
+    }
+
+    #[test]
+    fn test_jaeger_debug_name() {
+        let parser = TraceHeaderParser::new().with_jaeger_debug_name("force-a-trace");
+
+        let collector: Arc<dyn TraceCollector> = Arc::new(trace::LogTraceCollector::new());
+
+        let mut headers = HeaderMap::new();
+
+        assert!(parser.parse(Some(&collector), &headers).unwrap().is_none());
+        headers.insert("force-a-trace", HeaderValue::from_static("please do"));
+
+        // should have created an entirely new span
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+        assert!(span.parent_span_id.is_none());
+    }
+
+    #[test]
+    fn test_jaeger_debug_name_and_trace_context() {
+        let parser = TraceHeaderParser::new()
+            .with_jaeger_trace_context_header_name("uber-trace-id")
+            .with_jaeger_debug_name("force-a-trace");
+
+        let collector: Arc<dyn TraceCollector> = Arc::new(trace::LogTraceCollector::new());
+
+        // when both headers are present, prefer existing trace context
+        let mut headers = HeaderMap::new();
+        headers.insert("uber-trace-id", HeaderValue::from_static("1:2:3:1"));
+        headers.insert("force-a-trace", HeaderValue::from_static("please do"));
+
+        let span = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+        assert_eq!(span.trace_id.0.get(), 1);
+        assert_eq!(span.span_id.0.get(), 2);
+        assert_eq!(span.parent_span_id.unwrap().get(), 3);
+        assert!(span.sampled);
+    }
+
+    #[test]
+    fn test_format_jaeger_trace_context() {
+        const TRACE_HEADER: &str = "uber-trace-id";
+
+        let parser = TraceHeaderParser::new().with_jaeger_trace_context_header_name(TRACE_HEADER);
+        let collector: Arc<dyn TraceCollector> = Arc::new(trace::LogTraceCollector::new());
+
+        let assert_roundtrip = |orig: SpanContext| {
+            let formatted = format_jaeger_trace_context(&orig);
+
+            let mut headers = HeaderMap::new();
+            headers.insert(TRACE_HEADER, HeaderValue::from_str(&formatted).unwrap());
+            let parsed = parser.parse(Some(&collector), &headers).unwrap().unwrap();
+
+            assert_eq!(parsed, orig);
+        };
+
+        // w/o parent span ID
+        assert_roundtrip(SpanContext {
+            trace_id: TraceId::new(1234).unwrap(),
+            span_id: SpanId::new(5678).unwrap(),
+            parent_span_id: None,
+            links: vec![],
+            collector: Some(Arc::clone(&collector)),
+            sampled: true,
+        });
+
+        // w/ parent span ID
+        assert_roundtrip(SpanContext {
+            trace_id: TraceId::new(1234).unwrap(),
+            span_id: SpanId::new(5678).unwrap(),
+            parent_span_id: Some(SpanId::new(1357).unwrap()),
+            links: vec![],
+            collector: Some(Arc::clone(&collector)),
+            sampled: true,
+        });
+
+        // not sampled
+        assert_roundtrip(SpanContext {
+            trace_id: TraceId::new(1234).unwrap(),
+            span_id: SpanId::new(5678).unwrap(),
+            parent_span_id: None,
+            links: vec![],
+            collector: Some(Arc::clone(&collector)),
+            sampled: false,
+        });
+    }
+}
diff --git a/trace_http/src/lib.rs b/trace_http/src/lib.rs
new file mode 100644
index 0000000..06c26bb
--- /dev/null
+++ b/trace_http/src/lib.rs
@@ -0,0 +1,20 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod classify;
+pub mod ctx;
+pub mod metrics;
+pub mod tower;
diff --git a/trace_http/src/metrics.rs b/trace_http/src/metrics.rs
new file mode 100644
index 0000000..e32035a
--- /dev/null
+++ b/trace_http/src/metrics.rs
@@ -0,0 +1,250 @@
+use crate::classify::Classification;
+use hashbrown::HashMap;
+use http::Method;
+use metric::{Attributes, DurationHistogram, Metric, ResultMetric, U64Counter};
+use parking_lot::{MappedMutexGuard, Mutex, MutexGuard};
+use std::sync::Arc;
+use std::time::Instant;
+
+/// The family of [`RequestMetrics`] to publish
+#[derive(Debug, Copy, Clone)]
+pub enum MetricFamily {
+    HttpServer,
+    GrpcServer,
+    HttpClient,
+    GrpcClient,
+}
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct MetricsKey {
+    /// request path or None for 404 responses
+    path: Option<String>,
+
+    /// method or None for invalid methods
+    method: Option<Method>,
+}
+
+/// Metrics collected for HTTP/gRPC requests
+#[derive(Debug)]
+pub struct RequestMetrics {
+    /// Whether this `MetricCollection`
+    family: MetricFamily,
+
+    /// Metric registry for registering new metrics
+    metric_registry: Arc<metric::Registry>,
+
+    /// Metrics.
+    metrics: Mutex<HashMap<MetricsKey, Metrics>>,
+
+    /// Maximum path segments.
+    max_path_segments: Option<usize>,
+}
+
+impl RequestMetrics {
+    pub fn new(metric_registry: Arc<metric::Registry>, family: MetricFamily) -> Self {
+        Self {
+            family,
+            metric_registry,
+            metrics: Default::default(),
+            max_path_segments: None,
+        }
+    }
+
+    /// Restrict metric paths to `segments`
+    pub fn with_max_path_segments(mut self, segments: usize) -> Self {
+        self.max_path_segments = Some(segments);
+        self
+    }
+
+    /// Gets the `MetricsRecorder` for a given http request
+    pub(crate) fn recorder<B>(self: &Arc<Self>, request: &http::Request<B>) -> MetricsRecorder {
+        MetricsRecorder {
+            metrics: Arc::clone(self),
+            start_instant: Instant::now(),
+            path: Some(request.uri().path().to_string()),
+            method: Some(request.method().clone()),
+            classification: None,
+        }
+    }
+
+    fn request_metrics(
+        &self,
+        path: Option<String>,
+        method: Option<Method>,
+    ) -> MappedMutexGuard<'_, Metrics> {
+        // method is only important for HTTP / non-gRPC
+        let method = match self.family {
+            MetricFamily::HttpServer | MetricFamily::HttpClient => method,
+            MetricFamily::GrpcServer | MetricFamily::GrpcClient => None,
+        };
+
+        MutexGuard::map(self.metrics.lock(), |metrics| {
+            let key = MetricsKey { path, method };
+            let (_, request_metrics) =
+                metrics.raw_entry_mut().from_key(&key).or_insert_with(|| {
+                    let mut attributes = Attributes::from([]);
+                    if let Some(path) = &key.path {
+                        attributes.insert("path", truncate_path(path, self.max_path_segments));
+                    }
+                    if let Some(method) = &key.method {
+                        attributes.insert("method", method.to_string());
+                    }
+                    if let (Some(path), Some(method)) = (&key.path, &key.method) {
+                        // help Grafana because you can only repeat a single variable, not a cross-product of the two
+                        attributes.insert(
+                            "method_path",
+                            format!("{} {}", method, truncate_path(path, self.max_path_segments)),
+                        );
+                    }
+
+                    let metrics =
+                        Metrics::new(self.metric_registry.as_ref(), attributes, self.family);
+
+                    (key, metrics)
+                });
+            request_metrics
+        })
+    }
+}
+
+fn truncate_path(path: &str, segments: Option<usize>) -> String {
+    let search = || {
+        let s = segments?;
+        let mut indices = path.match_indices('/');
+        for _ in 0..s {
+            indices.next();
+        }
+        let end = indices.next()?.0;
+        if end + 1 == path.len() {
+            return None;
+        }
+        Some(format!("{}/*", &path[..end]))
+    };
+    search().unwrap_or_else(|| path.to_string())
+}
+
+/// The request metrics for a specific set of attributes (e.g. path)
+#[derive(Debug)]
+struct Metrics {
+    /// Counts of un-aborted requests
+    request_count: ResultMetric<U64Counter>,
+
+    /// Count of aborted requests
+    aborted_count: U64Counter,
+
+    /// Latency distribution of non-aborted requests
+    request_duration: ResultMetric<DurationHistogram>,
+}
+
+impl Metrics {
+    fn new(
+        registry: &metric::Registry,
+        attributes: impl Into<Attributes>,
+        family: MetricFamily,
+    ) -> Self {
+        let (counter, duration) = match family {
+            MetricFamily::GrpcServer => ("grpc_requests", "grpc_request_duration"),
+            MetricFamily::HttpServer => ("http_requests", "http_request_duration"),
+            MetricFamily::GrpcClient => ("grpc_client_requests", "grpc_client_request_duration"),
+            MetricFamily::HttpClient => ("http_client_requests", "http_client_request_duration"),
+        };
+
+        let counter: Metric<U64Counter> =
+            registry.register_metric(counter, "accumulated total requests");
+
+        let duration: Metric<DurationHistogram> =
+            registry.register_metric(duration, "distribution of request latencies");
+
+        let mut attributes = attributes.into();
+        let count = ResultMetric::new(&counter, attributes.clone());
+        let duration = ResultMetric::new(&duration, attributes.clone());
+
+        attributes.insert("status", "aborted");
+        let aborted_count = counter.recorder(attributes);
+
+        Self {
+            request_count: count,
+            request_duration: duration,
+            aborted_count,
+        }
+    }
+}
+
+/// A `MetricsRecorder` is used to record metrics for a given http request
+#[derive(Debug)]
+pub(crate) struct MetricsRecorder {
+    metrics: Arc<RequestMetrics>,
+    start_instant: Instant,
+    path: Option<String>,
+    method: Option<Method>,
+    classification: Option<Classification>,
+}
+
+impl MetricsRecorder {
+    /// Sets the classification of this request if not already set
+    pub(crate) fn set_classification(&mut self, classification: Classification) {
+        if matches!(classification, Classification::PathNotFound) {
+            // Don't want to pollute metrics with invalid paths
+            self.path = None
+        }
+        if matches!(classification, Classification::MethodNotAllowed) {
+            // Don't want to pollute metrics with invalid methods
+            self.method = None
+        }
+
+        self.classification = Some(match self.classification {
+            Some(existing) => existing.max(classification),
+            None => classification,
+        });
+    }
+}
+
+impl Drop for MetricsRecorder {
+    fn drop(&mut self) {
+        let metrics = self
+            .metrics
+            .request_metrics(self.path.take(), self.method.take());
+
+        let duration = self.start_instant.elapsed();
+        match self.classification {
+            Some(Classification::Ok) => {
+                metrics.request_count.ok.inc(1);
+                metrics.request_duration.ok.record(duration);
+            }
+            Some(Classification::ClientErr)
+            | Some(Classification::PathNotFound)
+            | Some(Classification::MethodNotAllowed) => {
+                metrics.request_count.client_error.inc(1);
+                metrics.request_duration.client_error.record(duration);
+            }
+            Some(Classification::ServerErr) => {
+                metrics.request_count.server_error.inc(1);
+                metrics.request_duration.server_error.record(duration);
+            }
+            Some(Classification::UnexpectedResponse) => {
+                metrics.request_count.unexpected_response.inc(1);
+                metrics
+                    .request_duration
+                    .unexpected_response
+                    .record(duration);
+            }
+            None => metrics.aborted_count.inc(1),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_truncate() {
+        assert_eq!(truncate_path("/health", Some(1)), "/health");
+        assert_eq!(truncate_path("/api/v2/write", Some(3)), "/api/v2/write");
+        assert_eq!(truncate_path("/api/v2/write/", Some(3)), "/api/v2/write/");
+        assert_eq!(truncate_path("/api/v2/write", Some(2)), "/api/v2/*");
+        assert_eq!(truncate_path("/v1/p/000000000000053e", Some(2)), "/v1/p/*");
+        assert_eq!(truncate_path("/a/b/c/d/e/f", None), "/a/b/c/d/e/f");
+        assert_eq!(truncate_path("/a/b/c/d/e/f/", None), "/a/b/c/d/e/f/");
+        assert_eq!(truncate_path("/v1/p/", Some(2)), "/v1/p/");
+    }
+}
diff --git a/trace_http/src/tower.rs b/trace_http/src/tower.rs
new file mode 100644
index 0000000..bfba5e1
--- /dev/null
+++ b/trace_http/src/tower.rs
@@ -0,0 +1,393 @@
+//!
+//! Tower plumbing for adding tracing instrumentation to an HTTP service stack
+//!
+//! This is loosely based on tower-http's trace crate but with the tokio-tracing
+//! specific bits removed and less generics.
+//!
+//! For those not familiar with tower:
+//!
+//! - A Layer produces a Service
+//! - A Service can then be called with a request which returns a Future
+//! - This Future returns a response which contains a Body
+//! - This Body contains the data payload (potentially streamed)
+//!
+
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use bytes::Buf;
+use futures::ready;
+use http::{HeaderValue, Request, Response};
+use http_body::SizeHint;
+use pin_project::{pin_project, pinned_drop};
+use tower::{Layer, Service};
+
+use observability_deps::tracing::{error, warn};
+use trace::span::{SpanEvent, SpanStatus};
+use trace::{span::SpanRecorder, TraceCollector};
+
+use crate::classify::{classify_headers, classify_response, Classification};
+use crate::ctx::{RequestLogContext, RequestLogContextExt, TraceHeaderParser};
+use crate::metrics::{MetricsRecorder, RequestMetrics};
+
+/// `TraceLayer` implements `tower::Layer` and can be used to decorate a
+/// `tower::Service` to collect information about requests flowing through it
+///
+/// Including:
+///
+/// - Extracting distributed trace context and attaching span context
+/// - Collecting count and duration metrics - [RED metrics][1]
+///
+/// [1]: https://www.weave.works/blog/the-red-method-key-metrics-for-microservices-architecture/
+#[derive(Debug, Clone)]
+pub struct TraceLayer {
+    trace_header_parser: TraceHeaderParser,
+    metrics: Arc<RequestMetrics>,
+    collector: Option<Arc<dyn TraceCollector>>,
+    name: Arc<str>,
+}
+
+impl TraceLayer {
+    /// Create a new tower [`Layer`] for tracing
+    pub fn new(
+        trace_header_parser: TraceHeaderParser,
+        metrics: Arc<RequestMetrics>,
+        collector: Option<Arc<dyn TraceCollector>>,
+        name: &str,
+    ) -> Self {
+        Self {
+            trace_header_parser,
+            metrics,
+            collector,
+            name: name.into(),
+        }
+    }
+}
+
+impl<S> Layer<S> for TraceLayer {
+    type Service = TraceService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        TraceService {
+            service,
+            collector: self.collector.clone(),
+            metrics: Arc::clone(&self.metrics),
+            trace_header_parser: Some(self.trace_header_parser.clone()),
+            name: Arc::clone(&self.name),
+        }
+    }
+}
+
+/// TraceService wraps an inner tower::Service and instruments its returned futures
+#[derive(Debug, Clone)]
+pub struct TraceService<S> {
+    service: S,
+    trace_header_parser: Option<TraceHeaderParser>,
+    collector: Option<Arc<dyn TraceCollector>>,
+    metrics: Arc<RequestMetrics>,
+    name: Arc<str>,
+}
+
+impl<S> TraceService<S> {
+    /// Create a new [`TraceService`] for instrumenting a client
+    pub fn new_client(
+        service: S,
+        metrics: Arc<RequestMetrics>,
+        collector: Option<Arc<dyn TraceCollector>>,
+        name: &str,
+    ) -> Self {
+        Self {
+            service,
+            trace_header_parser: None,
+            metrics,
+            collector,
+            name: name.into(),
+        }
+    }
+}
+
+impl<S, ReqBody, ResBody> Service<Request<ReqBody>> for TraceService<S>
+where
+    S: Service<Request<ReqBody>, Response = Response<ResBody>>,
+    ResBody: http_body::Body,
+{
+    type Response = Response<TracedBody<ResBody>>;
+    type Error = S::Error;
+    type Future = TracedFuture<S::Future>;
+
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.service.poll_ready(cx)
+    }
+
+    fn call(&mut self, mut request: Request<ReqBody>) -> Self::Future {
+        let metrics_recorder = Some(self.metrics.recorder(&request));
+
+        let request_ctx = self.trace_header_parser.as_ref().and_then(|parser| {
+            match parser.parse(self.collector.as_ref(), request.headers()) {
+                Ok(Some(ctx)) => {
+                    let ctx = RequestLogContext::new(ctx);
+
+                    request.extensions_mut().insert(ctx.clone());
+
+                    Some(ctx)
+                }
+                Ok(None) => None,
+                Err(e) => {
+                    error!(%e, "error extracting trace context from request");
+                    None
+                }
+            }
+        });
+
+        let span = request_ctx.as_ref().and_then(|ctx| {
+            let ctx = ctx.ctx();
+
+            (ctx.sampled && ctx.collector.is_some()).then(|| {
+                let span = ctx.child(format!("IOx {}", self.name));
+
+                // Add context to request for use by service handlers
+                request.extensions_mut().insert(span.ctx.clone());
+
+                span
+            })
+        });
+
+        TracedFuture {
+            request_ctx,
+            metrics_recorder,
+            span_recorder: SpanRecorder::new(span),
+            was_ready: false,
+            inner: self.service.call(request),
+        }
+    }
+}
+
+/// `TracedFuture` wraps a future returned by a `tower::Service` and
+/// instruments the returned body if any
+#[pin_project(PinnedDrop)]
+#[derive(Debug)]
+pub struct TracedFuture<F> {
+    request_ctx: Option<RequestLogContext>,
+    span_recorder: SpanRecorder,
+    metrics_recorder: Option<MetricsRecorder>,
+    was_ready: bool,
+    #[pin]
+    inner: F,
+}
+
+#[pinned_drop]
+impl<F> PinnedDrop for TracedFuture<F> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.was_ready {
+            let trace = self.request_ctx.format_jaeger();
+            warn!(
+                %trace,
+                when="before returning headers",
+                "request cancelled",
+            );
+        }
+    }
+}
+
+impl<F, ResBody, Error> Future for TracedFuture<F>
+where
+    F: Future<Output = Result<Response<ResBody>, Error>>,
+    ResBody: http_body::Body,
+{
+    type Output = Result<Response<TracedBody<ResBody>>, Error>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let result: Result<Response<ResBody>, Error> =
+            ready!(self.as_mut().project().inner.poll(cx));
+
+        let projected = self.as_mut().project();
+        *projected.was_ready = true;
+        let span_recorder = projected.span_recorder;
+        let mut metrics_recorder = projected.metrics_recorder.take().unwrap();
+        match &result {
+            Ok(response) => match classify_response(response) {
+                (_, Classification::Ok) => match response.body().is_end_stream() {
+                    true => {
+                        metrics_recorder.set_classification(Classification::Ok);
+                        span_recorder.ok("request processed with empty response")
+                    }
+                    false => span_recorder.event(SpanEvent::new("request processed")),
+                },
+                (error, c) => {
+                    metrics_recorder.set_classification(c);
+                    span_recorder.error(error);
+                }
+            },
+            Err(_) => {
+                metrics_recorder.set_classification(Classification::ServerErr);
+                span_recorder.error("error processing request")
+            }
+        }
+
+        match result {
+            Ok(mut response) => {
+                // add trace-id header to the response, if we have one
+                let projected = self.as_mut().project();
+                let request_ctx = projected.request_ctx.take();
+                let span_recorder = projected.span_recorder.take();
+                if let Some(trace_id) = span_recorder.span().map(|span| span.ctx.trace_id) {
+                    // format as hex
+                    let trace_id = HeaderValue::from_str(&format!("{:x}", trace_id.get())).unwrap();
+                    response.headers_mut().insert("trace-id", trace_id);
+                }
+
+                Poll::Ready(Ok(response.map(|body| TracedBody {
+                    request_ctx,
+                    span_recorder,
+                    was_done_data: AtomicBool::new(false),
+                    was_ready_trailers: AtomicBool::new(false),
+                    inner: body,
+                    metrics_recorder,
+                })))
+            }
+            Err(e) => Poll::Ready(Err(e)),
+        }
+    }
+}
+
+/// `TracedBody` wraps a `http_body::Body` and instruments it
+#[pin_project(PinnedDrop)]
+#[derive(Debug)]
+pub struct TracedBody<B> {
+    request_ctx: Option<RequestLogContext>,
+    span_recorder: SpanRecorder,
+    metrics_recorder: MetricsRecorder,
+    was_done_data: AtomicBool,
+    was_ready_trailers: AtomicBool,
+    #[pin]
+    inner: B,
+}
+
+#[pinned_drop]
+impl<B> PinnedDrop for TracedBody<B> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.was_done_data.load(Ordering::SeqCst) {
+            let trace = self.request_ctx.format_jaeger();
+            warn!(
+                %trace,
+                when="before fully returning body data",
+                "request cancelled",
+            );
+        } else if !self.was_ready_trailers.load(Ordering::SeqCst) {
+            let trace = self.request_ctx.format_jaeger();
+            warn!(
+                %trace,
+                when="before returning trailers",
+                "request cancelled",
+            );
+        }
+    }
+}
+
+impl<B: http_body::Body> http_body::Body for TracedBody<B> {
+    type Data = B::Data;
+    type Error = B::Error;
+
+    fn poll_data(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Data, Self::Error>>> {
+        let maybe_result = ready!(self.as_mut().project().inner.poll_data(cx));
+        let result = match maybe_result {
+            Some(result) => result,
+            None => {
+                self.as_mut()
+                    .project()
+                    .was_done_data
+                    .store(true, Ordering::SeqCst);
+                return Poll::Ready(None);
+            }
+        };
+
+        let projected = self.as_mut().project();
+        let span_recorder = projected.span_recorder;
+        let metrics_recorder = projected.metrics_recorder;
+
+        match &result {
+            Ok(body) => {
+                let size = body.remaining() as i64;
+                match projected.inner.is_end_stream() {
+                    true => {
+                        metrics_recorder.set_classification(Classification::Ok);
+
+                        let mut evt = SpanEvent::new("returned body data and no trailers");
+                        evt.set_metadata("size", size);
+                        span_recorder.event(evt);
+                        span_recorder.status(SpanStatus::Ok);
+
+                        projected.was_done_data.store(true, Ordering::SeqCst);
+                        projected.was_ready_trailers.store(true, Ordering::SeqCst);
+                    }
+                    false => {
+                        let mut evt = SpanEvent::new("returned body data");
+                        evt.set_metadata("size", size);
+                        span_recorder.event(evt);
+                    }
+                }
+            }
+            Err(_) => {
+                metrics_recorder.set_classification(Classification::ServerErr);
+                span_recorder.error("error getting body");
+                projected.was_done_data.store(true, Ordering::SeqCst);
+                projected.was_ready_trailers.store(true, Ordering::SeqCst);
+            }
+        }
+
+        Poll::Ready(Some(result))
+    }
+
+    fn poll_trailers(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Result<Option<http::header::HeaderMap>, Self::Error>> {
+        let result: Result<Option<http::header::HeaderMap>, Self::Error> =
+            ready!(self.as_mut().project().inner.poll_trailers(cx));
+
+        let projected = self.as_mut().project();
+
+        projected.was_done_data.store(true, Ordering::SeqCst);
+        projected.was_ready_trailers.store(true, Ordering::SeqCst);
+
+        let span_recorder = projected.span_recorder;
+        let metrics_recorder = projected.metrics_recorder;
+        match &result {
+            Ok(headers) => match classify_headers(headers.as_ref()) {
+                (_, Classification::Ok) => {
+                    metrics_recorder.set_classification(Classification::Ok);
+                    span_recorder.ok("returned trailers")
+                }
+                (error, c) => {
+                    metrics_recorder.set_classification(c);
+                    span_recorder.error(error)
+                }
+            },
+            Err(_) => {
+                metrics_recorder.set_classification(Classification::ServerErr);
+                span_recorder.error("error getting trailers")
+            }
+        }
+
+        Poll::Ready(result)
+    }
+
+    fn is_end_stream(&self) -> bool {
+        let res = self.inner.is_end_stream();
+        if res {
+            self.was_done_data.store(true, Ordering::SeqCst);
+            self.was_ready_trailers.store(true, Ordering::SeqCst);
+        }
+        res
+    }
+
+    fn size_hint(&self) -> SizeHint {
+        self.inner.size_hint()
+    }
+}
diff --git a/tracker/Cargo.toml b/tracker/Cargo.toml
new file mode 100644
index 0000000..d058226
--- /dev/null
+++ b/tracker/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "tracker"
+description = "Utilities for tracking resource utilisation within IOx"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+futures = "0.3"
+hashbrown = { workspace = true }
+lock_api = "0.4.11"
+metric = { path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+parking_lot = "0.12"
+pin-project = "1.1"
+iox_time = { path = "../iox_time" }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "sync", "time"] }
+tokio-util = { version = "0.7.10" }
+trace = { path = "../trace"}
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+sysinfo = "0.30.5"
+
+[dev-dependencies]
+tempfile = "3.9.0"
+# Need the multi-threaded executor for testing
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "time"] }
+test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
diff --git a/tracker/src/async_semaphore.rs b/tracker/src/async_semaphore.rs
new file mode 100644
index 0000000..3b8ce7b
--- /dev/null
+++ b/tracker/src/async_semaphore.rs
@@ -0,0 +1,732 @@
+//! Tooling to track/instrument [`tokio::sync::Semaphore`]s.
+use std::{
+    future::Future,
+    marker::PhantomData,
+    ops::Deref,
+    sync::Arc,
+    task::Poll,
+    time::{Duration, Instant},
+};
+
+use futures::{future::BoxFuture, FutureExt};
+use metric::{Attributes, DurationHistogram, MakeMetricObserver, U64Counter, U64Gauge};
+use pin_project::{pin_project, pinned_drop};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
+
+pub use tokio::sync::AcquireError;
+use trace::span::{Span, SpanRecorder};
+
+/// Metrics that can be used to create a [`InstrumentedAsyncSemaphore`].
+#[derive(Debug)]
+pub struct AsyncSemaphoreMetrics {
+    permits_acquired: U64Gauge,
+    permits_total: U64Gauge,
+    permits_pending: U64Gauge,
+    permits_cancelled_while_pending: U64Counter,
+    holders_acquired: U64Gauge,
+    holders_pending: U64Gauge,
+    holders_cancelled_while_pending: U64Counter,
+    acquire_duration: DurationHistogram,
+}
+
+impl AsyncSemaphoreMetrics {
+    /// Create new metrics that are linked to the given registry and carry the given attributes.
+    pub fn new(registry: &metric::Registry, attributes: impl Into<Attributes>) -> Self {
+        let attributes: Attributes = attributes.into();
+
+        let permits_acquired = registry
+            .register_metric::<U64Gauge>(
+                "iox_async_semaphore_permits_acquired",
+                "Number of currently acquired permits",
+            )
+            .recorder(attributes.clone());
+        let permits_total = registry
+            .register_metric::<U64Gauge>(
+                "iox_async_semaphore_permits_total",
+                "Number of total permits",
+            )
+            .recorder(attributes.clone());
+        let permits_pending = registry
+            .register_metric::<U64Gauge>(
+                "iox_async_semaphore_permits_pending",
+                "Number of pending permits",
+            )
+            .recorder(attributes.clone());
+        let permits_cancelled_while_pending = registry
+            .register_metric::<U64Counter>(
+                "iox_async_semaphore_permits_cancelled_while_pending",
+                "Counter for permits that were cancelled while they were waiting for the semaphore.",
+            )
+            .recorder(attributes.clone());
+        let holders_acquired = registry
+            .register_metric::<U64Gauge>(
+                "iox_async_semaphore_holders_acquired",
+                "Number of currently acquired semaphore holders. Each holder might have multiple permits",
+            )
+            .recorder(attributes.clone());
+        let holders_pending = registry
+            .register_metric::<U64Gauge>(
+                "iox_async_semaphore_holders_pending",
+                "Number of pending semaphore holders. Each holder might have multiple permits",
+            )
+            .recorder(attributes.clone());
+        let holders_cancelled_while_pending = registry
+            .register_metric::<U64Counter>(
+                "iox_async_semaphore_holders_cancelled_while_pending",
+                "Number of pending semaphore holders that were cancelled while they were waiting for the semaphore. Each holder might have multiple permits",
+            )
+            .recorder(attributes.clone());
+        let acquire_duration = registry
+            .register_metric::<DurationHistogram>(
+                "iox_async_semaphore_acquire_duration",
+                "Duration it takes to acquire a semaphore",
+            )
+            .recorder(attributes);
+
+        Self {
+            permits_acquired,
+            permits_total,
+            permits_pending,
+            permits_cancelled_while_pending,
+            holders_acquired,
+            holders_pending,
+            holders_cancelled_while_pending,
+            acquire_duration,
+        }
+    }
+
+    /// Create metrics that are not associated with any registry.
+    pub fn new_unregistered() -> Self {
+        Self {
+            permits_acquired: Default::default(),
+            permits_total: Default::default(),
+            permits_pending: Default::default(),
+            permits_cancelled_while_pending: Default::default(),
+            holders_acquired: Default::default(),
+            holders_pending: Default::default(),
+            holders_cancelled_while_pending: Default::default(),
+            acquire_duration: DurationHistogram::create(&Default::default()),
+        }
+    }
+
+    /// Create new instrumented semaphore.
+    pub fn new_semaphore(self: &Arc<Self>, permits: usize) -> InstrumentedAsyncSemaphore {
+        self.permits_total.inc(permits as u64);
+
+        InstrumentedAsyncSemaphore {
+            inner: Arc::new(Semaphore::new(permits)),
+            permits,
+            metrics: Arc::clone(self),
+        }
+    }
+}
+
+/// Instrumented version of [`tokio::sync::Semaphore`].
+///
+/// # Tracing
+/// All `acquire*` methods take an optional span. This span will be exported as:
+///
+/// - **happy acquire path:** `<start>...<acquire event>...<end>` with OK status
+/// - **acquire failure:** `<start>...<end>` with ERROR status
+/// - **canceled during acquire:** `<start>...<end>` with UNKNOWN status
+#[derive(Debug)]
+pub struct InstrumentedAsyncSemaphore {
+    /// Underlying semaphore implementation.
+    ///
+    /// This is wrapped into an [`Arc`] so we can use a single implementation for the owned and non-owned implementation.
+    inner: Arc<Semaphore>,
+
+    /// Number of total permits (acquired and available).
+    permits: usize,
+
+    /// Metrics.
+    metrics: Arc<AsyncSemaphoreMetrics>,
+}
+
+impl InstrumentedAsyncSemaphore {
+    /// Acquire a single permit.
+    ///
+    /// See [`tokio::sync::Semaphore::acquire`] for details.
+    pub async fn acquire(
+        &self,
+        span: Option<Span>,
+    ) -> Result<InstrumentedAsyncSemaphorePermit<'_>, AcquireError> {
+        self.acquire_many(1, span).await
+    }
+
+    /// Acquire `n` permits.
+    ///
+    /// See [`tokio::sync::Semaphore::acquire_many`] for details.
+    pub async fn acquire_many(
+        &self,
+        n: u32,
+        span: Option<Span>,
+    ) -> Result<InstrumentedAsyncSemaphorePermit<'_>, AcquireError> {
+        let owned_permit = self.acquire_impl(n, span).await?;
+        Ok(InstrumentedAsyncSemaphorePermit {
+            owned_permit,
+            phantom: Default::default(),
+        })
+    }
+
+    pub async fn acquire_owned(
+        self: &Arc<Self>,
+        span: Option<Span>,
+    ) -> Result<InstrumentedAsyncOwnedSemaphorePermit, AcquireError> {
+        // NOTE: We deliberately take `self: &Arc<Self>` here even though we strictly don't need it so we have use a
+        //       single implementation for the owned and non-owned variant while still providing a comparable API to
+        //       the ordinary tokio semaphore.
+        self.acquire_impl(1, span).await
+    }
+
+    pub async fn acquire_many_owned(
+        self: &Arc<Self>,
+        n: u32,
+        span: Option<Span>,
+    ) -> Result<InstrumentedAsyncOwnedSemaphorePermit, AcquireError> {
+        // NOTE: We deliberately take `self: &Arc<Self>` here even though we strictly don't need it so we have use a
+        //       single implementation for the owned and non-owned variant while still providing a comparable API to
+        //       the ordinary tokio semaphore.
+        self.acquire_impl(n, span).await
+    }
+
+    fn acquire_impl(
+        &self,
+        n: u32,
+        span: Option<Span>,
+    ) -> impl Future<Output = Result<InstrumentedAsyncOwnedSemaphorePermit, AcquireError>> {
+        InstrumentedAsyncSemaphoreAcquire {
+            inner: Arc::clone(&self.inner).acquire_many_owned(n).boxed(),
+            metrics: Arc::clone(&self.metrics),
+            n,
+            reported_pending: false,
+            t_start: Instant::now(),
+            span_recorder: Some(SpanRecorder::new(span)),
+        }
+    }
+
+    /// return the total number of permits (available + already acquired).
+    pub fn total_permits(self: &Arc<Self>) -> usize {
+        self.permits
+    }
+
+    /// return the number of pending permits
+    pub fn permits_pending(self: &Arc<Self>) -> u64 {
+        self.metrics.permits_pending.fetch()
+    }
+
+    /// return the number of acquired permits
+    pub fn permits_acquired(self: &Arc<Self>) -> u64 {
+        self.metrics.permits_acquired.fetch()
+    }
+
+    /// return the number of pending holders
+    pub fn holders_pending(self: &Arc<Self>) -> u64 {
+        self.metrics.holders_pending.fetch()
+    }
+
+    /// return the number of acquired holders
+    pub fn holders_acquired(self: &Arc<Self>) -> u64 {
+        self.metrics.holders_acquired.fetch()
+    }
+}
+
+impl Drop for InstrumentedAsyncSemaphore {
+    fn drop(&mut self) {
+        self.metrics.permits_total.dec(self.permits as u64);
+    }
+}
+
+/// Future that wraps [`tokio::sync::Semaphore::acquire_many`] with metrics.
+///
+/// This type is private so we don't leak too many implementation details.
+#[pin_project(PinnedDrop)]
+struct InstrumentedAsyncSemaphoreAcquire<'a> {
+    /// The actual `acquire_many` future.
+    #[pin]
+    inner: BoxFuture<'a, Result<OwnedSemaphorePermit, AcquireError>>,
+
+    /// Metrics.
+    metrics: Arc<AsyncSemaphoreMetrics>,
+
+    /// Number of requested permits.
+    ///
+    /// This was already passed to the `acquire_many` future but we need to store it separately for our metrics.
+    n: u32,
+
+    /// Flags if we already reported a "pending" state for this future.
+    ///
+    /// This is toggled back from `true` to `false` when we clear the "pending" metrics, e.g. when the future completes.
+    reported_pending: bool,
+
+    /// Start time of the "acquire" action.
+    t_start: Instant,
+
+    /// Span recorder for the entire semaphore interaction.
+    ///
+    /// Wrapped into an [`Option`] to allow the handover between the acquire-future and the permit.
+    span_recorder: Option<SpanRecorder>,
+}
+
+impl<'a> std::fmt::Debug for InstrumentedAsyncSemaphoreAcquire<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("InstrumentedAsyncSemaphoreAcquire")
+            .field("metrics", &self.metrics)
+            .field("n", &self.n)
+            .field("reported_pending", &self.reported_pending)
+            .field("t_start", &self.t_start)
+            .finish_non_exhaustive()
+    }
+}
+
+impl<'a> Future for InstrumentedAsyncSemaphoreAcquire<'a> {
+    type Output = Result<InstrumentedAsyncOwnedSemaphorePermit, AcquireError>;
+
+    fn poll(self: std::pin::Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+
+        match this.inner.poll(cx) {
+            Poll::Ready(res) => match res {
+                Ok(permit) => {
+                    this.metrics.permits_acquired.inc(*this.n as u64);
+                    this.metrics.holders_acquired.inc(1);
+
+                    let acquire_duration = this.t_start.elapsed();
+                    this.metrics.acquire_duration.record(acquire_duration);
+
+                    // reset "pending" metrics if we've reported any
+                    if *this.reported_pending {
+                        this.metrics.permits_pending.dec(*this.n as u64);
+                        this.metrics.holders_pending.dec(1);
+
+                        // Ensure that `Drop` doesn't decrease these metrics a 2nd time. Don't solely rely on `Drop`
+                        // however since this future might be referenced somewhere in the stack even when the result was
+                        // already produced.
+                        *this.reported_pending = false;
+                    }
+
+                    let mut span_recorder = this
+                        .span_recorder
+                        .take()
+                        .expect("span recorder should still be present");
+                    span_recorder.ok("acquired");
+
+                    Poll::Ready(Ok(InstrumentedAsyncOwnedSemaphorePermit {
+                        inner: permit,
+                        n: *this.n,
+                        metrics: Arc::clone(this.metrics),
+                        acquire_duration,
+                        span_recorder,
+                    }))
+                }
+                Err(e) => {
+                    this.span_recorder
+                        .take()
+                        .expect("span recorder should still be present")
+                        .error("AcquireError");
+
+                    Poll::Ready(Err(e))
+                }
+            },
+            Poll::Pending => {
+                // report "pending" metrics once
+                if !*this.reported_pending {
+                    this.metrics.permits_pending.inc(*this.n as u64);
+                    this.metrics.holders_pending.inc(1);
+
+                    *this.reported_pending = true;
+                }
+
+                Poll::Pending
+            }
+        }
+    }
+}
+
+#[pinned_drop]
+#[allow(clippy::needless_lifetimes)]
+impl<'a> PinnedDrop for InstrumentedAsyncSemaphoreAcquire<'a> {
+    fn drop(self: std::pin::Pin<&mut Self>) {
+        let this = self.project();
+
+        // reset "pending" metrics if we've reported any
+        if *this.reported_pending {
+            this.metrics.permits_pending.dec(*this.n as u64);
+            this.metrics.holders_pending.dec(1);
+
+            this.metrics
+                .permits_cancelled_while_pending
+                .inc(*this.n as u64);
+            this.metrics.holders_cancelled_while_pending.inc(1);
+        }
+    }
+}
+
+/// An instrumented wrapper around [`tokio::sync::OwnedSemaphorePermit`].
+///
+/// Normally you should use the non-owned
+/// [`InstrumentedAsyncSemaphorePermit`] version because the semaphore
+/// is attached to an object and moving it around independently from
+/// object can cause state confusion.
+///
+/// In certain distributed scenarios however, it may make sense to
+/// detach the permit from its origin (with the risk that this
+/// introduces state confusion) and use this owned version.
+#[derive(Debug)]
+pub struct InstrumentedAsyncOwnedSemaphorePermit {
+    /// The actual permit.
+    ///
+    /// This permit is never accessed but we hold it here because dropping it clears the permit.
+    #[allow(dead_code)]
+    inner: OwnedSemaphorePermit,
+
+    /// Number of permits that we hold.
+    ///
+    /// This is required for metric purposes.
+    n: u32,
+
+    /// Metrics.
+    metrics: Arc<AsyncSemaphoreMetrics>,
+
+    /// The time it took to acquire this permit.
+    acquire_duration: Duration,
+
+    /// Span recorder for the entire semaphore interaction.
+    ///
+    /// No direct interaction, will be exported during drop (aka the end of the span will be set).
+    #[allow(dead_code)]
+    span_recorder: SpanRecorder,
+}
+
+impl InstrumentedAsyncOwnedSemaphorePermit {
+    /// The time it took to acquire this permit.
+    pub fn acquire_duration(&self) -> Duration {
+        self.acquire_duration
+    }
+}
+
+impl Drop for InstrumentedAsyncOwnedSemaphorePermit {
+    fn drop(&mut self) {
+        self.metrics.holders_acquired.dec(1);
+        self.metrics.permits_acquired.dec(self.n as u64);
+    }
+}
+
+/// An instrumented wrapper around [`tokio::sync::SemaphorePermit`].
+#[derive(Debug)]
+pub struct InstrumentedAsyncSemaphorePermit<'a> {
+    /// Use the owned variant so we can use a single implementation.
+    #[allow(dead_code)]
+    owned_permit: InstrumentedAsyncOwnedSemaphorePermit,
+
+    /// Phantom data to track the livetime.
+    #[allow(dead_code)]
+    phantom: PhantomData<&'a ()>,
+}
+
+impl<'a> Deref for InstrumentedAsyncSemaphorePermit<'a> {
+    type Target = InstrumentedAsyncOwnedSemaphorePermit;
+
+    fn deref(&self) -> &Self::Target {
+        &self.owned_permit
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use tokio::{pin, sync::Barrier};
+    use trace::{ctx::SpanContext, span::SpanStatus, RingBufferTraceCollector};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_send_sync() {
+        let metrics = AsyncSemaphoreMetrics::new_unregistered();
+        assert_send(&metrics);
+        assert_sync(&metrics);
+
+        let metrics = Arc::new(metrics);
+        let semaphore = metrics.new_semaphore(10);
+        assert_send(&semaphore);
+        assert_sync(&semaphore);
+
+        let acquire_fut = semaphore.acquire(None);
+        let acquire_many_fut = semaphore.acquire_many(1, None);
+        assert_send(&acquire_fut);
+        assert_send(&acquire_many_fut);
+        // futures itself are NOT Sync
+
+        let permit_acquire = acquire_fut.await.unwrap();
+        let permit_acquire_many = acquire_many_fut.await.unwrap();
+        assert_send(&permit_acquire);
+        assert_send(&permit_acquire_many);
+        assert_sync(&permit_acquire);
+        assert_sync(&permit_acquire_many);
+    }
+
+    #[tokio::test]
+    async fn test_send_sync_owned() {
+        let metrics = Arc::new(AsyncSemaphoreMetrics::new_unregistered());
+        let semaphore = Arc::new(metrics.new_semaphore(10));
+
+        assert_eq!(10, semaphore.total_permits());
+
+        let acquire_fut = semaphore.acquire_owned(None);
+        let acquire_many_fut = semaphore.acquire_many_owned(1, None);
+        assert_send(&acquire_fut);
+        assert_send(&acquire_many_fut);
+        // futures itself are NOT Sync
+
+        let permit_acquire = acquire_fut.await.unwrap();
+        assert_eq!(10, semaphore.total_permits());
+        let permit_acquire_many = acquire_many_fut.await.unwrap();
+        assert_eq!(10, semaphore.total_permits());
+        assert_send(&permit_acquire);
+        assert_send(&permit_acquire_many);
+        assert_sync(&permit_acquire);
+        assert_sync(&permit_acquire_many);
+    }
+
+    #[tokio::test]
+    async fn test_total_permits() {
+        let metrics = Arc::new(AsyncSemaphoreMetrics::new_unregistered());
+        assert_eq!(metrics.permits_total.fetch(), 0);
+
+        let semaphore1 = metrics.new_semaphore(10);
+        assert_eq!(metrics.permits_total.fetch(), 10);
+
+        let semaphore2 = metrics.new_semaphore(3);
+        assert_eq!(metrics.permits_total.fetch(), 13);
+
+        drop(semaphore1);
+        assert_eq!(metrics.permits_total.fetch(), 3);
+
+        drop(semaphore2);
+        assert_eq!(metrics.permits_total.fetch(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_permits_acquired_and_holders_acquired() {
+        let metrics = Arc::new(AsyncSemaphoreMetrics::new_unregistered());
+        let semaphore = Arc::new(metrics.new_semaphore(10));
+
+        assert_eq!(metrics.holders_acquired.fetch(), 0);
+        assert_eq!(metrics.permits_acquired.fetch(), 0);
+
+        let p1 = semaphore.acquire(None).await.unwrap();
+        let p2 = semaphore.acquire_many(5, None).await.unwrap();
+
+        let pending_barrier = Arc::new(Barrier::new(2));
+        let pending_barrier_captured = Arc::clone(&pending_barrier);
+        let permit_acquired_barrier = Arc::new(Barrier::new(2));
+        let permit_acquired_barrier_captured = Arc::clone(&permit_acquired_barrier);
+        let drop_permit_barrier = Arc::new(Barrier::new(2));
+        let semaphore_captured = Arc::clone(&semaphore);
+        let drop_permit_barrier_captured = Arc::clone(&drop_permit_barrier);
+        let task = tokio::task::spawn(async move {
+            let fut = semaphore_captured.acquire_many(7, None).fuse();
+            pin!(fut);
+            futures::select_biased! {
+                _ = fut => panic!("should be pending"),
+                _ = pending_barrier_captured.wait().fuse() => (),
+            };
+
+            let permit = fut.await.unwrap();
+            permit_acquired_barrier_captured.wait().await;
+
+            drop_permit_barrier_captured.wait().await;
+            drop(permit);
+        });
+        pending_barrier.wait().await;
+
+        assert_eq!(metrics.holders_acquired.fetch(), 2);
+        assert_eq!(metrics.permits_acquired.fetch(), 6); // = 1 + 5
+
+        drop(p2);
+        permit_acquired_barrier.wait().await;
+
+        assert_eq!(metrics.holders_acquired.fetch(), 2);
+        assert_eq!(metrics.permits_acquired.fetch(), 8); // = 1 + 5 - 5 + 7
+
+        drop(p1);
+
+        assert_eq!(metrics.holders_acquired.fetch(), 1);
+        assert_eq!(metrics.permits_acquired.fetch(), 7); // = 1 + 5 - 5 + 7 - 1
+
+        drop_permit_barrier.wait().await;
+        task.await.unwrap();
+
+        assert_eq!(metrics.holders_acquired.fetch(), 0);
+        assert_eq!(metrics.permits_acquired.fetch(), 0); // = 1 + 5 - 5 + 7 - 1 - 7
+    }
+
+    #[tokio::test]
+    async fn test_permits_pending_and_holders_pending() {
+        let metrics = Arc::new(AsyncSemaphoreMetrics::new_unregistered());
+        let semaphore = Arc::new(metrics.new_semaphore(10));
+
+        assert_eq!(metrics.holders_pending.fetch(), 0);
+        assert_eq!(metrics.permits_pending.fetch(), 0);
+        assert_eq!(metrics.holders_cancelled_while_pending.fetch(), 0);
+        assert_eq!(metrics.permits_cancelled_while_pending.fetch(), 0);
+
+        let p1 = semaphore.acquire_many(5, None).await.unwrap();
+
+        assert_eq!(metrics.holders_pending.fetch(), 0);
+        assert_eq!(metrics.permits_pending.fetch(), 0);
+        assert_eq!(metrics.holders_cancelled_while_pending.fetch(), 0);
+        assert_eq!(metrics.permits_cancelled_while_pending.fetch(), 0);
+
+        {
+            let fut = semaphore.acquire_many(6, None);
+            pin!(fut);
+            assert_fut_pending(&mut fut).await;
+
+            assert_eq!(metrics.holders_pending.fetch(), 1);
+            assert_eq!(metrics.permits_pending.fetch(), 6);
+            assert_eq!(metrics.holders_cancelled_while_pending.fetch(), 0);
+            assert_eq!(metrics.permits_cancelled_while_pending.fetch(), 0);
+
+            // `fut` is dropped here
+        }
+
+        assert_eq!(metrics.holders_pending.fetch(), 0);
+        assert_eq!(metrics.permits_pending.fetch(), 0);
+        assert_eq!(metrics.holders_cancelled_while_pending.fetch(), 1);
+        assert_eq!(metrics.permits_cancelled_while_pending.fetch(), 6);
+
+        {
+            let fut = semaphore.acquire_many(6, None);
+            pin!(fut);
+            assert_fut_pending(&mut fut).await;
+
+            assert_eq!(metrics.holders_pending.fetch(), 1);
+            assert_eq!(metrics.permits_pending.fetch(), 6);
+            assert_eq!(metrics.holders_cancelled_while_pending.fetch(), 1);
+            assert_eq!(metrics.permits_cancelled_while_pending.fetch(), 6);
+
+            drop(p1);
+
+            let _p2 = (&mut fut).await.unwrap();
+
+            assert_eq!(metrics.holders_pending.fetch(), 0);
+            assert_eq!(metrics.permits_pending.fetch(), 0);
+            assert_eq!(metrics.holders_cancelled_while_pending.fetch(), 1);
+            assert_eq!(metrics.permits_cancelled_while_pending.fetch(), 6);
+
+            // `fut` is finally dropped here
+        }
+
+        // dropping the future should NOT decrease the pending counter a 2nd time and should NOT be considered as "cancelled"
+        assert_eq!(metrics.holders_pending.fetch(), 0);
+        assert_eq!(metrics.permits_pending.fetch(), 0);
+        assert_eq!(metrics.holders_cancelled_while_pending.fetch(), 1);
+        assert_eq!(metrics.permits_cancelled_while_pending.fetch(), 6);
+    }
+
+    #[tokio::test]
+    async fn test_acquire_duration() {
+        let metrics = Arc::new(AsyncSemaphoreMetrics::new_unregistered());
+        let semaphore = Arc::new(metrics.new_semaphore(10));
+
+        assert_eq!(
+            metrics.acquire_duration.fetch().total,
+            Duration::from_millis(0)
+        );
+
+        let p1 = semaphore.acquire_many(5, None).await.unwrap();
+        let p1_duration = p1.acquire_duration();
+
+        let fut = semaphore.acquire_many(6, None);
+        pin!(fut);
+        assert_fut_pending(&mut fut).await;
+
+        tokio::time::sleep(Duration::from_millis(10)).await;
+
+        drop(p1);
+        let p2 = fut.await.unwrap();
+        let acquire_duration_method = p1_duration + p2.acquire_duration();
+        let acquire_duration_metric = metrics.acquire_duration.fetch().total;
+
+        assert_eq!(acquire_duration_method, acquire_duration_metric);
+        assert!(acquire_duration_method >= Duration::from_millis(10));
+    }
+
+    #[tokio::test]
+    #[should_panic(expected = "`async fn` resumed after completion")]
+    async fn test_poll_ready_future_panics() {
+        // ensure that polling a "ready" future panics. This is the normal behavior for futures and we should follow
+        // that (instead of silently doing something silly with the metrics).
+        let metrics = Arc::new(AsyncSemaphoreMetrics::new_unregistered());
+        let semaphore = Arc::new(metrics.new_semaphore(1));
+
+        let fut = semaphore.acquire(None);
+
+        pin!(fut);
+        (&mut fut).await.unwrap();
+        (&mut fut).await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_tracing() {
+        let metrics = Arc::new(AsyncSemaphoreMetrics::new_unregistered());
+        let semaphore = Arc::new(metrics.new_semaphore(1));
+
+        // happy path
+        let traces = Arc::new(RingBufferTraceCollector::new(5));
+        let span = SpanContext::new(Arc::clone(&traces) as _).child("semaphore");
+        semaphore.acquire(Some(span)).await.unwrap();
+
+        let span = traces
+            .spans()
+            .into_iter()
+            .find(|s| s.name == "semaphore")
+            .expect("tracing span not found");
+
+        assert_eq!(span.status, SpanStatus::Ok);
+        assert_eq!(span.events.len(), 1);
+        assert_eq!(span.events[0].msg, "acquired");
+
+        // canceled
+        let traces = Arc::new(RingBufferTraceCollector::new(5));
+        let span = SpanContext::new(Arc::clone(&traces) as _).child("semaphore");
+        let _permit = semaphore.acquire(None).await.unwrap();
+        {
+            let fut = semaphore.acquire(Some(span));
+            pin!(fut);
+            assert_fut_pending(&mut fut).await;
+            // `fut` is dropped here
+        }
+
+        let span = traces
+            .spans()
+            .into_iter()
+            .find(|s| s.name == "semaphore")
+            .expect("tracing span not found");
+
+        assert_eq!(span.status, SpanStatus::Unknown);
+        assert_eq!(span.events.len(), 0);
+    }
+
+    /// Check that a given object implements [`Send`].
+    fn assert_send<T: Send>(_: &T) {}
+
+    /// Check that a given object implements [`Sync`].
+    fn assert_sync<T: Sync>(_: &T) {}
+
+    /// Assert that given future is pending.
+    ///
+    /// This will try to poll the future a bit to ensure that it is not stuck in tokios task preemption.
+    async fn assert_fut_pending<F>(fut: &mut F)
+    where
+        F: Future + Send + Unpin,
+        F::Output: std::fmt::Debug,
+    {
+        tokio::select! {
+            x = fut => panic!("future is not pending, yielded: {x:?}"),
+            _ = tokio::time::sleep(Duration::from_millis(10)) => {},
+        };
+    }
+}
diff --git a/tracker/src/disk_metric.rs b/tracker/src/disk_metric.rs
new file mode 100644
index 0000000..261b4d6
--- /dev/null
+++ b/tracker/src/disk_metric.rs
@@ -0,0 +1,254 @@
+use std::borrow::Cow;
+use std::path::PathBuf;
+use std::time::Duration;
+
+use metric::{Attributes, U64Gauge};
+use sysinfo::Disks;
+use tokio::sync::watch;
+
+/// The interval at which disk metrics are updated.
+///
+/// This is purposely chosen to be out-of-phase w.r.t the default metric scrape
+/// interval.
+const UPDATE_INTERVAL: Duration = Duration::from_secs(13);
+
+/// An immutable snapshot of space and usage statistics for some disk.
+#[derive(Clone, Copy, Debug)]
+pub struct DiskSpaceSnapshot {
+    available_disk_space: u64,
+    total_disk_space: u64,
+}
+
+impl DiskSpaceSnapshot {
+    /// Create a new disk space snapshot.
+    pub fn new(available_disk_space: u64, total_disk_space: u64) -> Self {
+        Self {
+            available_disk_space,
+            total_disk_space,
+        }
+    }
+
+    /// The available space in bytes on the disk.
+    pub fn available_disk_space(&self) -> u64 {
+        self.available_disk_space
+    }
+
+    /// The maximum capacity in bytes of the disk.
+    pub fn total_disk_space(&self) -> u64 {
+        self.total_disk_space
+    }
+
+    /// Overall usage of the disk, as a percentage [0.0, 1.0].
+    #[inline]
+    pub fn disk_usage_ratio(&self) -> f64 {
+        debug_assert!(self.available_disk_space <= self.total_disk_space);
+        1.0 - (self.available_disk_space as f64 / self.total_disk_space as f64)
+    }
+}
+
+/// A periodic reporter of disk capacity / free statistics for a given
+/// directory.
+#[derive(Debug)]
+pub struct DiskSpaceMetrics {
+    available_disk_space: U64Gauge,
+    total_disk_space: U64Gauge,
+
+    /// The [`Disks`] containing the disk list at construction time.
+    disks: Disks,
+
+    /// The index into [`Disks::list()`] for the disk containing the observed
+    /// directory.
+    disk_idx: usize,
+
+    /// A stream of [`DiskSpaceSnapshot`] produced by the metric reporter for
+    /// consumption by any listeners.
+    snapshot_tx: watch::Sender<DiskSpaceSnapshot>,
+}
+
+impl DiskSpaceMetrics {
+    /// Create a new [`DiskSpaceMetrics`], returning [`None`] if no disk can be
+    /// found for the specified `directory`.
+    pub fn new(
+        directory: PathBuf,
+        registry: &metric::Registry,
+    ) -> Option<(Self, watch::Receiver<DiskSpaceSnapshot>)> {
+        let path: Cow<'static, str> = Cow::from(directory.display().to_string());
+        let mut directory = directory.canonicalize().ok()?;
+
+        let attributes = Attributes::from([("path", path)]);
+
+        let available_disk_space = registry
+            .register_metric::<U64Gauge>(
+                "disk_space_free",
+                "The amount of disk space currently available at the labelled mount point.",
+            )
+            .recorder(attributes.clone());
+
+        let total_disk_space = registry
+            .register_metric::<U64Gauge>(
+                "disk_capacity_total",
+                "The disk capacity at the labelled mount point.",
+            )
+            .recorder(attributes);
+
+        // Load the disk stats once, and refresh them later.
+        let mut disks = Disks::new();
+        disks.refresh_list();
+
+        // Resolve the mount point once.
+        // The directory path may be `/path/to/dir` and the mount point is `/`.
+        let (disk_idx, initial_disk) = loop {
+            if let Some((idx, disk)) = disks
+                .list()
+                .iter()
+                .enumerate()
+                .find(|(_idx, disk)| disk.mount_point() == directory)
+            {
+                break (idx, disk);
+            }
+            // The mount point for this directory could not be found.
+            if !directory.pop() {
+                return None;
+            }
+        };
+
+        let (snapshot_tx, snapshot_rx) = watch::channel(DiskSpaceSnapshot {
+            available_disk_space: initial_disk.available_space(),
+            total_disk_space: initial_disk.total_space(),
+        });
+
+        Some((
+            Self {
+                available_disk_space,
+                total_disk_space,
+                disks,
+                disk_idx,
+                snapshot_tx,
+            },
+            snapshot_rx,
+        ))
+    }
+
+    /// Start the [`DiskSpaceMetrics`] evaluation loop, blocking forever.
+    pub async fn run(mut self) {
+        let mut interval = tokio::time::interval(UPDATE_INTERVAL);
+        loop {
+            interval.tick().await;
+
+            let disk = self
+                .disks
+                .list_mut()
+                .get_mut(self.disk_idx)
+                .expect("disk list never refreshed so should not change");
+
+            // Refresh the stats for this disk only.
+            disk.refresh();
+
+            self.available_disk_space.set(disk.available_space());
+            self.total_disk_space.set(disk.total_space());
+
+            // Produce and send a [`DiskSpaceSnapshot`] for any listeners
+            // that might exist.
+            _ = self.snapshot_tx.send(DiskSpaceSnapshot {
+                available_disk_space: disk.available_space(),
+                total_disk_space: disk.total_space(),
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{sync::Arc, time::Instant};
+
+    use metric::Metric;
+    use tempfile::tempdir_in;
+    use test_helpers::timeout::FutureTimeout;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_metrics() {
+        let tmp_dir = tempdir_in(".").ok().unwrap();
+        let path = tmp_dir.path().display().to_string();
+        // TempDir creates a directory in current directory, so test the relative path (if possible).
+        let path = match path.find("/./") {
+            Some(index) => &path[index + 3..],
+            None => &path[..],
+        };
+
+        let pathbuf = PathBuf::from(path);
+        let metric_label: Cow<'static, str> = path.to_string().into();
+
+        let registry = Arc::new(metric::Registry::new());
+
+        let (_handle, mut snapshot_rx) =
+            DiskSpaceMetrics::new(pathbuf, &registry).expect("root always exists");
+        let _handle = tokio::spawn(_handle.run());
+
+        // Wait for the metric to be emitted and non-zero - this should be very
+        // quick!
+        let deadline = Instant::now() + Duration::from_secs(5);
+        loop {
+            // Bound the test duration
+            if Instant::now() > deadline {
+                panic!("timeout waiting for disk metrics to be resolved");
+            }
+
+            let recorded_free_metric = registry
+                .get_instrument::<Metric<U64Gauge>>("disk_space_free")
+                .expect("metric should exist")
+                .get_observer(&Attributes::from([("path", metric_label.clone())]))
+                .expect("metric should have labels")
+                .fetch();
+
+            let recorded_total_metric = registry
+                .get_instrument::<Metric<U64Gauge>>("disk_capacity_total")
+                .expect("metric should exist")
+                .get_observer(&Attributes::from([("path", metric_label.clone())]))
+                .expect("metric should have labels")
+                .fetch();
+
+            if recorded_free_metric > 0 && recorded_total_metric > 0 {
+                snapshot_rx
+                    .changed()
+                    .with_timeout_panic(Duration::from_secs(5))
+                    .await
+                    .expect("snapshot value should have changed");
+
+                let snapshot = *snapshot_rx.borrow();
+                assert_eq!(snapshot.available_disk_space, recorded_free_metric);
+                assert_eq!(snapshot.total_disk_space, recorded_total_metric);
+
+                return;
+            }
+
+            tokio::time::sleep(Duration::from_millis(50)).await;
+        }
+    }
+
+    // Token test to assert disk usage ratio
+    #[test]
+    fn assert_disk_usage_ratio() {
+        // 80% used
+        let snapshot = DiskSpaceSnapshot {
+            available_disk_space: 2000,
+            total_disk_space: 10000,
+        };
+        assert_eq!(snapshot.disk_usage_ratio(), 0.8);
+
+        // 90% used
+        let snapshot = DiskSpaceSnapshot {
+            available_disk_space: 2000,
+            total_disk_space: 20000,
+        };
+        assert_eq!(snapshot.disk_usage_ratio(), 0.9);
+
+        // Free!
+        let snapshot = DiskSpaceSnapshot {
+            available_disk_space: 42,
+            total_disk_space: 42,
+        };
+        assert_eq!(snapshot.disk_usage_ratio(), 0.0);
+    }
+}
diff --git a/tracker/src/lib.rs b/tracker/src/lib.rs
new file mode 100644
index 0000000..58ce596
--- /dev/null
+++ b/tracker/src/lib.rs
@@ -0,0 +1,25 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod async_semaphore;
+mod disk_metric;
+mod lock;
+mod task;
+
+pub use async_semaphore::*;
+pub use disk_metric::*;
+pub use lock::*;
+pub use task::*;
diff --git a/tracker/src/lock.rs b/tracker/src/lock.rs
new file mode 100644
index 0000000..98b3761
--- /dev/null
+++ b/tracker/src/lock.rs
@@ -0,0 +1,562 @@
+use std::sync::Arc;
+
+use metric::{Attributes, DurationCounter, Metric, U64Counter};
+
+type RawRwLock = InstrumentedRawLock<parking_lot::RawRwLock>;
+type RawMutex = InstrumentedRawLock<parking_lot::RawMutex>;
+
+/// An instrumented Read-Write Lock
+pub type RwLock<T> = lock_api::RwLock<RawRwLock, T>;
+pub type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, RawRwLock, T>;
+pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, RawRwLock, T>;
+pub type MappedRwLockReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, RawRwLock, T>;
+pub type MappedRwLockWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, RawRwLock, T>;
+pub type RwLockUpgradableReadGuard<'a, T> = lock_api::RwLockUpgradableReadGuard<'a, RawRwLock, T>;
+
+/// An instrumented mutex
+pub type Mutex<T> = lock_api::Mutex<RawMutex, T>;
+pub type MutexGuard<'a, T> = lock_api::MutexGuard<'a, RawMutex, T>;
+pub type MappedMutexGuard<'a, T> = lock_api::MappedMutexGuard<'a, RawMutex, T>;
+
+#[derive(Debug)]
+pub struct LockMetrics {
+    exclusive_count: U64Counter,
+    shared_count: U64Counter,
+    upgradeable_count: U64Counter,
+    upgrade_count: U64Counter,
+    exclusive_wait: DurationCounter,
+    shared_wait: DurationCounter,
+    upgradeable_wait: DurationCounter,
+    upgrade_wait: DurationCounter,
+}
+
+impl LockMetrics {
+    pub fn new(registry: &metric::Registry, attributes: impl Into<Attributes>) -> Self {
+        let mut attributes = attributes.into();
+
+        let count: Metric<U64Counter> = registry.register_metric(
+            "catalog_lock",
+            "number of times the tracked locks have been obtained",
+        );
+
+        let wait: Metric<DurationCounter> = registry.register_metric(
+            "catalog_lock_wait",
+            "time spent waiting to acquire any of the tracked locks",
+        );
+
+        attributes.insert("access", "exclusive");
+        let exclusive_count = count.recorder(attributes.clone());
+        let exclusive_wait = wait.recorder(attributes.clone());
+
+        attributes.insert("access", "shared");
+        let shared_count = count.recorder(attributes.clone());
+        let shared_wait = wait.recorder(attributes.clone());
+
+        attributes.insert("access", "upgradeable");
+        let upgradeable_count = count.recorder(attributes.clone());
+        let upgradeable_wait = wait.recorder(attributes.clone());
+
+        attributes.insert("access", "upgrade");
+        let upgrade_count = count.recorder(attributes.clone());
+        let upgrade_wait = wait.recorder(attributes);
+
+        Self {
+            exclusive_count,
+            shared_count,
+            upgradeable_count,
+            upgrade_count,
+            exclusive_wait,
+            shared_wait,
+            upgradeable_wait,
+            upgrade_wait,
+        }
+    }
+
+    pub fn new_unregistered() -> Self {
+        Self {
+            exclusive_count: Default::default(),
+            shared_count: Default::default(),
+            upgradeable_count: Default::default(),
+            upgrade_count: Default::default(),
+            exclusive_wait: Default::default(),
+            shared_wait: Default::default(),
+            upgradeable_wait: Default::default(),
+            upgrade_wait: Default::default(),
+        }
+    }
+
+    pub fn new_lock<T: Sized>(self: &Arc<Self>, t: T) -> RwLock<T> {
+        self.new_lock_raw(t)
+    }
+
+    pub fn new_lock_raw<R: lock_api::RawRwLock, T: Sized>(
+        self: &Arc<Self>,
+        t: T,
+    ) -> lock_api::RwLock<InstrumentedRawLock<R>, T> {
+        lock_api::RwLock::const_new(
+            InstrumentedRawLock {
+                inner: R::INIT,
+                metrics: Some(Arc::clone(self)),
+            },
+            t,
+        )
+    }
+
+    pub fn new_mutex<T: Sized>(self: &Arc<Self>, t: T) -> Mutex<T> {
+        self.new_mutex_raw(t)
+    }
+
+    pub fn new_mutex_raw<R: lock_api::RawMutex, T: Sized>(
+        self: &Arc<Self>,
+        t: T,
+    ) -> lock_api::Mutex<InstrumentedRawLock<R>, T> {
+        lock_api::Mutex::const_new(
+            InstrumentedRawLock {
+                inner: R::INIT,
+                metrics: Some(Arc::clone(self)),
+            },
+            t,
+        )
+    }
+}
+
+/// The RAII-goop for locks is provided by lock_api with individual crates
+/// such as parking_lot providing raw lock implementations
+///
+/// This is a raw lock implementation that wraps another and instruments it
+#[derive(Debug)]
+pub struct InstrumentedRawLock<R: Sized> {
+    inner: R,
+
+    /// Stores the tracking data if any
+    ///
+    /// RawRwLocks must be able to be constructed in a const context, for example,
+    /// as the associated constant RawRwLock::INIT.
+    ///
+    /// Arc, however, does not have a const constructor.
+    ///
+    /// This field is therefore optional. There is no way to access
+    /// this field from a RwLock anyway, so ultimately it makes no difference
+    /// that tracking is effectively disabled for default constructed locks
+    metrics: Option<Arc<LockMetrics>>,
+}
+
+/// # Safety
+///
+/// Implementations of this trait must ensure that the `RwLock` is actually
+/// exclusive: an exclusive lock can't be acquired while an exclusive or shared
+/// lock exists, and a shared lock can't be acquire while an exclusive lock
+/// exists.
+///
+/// This is done by delegating to the wrapped RawRwLock implementation
+unsafe impl<R: lock_api::RawRwLock + Sized> lock_api::RawRwLock for InstrumentedRawLock<R> {
+    const INIT: Self = Self {
+        inner: R::INIT,
+        metrics: None,
+    };
+    type GuardMarker = R::GuardMarker;
+
+    /// Acquires a shared lock, blocking the current thread until it is able to do so.
+    fn lock_shared(&self) {
+        match &self.metrics {
+            Some(shared) => {
+                // Early return if possible - Instant::now is not necessarily cheap
+                if self.try_lock_shared() {
+                    return;
+                }
+
+                let now = std::time::Instant::now();
+                self.inner.lock_shared();
+                let elapsed = now.elapsed();
+                shared.shared_count.inc(1);
+                shared.shared_wait.inc(elapsed);
+            }
+            None => self.inner.lock_shared(),
+        }
+    }
+
+    /// Attempts to acquire a shared lock without blocking.
+    fn try_lock_shared(&self) -> bool {
+        let ret = self.inner.try_lock_shared();
+        if let Some(shared) = &self.metrics {
+            if ret {
+                shared.shared_count.inc(1);
+            }
+        }
+        ret
+    }
+
+    /// Releases a shared lock.
+    ///
+    /// # Safety
+    ///
+    /// This method may only be called if a shared lock is held in the current context.
+    #[inline]
+    unsafe fn unlock_shared(&self) {
+        self.inner.unlock_shared()
+    }
+
+    /// Acquires an exclusive lock, blocking the current thread until it is able to do so.
+    fn lock_exclusive(&self) {
+        match &self.metrics {
+            Some(shared) => {
+                // Early return if possible - Instant::now is not necessarily cheap
+                if self.try_lock_exclusive() {
+                    return;
+                }
+
+                let now = std::time::Instant::now();
+                self.inner.lock_exclusive();
+                let elapsed = now.elapsed();
+                shared.exclusive_count.inc(1);
+                shared.exclusive_wait.inc(elapsed);
+            }
+            None => self.inner.lock_exclusive(),
+        }
+    }
+
+    /// Attempts to acquire an exclusive lock without blocking.
+    fn try_lock_exclusive(&self) -> bool {
+        let ret = self.inner.try_lock_exclusive();
+        if let Some(shared) = &self.metrics {
+            if ret {
+                shared.exclusive_count.inc(1);
+            }
+        }
+        ret
+    }
+
+    /// Releases an exclusive lock.
+    ///
+    /// # Safety
+    ///
+    /// This method may only be called if an exclusive lock is held in the current context.
+    #[inline]
+    unsafe fn unlock_exclusive(&self) {
+        self.inner.unlock_exclusive()
+    }
+
+    /// Checks if this `RwLock` is currently locked in any way.
+    #[inline]
+    fn is_locked(&self) -> bool {
+        self.inner.is_locked()
+    }
+}
+
+/// # Safety
+///
+/// Implementations of this trait must ensure that the `RwLock` is actually
+/// exclusive: an exclusive lock can't be acquired while an exclusive or shared
+/// lock exists, and a shared lock can't be acquire while an exclusive lock
+/// exists.
+///
+/// This is done by delegating to the wrapped RawRwLock implementation
+unsafe impl<R: lock_api::RawRwLockUpgrade + Sized> lock_api::RawRwLockUpgrade
+    for InstrumentedRawLock<R>
+{
+    fn lock_upgradable(&self) {
+        match &self.metrics {
+            Some(shared) => {
+                // Early return if possible - Instant::now is not necessarily cheap
+                if self.try_lock_upgradable() {
+                    return;
+                }
+
+                let now = std::time::Instant::now();
+                self.inner.lock_upgradable();
+                let elapsed = now.elapsed();
+                shared.upgradeable_count.inc(1);
+                shared.upgradeable_wait.inc(elapsed);
+            }
+            None => self.inner.lock_upgradable(),
+        }
+    }
+
+    fn try_lock_upgradable(&self) -> bool {
+        let ret = self.inner.try_lock_upgradable();
+        if let Some(shared) = &self.metrics {
+            if ret {
+                shared.upgradeable_count.inc(1);
+            }
+        }
+        ret
+    }
+
+    unsafe fn unlock_upgradable(&self) {
+        self.inner.unlock_upgradable()
+    }
+
+    unsafe fn upgrade(&self) {
+        match &self.metrics {
+            Some(shared) => {
+                // Early return if possible - Instant::now is not necessarily cheap
+                if self.try_upgrade() {
+                    return;
+                }
+
+                let now = std::time::Instant::now();
+                self.inner.upgrade();
+                let elapsed = now.elapsed();
+                shared.upgrade_count.inc(1);
+                shared.upgrade_wait.inc(elapsed);
+            }
+            None => self.inner.upgrade(),
+        }
+    }
+
+    unsafe fn try_upgrade(&self) -> bool {
+        let ret = self.inner.try_upgrade();
+        if let Some(shared) = &self.metrics {
+            if ret {
+                shared.upgrade_count.inc(1);
+            }
+        }
+        ret
+    }
+}
+
+/// # Safety
+///
+/// Implementations of this trait must ensure that the `Mutex` is actually
+/// exclusive: an exclusive lock can't be acquired while another exclusive
+/// lock exists.
+///
+/// This is done by delegating to the wrapped RawMutex implementation
+unsafe impl<R: lock_api::RawMutex + Sized> lock_api::RawMutex for InstrumentedRawLock<R> {
+    const INIT: Self = Self {
+        inner: R::INIT,
+        metrics: None,
+    };
+
+    type GuardMarker = R::GuardMarker;
+
+    fn lock(&self) {
+        match &self.metrics {
+            Some(shared) => {
+                // Early return if possible - Instant::now is not necessarily cheap
+                if self.try_lock() {
+                    return;
+                }
+
+                let now = std::time::Instant::now();
+                self.inner.lock();
+                let elapsed = now.elapsed();
+                shared.exclusive_count.inc(1);
+                shared.exclusive_wait.inc(elapsed);
+            }
+            None => self.inner.lock(),
+        }
+    }
+
+    fn try_lock(&self) -> bool {
+        let ret = self.inner.try_lock();
+        if let Some(shared) = &self.metrics {
+            if ret {
+                shared.exclusive_count.inc(1);
+            }
+        }
+        ret
+    }
+
+    unsafe fn unlock(&self) {
+        self.inner.unlock()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    // Clippy isn't recognizing the explicit drops; none of these locks are actually being held
+    // across await points. See <https://github.com/rust-lang/rust-clippy/issues/6446>
+    #![allow(clippy::await_holding_lock)]
+
+    use super::*;
+    use std::time::Duration;
+
+    #[test]
+    fn test_rwlock_counts() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let lock = metrics.new_lock(32);
+
+        let r = lock.read();
+        drop(r);
+
+        let w = lock.write();
+        drop(w);
+
+        let r = lock.read();
+        drop(r);
+
+        assert_eq!(metrics.exclusive_count.fetch(), 1);
+        assert_eq!(metrics.shared_count.fetch(), 2);
+    }
+
+    #[test]
+    fn test_mutex_counts() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let mutex = metrics.new_mutex(32);
+
+        let g = mutex.lock();
+        drop(g);
+
+        let g = mutex.lock();
+        drop(g);
+
+        assert_eq!(metrics.exclusive_count.fetch(), 2);
+        assert_eq!(metrics.shared_count.fetch(), 0);
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_shared_wait_time() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+
+        let l1 = Arc::new(metrics.new_lock(32));
+        let l2 = Arc::clone(&l1);
+
+        let write = l1.write();
+        let join = tokio::spawn(async move {
+            let _w2 = l2.read();
+        });
+
+        std::thread::sleep(Duration::from_millis(100));
+        std::mem::drop(write);
+
+        join.await.unwrap();
+
+        assert_eq!(metrics.exclusive_count.fetch(), 1);
+        assert_eq!(metrics.shared_count.fetch(), 1);
+        assert!(metrics.exclusive_wait.fetch() < Duration::from_micros(100));
+        assert!(metrics.shared_wait.fetch() > Duration::from_millis(80));
+        assert!(metrics.shared_wait.fetch() < Duration::from_millis(200));
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_exclusive_wait_time() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let l1 = Arc::new(metrics.new_lock(32));
+        let l2 = Arc::clone(&l1);
+
+        let read = l1.read();
+        let join = tokio::spawn(async move {
+            let _w2 = l2.write();
+        });
+
+        std::thread::sleep(Duration::from_millis(100));
+        std::mem::drop(read);
+
+        join.await.unwrap();
+
+        assert_eq!(metrics.exclusive_count.fetch(), 1);
+        assert_eq!(metrics.shared_count.fetch(), 1);
+        assert!(metrics.shared_wait.fetch() < Duration::from_micros(100));
+        assert!(metrics.exclusive_wait.fetch() > Duration::from_millis(80));
+        assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200));
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_mutex_wait_time() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let l1 = Arc::new(metrics.new_mutex(32));
+        let l2 = Arc::clone(&l1);
+
+        let g = l1.lock();
+        let join = tokio::spawn(async move {
+            let _g = l2.lock();
+        });
+
+        std::thread::sleep(Duration::from_millis(100));
+        std::mem::drop(g);
+
+        join.await.unwrap();
+
+        assert_eq!(metrics.exclusive_count.fetch(), 2);
+        assert_eq!(metrics.shared_count.fetch(), 0);
+        assert_eq!(metrics.shared_wait.fetch(), Duration::ZERO);
+        assert!(metrics.exclusive_wait.fetch() > Duration::from_millis(80));
+        assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200));
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_multiple() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let l1 = Arc::new(metrics.new_lock(32));
+        let l1_captured = Arc::clone(&l1);
+        let l2 = Arc::new(metrics.new_lock(12));
+        let l2_captured = Arc::clone(&l2);
+
+        let r1 = l1.read();
+        let w2 = l2.write();
+        let join = tokio::spawn(async move {
+            let _w1 = l1_captured.write();
+            let _r2 = l2_captured.read();
+        });
+
+        std::thread::sleep(Duration::from_millis(100));
+        std::mem::drop(w2);
+        std::mem::drop(r1);
+
+        join.await.unwrap();
+
+        assert_eq!(metrics.exclusive_count.fetch(), 2);
+        assert_eq!(metrics.shared_count.fetch(), 2);
+        assert!(metrics.shared_wait.fetch() < Duration::from_micros(100));
+        assert!(metrics.exclusive_wait.fetch() > Duration::from_millis(80));
+        assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200));
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_upgradeable() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let l1 = Arc::new(metrics.new_lock(32));
+        let l1_captured = Arc::clone(&l1);
+
+        let r1 = l1.upgradable_read();
+        let join = tokio::spawn(async move {
+            let _w1 = l1_captured.write();
+        });
+
+        std::thread::sleep(Duration::from_millis(100));
+        std::mem::drop(r1);
+
+        join.await.unwrap();
+
+        assert_eq!(metrics.exclusive_count.fetch(), 1);
+        assert_eq!(metrics.shared_count.fetch(), 0);
+        assert_eq!(metrics.upgradeable_count.fetch(), 1);
+        assert_eq!(metrics.upgrade_count.fetch(), 0);
+
+        assert_eq!(metrics.upgrade_wait.fetch(), Duration::from_nanos(0));
+        assert_eq!(metrics.shared_wait.fetch(), Duration::from_nanos(0));
+        assert!(metrics.upgradeable_wait.fetch() < Duration::from_micros(100));
+        assert!(metrics.exclusive_wait.fetch() > Duration::from_millis(80));
+        assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200));
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_upgrade() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let l1 = Arc::new(metrics.new_lock(32));
+        let l1_captured = Arc::clone(&l1);
+
+        let r1 = l1.read();
+        let join = tokio::spawn(async move {
+            let ur1 = l1_captured.upgradable_read();
+            let _w1 = RwLockUpgradableReadGuard::upgrade(ur1);
+        });
+
+        std::thread::sleep(Duration::from_millis(100));
+        std::mem::drop(r1);
+
+        join.await.unwrap();
+
+        assert_eq!(metrics.exclusive_count.fetch(), 0);
+        assert_eq!(metrics.shared_count.fetch(), 1);
+        assert_eq!(metrics.upgradeable_count.fetch(), 1);
+        assert_eq!(metrics.upgrade_count.fetch(), 1);
+
+        assert_eq!(metrics.exclusive_wait.fetch(), Duration::from_nanos(0));
+        assert!(metrics.shared_wait.fetch() < Duration::from_micros(100));
+        assert!(metrics.upgradeable_wait.fetch() < Duration::from_micros(100));
+        assert!(metrics.upgrade_wait.fetch() > Duration::from_millis(80));
+        assert!(metrics.upgrade_wait.fetch() < Duration::from_millis(200));
+    }
+}
diff --git a/tracker/src/task.rs b/tracker/src/task.rs
new file mode 100644
index 0000000..1631f93
--- /dev/null
+++ b/tracker/src/task.rs
@@ -0,0 +1,1139 @@
+//! This module contains a future tracking system supporting fanout,
+//! cancellation and asynchronous signalling of completion
+//!
+//! A Tracker is created by calling TrackerRegistry::register. TrackedFutures
+//! can then be associated with this Tracker and monitored and/or cancelled.
+//!
+//! This is used within IOx to track futures spawned as multiple tokio tasks.
+//!
+//! For example, when migrating a chunk from the mutable buffer to the read
+//! buffer:
+//!
+//! - There is a single over-arching Job being performed
+//! - A single tracker is allocated from a TrackerRegistry in Server and
+//!   associated with the Job metadata
+//! - This tracker is registered with every future that is spawned as a tokio
+//!   task
+//!
+//! This same system may in future form part of a query tracking story
+//!
+//! # Correctness
+//!
+//! The key correctness property of the Tracker system is Tracker::get_status
+//! only returns Complete when all futures associated with the tracker have
+//! completed and no more can be spawned. Additionally at such a point
+//! all metrics - cpu_nanos, wall_nanos, created_futures should be visible
+//! to the calling thread
+//!
+//! Note: there is no guarantee that pending_registrations or pending_futures
+//! ever reaches 0, a program could call mem::forget on a TrackerRegistration,
+//! leak the TrackerRegistration, spawn a future that never finishes, etc...
+//! Such a program would never consider the Tracker complete and therefore this
+//! doesn't violate the correctness property
+//!
+//! ## Proof
+//!
+//! 1. pending_registrations starts at 1, and is incremented on
+//! TrackerRegistration::clone. A TrackerRegistration cannot be created from an
+//! existing TrackerState, only another TrackerRegistration
+//!
+//! 2. pending_registrations is decremented with release semantics on
+//! TrackerRegistration::drop
+//!
+//! 3. pending_futures is only incremented with a TrackerRegistration in scope
+//!
+//! 4. 2. + 3. -> A thread that increments pending_futures, decrements
+//! pending_registrations with release semantics afterwards. By definition of
+//! release semantics these writes to pending_futures cannot be reordered to
+//! come after the atomic decrement of pending_registrations
+//!
+//! 5. 1. + 2. + drop cannot be called multiple times on the same object -> once
+//! pending_registrations is decremented to 0 it can never be incremented again
+//!
+//! 6. 4. + 5. -> the decrement to 0 of pending_registrations must commit after
+//! the last increment of pending_futures
+//!
+//! 7. pending_registrations is loaded with acquire semantics
+//!
+//! 8. By definition of acquire semantics, any thread that reads
+//! pending_registrations is guaranteed to see any increments to pending_futures
+//! performed before the most recent decrement of pending_registrations
+//!
+//! 9. 6. + 8. -> A thread that observes a pending_registrations of 0 cannot
+//! subsequently observe pending_futures to increase
+//!
+//! 10. Tracker::get_status returns Complete if it observes
+//! pending_registrations to be 0 and then pending_futures to be 0
+//!
+//! 11. 9 + 10 -> A thread can only observe a tracker to be complete
+//! after all futures have been dropped and no more can be created
+//!
+//! 12. pending_futures is decremented with Release semantics on
+//! TrackedFuture::drop after any associated metrics have been incremented
+//!
+//! 13. pending_futures is loaded with acquire semantics
+//!
+//! 14. 12. + 13. -> A thread that observes a pending_futures of 0 is guaranteed
+//! to see any metrics from any dropped TrackedFuture
+//!
+//! Note: this proof ignores the complexity of moving Trackers, TrackedFutures,
+//! etc... between threads as any such functionality must perform the necessary
+//! synchronisation to be well-formed.
+
+use iox_time::{Time, TimeProvider};
+use std::fmt::Formatter;
+use std::sync::{
+    atomic::{AtomicUsize, Ordering},
+    Arc,
+};
+
+use tokio_util::sync::CancellationToken;
+
+pub use future::{TrackedFuture, TrackedFutureExt};
+pub use history::TaskRegistryWithHistory;
+pub use metrics::{FAttributes, TaskRegistryWithMetrics};
+pub use registry::{AbstractTaskRegistry, TaskId, TaskRegistry};
+use tokio::sync::Notify;
+
+mod future;
+mod history;
+mod metrics;
+mod registry;
+
+/// The state shared between all sibling tasks
+#[derive(Debug)]
+struct TrackerState {
+    start_time: Time,
+    time_provider: Arc<dyn TimeProvider>,
+
+    cancel_token: CancellationToken,
+    cpu_nanos: AtomicUsize,
+    wall_nanos: AtomicUsize,
+
+    created_futures: AtomicUsize,
+    pending_futures: AtomicUsize,
+    pending_registrations: AtomicUsize,
+
+    ok_futures: AtomicUsize,
+    err_futures: AtomicUsize,
+    cancelled_futures: AtomicUsize,
+
+    notify: Notify,
+}
+
+impl TrackerState {
+    fn get_status(&self) -> TaskStatus {
+        // The atomic decrement in TrackerRegistration::drop has release semantics
+        // acquire here ensures that if a thread observes the tracker to have
+        // no pending_registrations it cannot subsequently observe pending_futures
+        // to increase. If it could, observing pending_futures==0 would be insufficient
+        // to conclude there are no outstanding futures
+        let pending_registrations = self.pending_registrations.load(Ordering::Acquire);
+
+        // The atomic decrement in TrackedFuture::drop has release semantics
+        // acquire therefore ensures that if a thread observes the completion of
+        // a TrackedFuture, it is guaranteed to see its updates (e.g. wall_nanos)
+        let pending_futures = self.pending_futures.load(Ordering::Acquire);
+
+        match (pending_registrations == 0, pending_futures == 0) {
+            (false, _) => TaskStatus::Creating,
+            (true, false) => TaskStatus::Running {
+                total_count: self.created_futures.load(Ordering::Relaxed),
+                pending_count: self.pending_futures.load(Ordering::Relaxed),
+                cpu_nanos: self.cpu_nanos.load(Ordering::Relaxed),
+            },
+            (true, true) => {
+                let total_count = self.created_futures.load(Ordering::Relaxed);
+                let success_count = self.ok_futures.load(Ordering::Relaxed);
+                let error_count = self.err_futures.load(Ordering::Relaxed);
+                let cancelled_count = self.cancelled_futures.load(Ordering::Relaxed);
+
+                // Failure of this would imply a future reported its completion status multiple
+                // times or a future was created without incrementing created_futures.
+                // Both of these should be impossible
+                let dropped_count = total_count
+                    .checked_sub(success_count + error_count + cancelled_count)
+                    .expect("invalid tracker state");
+
+                TaskStatus::Complete {
+                    total_count,
+                    success_count,
+                    error_count,
+                    cancelled_count,
+                    dropped_count,
+                    cpu_nanos: self.cpu_nanos.load(Ordering::Relaxed),
+                    wall_nanos: self.wall_nanos.load(Ordering::Relaxed),
+                }
+            }
+        }
+    }
+
+    fn is_complete(&self) -> bool {
+        matches!(self.get_status(), TaskStatus::Complete { .. })
+    }
+}
+
+/// Returns a summary of the task execution
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum TaskResult {
+    /// All futures completed successfully
+    Success,
+    /// Some futures were cancelled, and none were dropped or errored
+    Cancelled,
+    /// Some futures were dropped, and none errored
+    Dropped,
+    /// Some futures returned an error
+    Error,
+}
+
+impl TaskResult {
+    /// return a human readable name for this result
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::Success => "Success",
+            Self::Cancelled => "Cancelled",
+            Self::Dropped => "Dropped",
+            Self::Error => "Error",
+        }
+    }
+
+    /// Returns true if `self == TaskResult::Success`
+    pub fn success(&self) -> bool {
+        matches!(self, Self::Success)
+    }
+}
+
+/// The status of the tracked task
+#[derive(Debug, Clone, Eq, PartialEq, Copy)]
+pub enum TaskStatus {
+    /// More futures can be registered
+    Creating,
+
+    /// No more futures can be registered
+    ///
+    /// `pending_count` and `cpu_nanos` are best-effort -
+    /// they may not be the absolute latest values.
+    ///
+    /// `total_count` is guaranteed to be the final value
+    Running {
+        /// The number of created futures
+        total_count: usize,
+        /// The number of pending futures
+        pending_count: usize,
+        /// The total amount of CPU time spent executing the futures
+        cpu_nanos: usize,
+    },
+
+    /// All futures have been dropped and no more can be registered
+    ///
+    /// All values are guaranteed to be the final values
+    Complete {
+        /// The number of created futures
+        total_count: usize,
+        /// The number of futures that completed successfully
+        success_count: usize,
+        /// The number of futures that returned an error
+        error_count: usize,
+        /// The number of futures that were aborted
+        cancelled_count: usize,
+        /// The number of futures that were dropped without running to completion (e.g. panic)
+        dropped_count: usize,
+        /// The total amount of CPU time spent executing the futures
+        cpu_nanos: usize,
+        /// The number of nanoseconds between tracker registration and
+        /// the last TrackedFuture being dropped
+        wall_nanos: usize,
+    },
+}
+
+impl std::fmt::Display for TaskStatus {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Creating => {
+                write!(f, "TaskStatus(status=creating)")
+            }
+            Self::Running {
+                total_count,
+                pending_count,
+                cpu_nanos,
+            } => {
+                write!(
+                    f,
+                    "TaskStatus(status=running,total={},pending={},cpu={}s)",
+                    total_count,
+                    pending_count,
+                    (*cpu_nanos as f64) / 1_000_000_000_f64
+                )
+            }
+            Self::Complete {
+                total_count,
+                success_count,
+                error_count,
+                cancelled_count,
+                dropped_count,
+                cpu_nanos,
+                wall_nanos,
+            } => {
+                write!(
+                    f,
+                    "TaskStatus(status=complete,total={},success={},error={},cancelled={},dropped={},cpu={}s,wall={}s)",
+                    total_count,
+                    success_count,
+                    error_count,
+                    cancelled_count,
+                    dropped_count,
+                    (*cpu_nanos as f64) / 1_000_000_000_f64,
+                    (*wall_nanos as f64) / 1_000_000_000_f64
+                )
+            }
+        }
+    }
+}
+
+impl TaskStatus {
+    /// return a human readable name for this status
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::Creating => "Creating",
+            Self::Running { .. } => "Running",
+            Self::Complete { .. } => "Complete",
+        }
+    }
+
+    /// If the job is running or competed, returns the total amount of CPU time
+    /// spent executing futures
+    pub fn cpu_nanos(&self) -> Option<usize> {
+        match self {
+            Self::Creating => None,
+            Self::Running { cpu_nanos, .. } => Some(*cpu_nanos),
+            Self::Complete { cpu_nanos, .. } => Some(*cpu_nanos),
+        }
+    }
+
+    /// If the job has completed, returns the total amount of wall clock time
+    /// spent executing futures
+    pub fn wall_nanos(&self) -> Option<usize> {
+        match self {
+            Self::Creating => None,
+            Self::Running { .. } => None,
+            Self::Complete { wall_nanos, .. } => Some(*wall_nanos),
+        }
+    }
+
+    /// Returns the result of the job if it has completed, otherwise None
+    pub fn result(&self) -> Option<TaskResult> {
+        match self {
+            Self::Creating => None,
+            Self::Running { .. } => None,
+            Self::Complete {
+                total_count,
+                success_count,
+                error_count,
+                dropped_count,
+                cancelled_count,
+                ..
+            } => {
+                if *error_count != 0 {
+                    Some(TaskResult::Error)
+                } else if *dropped_count != 0 {
+                    Some(TaskResult::Dropped)
+                } else if *cancelled_count != 0 {
+                    Some(TaskResult::Cancelled)
+                } else {
+                    // Sanity check
+                    assert_eq!(total_count, success_count);
+                    Some(TaskResult::Success)
+                }
+            }
+        }
+    }
+}
+
+/// A Tracker can be used to monitor/cancel/wait for a set of associated futures
+#[derive(Debug)]
+pub struct TaskTracker<T>
+where
+    T: Send + Sync,
+{
+    id: TaskId,
+    state: Arc<TrackerState>,
+    metadata: Arc<T>,
+}
+
+impl<T> Clone for TaskTracker<T>
+where
+    T: Send + Sync,
+{
+    fn clone(&self) -> Self {
+        Self {
+            id: self.id,
+            state: Arc::clone(&self.state),
+            metadata: Arc::clone(&self.metadata),
+        }
+    }
+}
+
+impl<T: std::fmt::Display + Send + Sync> std::fmt::Display for TaskTracker<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        let status = self.state.get_status();
+        write!(
+            f,
+            "Task(start={},status={},metadata={})",
+            self.state.start_time,
+            status,
+            self.metadata.as_ref()
+        )
+    }
+}
+
+impl<T> TaskTracker<T>
+where
+    T: Send + Sync,
+{
+    /// Creates a new task tracker from the provided registration
+    pub fn new(id: TaskId, registration: &TaskRegistration, metadata: T) -> Self {
+        Self {
+            id,
+            metadata: Arc::new(metadata),
+            state: Arc::clone(&registration.state),
+        }
+    }
+
+    /// Consumes this `TaskTracker` and produces a new one with the provided metadata
+    pub fn with_metadata<U: Send + Sync>(self, metadata: U) -> TaskTracker<U> {
+        TaskTracker {
+            id: self.id,
+            state: self.state,
+            metadata: Arc::new(metadata),
+        }
+    }
+
+    /// Returns the ID of the Tracker - these are unique per TrackerRegistry
+    pub fn id(&self) -> TaskId {
+        self.id
+    }
+
+    /// Returns a reference to the metadata stored within this Tracker
+    pub fn metadata(&self) -> &T {
+        &self.metadata
+    }
+
+    /// Trigger graceful termination of any futures tracked by
+    /// this tracker
+    ///
+    /// Note: If the future is currently executing, termination
+    /// will only occur when the future yields (returns from poll)
+    /// and is then scheduled to run again
+    pub fn cancel(&self) {
+        self.state.cancel_token.cancel();
+    }
+
+    /// Returns true if all futures associated with this tracker have
+    /// been dropped and no more can be created
+    pub fn is_complete(&self) -> bool {
+        self.state.is_complete()
+    }
+
+    /// Gets the status of the tracker
+    pub fn get_status(&self) -> TaskStatus {
+        self.state.get_status()
+    }
+
+    /// Returns the instant the tracker was created
+    pub fn start_time(&self) -> Time {
+        self.state.start_time
+    }
+
+    /// Returns if this tracker has been cancelled
+    pub fn is_cancelled(&self) -> bool {
+        self.state.cancel_token.is_cancelled()
+    }
+
+    /// Blocks until all futures associated with the tracker have been
+    /// dropped and no more can be created
+    pub fn join(&self) -> impl std::future::Future<Output = ()> {
+        let state = Arc::clone(&self.state);
+        async move {
+            // Notify is notified when pending_futures hits 0 AND when pending_registrations
+            // hits 0. In almost all cases join won't be called before pending_registrations
+            // has already hit 0, but in the extremely rare case this occurs the loop
+            // handles the spurious wakeup
+            loop {
+                // Request notification before checking if complete
+                // to avoid a race condition
+                let notify = state.notify.notified();
+
+                if state.is_complete() {
+                    return;
+                }
+
+                notify.await
+            }
+        }
+    }
+}
+
+/// A TrackerRegistration is returned by TrackerRegistry::register and can be
+/// used to register new TrackedFutures
+///
+/// A tracker will not be considered completed until all TrackerRegistrations
+/// referencing it have been dropped. This is to prevent a race where further
+/// TrackedFutures are registered with a Tracker that has already signalled
+/// completion
+#[derive(Debug)]
+pub struct TaskRegistration {
+    state: Arc<TrackerState>,
+}
+
+impl Clone for TaskRegistration {
+    fn clone(&self) -> Self {
+        self.state
+            .pending_registrations
+            .fetch_add(1, Ordering::Relaxed);
+
+        Self {
+            state: Arc::clone(&self.state),
+        }
+    }
+}
+
+impl TaskRegistration {
+    pub fn new(time_provider: Arc<dyn TimeProvider>) -> Self {
+        let state = Arc::new(TrackerState {
+            start_time: time_provider.now(),
+            time_provider,
+            cpu_nanos: AtomicUsize::new(0),
+            wall_nanos: AtomicUsize::new(0),
+            cancel_token: CancellationToken::new(),
+            created_futures: AtomicUsize::new(0),
+            pending_futures: AtomicUsize::new(0),
+            pending_registrations: AtomicUsize::new(1),
+            ok_futures: AtomicUsize::new(0),
+            err_futures: AtomicUsize::new(0),
+            cancelled_futures: AtomicUsize::new(0),
+            notify: Notify::new(),
+        });
+
+        Self { state }
+    }
+
+    /// Converts the registration into a tracker with id 0 and specified metadata
+    pub fn into_tracker<T>(self, metadata: T) -> TaskTracker<T>
+    where
+        T: Send + Sync,
+    {
+        TaskTracker::new(TaskId(0), &self, metadata)
+    }
+}
+
+impl Drop for TaskRegistration {
+    fn drop(&mut self) {
+        // This synchronizes with the Acquire load in Tracker::get_status
+        let previous = self
+            .state
+            .pending_registrations
+            .fetch_sub(1, Ordering::Release);
+
+        // This implies a TrackerRegistration has been cloned without it incrementing
+        // the pending_registration counter
+        assert_ne!(previous, 0);
+
+        // Need to signal potential completion
+        if previous == 1 {
+            // Perform an acquire load to establish ordering with respect
+            // to all other decrements
+            self.state.pending_futures.load(Ordering::Acquire);
+
+            self.state.notify.notify_waiters();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use crate::task::registry::AbstractTaskRegistry;
+
+    use super::*;
+    use futures::FutureExt;
+    use std::convert::Infallible;
+    use tokio::sync::oneshot;
+
+    fn pending() -> futures::future::Pending<Result<(), Infallible>> {
+        futures::future::pending()
+    }
+
+    fn ready_ok() -> futures::future::Ready<Result<(), Infallible>> {
+        futures::future::ready(Ok(()))
+    }
+
+    fn test_registry<T: Send + Sync>() -> TaskRegistry<T> {
+        TaskRegistry::new(Arc::new(iox_time::SystemProvider::new()))
+    }
+
+    #[tokio::test]
+    async fn test_lifecycle() {
+        let (sender, receive) = oneshot::channel();
+        let mut registry = test_registry();
+        let (tracker, registration) = registry.register(());
+
+        tokio::spawn(receive.track(registration));
+
+        assert_eq!(registry.running().len(), 1);
+
+        sender.send(()).unwrap();
+        tracker.join().await;
+
+        assert_eq!(registry.running().len(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_interleaved() {
+        let (sender1, receive1) = oneshot::channel();
+        let (sender2, receive2) = oneshot::channel();
+        let mut registry = test_registry();
+        let (t1, registration1) = registry.register(1);
+        let (t2, registration2) = registry.register(2);
+
+        tokio::spawn(receive1.track(registration1));
+        tokio::spawn(receive2.track(registration2));
+
+        let tracked = sorted(registry.running());
+        assert_eq!(get_metadata(&tracked), vec![1, 2]);
+
+        sender2.send(()).unwrap();
+        t2.join().await;
+
+        let tracked: Vec<_> = sorted(registry.running());
+        assert_eq!(get_metadata(&tracked), vec![1]);
+
+        sender1.send(42).unwrap();
+        t1.join().await;
+
+        assert_eq!(registry.running().len(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_drop() {
+        let mut registry = test_registry();
+        let (_, registration) = registry.register(());
+
+        {
+            let f = pending().track(registration);
+
+            assert_eq!(registry.running().len(), 1);
+
+            std::mem::drop(f);
+        }
+
+        assert_eq!(registry.running().len(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_drop_multiple() {
+        let mut registry = test_registry();
+        let (_, registration) = registry.register(());
+
+        {
+            let f = pending().track(registration.clone());
+            {
+                let f = pending().track(registration);
+                assert_eq!(registry.running().len(), 1);
+                std::mem::drop(f);
+            }
+            assert_eq!(registry.running().len(), 1);
+            std::mem::drop(f);
+        }
+
+        assert_eq!(registry.running().len(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_terminate() {
+        let mut registry = test_registry();
+        let (_, registration) = registry.register(());
+
+        let task = tokio::spawn(pending().track(registration));
+
+        let tracked = registry.running();
+        assert_eq!(tracked.len(), 1);
+
+        tracked[0].cancel();
+        let result = task.await.unwrap();
+
+        assert!(result.is_err());
+        assert_eq!(registry.running().len(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_terminate_early() {
+        let mut registry = test_registry();
+        let (tracker, registration) = registry.register(());
+        tracker.cancel();
+
+        let task1 = tokio::spawn(pending().track(registration));
+        let result1 = task1.await.unwrap();
+
+        assert!(result1.is_err());
+        assert_eq!(registry.running().len(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_terminate_multiple() {
+        let mut registry = test_registry();
+        let (_, registration) = registry.register(());
+
+        let task1 = tokio::spawn(pending().track(registration.clone()));
+        let task2 = tokio::spawn(pending().track(registration));
+
+        let tracked = registry.running();
+        assert_eq!(tracked.len(), 1);
+
+        tracked[0].cancel();
+
+        let result1 = task1.await.unwrap();
+        let result2 = task2.await.unwrap();
+
+        assert!(result1.is_err());
+        assert!(result2.is_err());
+        assert_eq!(registry.running().len(), 0);
+    }
+
+    #[tokio::test]
+    async fn test_reclaim() {
+        let mut registry = test_registry();
+
+        let (_, registration1) = registry.register(1);
+        let (_, registration2) = registry.register(2);
+        let (_, registration3) = registry.register(3);
+
+        let task1 = tokio::spawn(pending().track(registration1.clone()));
+        let task2 = tokio::spawn(pending().track(registration1));
+        let task3 = tokio::spawn(ready_ok().track(registration2.clone()));
+        let task4 = tokio::spawn(pending().track(registration2));
+        let task5 = tokio::spawn(pending().track(registration3));
+
+        let running = sorted(registry.running());
+        let tracked = sorted(registry.tracked());
+
+        assert_eq!(running.len(), 3);
+        assert_eq!(get_metadata(&running), vec![1, 2, 3]);
+        assert_eq!(tracked.len(), 3);
+        assert_eq!(get_metadata(&tracked), vec![1, 2, 3]);
+
+        // Trigger termination of task1 and task2
+        running[0].cancel();
+
+        let result1 = task1.await.unwrap();
+        let result2 = task2.await.unwrap();
+
+        assert!(result1.is_err());
+        assert!(result2.is_err());
+
+        let running = sorted(registry.running());
+        let tracked = sorted(registry.tracked());
+
+        assert_eq!(running.len(), 2);
+        assert_eq!(get_metadata(&running), vec![2, 3]);
+        assert_eq!(tracked.len(), 3);
+        assert_eq!(get_metadata(&tracked), vec![1, 2, 3]);
+
+        // Expect reclaim to find now finished registration1
+        let reclaimed = sorted(registry.reclaim());
+        assert_eq!(reclaimed.len(), 1);
+        assert_eq!(get_metadata(&reclaimed), vec![1]);
+
+        // Now expect tracked to match running
+        let running = sorted(registry.running());
+        let tracked = sorted(registry.tracked());
+
+        assert_eq!(running.len(), 2);
+        assert_eq!(get_metadata(&running), vec![2, 3]);
+        assert_eq!(tracked.len(), 2);
+        assert_eq!(get_metadata(&tracked), vec![2, 3]);
+
+        // Wait for task3 to finish
+        let result3 = task3.await.unwrap();
+        assert!(result3.is_ok());
+
+        assert!(matches!(
+            tracked[0].get_status(),
+            TaskStatus::Running {
+                pending_count: 1,
+                total_count: 2,
+                ..
+            }
+        ));
+
+        // Trigger termination of task5
+        running[1].cancel();
+
+        let result5 = task5.await.unwrap();
+        assert!(result5.is_err());
+
+        let running = sorted(registry.running());
+        let tracked = sorted(registry.tracked());
+
+        assert_eq!(running.len(), 1);
+        assert_eq!(get_metadata(&running), vec![2]);
+        assert_eq!(tracked.len(), 2);
+        assert_eq!(get_metadata(&tracked), vec![2, 3]);
+
+        // Trigger termination of task4
+        running[0].cancel();
+
+        let result4 = task4.await.unwrap();
+        assert!(result4.is_err());
+        assert!(matches!(
+            running[0].get_status(),
+            TaskStatus::Complete { total_count: 2, .. }
+        ));
+
+        let reclaimed = sorted(registry.reclaim());
+
+        assert_eq!(reclaimed.len(), 2);
+        assert_eq!(get_metadata(&reclaimed), vec![2, 3]);
+        assert_eq!(registry.tracked().len(), 0);
+    }
+
+    // Use n+1 threads where n is the number of "blocking" tasks
+    // to prevent stalling the tokio executor
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn test_timing() {
+        let mut registry = test_registry();
+        let (tracker1, registration1) = registry.register(1);
+        let (tracker2, registration2) = registry.register(2);
+        let (tracker3, registration3) = registry.register(3);
+
+        let async_task = || async move {
+            tokio::time::sleep(Duration::from_millis(100)).await;
+            Ok::<_, Infallible>(())
+        };
+
+        let blocking_task = || async move {
+            std::thread::sleep(Duration::from_millis(100));
+            Ok::<_, Infallible>(())
+        };
+
+        let task1 = tokio::spawn(async_task().track(registration1));
+        let task2 = tokio::spawn(blocking_task().track(registration2));
+        let task3 = tokio::spawn(blocking_task().track(registration3.clone()));
+        let task4 = tokio::spawn(blocking_task().track(registration3));
+
+        task1.await.unwrap().unwrap().unwrap();
+        task2.await.unwrap().unwrap().unwrap();
+        task3.await.unwrap().unwrap().unwrap();
+        task4.await.unwrap().unwrap().unwrap();
+
+        let assert_fuzzy = |actual: usize, expected: std::time::Duration| {
+            // Number of milliseconds of toleration
+            let epsilon = Duration::from_millis(25).as_nanos() as usize;
+            let expected = expected.as_nanos() as usize;
+
+            // std::thread::sleep is guaranteed to take at least as long as requested
+            assert!(actual > expected, "Expected {expected} got {actual}");
+            assert!(
+                actual < expected.saturating_add(epsilon),
+                "Expected {expected} got {actual}"
+            );
+        };
+
+        let assert_complete = |status: TaskStatus,
+                               expected_cpu: std::time::Duration,
+                               expected_wall: std::time::Duration| {
+            match status {
+                TaskStatus::Complete {
+                    cpu_nanos,
+                    wall_nanos,
+                    ..
+                } => {
+                    assert_fuzzy(cpu_nanos, expected_cpu);
+                    assert_fuzzy(wall_nanos, expected_wall);
+                }
+                _ => panic!("expected complete got {status:?}"),
+            }
+        };
+
+        assert_complete(
+            tracker1.get_status(),
+            Duration::from_millis(0),
+            Duration::from_millis(100),
+        );
+        assert_complete(
+            tracker2.get_status(),
+            Duration::from_millis(100),
+            Duration::from_millis(100),
+        );
+        assert_complete(
+            tracker3.get_status(),
+            Duration::from_millis(200),
+            Duration::from_millis(100),
+        );
+    }
+
+    #[tokio::test]
+    async fn test_register_race() {
+        let mut registry = test_registry();
+        let (_, registration) = registry.register(());
+
+        let task1 = tokio::spawn(ready_ok().track(registration.clone()));
+        task1.await.unwrap().unwrap().unwrap();
+
+        let tracked = registry.tracked();
+        assert_eq!(tracked.len(), 1);
+        assert!(matches!(&tracked[0].get_status(), TaskStatus::Creating));
+
+        // Should only consider tasks complete once cannot register more Futures
+        let reclaimed = registry.reclaim();
+        assert_eq!(reclaimed.len(), 0);
+
+        let task2 = tokio::spawn(ready_ok().track(registration));
+        task2.await.unwrap().unwrap().unwrap();
+
+        let reclaimed = registry.reclaim();
+        assert_eq!(reclaimed.len(), 1);
+    }
+
+    #[tokio::test]
+    async fn test_failure() {
+        let mut registry = test_registry();
+        let zero_clocks = |mut status: TaskStatus| {
+            match &mut status {
+                TaskStatus::Creating => {}
+                TaskStatus::Running { cpu_nanos, .. } => {
+                    *cpu_nanos = 0;
+                }
+                TaskStatus::Complete {
+                    wall_nanos,
+                    cpu_nanos,
+                    ..
+                } => {
+                    *wall_nanos = 0;
+                    *cpu_nanos = 0;
+                }
+            }
+            status
+        };
+
+        let (task, registration) = registry.register(());
+        let (sender, receiver) = oneshot::channel();
+        let handle = tokio::spawn(receiver.track(registration));
+
+        sender.send(()).unwrap();
+        handle.await.unwrap().unwrap().unwrap();
+        assert_eq!(task.get_status().result(), Some(TaskResult::Success));
+        assert_eq!(
+            zero_clocks(task.get_status()),
+            TaskStatus::Complete {
+                total_count: 1,
+                success_count: 1,
+                error_count: 0,
+                cancelled_count: 0,
+                dropped_count: 0,
+                cpu_nanos: 0,
+                wall_nanos: 0
+            }
+        );
+
+        let (task, registration) = registry.register(());
+        let (sender, receiver) = oneshot::channel::<()>();
+        let handle = tokio::spawn(receiver.track(registration));
+
+        std::mem::drop(sender);
+        handle.await.unwrap().unwrap().expect_err("expected error");
+        assert_eq!(task.get_status().result(), Some(TaskResult::Error));
+        assert_eq!(
+            zero_clocks(task.get_status()),
+            TaskStatus::Complete {
+                total_count: 1,
+                success_count: 0,
+                error_count: 1,
+                cancelled_count: 0,
+                dropped_count: 0,
+                cpu_nanos: 0,
+                wall_nanos: 0
+            }
+        );
+
+        let (task, registration) = registry.register(());
+        let handle = tokio::spawn(pending().track(registration));
+
+        task.cancel();
+        handle.await.unwrap().expect_err("expected aborted");
+
+        assert_eq!(task.get_status().result(), Some(TaskResult::Cancelled));
+        assert_eq!(
+            zero_clocks(task.get_status()),
+            TaskStatus::Complete {
+                total_count: 1,
+                success_count: 0,
+                error_count: 0,
+                cancelled_count: 1,
+                dropped_count: 0,
+                cpu_nanos: 0,
+                wall_nanos: 0
+            }
+        );
+
+        let (task, registration) = registry.register(());
+        std::mem::drop(pending().track(registration));
+
+        assert_eq!(task.get_status().result(), Some(TaskResult::Dropped));
+        assert_eq!(
+            zero_clocks(task.get_status()),
+            TaskStatus::Complete {
+                total_count: 1,
+                success_count: 0,
+                error_count: 0,
+                cancelled_count: 0,
+                dropped_count: 1,
+                cpu_nanos: 0,
+                wall_nanos: 0
+            }
+        );
+
+        let (task, registration) = registry.register(());
+        let handle = tokio::spawn(
+            async move {
+                tokio::time::sleep(tokio::time::Duration::from_micros(1)).await;
+                panic!("test");
+            }
+            .inspect(|_output: &Result<(), Infallible>| {})
+            .track(registration),
+        );
+
+        handle.await.unwrap_err();
+
+        assert_eq!(task.get_status().result(), Some(TaskResult::Dropped));
+        assert_eq!(
+            zero_clocks(task.get_status()),
+            TaskStatus::Complete {
+                total_count: 1,
+                success_count: 0,
+                error_count: 0,
+                cancelled_count: 0,
+                dropped_count: 1,
+                cpu_nanos: 0,
+                wall_nanos: 0
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn test_join() {
+        use std::future::Future;
+        use std::task::Poll;
+
+        let mut registry = test_registry();
+        let (tracker, registration) = registry.register(());
+
+        let (s1, r1) = oneshot::channel();
+        let task1 = tokio::spawn(r1.track(registration.clone()));
+
+        let (s2, r2) = oneshot::channel();
+        let task2 = tokio::spawn(r2.track(registration.clone()));
+
+        // This executor goop is necessary to get a future into
+        // a state where it is waiting on the Notify resource
+
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+        let fut_tracker = tracker.clone();
+        let fut = fut_tracker.join();
+        futures::pin_mut!(fut);
+
+        let poll = std::pin::Pin::new(&mut fut).poll(&mut cx);
+        assert_eq!(poll, Poll::Pending);
+
+        assert!(matches!(tracker.get_status(), TaskStatus::Creating));
+
+        s1.send(()).unwrap();
+        task1.await.unwrap().unwrap().unwrap();
+
+        assert!(matches!(tracker.get_status(), TaskStatus::Creating));
+
+        let poll = std::pin::Pin::new(&mut fut).poll(&mut cx);
+        assert_eq!(poll, Poll::Pending);
+
+        s2.send(()).unwrap();
+        task2.await.unwrap().unwrap().unwrap();
+
+        assert!(matches!(tracker.get_status(), TaskStatus::Creating));
+
+        let poll = std::pin::Pin::new(&mut fut).poll(&mut cx);
+        assert_eq!(poll, Poll::Pending);
+
+        std::mem::drop(registration);
+
+        assert!(matches!(tracker.get_status(), TaskStatus::Complete { .. }));
+
+        let poll = std::pin::Pin::new(&mut fut).poll(&mut cx);
+        assert_eq!(poll, Poll::Ready(()));
+    }
+
+    #[tokio::test]
+    async fn test_join_no_registration() {
+        use std::future::Future;
+        use std::task::Poll;
+
+        let mut registry = test_registry();
+        let (tracker, registration) = registry.register(());
+
+        // This executor goop is necessary to get a future into
+        // a state where it is waiting on the Notify resource
+
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+        let fut = tracker.join();
+        futures::pin_mut!(fut);
+
+        let poll = std::pin::Pin::new(&mut fut).poll(&mut cx);
+
+        assert_eq!(poll, Poll::Pending);
+
+        std::mem::drop(registration);
+
+        let poll = std::pin::Pin::new(&mut fut).poll(&mut cx);
+
+        assert_eq!(poll, Poll::Ready(()));
+    }
+
+    #[test]
+    fn display() {
+        let state = TaskStatus::Creating;
+        assert_eq!(state.to_string(), "TaskStatus(status=creating)");
+
+        let state = TaskStatus::Running {
+            total_count: 45,
+            pending_count: 3,
+            cpu_nanos: 3_365_354_646,
+        };
+        assert_eq!(
+            state.to_string(),
+            "TaskStatus(status=running,total=45,pending=3,cpu=3.365354646s)"
+        );
+
+        let state = TaskStatus::Complete {
+            total_count: 45,
+            success_count: 40,
+            error_count: 2,
+            cancelled_count: 1,
+            dropped_count: 2,
+            cpu_nanos: 33_653_354_646,
+            wall_nanos: 456_235_452,
+        };
+        assert_eq!(state.to_string(), "TaskStatus(status=complete,total=45,success=40,error=2,cancelled=1,dropped=2,cpu=33.653354646s,wall=0.456235452s)");
+    }
+
+    fn sorted(mut input: Vec<TaskTracker<i32>>) -> Vec<TaskTracker<i32>> {
+        input.sort_unstable_by_key(|x| *x.metadata());
+        input
+    }
+
+    fn get_metadata(input: &[TaskTracker<i32>]) -> Vec<i32> {
+        let mut ret: Vec<_> = input.iter().map(|x| *x.metadata()).collect();
+        ret.sort_unstable();
+        ret
+    }
+}
diff --git a/tracker/src/task/future.rs b/tracker/src/task/future.rs
new file mode 100644
index 0000000..f437292
--- /dev/null
+++ b/tracker/src/task/future.rs
@@ -0,0 +1,122 @@
+use std::pin::Pin;
+use std::sync::atomic::Ordering;
+use std::task::{Context, Poll};
+use std::time::Instant;
+
+use futures::{future::BoxFuture, prelude::*};
+use pin_project::{pin_project, pinned_drop};
+
+use super::{TaskRegistration, TrackerState};
+use std::sync::Arc;
+
+/// An extension trait that provides `self.track(registration)` allowing
+/// associating this future with a `TrackerRegistration`
+pub trait TrackedFutureExt: TryFuture {
+    fn track(self, registration: TaskRegistration) -> TrackedFuture<Self>
+    where
+        Self: Sized,
+    {
+        let tracker = Arc::clone(&registration.state);
+        let token = tracker.cancel_token.clone();
+
+        tracker.created_futures.fetch_add(1, Ordering::Relaxed);
+        tracker.pending_futures.fetch_add(1, Ordering::Relaxed);
+
+        // This must occur after the increment of pending_futures
+        std::mem::drop(registration);
+
+        // The future returned by CancellationToken::cancelled borrows the token
+        // In order to ensure we get a future with a static lifetime
+        // we box them up together and let async work its magic
+        let cancel = Box::pin(async move { token.cancelled().await });
+
+        TrackedFuture {
+            inner: self,
+            cancel,
+            tracker,
+            complete: false,
+        }
+    }
+}
+
+impl<T: ?Sized> TrackedFutureExt for T where T: TryFuture {}
+
+/// The `Future` returned by `TrackedFutureExt::track()`
+/// Unregisters the future from the registered `TrackerRegistry` on drop
+/// and provides the early termination functionality used by
+/// `TrackerRegistry::terminate`
+#[pin_project(PinnedDrop)]
+#[allow(missing_debug_implementations)]
+pub struct TrackedFuture<F: TryFuture> {
+    #[pin]
+    inner: F,
+    #[pin]
+    cancel: BoxFuture<'static, ()>,
+    tracker: Arc<TrackerState>,
+    complete: bool,
+}
+
+impl<F: TryFuture> Future for TrackedFuture<F> {
+    type Output = Result<Result<F::Ok, F::Error>, future::Aborted>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        assert!(!self.complete, "It is illegal to poll a completed future");
+        if self.as_mut().project().cancel.poll(cx).is_ready() {
+            *self.as_mut().project().complete = true;
+            self.tracker
+                .cancelled_futures
+                .fetch_add(1, Ordering::Relaxed);
+            return Poll::Ready(Err(future::Aborted {}));
+        }
+
+        let start = Instant::now();
+        let poll = self.as_mut().project().inner.try_poll(cx);
+        let delta = start.elapsed().as_nanos() as usize;
+
+        self.tracker.cpu_nanos.fetch_add(delta, Ordering::Relaxed);
+
+        match poll {
+            Poll::Pending => Poll::Pending,
+            Poll::Ready(v) => {
+                match v.is_ok() {
+                    true => self.tracker.ok_futures.fetch_add(1, Ordering::Relaxed),
+                    false => self.tracker.err_futures.fetch_add(1, Ordering::Relaxed),
+                };
+
+                *self.as_mut().project().complete = true;
+                Poll::Ready(Ok(v))
+            }
+        }
+    }
+}
+
+#[pinned_drop]
+impl<F: TryFuture> PinnedDrop for TrackedFuture<F> {
+    fn drop(self: Pin<&mut Self>) {
+        let state: &TrackerState = self.project().tracker;
+
+        let now = state.time_provider.now();
+        let wall_nanos = now
+            .checked_duration_since(state.start_time)
+            .unwrap_or_default()
+            .as_nanos() as usize;
+
+        state.wall_nanos.fetch_max(wall_nanos, Ordering::Relaxed);
+
+        // This synchronizes with the Acquire load in Tracker::get_status
+        let previous = state.pending_futures.fetch_sub(1, Ordering::Release);
+
+        // Failure implies a TrackedFuture has somehow been created
+        // without it incrementing the pending_futures counter
+        assert_ne!(previous, 0);
+
+        // Need to signal potential completion
+        if previous == 1 {
+            // Perform an acquire load to establish ordering with respect
+            // to all other decrements
+            state.pending_futures.load(Ordering::Acquire);
+
+            state.notify.notify_waiters();
+        }
+    }
+}
diff --git a/tracker/src/task/history.rs b/tracker/src/task/history.rs
new file mode 100644
index 0000000..3ef8e2e
--- /dev/null
+++ b/tracker/src/task/history.rs
@@ -0,0 +1,222 @@
+use super::registry::AbstractTaskRegistry;
+use super::{TaskId, TaskRegistration, TaskTracker};
+use hashbrown::hash_map::Entry;
+use hashbrown::HashMap;
+use observability_deps::tracing::*;
+use std::hash::Hash;
+
+/// A wrapper around a TaskRegistry that automatically retains a history
+#[derive(Debug)]
+pub struct TaskRegistryWithHistory<T, R>
+where
+    T: std::fmt::Debug + Send + Sync,
+    R: AbstractTaskRegistry<T>,
+{
+    registry: R,
+    history: SizeLimitedHashMap<TaskId, TaskTracker<T>>,
+}
+
+impl<T, R> TaskRegistryWithHistory<T, R>
+where
+    T: std::fmt::Debug + Send + Sync,
+    R: AbstractTaskRegistry<T>,
+{
+    pub fn new(inner: R, capacity: usize) -> Self {
+        Self {
+            history: SizeLimitedHashMap::new(capacity),
+            registry: inner,
+        }
+    }
+}
+
+impl<T, R> AbstractTaskRegistry<T> for TaskRegistryWithHistory<T, R>
+where
+    T: std::fmt::Debug + Send + Sync,
+    R: AbstractTaskRegistry<T>,
+{
+    fn register(&mut self, metadata: T) -> (TaskTracker<T>, TaskRegistration) {
+        self.registry.register(metadata)
+    }
+
+    fn get(&self, id: TaskId) -> Option<TaskTracker<T>> {
+        match self.history.get(&id) {
+            Some(x) => Some(x.clone()),
+            None => self.registry.get(id),
+        }
+    }
+
+    fn tracked_len(&self) -> usize {
+        self.registry.tracked_len()
+    }
+
+    fn tracked(&self) -> Vec<TaskTracker<T>> {
+        let mut tracked = self.registry.tracked();
+        tracked.extend(self.history.values().cloned());
+        tracked
+    }
+
+    fn running(&self) -> Vec<TaskTracker<T>> {
+        self.registry.running()
+    }
+
+    fn reclaim(&mut self) -> Vec<TaskTracker<T>> {
+        let mut pruned = vec![];
+
+        for job in self.registry.reclaim() {
+            debug!(?job, "job finished");
+            if let Some((_pruned_id, pruned_job)) = self.history.push(job.id(), job) {
+                pruned.push(pruned_job);
+            }
+        }
+
+        pruned
+    }
+}
+
+/// A size limited hashmap that maintains a finite number
+/// of key value pairs providing O(1) key lookups
+///
+/// Inserts over the capacity will overwrite previous values
+#[derive(Debug)]
+struct SizeLimitedHashMap<K, V> {
+    values: HashMap<K, V>,
+    ring: Vec<K>,
+    start_idx: usize,
+    capacity: usize,
+}
+
+impl<K: Copy + Hash + Eq + Ord, V> SizeLimitedHashMap<K, V> {
+    pub(crate) fn new(capacity: usize) -> Self {
+        Self {
+            values: HashMap::with_capacity(capacity),
+            ring: Vec::with_capacity(capacity),
+            start_idx: 0,
+            capacity,
+        }
+    }
+
+    /// Get the value associated with a specific key
+    pub(crate) fn get(&self, key: &K) -> Option<&V> {
+        self.values.get(key)
+    }
+
+    /// Returns an iterator to all values stored within the ring buffer
+    ///
+    /// Note: the order is not guaranteed
+    pub(crate) fn values(&self) -> impl Iterator<Item = &V> + '_ {
+        self.values.values()
+    }
+
+    /// Push a new value into the ring buffer
+    ///
+    /// If a value with the given key already exists, it will replace the value
+    /// Otherwise it will add the key and value to the buffer.
+    ///
+    /// If there is insufficient capacity it will drop the oldest key value pair
+    /// from the buffer.
+    ///
+    /// This returns the replaced value (if any).
+    pub(crate) fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
+        if let Entry::Occupied(occupied) = self.values.entry(key) {
+            // If already exists - replace existing value
+            occupied.replace_entry(value);
+
+            return None;
+        }
+
+        if self.ring.len() < self.capacity {
+            // Still populating the ring
+            assert_eq!(self.start_idx, 0);
+            self.ring.push(key);
+            self.values.insert(key, value);
+
+            return None;
+        }
+
+        // Need to swap something out of the ring
+        let mut old_key = key;
+        std::mem::swap(&mut self.ring[self.start_idx], &mut old_key);
+
+        self.start_idx += 1;
+        if self.start_idx == self.capacity {
+            self.start_idx = 0;
+        }
+
+        let old_value = self.values.remove(&old_key).unwrap();
+        self.values.insert(key, value);
+
+        Some((old_key, old_value))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use crate::TaskRegistry;
+
+    use super::*;
+
+    #[test]
+    fn test_hashmap() {
+        let expect = |ring: &SizeLimitedHashMap<i32, i32>, expected: &[i32]| {
+            let mut values: Vec<_> = ring.values().cloned().collect();
+            values.sort_unstable();
+            assert_eq!(&values, expected);
+        };
+
+        let mut ring = SizeLimitedHashMap::new(5);
+        for i in 0..=4 {
+            assert_eq!(ring.push(i, i), None);
+        }
+
+        expect(&ring, &[0, 1, 2, 3, 4]);
+
+        // Expect rollover
+        assert_eq!(ring.push(5, 5), Some((0, 0)));
+        expect(&ring, &[1, 2, 3, 4, 5]);
+
+        for i in 6..=9 {
+            assert!(ring.push(i, i).is_some());
+        }
+        expect(&ring, &[5, 6, 7, 8, 9]);
+
+        for i in 10..=52 {
+            assert!(ring.push(i + 10, i).is_some());
+        }
+        expect(&ring, &[48, 49, 50, 51, 52]);
+        assert_eq!(*ring.get(&60).unwrap(), 50);
+    }
+
+    #[test]
+    fn test_registry_archive() {
+        let compare =
+            |expected_ids: &[TaskId], archive: &TaskRegistryWithHistory<i32, TaskRegistry<i32>>| {
+                let mut collected: Vec<_> = archive.history.values().map(|x| x.id()).collect();
+                collected.sort();
+                assert_eq!(&collected, expected_ids);
+            };
+
+        let time_provider = Arc::new(iox_time::SystemProvider::new());
+        let registry = TaskRegistry::new(time_provider);
+        let mut archive = TaskRegistryWithHistory::new(registry, 4);
+
+        for i in 0..=3 {
+            archive.register(i);
+        }
+
+        archive.reclaim();
+
+        compare(&[TaskId(0), TaskId(1), TaskId(2), TaskId(3)], &archive);
+
+        for i in 4..=7 {
+            archive.register(i);
+        }
+
+        compare(&[TaskId(0), TaskId(1), TaskId(2), TaskId(3)], &archive);
+
+        archive.reclaim();
+
+        compare(&[TaskId(4), TaskId(5), TaskId(6), TaskId(7)], &archive);
+    }
+}
diff --git a/tracker/src/task/metrics.rs b/tracker/src/task/metrics.rs
new file mode 100644
index 0000000..e047ddb
--- /dev/null
+++ b/tracker/src/task/metrics.rs
@@ -0,0 +1,286 @@
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    sync::Arc,
+    time::Duration,
+};
+
+use crate::{AbstractTaskRegistry, TaskId, TaskRegistration, TaskTracker};
+
+/// Function that extracts metric attributes from job metadata.
+///
+/// Note that some attributes like `"status"` will automatically be set/overwritten to ensure a certain consistency.
+pub type FAttributes<T> = Box<dyn FnMut(&T) -> metric::Attributes + Send>;
+
+/// Wraps a task registry and adds metrics.
+#[derive(Debug)]
+pub struct TaskRegistryWithMetrics<T, R>
+where
+    T: std::fmt::Debug + Send + Sync,
+    R: AbstractTaskRegistry<T>,
+{
+    registry: R,
+    metrics: RegistryMetrics<T>,
+}
+
+impl<T, R> TaskRegistryWithMetrics<T, R>
+where
+    T: std::fmt::Debug + Send + Sync,
+    R: AbstractTaskRegistry<T>,
+{
+    pub fn new(
+        inner: R,
+        metric_registry: Arc<metric::Registry>,
+        f_attributes: FAttributes<T>,
+    ) -> Self {
+        Self {
+            registry: inner,
+            metrics: RegistryMetrics::new(metric_registry, f_attributes),
+        }
+    }
+}
+
+impl<T, R> AbstractTaskRegistry<T> for TaskRegistryWithMetrics<T, R>
+where
+    T: std::fmt::Debug + Send + Sync,
+    R: AbstractTaskRegistry<T>,
+{
+    fn register(&mut self, metadata: T) -> (TaskTracker<T>, TaskRegistration) {
+        self.registry.register(metadata)
+    }
+
+    fn get(&self, id: TaskId) -> Option<TaskTracker<T>> {
+        self.registry.get(id)
+    }
+
+    fn tracked_len(&self) -> usize {
+        self.registry.tracked_len()
+    }
+
+    fn tracked(&self) -> Vec<TaskTracker<T>> {
+        self.registry.tracked()
+    }
+
+    fn running(&self) -> Vec<TaskTracker<T>> {
+        self.registry.running()
+    }
+
+    fn reclaim(&mut self) -> Vec<TaskTracker<T>> {
+        let pruned = self.registry.reclaim();
+        self.metrics.update(&self.registry, &pruned);
+        pruned
+    }
+}
+
+struct RegistryMetrics<T>
+where
+    T: std::fmt::Debug + Send + Sync,
+{
+    active_gauge: metric::Metric<metric::U64Gauge>,
+
+    // Accumulates jobs that were pruned from the limited job history. This is required to not saturate the completed
+    // count after a while.
+    completed_accu: BTreeMap<metric::Attributes, u64>,
+
+    cpu_time_histogram: metric::Metric<metric::DurationHistogram>,
+    wall_time_histogram: metric::Metric<metric::DurationHistogram>,
+
+    // Set of jobs for which we already accounted data but that are still tracked. We must not account these
+    // jobs a second time.
+    completed_but_still_tracked: BTreeSet<TaskId>,
+
+    f_attributes: FAttributes<T>,
+}
+
+impl<T> std::fmt::Debug for RegistryMetrics<T>
+where
+    T: std::fmt::Debug + Send + Sync,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RegistryMetrics")
+            .field("active_gauge", &self.active_gauge)
+            .field("completed_accu", &self.completed_accu)
+            .field("cpu_time_histogram", &self.cpu_time_histogram)
+            .field("wall_time_histogram", &self.wall_time_histogram)
+            .field(
+                "completed_but_still_tracked",
+                &self.completed_but_still_tracked,
+            )
+            .finish_non_exhaustive()
+    }
+}
+
+impl<T> RegistryMetrics<T>
+where
+    T: std::fmt::Debug + Send + Sync,
+{
+    fn new(metric_registry: Arc<metric::Registry>, f_attributes: FAttributes<T>) -> Self {
+        Self {
+            active_gauge: metric_registry
+                .register_metric("influxdb_iox_job_count", "Number of known jobs"),
+            completed_accu: Default::default(),
+            cpu_time_histogram: metric_registry.register_metric_with_options(
+                "influxdb_iox_job_completed_cpu",
+                "CPU time of of completed jobs",
+                Self::duration_histogram_options,
+            ),
+            wall_time_histogram: metric_registry.register_metric_with_options(
+                "influxdb_iox_job_completed_wall",
+                "Wall time of of completed jobs",
+                Self::duration_histogram_options,
+            ),
+            completed_but_still_tracked: Default::default(),
+            f_attributes,
+        }
+    }
+
+    fn duration_histogram_options() -> metric::DurationHistogramOptions {
+        metric::DurationHistogramOptions::new(vec![
+            Duration::from_millis(10),
+            Duration::from_millis(100),
+            Duration::from_secs(1),
+            Duration::from_secs(10),
+            Duration::from_secs(100),
+            metric::DURATION_MAX,
+        ])
+    }
+
+    fn update<R>(&mut self, registry: &R, pruned: &[TaskTracker<T>])
+    where
+        R: AbstractTaskRegistry<T>,
+    {
+        // scan pruned jobs
+        for job in pruned {
+            assert!(job.is_complete());
+            if self.completed_but_still_tracked.remove(&job.id()) {
+                // already accounted
+                continue;
+            }
+
+            self.process_completed_job(job);
+        }
+
+        // scan current completed jobs
+        let (tracked_completed, tracked_other): (Vec<_>, Vec<_>) = registry
+            .tracked()
+            .into_iter()
+            .partition(|job| job.is_complete());
+        for job in tracked_completed {
+            if !self.completed_but_still_tracked.insert(job.id()) {
+                // already accounted
+                continue;
+            }
+
+            self.process_completed_job(&job);
+        }
+
+        // scan current not-completed jobs
+        let mut accumulator: BTreeMap<metric::Attributes, u64> = self.completed_accu.clone();
+        for job in tracked_other {
+            let attr = self.job_to_gauge_attributes(&job);
+            accumulator
+                .entry(attr.clone())
+                .and_modify(|x| *x += 1)
+                .or_insert(1);
+        }
+
+        // emit metric
+        for (attr, count) in accumulator {
+            self.active_gauge.recorder(attr).set(count);
+        }
+    }
+
+    fn job_to_gauge_attributes(&mut self, job: &TaskTracker<T>) -> metric::Attributes
+    where
+        T: Send + Sync,
+    {
+        let metadata = job.metadata();
+        let status = job.get_status();
+
+        let mut attributes = (self.f_attributes)(metadata);
+        attributes.insert(
+            "status",
+            status
+                .result()
+                .map(|result| result.name())
+                .unwrap_or_else(|| status.name()),
+        );
+
+        attributes
+    }
+
+    fn process_completed_job(&mut self, job: &TaskTracker<T>) {
+        let attr = self.job_to_gauge_attributes(job);
+        self.completed_accu
+            .entry(attr.clone())
+            .and_modify(|x| *x += 1)
+            .or_insert(1);
+
+        let status = job.get_status();
+        if let Some(nanos) = status.cpu_nanos() {
+            self.cpu_time_histogram
+                .recorder(attr.clone())
+                .record(std::time::Duration::from_nanos(nanos as u64));
+        }
+        if let Some(nanos) = status.wall_nanos() {
+            self.wall_time_histogram
+                .recorder(attr)
+                .record(std::time::Duration::from_nanos(nanos as u64));
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use metric::Observation;
+
+    use crate::{TaskRegistry, TrackedFutureExt};
+
+    use super::*;
+
+    #[test]
+    fn test_metrics() {
+        let time_provider = Arc::new(iox_time::SystemProvider::new());
+        let registry = TaskRegistry::new(time_provider);
+        let metric_registry = Arc::new(metric::Registry::new());
+        let mut reg = TaskRegistryWithMetrics::new(
+            registry,
+            Arc::clone(&metric_registry),
+            Box::new(extract_attributes),
+        );
+
+        fut().track(reg.register(0).1);
+        for i in 1..=3 {
+            reg.complete(i);
+        }
+
+        reg.reclaim();
+
+        let mut reporter = metric::RawReporter::default();
+        metric_registry.report(&mut reporter);
+
+        let gauge = reporter
+            .metric("influxdb_iox_job_count")
+            .unwrap()
+            .observation(&[("status", "Dropped"), ("is_even", "true")])
+            .unwrap();
+        assert_eq!(gauge, &Observation::U64Gauge(1));
+
+        let gauge = reporter
+            .metric("influxdb_iox_job_count")
+            .unwrap()
+            .observation(&[("status", "Success"), ("is_even", "false")])
+            .unwrap();
+        assert_eq!(gauge, &Observation::U64Gauge(2));
+    }
+
+    async fn fut() -> Result<(), ()> {
+        Ok(())
+    }
+
+    fn extract_attributes(job: &i32) -> metric::Attributes {
+        metric::Attributes::from(&[
+            ("is_even", if job % 2 == 0 { "true" } else { "false" }),
+            ("status", "will be overwritten"),
+        ])
+    }
+}
diff --git a/tracker/src/task/registry.rs b/tracker/src/task/registry.rs
new file mode 100644
index 0000000..465d3ec
--- /dev/null
+++ b/tracker/src/task/registry.rs
@@ -0,0 +1,135 @@
+use std::str::FromStr;
+use std::sync::Arc;
+
+use hashbrown::HashMap;
+
+use iox_time::TimeProvider;
+
+use super::{TaskRegistration, TaskTracker};
+
+/// Every future registered with a `TaskRegistry` is assigned a unique
+/// `TaskId`
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+pub struct TaskId(pub usize);
+
+impl FromStr for TaskId {
+    type Err = std::num::ParseIntError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(Self(FromStr::from_str(s)?))
+    }
+}
+
+impl ToString for TaskId {
+    fn to_string(&self) -> String {
+        self.0.to_string()
+    }
+}
+
+pub trait AbstractTaskRegistry<T>
+where
+    T: std::fmt::Debug + Send + Sync,
+{
+    /// Register a new tracker in the registry
+    fn register(&mut self, metadata: T) -> (TaskTracker<T>, TaskRegistration);
+
+    /// Returns a complete tracker
+    fn complete(&mut self, metadata: T) -> TaskTracker<T> {
+        self.register(metadata).0
+    }
+
+    /// Get the tracker associated with a given id
+    fn get(&self, id: TaskId) -> Option<TaskTracker<T>>;
+
+    /// Returns the number of tracked tasks
+    fn tracked_len(&self) -> usize;
+
+    /// Returns a list of trackers, including those that are no longer running
+    fn tracked(&self) -> Vec<TaskTracker<T>>;
+
+    /// Returns a list of active trackers
+    fn running(&self) -> Vec<TaskTracker<T>>;
+
+    /// Removes completed tasks from the registry and returns a vector of
+    /// those removed.
+    ///
+    /// Should be called periodically.
+    fn reclaim(&mut self) -> Vec<TaskTracker<T>>;
+}
+
+/// Allows tracking the lifecycle of futures registered by
+/// `TrackedFutureExt::track` with an accompanying metadata payload of type T
+///
+/// Additionally can trigger graceful cancellation of registered futures
+#[derive(Debug)]
+pub struct TaskRegistry<T>
+where
+    T: Send + Sync,
+{
+    next_id: usize,
+    tasks: HashMap<TaskId, TaskTracker<T>>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl<T> TaskRegistry<T>
+where
+    T: Send + Sync,
+{
+    pub fn new(time_provider: Arc<dyn TimeProvider>) -> Self {
+        Self {
+            next_id: 0,
+            tasks: Default::default(),
+            time_provider,
+        }
+    }
+}
+
+impl<T> AbstractTaskRegistry<T> for TaskRegistry<T>
+where
+    T: std::fmt::Debug + Send + Sync,
+{
+    fn register(&mut self, metadata: T) -> (TaskTracker<T>, TaskRegistration) {
+        let id = TaskId(self.next_id);
+        self.next_id += 1;
+
+        let registration = TaskRegistration::new(Arc::clone(&self.time_provider));
+        let tracker = TaskTracker::new(id, &registration, metadata);
+
+        self.tasks.insert(id, tracker.clone());
+
+        (tracker, registration)
+    }
+
+    fn get(&self, id: TaskId) -> Option<TaskTracker<T>> {
+        self.tasks.get(&id).cloned()
+    }
+
+    fn tracked_len(&self) -> usize {
+        self.tasks.len()
+    }
+
+    fn tracked(&self) -> Vec<TaskTracker<T>> {
+        self.tasks.values().cloned().collect()
+    }
+
+    fn running(&self) -> Vec<TaskTracker<T>> {
+        self.tasks
+            .values()
+            .filter_map(|v| {
+                if !v.is_complete() {
+                    return Some(v.clone());
+                }
+                None
+            })
+            .collect()
+    }
+
+    /// Removes completed tasks from the registry and returns an iterator of
+    /// those removed
+    fn reclaim(&mut self) -> Vec<TaskTracker<T>> {
+        self.tasks
+            .extract_if(|_, v| v.is_complete())
+            .map(|(_, v)| v)
+            .collect()
+    }
+}
diff --git a/trogging/Cargo.toml b/trogging/Cargo.toml
new file mode 100644
index 0000000..b4f547c
--- /dev/null
+++ b/trogging/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "trogging"
+description = "IOx logging pipeline built upon tokio-tracing"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+clap = { version = "4", features = ["derive", "env"], optional = true }
+logfmt = { path = "../logfmt" }
+observability_deps = { path = "../observability_deps" }
+thiserror = "1.0.56"
+tracing-log = "0.2"
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
+
+[dev-dependencies]
+synchronized-writer = "1"
+regex = "1"
diff --git a/trogging/src/cli.rs b/trogging/src/cli.rs
new file mode 100644
index 0000000..927c5a0
--- /dev/null
+++ b/trogging/src/cli.rs
@@ -0,0 +1,246 @@
+//! Common CLI flags for logging and tracing
+use crate::{config::*, Builder};
+use tracing_subscriber::fmt::{writer::BoxMakeWriter, MakeWriter};
+
+/// CLI config for the logging related subset of options.
+#[derive(Debug, Clone, clap::Parser)]
+pub struct LoggingConfig {
+    /// Logs: filter directive
+    ///
+    /// Configures log severity level filter, by target.
+    ///
+    /// Simplest options: error, warn, info, debug, trace
+    ///
+    /// Levels for different modules can be specified. For example
+    /// `debug,hyper::proto::h1=info` specifies debug logging for all modules
+    /// except for the `hyper::proto::h1' module which will only display info
+    /// level logging.
+    ///
+    /// Extended syntax provided by `tracing-subscriber` includes span/field
+    /// filters. See <https://docs.rs/tracing-subscriber/0.2.17/tracing_subscriber/filter/struct.EnvFilter.html> for more details.
+    ///
+    /// Overridden by `-v`.
+    ///
+    /// If None, [`crate::Builder`] sets a default, by default [`crate::Builder::DEFAULT_LOG_FILTER`],
+    /// but overrideable with [`crate::Builder::with_default_log_filter`].
+    #[clap(long = "log-filter", env = "LOG_FILTER", action)]
+    pub log_filter: Option<String>,
+
+    /// Logs: filter short-hand
+    ///
+    /// Convenient way to set log severity level filter.
+    /// Overrides `--log-filter`.
+    ///
+    /// -v   'info,sqlx=warn'
+    ///
+    /// -vv  'debug,hyper::proto::h1=info,h2=info'
+    ///
+    /// -vvv 'trace,hyper::proto::h1=info,h2=info'
+    #[clap(
+        short = 'v',
+        long = "verbose",
+        action = clap::ArgAction::Count,
+    )]
+    pub log_verbose_count: u8,
+
+    /// Logs: destination
+    ///
+    /// Can be one of: stdout, stderr
+    //
+    // TODO(jacobmarble): consider adding file path, file rotation, syslog, ?
+    #[clap(
+        long = "log-destination",
+        env = "LOG_DESTINATION",
+        default_value = "stdout",
+        verbatim_doc_comment,
+        action
+    )]
+    pub log_destination: LogDestination,
+
+    #[rustfmt::skip]
+    /// Logs: message format
+    ///
+    /// Can be one of:
+    ///
+    /// full: human-readable, single line
+    ///
+    ///   Oct 24 12:55:47.815 ERROR shaving_yaks{yaks=3}: fmt::yak_shave: failed to shave yak yak=3 error=missing yak
+    ///   Oct 24 12:55:47.815 TRACE shaving_yaks{yaks=3}: fmt::yak_shave: yaks_shaved=2
+    ///   Oct 24 12:55:47.815  INFO fmt: yak shaving completed all_yaks_shaved=false
+    ///
+    /// pretty: human-readable, multi line
+    ///
+    ///   Oct 24 12:57:29.387 fmt_pretty::yak_shave: failed to shave yak, yak: 3, error: missing yak
+    ///     at examples/examples/fmt/yak_shave.rs:48 on main
+    ///     in fmt_pretty::yak_shave::shaving_yaks with yaks: 3
+    ///
+    ///   Oct 24 12:57:29.387 fmt_pretty::yak_shave: yaks_shaved: 2
+    ///     at examples/examples/fmt/yak_shave.rs:52 on main
+    ///     in fmt_pretty::yak_shave::shaving_yaks with yaks: 3
+    ///
+    ///   Oct 24 12:57:29.387 fmt_pretty: yak shaving completed, all_yaks_shaved: false
+    ///     at examples/examples/fmt-pretty.rs:19 on main
+    ///
+    /// json: machine-parseable
+    ///
+    ///   {"timestamp":"Oct 24 13:00:00.875","level":"ERROR","fields":{"message":"failed to shave yak","yak":3,"error":"missing yak"},"target":"fmt_json::yak_shave","spans":[{"yaks":3,"name":"shaving_yaks"}]}
+    ///   {"timestamp":"Oct 24 13:00:00.875","level":"TRACE","fields":{"yaks_shaved":2},"target":"fmt_json::yak_shave","spans":[{"yaks":3,"name":"shaving_yaks"}]}
+    ///   {"timestamp":"Oct 24 13:00:00.875","level":"INFO","fields":{"message":"yak shaving completed","all_yaks_shaved":false},"target":"fmt_json"}
+    ///
+    /// logfmt: human-readable and machine-parseable
+    ///
+    ///   level=info msg="This is an info message" target="logging" location="logfmt/tests/logging.rs:36" time=1612181556329599000
+    ///   level=debug msg="This is a debug message" target="logging" location="logfmt/tests/logging.rs:37" time=1612181556329618000
+    ///   level=trace msg="This is a trace message" target="logging" location="logfmt/tests/logging.rs:38" time=1612181556329634000
+    #[clap(
+        long = "log-format",
+        env = "LOG_FORMAT",
+        default_value = "full",
+        verbatim_doc_comment,
+        action,
+    )]
+    pub log_format: LogFormat,
+}
+
+impl LoggingConfig {
+    pub fn to_builder(&self) -> Builder<BoxMakeWriter> {
+        self.with_builder(Builder::new())
+    }
+
+    pub fn with_builder<W>(&self, builder: Builder<W>) -> Builder<BoxMakeWriter>
+    where
+        W: for<'writer> MakeWriter<'writer> + Send + Sync + Clone + 'static,
+    {
+        builder
+            .with_log_filter(&self.log_filter)
+            // with_verbose_count goes after with_log_filter because our CLI flag state
+            // that --v overrides --log-filter.
+            .with_log_verbose_count(self.log_verbose_count)
+            .with_log_destination(self.log_destination)
+            .with_log_format(self.log_format)
+    }
+}
+
+/// Extends the trogging [`crate::Builder`] API.
+pub trait LoggingConfigBuilderExt {
+    /// Applies all config entries from a [`LoggingConfig`] to a [`crate::Builder`].
+    fn with_logging_config(self, config: &LoggingConfig) -> Builder<BoxMakeWriter>;
+}
+
+impl<W> LoggingConfigBuilderExt for Builder<W>
+where
+    W: for<'writer> MakeWriter<'writer> + Send + Sync + Clone + 'static,
+{
+    fn with_logging_config(self, config: &LoggingConfig) -> Builder<BoxMakeWriter> {
+        config.with_builder(self)
+    }
+}
+
+impl From<LoggingConfig> for Builder<BoxMakeWriter> {
+    fn from(config: LoggingConfig) -> Self {
+        config.to_builder()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test_util::simple_test;
+    use clap::Parser;
+
+    #[ignore] // REVERT THIS WHEN REVERTING CIRCLECI LOGGING CONFIG
+    #[test]
+    fn test_log_verbose_count() {
+        let cfg = LoggingConfig::try_parse_from(["cli"]).unwrap();
+        assert_eq!(cfg.log_verbose_count, 0);
+
+        assert_eq!(
+            simple_test(cfg.into()).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+"#
+            .trim_start(),
+        );
+
+        let cfg = LoggingConfig::try_parse_from(["cli", "-v"]).unwrap();
+        assert_eq!(cfg.log_verbose_count, 1);
+
+        assert_eq!(
+            simple_test(cfg.into()).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+"#
+            .trim_start(),
+        );
+
+        let cfg = LoggingConfig::try_parse_from(["cli", "-vv"]).unwrap();
+        assert_eq!(cfg.log_verbose_count, 2);
+
+        assert_eq!(
+            simple_test(cfg.into()).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+DEBUG baz
+"#
+            .trim_start(),
+        );
+
+        let cfg = LoggingConfig::try_parse_from(["cli", "-vvv"]).unwrap();
+        assert_eq!(cfg.log_verbose_count, 3);
+
+        assert_eq!(
+            simple_test(cfg.into()).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+DEBUG baz
+TRACE trax
+"#
+            .trim_start(),
+        );
+    }
+
+    #[test]
+    fn test_custom_default_log_level() {
+        let cfg = LoggingConfig::try_parse_from(["cli"]).unwrap();
+
+        assert_eq!(
+            simple_test(
+                Builder::new()
+                    .with_default_log_filter("debug")
+                    .with_logging_config(&cfg)
+            )
+            .without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+DEBUG baz
+"#
+            .trim_start(),
+        );
+
+        let cfg = LoggingConfig::try_parse_from(["cli", "--log-filter=info"]).unwrap();
+
+        assert_eq!(
+            simple_test(
+                Builder::new()
+                    .with_default_log_filter("debug")
+                    .with_logging_config(&cfg)
+            )
+            .without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+"#
+            .trim_start(),
+        );
+    }
+}
diff --git a/trogging/src/config.rs b/trogging/src/config.rs
new file mode 100644
index 0000000..05e2601
--- /dev/null
+++ b/trogging/src/config.rs
@@ -0,0 +1,63 @@
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LogFormat {
+    Full,
+    Pretty,
+    Json,
+    Logfmt,
+}
+
+impl std::str::FromStr for LogFormat {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_ascii_lowercase().as_str() {
+            "full" => Ok(Self::Full),
+            "pretty" => Ok(Self::Pretty),
+            "json" => Ok(Self::Json),
+            "logfmt" => Ok(Self::Logfmt),
+            _ => Err(format!(
+                "Invalid log format '{s}'. Valid options: full, pretty, json, logfmt"
+            )),
+        }
+    }
+}
+
+impl std::fmt::Display for LogFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Full => write!(f, "full"),
+            Self::Pretty => write!(f, "pretty"),
+            Self::Json => write!(f, "json"),
+            Self::Logfmt => write!(f, "logfmt"),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum LogDestination {
+    Stdout,
+    Stderr,
+}
+
+impl std::str::FromStr for LogDestination {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_ascii_lowercase().as_str() {
+            "stdout" => Ok(Self::Stdout),
+            "stderr" => Ok(Self::Stderr),
+            _ => Err(format!(
+                "Invalid log destination '{s}'. Valid options: stdout, stderr"
+            )),
+        }
+    }
+}
+
+impl std::fmt::Display for LogDestination {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Stdout => write!(f, "stdout"),
+            Self::Stderr => write!(f, "stderr"),
+        }
+    }
+}
diff --git a/trogging/src/lib.rs b/trogging/src/lib.rs
new file mode 100644
index 0000000..47b4131
--- /dev/null
+++ b/trogging/src/lib.rs
@@ -0,0 +1,671 @@
+//! Log and trace initialization and setup
+
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+#[cfg(feature = "clap")]
+pub mod cli;
+pub mod config;
+
+pub use config::*;
+
+// Re-export tracing_subscriber
+pub use tracing_subscriber;
+
+use observability_deps::tracing::{self, Subscriber};
+use std::{
+    cmp::min,
+    io::{self, IsTerminal, Write},
+};
+use thiserror::Error;
+use tracing_subscriber::{
+    fmt::{self, writer::BoxMakeWriter, MakeWriter},
+    layer::SubscriberExt,
+    registry::LookupSpan,
+    EnvFilter, Layer,
+};
+
+/// Maximum length of a log line.
+/// Space for a final trailing newline if truncated.
+///
+/// Docker "chunks" log message in 16KB chunks. The log driver receives log lines in such chunks
+/// but not all of the log drivers properly recombine the lines, and even those who do have their
+/// own max buffer sizes (e.g. for our fluentd config it's 32KB).
+/// To avoid surprises, for now, let's just truncate log lines right below 16K and make sure
+/// they are properly terminated with a newline if they were so before the truncation.
+const MAX_LINE_LENGTH: usize = 16 * 1024 - 1;
+
+#[derive(Debug, Error)]
+pub enum Error {
+    #[error("Cannot set global tracing subscriber")]
+    SetGlobalDefaultError(#[from] tracing::dispatcher::SetGlobalDefaultError),
+
+    #[error("Cannot set global log subscriber")]
+    SetLoggerError(#[from] tracing_log::log_tracer::SetLoggerError),
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Builder to configure tracing and logging.
+#[derive(Debug)]
+pub struct Builder<W = fn() -> io::Stdout> {
+    log_format: LogFormat,
+    log_filter: Option<EnvFilter>,
+    // used when log_filter is none.
+    default_log_filter: EnvFilter,
+    make_writer: W,
+    with_target: bool,
+    with_ansi: bool,
+}
+
+impl Default for Builder {
+    fn default() -> Self {
+        Self {
+            log_format: LogFormat::Full,
+            log_filter: None,
+            default_log_filter: EnvFilter::try_new(Self::DEFAULT_LOG_FILTER).unwrap(),
+            make_writer: io::stdout,
+            with_target: true,
+            // use ansi control codes for color if connected to a TTY
+            with_ansi: std::io::stdout().is_terminal(),
+        }
+    }
+}
+
+impl Builder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+// This needs to be a separate impl block because they place different bounds on the type parameters.
+impl<W> Builder<W> {
+    pub fn with_writer<W2>(self, make_writer: W2) -> Builder<W2>
+    where
+        W2: for<'writer> MakeWriter<'writer> + Send + Sync + 'static,
+    {
+        Builder::<W2> {
+            make_writer,
+            // cannot use `..self` because W type parameter changes
+            log_format: self.log_format,
+            log_filter: self.log_filter,
+            default_log_filter: self.default_log_filter,
+            with_target: self.with_target,
+            with_ansi: self.with_ansi,
+        }
+    }
+}
+
+// This needs to be a separate impl block because they place different bounds on the type parameters.
+impl<W> Builder<W>
+where
+    W: for<'writer> MakeWriter<'writer> + Send + Sync + 'static,
+{
+    pub const DEFAULT_LOG_FILTER: &'static str = "warn";
+
+    /// Set log_filter using a simple numeric "verbosity level".
+    ///
+    /// 0 means, keep existing `log_filter` value.
+    pub fn with_log_verbose_count(self, log_verbose_count: u8) -> Self {
+        let log_filter = match log_verbose_count {
+            0 => self.log_filter,
+            1 => Some(EnvFilter::try_new("info,sqlx=warn").unwrap()),
+            2 => Some(EnvFilter::try_new("debug,hyper::proto::h1=info,h2=info").unwrap()),
+            _ => Some(EnvFilter::try_new("trace,hyper::proto::h1=info,h2=info").unwrap()),
+        };
+        Self { log_filter, ..self }
+    }
+
+    pub fn with_log_filter(self, log_filter: &Option<String>) -> Self {
+        let log_filter = log_filter
+            .as_ref()
+            .map(|log_filter| EnvFilter::try_new(log_filter).unwrap());
+        Self { log_filter, ..self }
+    }
+
+    pub fn with_default_log_filter(self, default_log_filter: impl AsRef<str>) -> Self {
+        let default_log_filter = EnvFilter::try_new(default_log_filter).unwrap();
+        Self {
+            default_log_filter,
+            ..self
+        }
+    }
+
+    pub fn with_log_format(self, log_format: LogFormat) -> Self {
+        Self { log_format, ..self }
+    }
+
+    pub fn with_log_destination(self, log_destination: LogDestination) -> Builder<BoxMakeWriter> {
+        // Ideally we need a synchronized writer so that threads don't stomp on each others.
+        // Unfortunately with the current release of `tracing-subscriber`, the trait doesn't expose
+        // a lifetime parameter. The HEAD version fixes that and even implements MakeWriter for Mutex<Writer>
+        // and shows some examples of how to use StdoutLock
+        //
+        // https://docs.rs/tracing-subscriber/0.2.18/tracing_subscriber/fmt/trait.MakeWriter.html)
+        // vs
+        // https://github.com/tokio-rs/tracing/blob/master/tracing-subscriber/src/fmt/writer.rs#L161
+        //
+        // The current hack is to ensure the LineWriter has enough buffer space to perform one single
+        // large call to the underlying writer. This is not a guarantee that the write won't be interrupted
+        // but hopefully it will happen much less often and not cause a pain until tracing_subscriber
+        // gets upgraded.
+        //
+        // For that to work we must cap the line length, which is also a good idea because docker
+        // caps log lines at 16K (and fluentd caps them at 32K).
+
+        let make_writer = match log_destination {
+            LogDestination::Stdout => make_writer(std::io::stdout),
+            LogDestination::Stderr => make_writer(std::io::stderr),
+        };
+        Builder {
+            make_writer,
+            // cannot use `..self` because W type parameter changes
+            log_format: self.log_format,
+            log_filter: self.log_filter,
+            default_log_filter: self.default_log_filter,
+            with_target: self.with_target,
+            with_ansi: self.with_ansi,
+        }
+    }
+
+    /// Sets whether or not an event’s target and location are displayed.
+    ///
+    /// Defaults to true. See [tracing_subscriber::fmt::Layer::with_target]
+    pub fn with_target(self, with_target: bool) -> Self {
+        Self {
+            with_target,
+            ..self
+        }
+    }
+
+    /// Enable/disable ANSI encoding for formatted events (i.e. colors).
+    ///
+    /// Defaults to true if connected to a TTY, false otherwise. See
+    /// [tracing_subscriber::fmt::Layer::with_ansi]
+    pub fn with_ansi(self, with_ansi: bool) -> Self {
+        Self { with_ansi, ..self }
+    }
+
+    /// Returns a [`Layer`] that emits logs as specified by the configuration of
+    /// `self`.
+    pub fn build<S>(self) -> Result<impl Layer<S> + 'static>
+    where
+        S: Subscriber,
+        for<'a> S: LookupSpan<'a>,
+    {
+        let log_writer = self.make_writer;
+        let log_format = self.log_format;
+        let with_target = self.with_target;
+        let with_ansi = self.with_ansi;
+
+        let log_filter = self.log_filter.unwrap_or(self.default_log_filter);
+
+        let res: Box<dyn Layer<S> + Send + Sync> = match log_format {
+            LogFormat::Full => Box::new(
+                log_filter.and_then(
+                    fmt::layer()
+                        .with_writer(log_writer)
+                        .with_target(with_target)
+                        .with_ansi(with_ansi),
+                ),
+            ),
+            LogFormat::Pretty => Box::new(
+                log_filter.and_then(
+                    fmt::layer()
+                        .pretty()
+                        .with_writer(log_writer)
+                        .with_target(with_target)
+                        .with_ansi(with_ansi),
+                ),
+            ),
+            LogFormat::Json => Box::new(
+                log_filter.and_then(
+                    fmt::layer()
+                        .json()
+                        .with_writer(log_writer)
+                        .with_target(with_target)
+                        .with_ansi(with_ansi),
+                ),
+            ),
+            LogFormat::Logfmt => Box::new(
+                log_filter.and_then(logfmt::LogFmtLayer::new(log_writer).with_target(with_target)),
+            ),
+        };
+
+        Ok(res)
+    }
+
+    /// Build a tracing subscriber and install it as a global default subscriber
+    /// for all threads.
+    ///
+    /// It returns a RAII guard that will ensure all events are flushed on drop
+    pub fn install_global(self) -> Result<TroggingGuard> {
+        let layer = self.build()?;
+        let subscriber = tracing_subscriber::Registry::default().with(layer);
+        install_global(subscriber)
+    }
+}
+
+/// Install a global tracing/logging subscriber.
+///
+/// Call this function when installing a subscriber instead of calling
+/// `tracing::subscriber::set_global_default` directly.
+///
+/// This function also sets up the `log::Log` -> `tracing` bridge.
+pub fn install_global<S>(subscriber: S) -> Result<TroggingGuard>
+where
+    S: Subscriber + Send + Sync + 'static,
+{
+    tracing::subscriber::set_global_default(subscriber)?;
+    tracing_log::LogTracer::init()?;
+    Ok(TroggingGuard)
+}
+
+/// A RAII guard. On Drop, ensures all events are flushed
+///
+/// Note: This is currently unnecessary but has been kept in case we choose to
+/// switch to using tracing-appender which writes logs in a background worker
+#[derive(Debug)]
+pub struct TroggingGuard;
+
+impl Drop for TroggingGuard {
+    fn drop(&mut self) {}
+}
+
+fn make_writer<M>(m: M) -> BoxMakeWriter
+where
+    M: for<'writer> MakeWriter<'writer> + Send + Sync + 'static,
+{
+    BoxMakeWriter::new(MakeWriterHelper {
+        inner: BoxMakeWriter::new(m),
+    })
+}
+
+struct MakeWriterHelper {
+    inner: BoxMakeWriter,
+}
+
+impl<'a> MakeWriter<'a> for MakeWriterHelper {
+    type Writer = Box<dyn Write + 'a>;
+
+    fn make_writer(&'a self) -> Self::Writer {
+        Box::new(std::io::LineWriter::with_capacity(
+            MAX_LINE_LENGTH,
+            LimitedWriter(MAX_LINE_LENGTH, self.inner.make_writer()),
+        ))
+    }
+}
+
+struct LimitedWriter<W: Write>(usize, W);
+
+impl<W: Write> Write for LimitedWriter<W> {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        if buf.is_empty() {
+            return Ok(0);
+        }
+        let truncated = &buf[..min(self.0, buf.len())];
+        let had_trailing_newline = buf[buf.len() - 1] == b'\n';
+        if had_trailing_newline && (truncated[truncated.len() - 1] != b'\n') {
+            // slow path; copy buffer and append a newline at the end
+            // we still want to perform a single write syscall (if possible).
+            let mut tmp = truncated.to_vec();
+            tmp.push(b'\n');
+            self.1.write_all(&tmp).map(|_| buf.len())
+        } else {
+            self.1.write_all(truncated).map(|_| buf.len())
+        }
+        // ^^^ `write_all`:
+        // in case of interrupted syscalls we prefer to write a garbled log line.
+        // than to just truncate the logs.
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.1.flush()
+    }
+}
+
+#[cfg(test)]
+pub mod test_util {
+    /// Utilities for testing logging and tracing.
+    use super::*;
+
+    use observability_deps::tracing::{self, debug, error, info, trace, warn};
+    use std::sync::{Arc, Mutex};
+    use synchronized_writer::SynchronizedWriter;
+    use tracing_subscriber::fmt::MakeWriter;
+
+    /// Log writer suitable for using in tests.
+    /// It captures log output in a buffer and provides ways to filter out
+    /// non-deterministic parts such as timestamps.
+    #[derive(Default, Debug, Clone)]
+    pub struct TestWriter {
+        buffer: Arc<Mutex<Vec<u8>>>,
+    }
+
+    impl TestWriter {
+        /// Return a writer and reference to the to-be captured output.
+        pub fn new() -> (Self, Captured) {
+            let writer = Self::default();
+            let captured = Captured(Arc::clone(&writer.buffer));
+            (writer, captured)
+        }
+    }
+
+    impl MakeWriter<'_> for TestWriter {
+        type Writer = SynchronizedWriter<Vec<u8>>;
+
+        fn make_writer(&self) -> Self::Writer {
+            SynchronizedWriter::new(Arc::clone(&self.buffer))
+        }
+    }
+
+    #[derive(Debug)]
+    pub struct Captured(Arc<Mutex<Vec<u8>>>);
+
+    impl Captured {
+        /// Removes non-determinism by removing timestamps from the log lines.
+        /// It supports the built-in tracing timestamp format and the logfmt timestamps.
+        pub fn without_timestamps(&self) -> String {
+            // logfmt (e.g. `time=12345`) or fmt::layer() (e.g. `2021-10-25T13:48:50.555258`) time format
+            let timestamp = regex::Regex::new(
+                r"(?m)( ?time=[0-9]+|^(\d{4})-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}.\d+Z *)",
+            )
+            .unwrap();
+            timestamp.replace_all(&self.to_string(), "").to_string()
+        }
+    }
+
+    impl std::fmt::Display for Captured {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            let bytes = self.0.lock().unwrap();
+            write!(f, "{}", std::str::from_utf8(&bytes).unwrap())
+        }
+    }
+
+    /// This is a test helper that sets a few test-friendly parameters
+    /// such as disabled ANSI escape sequences on the provided builder.
+    /// This helper then calls the provided function within the context
+    /// of the test subscriber, and returns the captured output of all
+    /// the logging macros invoked by the function.
+    pub fn log_test<W, F>(builder: Builder<W>, f: F) -> Captured
+    where
+        W: for<'writer> MakeWriter<'writer> + Send + Sync + 'static,
+        F: Fn(),
+    {
+        let (writer, output) = TestWriter::new();
+        let layer = builder
+            .with_writer(make_writer(writer))
+            .with_target(false)
+            .with_ansi(false)
+            .build()
+            .expect("subscriber");
+
+        let subscriber = tracing_subscriber::Registry::default().with(layer);
+        tracing::subscriber::with_default(subscriber, f);
+
+        output
+    }
+
+    /// This is a test helper that sets a few test-friendly parameters
+    /// such as disabled ANSI escape sequences on the provided builder.
+    /// This helper then emits a few logs of different verbosity levels
+    /// and returns the captured output.
+    pub fn simple_test<W>(builder: Builder<W>) -> Captured
+    where
+        W: for<'writer> MakeWriter<'writer> + Send + Sync + 'static,
+    {
+        log_test(builder, || {
+            error!("foo");
+            warn!("woo");
+            info!("bar");
+            debug!("baz");
+            trace!("trax");
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use crate::test_util::*;
+    use observability_deps::tracing::{debug, error};
+    use std::sync::atomic::{AtomicBool, Ordering};
+    use std::sync::Arc;
+
+    #[test]
+    fn simple_logging() {
+        assert_eq!(
+            simple_test(Builder::new()).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+"#
+            .trim_start(),
+        );
+    }
+
+    #[test]
+    fn simple_logging_logfmt() {
+        assert_eq!(
+            simple_test(Builder::new().with_log_format(LogFormat::Logfmt)).without_timestamps(),
+            r#"
+level=error msg=foo
+level=warn msg=woo
+"#
+            .trim_start(),
+        );
+    }
+
+    #[test]
+    fn verbose_count() {
+        assert_eq!(
+            simple_test(Builder::new().with_log_verbose_count(0)).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+"#
+            .trim_start(),
+        );
+
+        assert_eq!(
+            simple_test(Builder::new().with_log_verbose_count(1)).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+"#
+            .trim_start(),
+        );
+
+        assert_eq!(
+            simple_test(Builder::new().with_log_verbose_count(2)).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+DEBUG baz
+"#
+            .trim_start(),
+        );
+
+        assert_eq!(
+            simple_test(Builder::new().with_log_verbose_count(3)).without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+DEBUG baz
+TRACE trax
+"#
+            .trim_start(),
+        );
+    }
+
+    #[test]
+    fn test_override_default_log_filter() {
+        const DEFAULT_LOG_FILTER: &str = "error";
+
+        assert_eq!(
+            simple_test(
+                Builder::new()
+                    .with_default_log_filter(DEFAULT_LOG_FILTER)
+                    .with_log_verbose_count(0)
+            )
+            .without_timestamps(),
+            r#"
+ERROR foo
+"#
+            .trim_start(),
+        );
+
+        assert_eq!(
+            simple_test(
+                Builder::new()
+                    .with_default_log_filter(DEFAULT_LOG_FILTER)
+                    .with_log_verbose_count(1)
+            )
+            .without_timestamps(),
+            r#"
+ERROR foo
+WARN woo
+INFO bar
+"#
+            .trim_start(),
+        );
+    }
+
+    #[test]
+    fn test_side_effects() {
+        let called = Arc::new(AtomicBool::new(false));
+        let called_captured = Arc::clone(&called);
+
+        fn call(called: &AtomicBool) -> bool {
+            called.store(true, Ordering::SeqCst);
+            true
+        }
+
+        assert_eq!(
+            log_test(
+                Builder::new().with_log_filter(&Some("error".to_string())),
+                move || {
+                    error!("foo");
+                    debug!(called=?call(&called_captured), "bar");
+                }
+            )
+            .without_timestamps(),
+            r#"
+ERROR foo
+"#
+            .trim_start(),
+        );
+
+        assert!(!called.load(Ordering::SeqCst));
+    }
+
+    #[test]
+    fn test_long_lines() {
+        let test_cases = vec![
+            0..1,
+            10..11,
+            // test all the values around the line length limit; the logger adds some
+            // prefix text such as level and field name, so the field value length that
+            // actually trigger a line overflow is a bit smaller than MAX_LINE_LENGTH.
+            MAX_LINE_LENGTH - 40..MAX_LINE_LENGTH + 20,
+            20 * 1024..20 * 1024,
+        ];
+
+        for range in test_cases {
+            for len in range {
+                let long = "X".repeat(len);
+
+                let captured = log_test(
+                    Builder::new().with_log_filter(&Some("error".to_string())),
+                    move || {
+                        error!(%long);
+                    },
+                )
+                .without_timestamps();
+
+                assert_eq!(captured.chars().last().unwrap(), '\n');
+
+                assert!(
+                    captured.len() <= MAX_LINE_LENGTH,
+                    "{} <= {}",
+                    captured.len(),
+                    MAX_LINE_LENGTH
+                );
+            }
+        }
+    }
+
+    // This test checks that [`make_writer`] returns a writer that implement line buffering, which means
+    // that written data is not flushed to the underlying IO writer until a a whole line is been written
+    // to the line buffered writer, possibly by multiple calls to the `write` method.
+    // (In fact, the `logfmt` layer does call `write` multiple times for each log line).
+    //
+    // What happens to the truncated strings w.r.t to newlines is out of scope for this test,
+    // see the [`limited_writer`] test instead.
+    #[test]
+    fn line_buffering() {
+        let (test_writer, captured) = TestWriter::new();
+        let mw = make_writer(test_writer);
+        let mut writer = mw.make_writer();
+        writer.write_all("foo".as_bytes()).unwrap();
+        // wasn't flushed yet because there was no newline yet
+        assert_eq!(captured.to_string(), "");
+        writer.write_all("\nbar".as_bytes()).unwrap();
+        // a newline caused the first line to be flushed but the trailing string is still buffered
+        assert_eq!(captured.to_string(), "foo\n");
+        writer.flush().unwrap();
+        // an explicit call to flush flushes even if there is no trailing newline
+        assert_eq!(captured.to_string(), "foo\nbar");
+
+        // another case when the line buffer flushes even before a newline is when the internal buffer limit
+        let (test_writer, captured) = TestWriter::new();
+        let mw = make_writer(test_writer);
+        let mut writer = mw.make_writer();
+        let long = std::iter::repeat(b'X')
+            .take(MAX_LINE_LENGTH)
+            .collect::<Vec<u8>>();
+        writer.write_all(&long).unwrap();
+        assert_eq!(captured.to_string().len(), MAX_LINE_LENGTH);
+    }
+
+    #[test]
+    fn limited_writer() {
+        const TEST_MAX_LINE_LENGTH: usize = 3;
+        let test_cases = vec![
+            ("", ""),
+            ("a", "a"),
+            ("ab", "ab"),
+            ("abc", "abc"),
+            ("abc", "abc"),
+            ("abcd", "abc"),
+            ("abcd\n", "abc\n"),
+            ("abcd\n\n", "abc\n"),
+            ("abcd\nx", "abc"),
+            ("\n", "\n"),
+            ("\nabc", "\nab"),
+        ];
+        for (input, want) in test_cases {
+            let mut buf = Vec::new();
+            {
+                let mut lw = LimitedWriter(TEST_MAX_LINE_LENGTH, &mut buf);
+                write!(&mut lw, "{input}").unwrap();
+            }
+            assert_eq!(std::str::from_utf8(&buf).unwrap(), want);
+        }
+    }
+}
diff --git a/wal/Cargo.toml b/wal/Cargo.toml
new file mode 100644
index 0000000..1a9fc0a
--- /dev/null
+++ b/wal/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "wal"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+byteorder = "1.5.0"
+crc32fast = "1.2.0"
+data_types = { path = "../data_types" }
+generated_types = { path = "../generated_types" }
+hashbrown.workspace = true
+mutable_batch = { version = "0.1.0", path = "../mutable_batch" }
+mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
+observability_deps = { path = "../observability_deps" }
+parking_lot = "0.12"
+prost = { workspace = true }
+snafu = "0.8"
+snap = "1.1.1"
+tokio = { version = "1.35", features = ["macros", "fs", "io-util", "parking_lot", "rt-multi-thread", "sync", "time"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+assert_matches = "1.5.0"
+dml = { path = "../dml" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+test_helpers = { path = "../test_helpers" }
diff --git a/wal/src/blocking.rs b/wal/src/blocking.rs
new file mode 100644
index 0000000..f8bf89d
--- /dev/null
+++ b/wal/src/blocking.rs
@@ -0,0 +1,5 @@
+mod reader;
+pub use reader::{ClosedSegmentFileReader, Error as ReaderError, Result as ReaderResult};
+
+mod writer;
+pub use writer::{Error as WriterError, OpenSegmentFileWriter, Result as WriterResult};
diff --git a/wal/src/blocking/reader.rs b/wal/src/blocking/reader.rs
new file mode 100644
index 0000000..c0e9dca
--- /dev/null
+++ b/wal/src/blocking/reader.rs
@@ -0,0 +1,474 @@
+use crate::{FileTypeIdentifier, SegmentEntry, SegmentIdBytes, SequencedWalOp};
+use byteorder::{BigEndian, ReadBytesExt};
+use crc32fast::Hasher;
+use generated_types::influxdata::iox::wal::v1::WalOpBatch as ProtoWalOpBatch;
+use prost::Message;
+use snafu::prelude::*;
+use snap::read::FrameDecoder;
+use std::{
+    fs::File,
+    io::{self, BufReader, Read},
+    path::{Path, PathBuf},
+};
+
+/// A closed segment file reader over an `R`, tracking the number of compressed
+/// bytes read.
+#[derive(Debug)]
+pub struct ClosedSegmentFileReader<R>(R, u64);
+
+impl ClosedSegmentFileReader<BufReader<File>> {
+    pub fn from_path(path: impl AsRef<Path>) -> Result<Self> {
+        let path = path.as_ref();
+        let f = File::open(path).context(UnableToOpenFileSnafu { path })?;
+        let f = BufReader::new(f);
+        Ok(Self::new(f))
+    }
+}
+
+impl<R> ClosedSegmentFileReader<R>
+where
+    R: Read,
+{
+    pub fn new(f: R) -> Self {
+        Self(f, 0)
+    }
+
+    fn read_array<const N: usize>(&mut self) -> Result<[u8; N]> {
+        let mut data = [0u8; N];
+        self.0
+            .read_exact(&mut data)
+            .context(UnableToReadArraySnafu { length: N })?;
+        self.1 += N as u64;
+        Ok(data)
+    }
+
+    pub fn read_header(&mut self) -> Result<(FileTypeIdentifier, SegmentIdBytes)> {
+        Ok((self.read_array()?, self.read_array()?))
+    }
+
+    fn one_entry(&mut self) -> Result<Option<SegmentEntry>> {
+        let expected_checksum = match self.0.read_u32::<BigEndian>() {
+            Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
+            other => other.context(UnableToReadChecksumSnafu)?,
+        };
+
+        let expected_len = self
+            .0
+            .read_u32::<BigEndian>()
+            .context(UnableToReadLengthSnafu)?
+            .into();
+
+        let compressed_read = self.0.by_ref().take(expected_len);
+        let hashing_read = CrcReader::new(compressed_read);
+        let mut decompressing_read = FrameDecoder::new(hashing_read);
+
+        let mut data = Vec::with_capacity(100);
+        decompressing_read
+            .read_to_end(&mut data)
+            .context(UnableToReadDataSnafu)?;
+
+        let (actual_compressed_len, actual_checksum) = decompressing_read.into_inner().checksum();
+
+        // Track the size of the entry header and total amount of compressed
+        // data successfully read so far by the reader. The header values are
+        // tracked here to avoid continuously counting bytes read from a
+        // corrupted segment where no further entries can be read.
+        //
+        // This accounting is done before checksum/length mismatch, if the data has still
+        // been read in successfully.
+        self.1 += 2 * std::mem::size_of::<u32>() as u64;
+        self.1 += actual_compressed_len;
+
+        ensure!(
+            expected_len == actual_compressed_len,
+            LengthMismatchSnafu {
+                expected: expected_len,
+                actual: actual_compressed_len
+            }
+        );
+
+        ensure!(
+            expected_checksum == actual_checksum,
+            ChecksumMismatchSnafu {
+                expected: expected_checksum,
+                actual: actual_checksum
+            }
+        );
+
+        Ok(Some(SegmentEntry { data }))
+    }
+
+    pub fn next_batch(&mut self) -> Result<Option<Vec<SequencedWalOp>>> {
+        if let Some(entry) = self.one_entry()? {
+            let decoded =
+                ProtoWalOpBatch::decode(&*entry.data).context(UnableToDeserializeDataSnafu)?;
+
+            let mut ops = Vec::with_capacity(decoded.ops.len());
+            for op in decoded.ops {
+                ops.push(op.try_into().context(InvalidMessageSnafu)?);
+            }
+
+            return Ok(Some(ops));
+        }
+
+        Ok(None)
+    }
+
+    /// Returns the total amount of bytes successfully read from this reader's
+    /// underlying file, in bytes.
+    pub fn bytes_read(&self) -> u64 {
+        self.1
+    }
+}
+
+struct CrcReader<R> {
+    inner: R,
+    hasher: Hasher,
+    bytes_seen: u64,
+}
+
+impl<R> CrcReader<R> {
+    fn new(inner: R) -> Self {
+        let hasher = Hasher::default();
+        Self {
+            inner,
+            hasher,
+            bytes_seen: 0,
+        }
+    }
+
+    fn checksum(self) -> (u64, u32) {
+        (self.bytes_seen, self.hasher.finalize())
+    }
+}
+
+impl<R> Read for CrcReader<R>
+where
+    R: Read,
+{
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        let len = self.inner.read(buf)?;
+        let len_u64 = u64::try_from(len).expect("Only designed to run on 32-bit systems or higher");
+
+        self.bytes_seen += len_u64;
+        self.hasher.update(&buf[..len]);
+        Ok(len)
+    }
+}
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    UnableToOpenFile {
+        source: io::Error,
+        path: PathBuf,
+    },
+
+    UnableToReadArray {
+        source: io::Error,
+        length: usize,
+    },
+
+    UnableToReadChecksum {
+        source: io::Error,
+    },
+
+    UnableToReadLength {
+        source: io::Error,
+    },
+
+    UnableToReadData {
+        source: io::Error,
+    },
+
+    LengthMismatch {
+        expected: u64,
+        actual: u64,
+    },
+
+    ChecksumMismatch {
+        expected: u32,
+        actual: u32,
+    },
+
+    UnableToDecompressData {
+        source: snap::Error,
+    },
+
+    UnableToDeserializeData {
+        source: prost::DecodeError,
+    },
+
+    InvalidMessage {
+        source: generated_types::google::FieldViolation,
+    },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{SegmentId, FILE_TYPE_IDENTIFIER};
+    use assert_matches::assert_matches;
+    use byteorder::WriteBytesExt;
+    use std::io::Write;
+    use test_helpers::assert_error;
+
+    #[test]
+    fn successful_read_no_entries() {
+        let segment_file = FakeSegmentFile::new();
+
+        let data = segment_file.data();
+        let mut reader = ClosedSegmentFileReader::new(data.as_slice());
+
+        let (file_type_id, uuid) = reader.read_header().unwrap();
+        assert_eq!(&file_type_id, FILE_TYPE_IDENTIFIER);
+        assert_eq!(uuid, segment_file.id.as_bytes());
+
+        let entry = reader.one_entry().unwrap();
+        assert!(entry.is_none());
+        assert_eq!(reader.bytes_read(), segment_file.size_bytes());
+    }
+
+    #[test]
+    fn successful_read_with_entries() {
+        let mut segment_file = FakeSegmentFile::new();
+        let entry_input_1 = FakeSegmentEntry::new(b"hello");
+        segment_file.add_entry(entry_input_1.clone());
+
+        let entry_input_2 = FakeSegmentEntry::new(b"goodbye");
+        segment_file.add_entry(entry_input_2.clone());
+
+        let data = segment_file.data();
+        let mut reader = ClosedSegmentFileReader::new(data.as_slice());
+
+        let (file_type_id, uuid) = reader.read_header().unwrap();
+        assert_eq!(&file_type_id, FILE_TYPE_IDENTIFIER);
+        assert_eq!(uuid, segment_file.id.as_bytes());
+
+        let entry_output_1 = reader.one_entry().unwrap().unwrap();
+        let expected_1 = SegmentEntry::from(&entry_input_1);
+        assert_eq!(entry_output_1.data, expected_1.data);
+
+        let entry_output_2 = reader.one_entry().unwrap().unwrap();
+        let expected_2 = SegmentEntry::from(&entry_input_2);
+        assert_eq!(entry_output_2.data, expected_2.data);
+
+        let entry = reader.one_entry().unwrap();
+        assert!(entry.is_none());
+        assert_eq!(reader.bytes_read(), segment_file.size_bytes());
+    }
+
+    #[test]
+    fn unsuccessful_read_too_short_len() {
+        let mut segment_file = FakeSegmentFile::new();
+
+        // The bad entry will prevent any entries being read, thus the
+        // no bytes can be reported as successfully read.
+        let want_bytes_read = segment_file.size_bytes();
+
+        let bad_entry_input = FakeSegmentEntry::new(b"hello");
+        let good_length = bad_entry_input.compressed_len();
+        let bad_entry_input = bad_entry_input.with_compressed_len(good_length - 1);
+        segment_file.add_entry(bad_entry_input);
+
+        let good_entry_input = FakeSegmentEntry::new(b"goodbye");
+        segment_file.add_entry(good_entry_input);
+
+        let data = segment_file.data();
+        let mut reader = ClosedSegmentFileReader::new(data.as_slice());
+
+        let (file_type_id, uuid) = reader.read_header().unwrap();
+        assert_eq!(&file_type_id, FILE_TYPE_IDENTIFIER);
+        assert_eq!(uuid, segment_file.id.as_bytes());
+
+        let read_fail = reader.one_entry();
+        assert_matches!(read_fail, Err(Error::UnableToReadData { source: e }) => {
+            assert_matches!(e.kind(), std::io::ErrorKind::UnexpectedEof);
+        });
+        assert_eq!(reader.bytes_read(), want_bytes_read);
+        // Trying to continue reading will fail as well, see:
+        // <https://github.com/influxdata/influxdb_iox/issues/6222>
+        assert_error!(reader.one_entry(), Error::UnableToReadData { .. });
+        // Ensure no magical bean counting occurs when stuck unable to read data.
+        assert_eq!(reader.bytes_read(), want_bytes_read);
+    }
+
+    #[test]
+    fn unsuccessful_read_too_long_len() {
+        let mut segment_file = FakeSegmentFile::new();
+
+        // The bad entry will prevent any entries being read, thus the
+        // no bytes can be reported as successfully read.
+        let want_bytes_read = segment_file.size_bytes();
+
+        let bad_entry_input = FakeSegmentEntry::new(b"hello");
+        let good_length = bad_entry_input.compressed_len();
+        let bad_entry_input = bad_entry_input.with_compressed_len(good_length + 1);
+        segment_file.add_entry(bad_entry_input);
+
+        let good_entry_input = FakeSegmentEntry::new(b"goodbye");
+        segment_file.add_entry(good_entry_input);
+
+        let data = segment_file.data();
+        let mut reader = ClosedSegmentFileReader::new(data.as_slice());
+
+        let (file_type_id, uuid) = reader.read_header().unwrap();
+        assert_eq!(&file_type_id, FILE_TYPE_IDENTIFIER);
+        assert_eq!(uuid, segment_file.id.as_bytes());
+
+        let read_fail = reader.one_entry();
+        assert_matches!(read_fail, Err(Error::UnableToReadData { source: e }) => {
+            assert_matches!(e.kind(), std::io::ErrorKind::UnexpectedEof);
+        });
+        assert_eq!(reader.bytes_read(), want_bytes_read);
+        // Trying to continue reading will fail as well, see:
+        // <https://github.com/influxdata/influxdb_iox/issues/6222>
+        assert_error!(reader.one_entry(), Error::UnableToReadData { .. });
+        // Also no magical bean counting when cannot read more.
+        assert_eq!(reader.bytes_read(), want_bytes_read);
+    }
+
+    #[test]
+    fn unsuccessful_read_checksum_mismatch() {
+        let mut segment_file = FakeSegmentFile::new();
+
+        let bad_entry_input = FakeSegmentEntry::new(b"hello");
+        let good_checksum = bad_entry_input.checksum();
+        let bad_entry_input = bad_entry_input.with_checksum(good_checksum + 1);
+        segment_file.add_entry(bad_entry_input);
+
+        let good_entry_input = FakeSegmentEntry::new(b"goodbye");
+        segment_file.add_entry(good_entry_input.clone());
+
+        let data = segment_file.data();
+        let mut reader = ClosedSegmentFileReader::new(data.as_slice());
+
+        let (file_type_id, uuid) = reader.read_header().unwrap();
+        assert_eq!(&file_type_id, FILE_TYPE_IDENTIFIER);
+        assert_eq!(uuid, segment_file.id.as_bytes());
+
+        let read_fail = reader.one_entry();
+        assert_error!(read_fail, Error::ChecksumMismatch { .. });
+
+        // A bad checksum won't corrupt further entries
+        let entry_output_2 = reader.one_entry().unwrap().unwrap();
+        let expected_2 = SegmentEntry::from(&good_entry_input);
+        assert_eq!(entry_output_2.data, expected_2.data);
+
+        let entry = reader.one_entry().unwrap();
+        assert!(entry.is_none());
+        assert_eq!(reader.bytes_read(), segment_file.size_bytes());
+    }
+
+    #[derive(Debug)]
+    struct FakeSegmentFile {
+        id: SegmentId,
+        entries: Vec<FakeSegmentEntry>,
+    }
+
+    impl FakeSegmentFile {
+        fn new() -> Self {
+            Self {
+                id: SegmentId::new(0),
+                entries: Default::default(),
+            }
+        }
+
+        fn add_entry(&mut self, entry: FakeSegmentEntry) {
+            self.entries.push(entry);
+        }
+
+        fn data(&self) -> Vec<u8> {
+            let mut f = Vec::new();
+
+            f.write_all(FILE_TYPE_IDENTIFIER).unwrap();
+
+            let id_bytes = self.id.as_bytes();
+            f.write_all(&id_bytes).unwrap();
+
+            for entry in &self.entries {
+                f.write_u32::<BigEndian>(entry.checksum()).unwrap();
+                f.write_u32::<BigEndian>(entry.compressed_len()).unwrap();
+                f.write_all(&entry.compressed_data()).unwrap();
+            }
+
+            f
+        }
+
+        fn size_bytes(&self) -> u64 {
+            std::mem::size_of::<FileTypeIdentifier>() as u64
+                + std::mem::size_of::<SegmentIdBytes>() as u64
+                + self
+                    .entries
+                    .iter()
+                    .map(|e| {
+                        // Each entry is sized by the two 4 byte
+                        // header values (checksum and compressed_len)
+                        // as well as the length of the compressed data.
+                        (std::mem::size_of::<u32>()
+                            + std::mem::size_of::<u32>()
+                            + e.compressed_data().len()) as u64
+                    })
+                    .sum::<u64>()
+        }
+    }
+
+    #[derive(Debug, Clone, PartialEq)]
+    struct FakeSegmentEntry {
+        checksum: Option<u32>,
+        compressed_len: Option<u32>,
+        uncompressed_data: Vec<u8>,
+    }
+
+    impl FakeSegmentEntry {
+        fn new(data: &[u8]) -> Self {
+            Self {
+                checksum: None,
+                compressed_len: None,
+                uncompressed_data: data.to_vec(),
+            }
+        }
+
+        fn with_compressed_len(self, compressed_len: u32) -> Self {
+            Self {
+                compressed_len: Some(compressed_len),
+                ..self
+            }
+        }
+
+        fn with_checksum(self, checksum: u32) -> Self {
+            Self {
+                checksum: Some(checksum),
+                ..self
+            }
+        }
+
+        fn checksum(&self) -> u32 {
+            self.checksum.unwrap_or_else(|| {
+                let mut hasher = Hasher::new();
+                hasher.update(&self.compressed_data());
+                hasher.finalize()
+            })
+        }
+
+        fn compressed_data(&self) -> Vec<u8> {
+            let mut encoder = snap::write::FrameEncoder::new(Vec::new());
+            encoder.write_all(&self.uncompressed_data).unwrap();
+            encoder.into_inner().expect("cannot fail to flush to a Vec")
+        }
+
+        fn compressed_len(&self) -> u32 {
+            self.compressed_len
+                .unwrap_or_else(|| self.compressed_data().len() as u32)
+        }
+    }
+
+    impl From<&FakeSegmentEntry> for SegmentEntry {
+        fn from(fake: &FakeSegmentEntry) -> Self {
+            Self {
+                data: fake.uncompressed_data.clone(),
+            }
+        }
+    }
+}
diff --git a/wal/src/blocking/writer.rs b/wal/src/blocking/writer.rs
new file mode 100644
index 0000000..1767fb7
--- /dev/null
+++ b/wal/src/blocking/writer.rs
@@ -0,0 +1,234 @@
+use crate::{ClosedSegment, SegmentId, WriteSummary, FILE_TYPE_IDENTIFIER};
+use byteorder::{BigEndian, WriteBytesExt};
+use crc32fast::Hasher;
+use snafu::prelude::*;
+use std::{
+    fs::{File, OpenOptions},
+    io::{self, Cursor, Write},
+    mem, num,
+    path::PathBuf,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+};
+
+/// Defines the desired maximum size of the re-used write
+/// [`OpenSegmentFileWriter`] buffer.
+///
+/// The buffer is free to exceed this soft limit as necessary, but will always
+/// be shrunk back down to at most this size eventually.
+///
+/// Setting this too low causes needless reallocations for each write that
+/// exceeds it. Setting it too high wastes memory. Configure it to a tolerable
+/// amount of memory overhead for the lifetime of the writer.
+const SOFT_MAX_BUFFER_LEN: usize = 1024 * 128; // 128kiB
+
+/// Struct for writing data to a segment file in a wal
+#[derive(Debug)]
+pub struct OpenSegmentFileWriter {
+    id: SegmentId,
+    path: PathBuf,
+    f: File,
+    bytes_written: usize,
+
+    buffer: Vec<u8>,
+}
+
+impl OpenSegmentFileWriter {
+    pub fn new_in_directory(
+        dir: impl Into<PathBuf>,
+        next_id_source: Arc<AtomicU64>,
+    ) -> Result<Self> {
+        let id = SegmentId::new(next_id_source.fetch_add(1, Ordering::Relaxed));
+        let path = crate::build_segment_path(dir, id);
+
+        let mut f = OpenOptions::new()
+            .write(true)
+            .create(true)
+            .open(&path)
+            .context(SegmentCreateSnafu)?;
+
+        f.write_all(FILE_TYPE_IDENTIFIER)
+            .context(SegmentWriteFileTypeSnafu)?;
+        let file_type_bytes_written = FILE_TYPE_IDENTIFIER.len();
+
+        let id_bytes = id.as_bytes();
+        f.write_all(&id_bytes).context(SegmentWriteIdSnafu)?;
+        let id_bytes_written = id_bytes.len();
+
+        f.sync_all().expect("fsync failure");
+
+        let bytes_written = file_type_bytes_written + id_bytes_written;
+
+        Ok(Self {
+            id,
+            path,
+            f,
+            bytes_written,
+            buffer: Vec::with_capacity(8 * 1204), // 8kiB initial size
+        })
+    }
+
+    pub fn id(&self) -> SegmentId {
+        self.id
+    }
+
+    pub fn write(&mut self, data: &[u8]) -> Result<WriteSummary> {
+        // Ensure the write buffer is always empty before using it.
+        self.buffer.clear();
+        // And shrink the buffer below the maximum permitted size should the odd
+        // large batch grow it. This is a NOP if the size is less than
+        // SOFT_MAX_BUFFER_LEN already.
+        self.buffer.shrink_to(SOFT_MAX_BUFFER_LEN);
+
+        // Only designed to support chunks up to `u32::max` bytes long.
+        let uncompressed_len = data.len();
+        u32::try_from(uncompressed_len).context(ChunkSizeTooLargeSnafu {
+            actual: uncompressed_len,
+        })?;
+
+        // The chunk header is two u32 values, so write a dummy u64 value and
+        // come back to fill them in later.
+        self.buffer
+            .write_u64::<BigEndian>(0)
+            .expect("cannot fail to write to buffer");
+
+        // Compress the payload into the reused buffer, recording the crc hash
+        // as it is wrote.
+        let mut encoder = snap::write::FrameEncoder::new(HasherWrapper::new(&mut self.buffer));
+        encoder.write_all(data).context(UnableToCompressDataSnafu)?;
+        let (checksum, buf) = encoder
+            .into_inner()
+            .expect("cannot fail to flush to a Vec")
+            .finalize();
+
+        // Adjust the compressed length to take into account the u64 padding
+        // above.
+        let compressed_len = buf.len() - mem::size_of::<u64>();
+        let compressed_len = u32::try_from(compressed_len).context(ChunkSizeTooLargeSnafu {
+            actual: compressed_len,
+        })?;
+
+        // Go back and write the chunk header values
+        let mut buf = Cursor::new(buf);
+        buf.set_position(0);
+
+        buf.write_u32::<BigEndian>(checksum)
+            .context(SegmentWriteChecksumSnafu)?;
+        buf.write_u32::<BigEndian>(compressed_len)
+            .context(SegmentWriteLengthSnafu)?;
+
+        // Write the entire buffer to the file
+        let buf = buf.into_inner();
+        let bytes_written = buf.len();
+        self.f.write_all(buf).context(SegmentWriteDataSnafu)?;
+
+        // fsync the fd
+        self.f.sync_all().expect("fsync failure");
+
+        self.bytes_written += bytes_written;
+
+        Ok(WriteSummary {
+            total_bytes: self.bytes_written,
+            bytes_written,
+            segment_id: self.id,
+        })
+    }
+
+    pub fn close(self) -> Result<ClosedSegment> {
+        let Self {
+            id,
+            path,
+            bytes_written,
+            ..
+        } = self;
+        Ok(ClosedSegment {
+            id,
+            path,
+            size: bytes_written
+                .try_into()
+                .expect("bytes_written did not fit in size type"),
+        })
+    }
+}
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    SegmentCreate {
+        source: io::Error,
+    },
+
+    SegmentWriteFileType {
+        source: io::Error,
+    },
+
+    SegmentWriteId {
+        source: io::Error,
+    },
+
+    SegmentWriteChecksum {
+        source: io::Error,
+    },
+
+    SegmentWriteLength {
+        source: io::Error,
+    },
+
+    SegmentWriteData {
+        source: io::Error,
+    },
+
+    ChunkSizeTooLarge {
+        source: num::TryFromIntError,
+        actual: usize,
+    },
+
+    UnableToCompressData {
+        source: io::Error,
+    },
+
+    UnableToReadFileMetadata {
+        source: io::Error,
+    },
+
+    UnableToReadCreated {
+        source: io::Error,
+    },
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A [`HasherWrapper`] acts as a [`Write`] decorator, recording the crc
+/// checksum of the data wrote to the inner [`Write`] implementation.
+struct HasherWrapper<W> {
+    inner: W,
+    hasher: Hasher,
+}
+
+impl<W> HasherWrapper<W> {
+    fn new(inner: W) -> Self {
+        Self {
+            inner,
+            hasher: Hasher::default(),
+        }
+    }
+
+    fn finalize(self) -> (u32, W) {
+        (self.hasher.finalize(), self.inner)
+    }
+}
+
+impl<W> Write for HasherWrapper<W>
+where
+    W: Write,
+{
+    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+        self.hasher.update(buf);
+        self.inner.write(buf)
+    }
+
+    fn flush(&mut self) -> io::Result<()> {
+        self.inner.flush()
+    }
+}
diff --git a/wal/src/lib.rs b/wal/src/lib.rs
new file mode 100644
index 0000000..0801021
--- /dev/null
+++ b/wal/src/lib.rs
@@ -0,0 +1,984 @@
+//! # WAL
+//!
+//! This crate provides a local-disk WAL for the IOx ingestion pipeline.
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::{
+    collections::BTreeMap,
+    fs::File,
+    io::{self, BufReader},
+    path::{Path, PathBuf},
+    sync::{atomic::AtomicU64, Arc},
+    time::Duration,
+};
+
+use hashbrown::HashMap;
+use parking_lot::Mutex;
+use snafu::prelude::*;
+use tokio::{sync::watch, task::JoinHandle};
+
+use data_types::{sequence_number_set::SequenceNumberSet, NamespaceId, TableId};
+use generated_types::{
+    google::{FieldViolation, OptionalField},
+    influxdata::iox::wal::v1::{
+        sequenced_wal_op::Op as WalOp, SequencedWalOp as ProtoSequencedWalOp,
+    },
+};
+use mutable_batch::MutableBatch;
+use mutable_batch_pb::decode::decode_database_batch;
+use observability_deps::tracing::info;
+use writer_thread::WriterIoThreadHandle;
+
+use crate::blocking::{
+    ClosedSegmentFileReader as RawClosedSegmentFileReader, OpenSegmentFileWriter,
+};
+
+pub mod blocking;
+mod writer_thread;
+
+const WAL_FLUSH_INTERVAL: Duration = Duration::from_millis(10);
+
+// TODO: Should have more variants / error types to avoid reusing these
+#[derive(Debug, Snafu)]
+#[allow(missing_copy_implementations, missing_docs)]
+#[snafu(visibility(pub(crate)))]
+pub enum Error {
+    SegmentFileIdentifierMismatch {},
+
+    UnableToReadFileMetadata {
+        source: io::Error,
+    },
+
+    UnableToCreateWalDir {
+        source: io::Error,
+        path: PathBuf,
+    },
+
+    UnableToWriteSequenceNumber {
+        source: io::Error,
+    },
+
+    UnableToWriteChecksum {
+        source: io::Error,
+    },
+
+    UnableToWriteLength {
+        source: io::Error,
+    },
+
+    UnableToWriteData {
+        source: io::Error,
+    },
+
+    UnableToSync {
+        source: io::Error,
+    },
+
+    UnableToReadDirectoryContents {
+        source: io::Error,
+        path: PathBuf,
+    },
+
+    UnableToOpenFile {
+        source: blocking::ReaderError,
+        path: PathBuf,
+    },
+
+    UnableToSendRequestToReaderTask,
+
+    UnableToReceiveResponseFromSenderTask {
+        source: tokio::sync::oneshot::error::RecvError,
+    },
+
+    UnableToReadFileHeader {
+        source: blocking::ReaderError,
+    },
+
+    UnableToReadEntries {
+        source: blocking::ReaderError,
+    },
+
+    /// This error indicates that the next entry cannot be read from the
+    /// segment file.
+    UnableToReadNextOps {
+        source: blocking::ReaderError,
+    },
+
+    InvalidId {
+        filename: String,
+        source: std::num::ParseIntError,
+    },
+
+    SegmentNotFound {
+        id: SegmentId,
+    },
+
+    DeleteClosedSegment {
+        source: std::io::Error,
+        path: PathBuf,
+    },
+
+    OpenSegmentDirectory {
+        source: std::io::Error,
+        path: PathBuf,
+    },
+
+    UnableToWrite {
+        source: blocking::WriterError,
+    },
+
+    UnableToCreateSegmentFile {
+        source: blocking::WriterError,
+    },
+}
+
+/// Errors that occur when decoding internal types from a WAL file.
+#[derive(Debug, Snafu)]
+#[snafu(visibility(pub(crate)))]
+pub enum DecodeError {
+    UnableToCreateMutableBatch {
+        source: mutable_batch_pb::decode::Error,
+    },
+
+    FailedToReadWal {
+        source: Error,
+    },
+}
+
+/// A specialized `Result` for WAL-related errors
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Segments are identified by a u64 that indicates file ordering
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct SegmentId(u64);
+
+pub type SegmentIdBytes = [u8; 8];
+
+#[allow(missing_docs)]
+impl SegmentId {
+    pub const fn new(v: u64) -> Self {
+        Self(v)
+    }
+
+    pub fn get(&self) -> u64 {
+        self.0
+    }
+
+    pub fn as_bytes(&self) -> SegmentIdBytes {
+        self.0.to_be_bytes()
+    }
+
+    pub fn from_bytes(bytes: SegmentIdBytes) -> Self {
+        let v = u64::from_be_bytes(bytes);
+        Self::new(v)
+    }
+}
+
+impl std::fmt::Display for SegmentId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+pub(crate) fn build_segment_path(dir: impl Into<PathBuf>, id: SegmentId) -> PathBuf {
+    let mut path = dir.into();
+    path.push(id.to_string());
+    path.set_extension(SEGMENT_FILE_EXTENSION);
+    path
+}
+
+/// The first bytes written into a segment file to identify it and its version.
+// TODO: What's the expected way of upgrading -- what happens when we need version 31?
+type FileTypeIdentifier = [u8; 8];
+const FILE_TYPE_IDENTIFIER: &FileTypeIdentifier = b"INFLUXV3";
+/// File extension for segment files.
+const SEGMENT_FILE_EXTENSION: &str = "dat";
+
+/// The main type representing one WAL for one ingester instance.
+///
+/// # Constraints
+///
+/// Creating multiple separate instances of this type using the same root path as the storage
+/// location is not supported. Each instance needs to be the only logical owner of the files within
+/// its root directory.
+///
+/// Similarly, editing or deleting files within a `Wal`'s root directory via some other mechanism
+/// is not supported.
+pub struct Wal {
+    root: PathBuf,
+    segments: Arc<Mutex<Segments>>,
+    next_id_source: Arc<AtomicU64>,
+    buffer: Mutex<WalBuffer>,
+
+    /// The handle to the [`Wal::flush_buffer_background_task()`] task.
+    flusher_task: Mutex<Option<JoinHandle<()>>>,
+}
+
+impl Wal {
+    /// Creates a `Wal` instance that manages files in the specified root directory.
+    /// # Constraints
+    ///
+    /// Creating multiple separate instances of this type using the same root path as the storage
+    /// location is not supported. Each instance needs to be the only logical owner of the files
+    /// within its root directory.
+    ///
+    /// Similarly, editing or deleting files within a `Wal`'s root directory via some other
+    /// mechanism is not supported.
+    pub async fn new(root: impl Into<PathBuf> + Send) -> Result<Arc<Self>> {
+        let root = root.into();
+        info!(wal_dir=?root, "Initalizing Write Ahead Log (WAL)");
+        tokio::fs::create_dir_all(&root)
+            .await
+            .context(UnableToCreateWalDirSnafu { path: &root })?;
+
+        // ensure the directory creation is actually fsync'd so that when we create files there
+        // we don't lose them (see: https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-pillai.pdf)
+        File::open(&root)
+            .expect("should be able to open just-created directory")
+            .sync_all()
+            .expect("fsync failure");
+
+        let mut dir = tokio::fs::read_dir(&root)
+            .await
+            .context(UnableToReadDirectoryContentsSnafu { path: &root })?;
+
+        // Closed segments must be ordered by ID, which is the order they were written in and the
+        // order they should be replayed in.
+        let mut closed_segments = BTreeMap::new();
+
+        while let Some(child) = dir
+            .next_entry()
+            .await
+            .context(UnableToReadDirectoryContentsSnafu { path: &root })?
+        {
+            let metadata = child
+                .metadata()
+                .await
+                .context(UnableToReadFileMetadataSnafu)?;
+            if metadata.is_file() {
+                let child_path = child.path();
+                let filename = child_path
+                    .file_stem()
+                    .expect("WAL files created by IOx should have a file stem");
+                let filename = filename
+                    .to_str()
+                    .expect("WAL files created by IOx should be named with valid UTF-8");
+                let id = SegmentId::new(filename.parse().context(InvalidIdSnafu { filename })?);
+                let segment = ClosedSegment {
+                    id,
+                    path: child.path(),
+                    size: metadata.len(),
+                };
+                closed_segments.insert(id, segment);
+            }
+        }
+
+        let next_id = closed_segments
+            .keys()
+            .last()
+            .copied()
+            .map(|id| id.get() + 1)
+            .unwrap_or(0);
+        let next_id_source = Arc::new(AtomicU64::new(next_id));
+        let open_segment =
+            OpenSegmentFileWriter::new_in_directory(&root, Arc::clone(&next_id_source))
+                .context(UnableToCreateSegmentFileSnafu)?;
+
+        let buffer = WalBuffer::new(None);
+
+        let wal = Self {
+            root,
+            segments: Arc::new(Mutex::new(Segments {
+                closed_segments,
+                open_segment,
+                open_segment_ids: SequenceNumberSet::default(),
+            })),
+            next_id_source,
+            buffer: Mutex::new(buffer),
+            flusher_task: Default::default(),
+        };
+
+        let wal = Arc::new(wal);
+        let flush_wal = Arc::clone(&wal);
+
+        // Retain the handle to the flusher task so it can be stopped later.
+        *wal.flusher_task.lock() = Some(tokio::task::spawn(async move {
+            flush_wal.flush_buffer_background_task().await
+        }));
+
+        Ok(wal)
+    }
+
+    /// Get the closed segments from the WAL
+    pub fn closed_segments(&self) -> Vec<ClosedSegment> {
+        let s = self.segments.lock();
+        s.closed_segments.values().cloned().collect()
+    }
+
+    /// Open a reader to a closed segment
+    pub fn reader_for_segment(&self, id: SegmentId) -> Result<ClosedSegmentFileReader> {
+        let path = build_segment_path(&self.root, id);
+        ClosedSegmentFileReader::from_path(path)
+    }
+
+    /// Writes one [`SequencedWalOp`] to the buffer and returns a watch channel
+    /// for when the buffer is flushed and fsync'd to disk.
+    pub fn write_op(&self, op: SequencedWalOp) -> watch::Receiver<Option<WriteResult>> {
+        let mut b = self.buffer.lock();
+        b.ops.push(op);
+        b.flush_notification.clone()
+    }
+
+    /// Closes the currently open segment and opens a new one, returning the
+    /// closed segment details, including the [`SequenceNumberSet`] containing
+    /// the sequence numbers of the writes within the closed segment.
+    pub fn rotate(&self) -> Result<(ClosedSegment, SequenceNumberSet)> {
+        let new_open_segment =
+            OpenSegmentFileWriter::new_in_directory(&self.root, Arc::clone(&self.next_id_source))
+                .context(UnableToCreateSegmentFileSnafu)?;
+
+        let mut segments = self.segments.lock();
+
+        let closed = std::mem::replace(&mut segments.open_segment, new_open_segment);
+        let seqnum_set = std::mem::take(&mut segments.open_segment_ids);
+        let closed = closed.close().expect("should convert to closed segment");
+
+        let previous_value = segments.closed_segments.insert(closed.id(), closed.clone());
+        assert!(
+            previous_value.is_none(),
+            "should always add new closed segment entries, not replace"
+        );
+
+        Ok((closed, seqnum_set))
+    }
+
+    async fn flush_buffer_background_task(&self) {
+        // Start a separate I/O thread to handle the serialisation, compression,
+        // and actual file I/O.
+        //
+        // This prevents the file I/O from blocking an async runtime thread,
+        // which in turn would starve it of the ability to service other tasks.
+        //
+        // When this handle is dropped, the I/O thread is gracefully stopped.
+        let io_thread = WriterIoThreadHandle::new(Arc::clone(&self.segments));
+
+        let mut interval = tokio::time::interval(WAL_FLUSH_INTERVAL);
+
+        // Pre-allocate the WAL buffer outside of the exclusive lock, and track
+        // the buffer utilisation to optimise pre-allocation.
+        let mut size_hint = None;
+        let mut new_buf = WalBuffer::new(size_hint);
+
+        loop {
+            interval.tick().await;
+
+            // Rust's move properties ensure we never accidentally reuse a
+            // buffer, but make it clear the buffer is always fresh before use.
+            assert!(new_buf.ops.is_empty());
+
+            // only write to the disk if there are writes buffered
+            let filled_buffer = {
+                let mut b = self.buffer.lock();
+                if b.ops.is_empty() {
+                    // Don't replace the buffer, as the current buffer is empty.
+                    //
+                    // This doesn't throw away the pre-allocated buffer - that
+                    // can be used later!
+                    continue;
+                }
+
+                // Update the size hint to match the size of the last batch.
+                size_hint = Some(b.ops.len());
+
+                // Move the pre-allocated buffer to replace the old.
+                std::mem::replace(&mut *b, new_buf)
+            };
+
+            // Allocate the next buffer, using the previous buffer size as a
+            // pre-allocation hint.
+            new_buf = WalBuffer::new(size_hint);
+
+            io_thread.enqueue_batch(filled_buffer).await;
+        }
+    }
+
+    /// Deletes the specified segment from disk.
+    pub async fn delete(&self, id: SegmentId) -> Result<()> {
+        let closed = self
+            .segments
+            .lock()
+            .closed_segments
+            .remove(&id)
+            .context(SegmentNotFoundSnafu { id })?;
+        std::fs::remove_file(&closed.path).context(DeleteClosedSegmentSnafu { path: closed.path })
+    }
+}
+
+impl Drop for Wal {
+    fn drop(&mut self) {
+        // Stop the background flusher task, if any.
+        if let Some(t) = self.flusher_task.lock().take() {
+            t.abort()
+        }
+    }
+}
+
+impl std::fmt::Debug for Wal {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Wal")
+            .field("root", &self.root)
+            .field("next_id_source", &self.next_id_source)
+            .finish()
+    }
+}
+
+#[derive(Debug)]
+struct Segments {
+    closed_segments: BTreeMap<SegmentId, ClosedSegment>,
+    open_segment: OpenSegmentFileWriter,
+    open_segment_ids: SequenceNumberSet,
+}
+
+#[derive(Debug)]
+struct WalBuffer {
+    ops: Vec<SequencedWalOp>,
+    notify_flush: tokio::sync::watch::Sender<Option<WriteResult>>,
+    flush_notification: tokio::sync::watch::Receiver<Option<WriteResult>>,
+}
+
+impl WalBuffer {
+    fn new(size_hint: Option<usize>) -> Self {
+        let (tx, rx) = tokio::sync::watch::channel::<Option<WriteResult>>(None);
+
+        Self {
+            ops: Vec::with_capacity(size_hint.unwrap_or(20)),
+            notify_flush: tx,
+            flush_notification: rx,
+        }
+    }
+}
+
+/// A wal operation with a sequence number
+#[derive(Debug, PartialEq, Clone)]
+pub struct SequencedWalOp {
+    /// This mapping assigns a sequence number to table ID modified by this
+    /// write.
+    pub table_write_sequence_numbers: std::collections::HashMap<TableId, u64>,
+    /// The underlying WAL operation which this wrapper sequences.
+    pub op: WalOp,
+}
+
+impl TryFrom<ProtoSequencedWalOp> for SequencedWalOp {
+    type Error = FieldViolation;
+
+    fn try_from(proto: ProtoSequencedWalOp) -> Result<Self, Self::Error> {
+        let ProtoSequencedWalOp {
+            table_write_sequence_numbers,
+            op,
+        } = proto;
+
+        Ok(Self {
+            table_write_sequence_numbers: table_write_sequence_numbers
+                .into_iter()
+                .map(|(table_id, sequence_number)| (TableId::new(table_id), sequence_number))
+                .collect(),
+            op: op.unwrap_field("op")?,
+        })
+    }
+}
+
+impl From<SequencedWalOp> for ProtoSequencedWalOp {
+    fn from(seq_op: SequencedWalOp) -> Self {
+        let SequencedWalOp {
+            table_write_sequence_numbers,
+            op,
+        } = seq_op;
+
+        Self {
+            table_write_sequence_numbers: table_write_sequence_numbers
+                .into_iter()
+                .map(|(table_id, sequence_number)| (table_id.get(), sequence_number))
+                .collect(),
+            op: Some(op),
+        }
+    }
+}
+
+/// Raw, uncompressed and unstructured data for a Segment entry with a checksum.
+#[derive(Debug, Eq, PartialEq)]
+pub struct SegmentEntry {
+    /// The uncompressed data
+    pub data: Vec<u8>,
+}
+
+/// Result from a WAL flush and fsync. This needs to be cloneable which is why it doesn't
+/// use a regular error type. Also, receivers of this can't really do anything with the
+/// error result other than determining that there was an error and that they should do an
+/// orderly shutdown.
+#[derive(Debug, Clone)]
+pub enum WriteResult {
+    Ok(WriteSummary),
+    Err(String),
+}
+
+/// Summary information after a write
+#[derive(Debug, Copy, Clone)]
+pub struct WriteSummary {
+    /// Total size of the segment in bytes
+    pub total_bytes: usize,
+    /// Number of bytes written to segment in this write
+    pub bytes_written: usize,
+    /// Which segment file this entry was written to
+    pub segment_id: SegmentId,
+}
+
+/// Reader for a closed segment file
+pub struct ClosedSegmentFileReader {
+    id: SegmentId,
+    file: RawClosedSegmentFileReader<BufReader<File>>,
+}
+
+impl Iterator for ClosedSegmentFileReader {
+    type Item = Result<(Vec<SequencedWalOp>, u64)>;
+
+    /// Read the next batch of sequenced WAL operations from the file
+    fn next(&mut self) -> Option<Self::Item> {
+        self.file
+            .next_batch()
+            .context(UnableToReadNextOpsSnafu)
+            .transpose()
+            .map(|result| result.map(|batch| (batch, self.bytes_read())))
+    }
+}
+
+impl ClosedSegmentFileReader {
+    /// Return the segment file id
+    pub fn id(&self) -> SegmentId {
+        self.id
+    }
+
+    /// Returns the total number of bytes successfully read by the underlying file reader
+    /// from disk.
+    pub fn bytes_read(&self) -> u64 {
+        self.file.bytes_read()
+    }
+
+    /// Open the segment file and read its header, ensuring it is a segment file and reading its id.
+    pub fn from_path(path: impl AsRef<Path>) -> Result<Self> {
+        let path = path.as_ref();
+        let mut file =
+            RawClosedSegmentFileReader::from_path(path).context(UnableToOpenFileSnafu { path })?;
+
+        let (file_type, id) = file.read_header().context(UnableToReadFileHeaderSnafu)?;
+
+        ensure!(
+            &file_type == FILE_TYPE_IDENTIFIER,
+            SegmentFileIdentifierMismatchSnafu,
+        );
+
+        let id = SegmentId::from_bytes(id);
+
+        Ok(Self { id, file })
+    }
+}
+
+impl std::fmt::Debug for ClosedSegmentFileReader {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ClosedSegmentFileReader")
+            .field("id", &self.id)
+            .finish()
+    }
+}
+
+/// An in-memory representation of a WAL write operation entry.
+#[derive(Debug)]
+pub struct WriteOpEntry {
+    pub namespace: NamespaceId,
+    pub table_batches: HashMap<TableId, MutableBatch>,
+}
+
+/// A decoder that reads from a closed segment file and parses write
+/// operations from their on-disk format to an internal format.
+#[derive(Debug)]
+pub struct WriteOpEntryDecoder {
+    reader: ClosedSegmentFileReader,
+}
+
+impl From<ClosedSegmentFileReader> for WriteOpEntryDecoder {
+    /// Creates a decoder which will use the closed segment file of `reader` to
+    /// decode write ops from their on-disk format.
+    fn from(reader: ClosedSegmentFileReader) -> Self {
+        Self { reader }
+    }
+}
+
+impl Iterator for WriteOpEntryDecoder {
+    type Item = Result<Vec<WriteOpEntry>, DecodeError>;
+
+    /// Reads a collection of write op entries in the next WAL entry batch from the
+    /// underlying closed segment. A returned Ok(None) indicates that there are no
+    /// more entries to be decoded from the underlying segment. A zero-length vector
+    /// may be returned if there are no writes in a WAL entry batch, but does not
+    /// indicate the decoder is consumed.
+    fn next(&mut self) -> Option<Self::Item> {
+        Some(
+            self.reader
+                .next()?
+                .context(FailedToReadWalSnafu)
+                .map(|(batch, _)| {
+                    batch
+                        .into_iter()
+                        .filter_map(|sequenced_op| match sequenced_op.op {
+                            WalOp::Write(w) => Some(w),
+                            WalOp::Delete(..) => None,
+                            WalOp::Persist(..) => None,
+                        })
+                        .map(|w| {
+                            Ok(WriteOpEntry {
+                                namespace: NamespaceId::new(w.database_id),
+                                table_batches: decode_database_batch(&w)
+                                    .context(UnableToCreateMutableBatchSnafu)?
+                                    .into_iter()
+                                    .map(|(id, mb)| (TableId::new(id), mb))
+                                    .collect(),
+                            })
+                        })
+                        .collect::<Self::Item>()
+                })
+                .unwrap_or_else(Err),
+        )
+    }
+}
+
+/// Metadata for a WAL segment that is no longer accepting writes, but can be read for replay
+/// purposes.
+#[derive(Debug, Clone)]
+pub struct ClosedSegment {
+    id: SegmentId,
+    path: PathBuf,
+    size: u64,
+}
+
+impl ClosedSegment {
+    pub fn id(&self) -> SegmentId {
+        self.id
+    }
+
+    pub fn size(&self) -> u64 {
+        self.size
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::BTreeSet;
+    use std::fs::{read_dir, OpenOptions};
+    use std::io::Write;
+
+    use assert_matches::assert_matches;
+
+    use data_types::{NamespaceId, SequenceNumber, TableId};
+    use dml::DmlWrite;
+    use generated_types::influxdata::{
+        iox::{delete::v1::DeletePayload, wal::v1::PersistOp},
+        pbdata::v1::DatabaseBatch,
+    };
+    use mutable_batch_lp::lines_to_batches;
+
+    use super::*;
+
+    const TEST_NAMESPACE_ID: NamespaceId = NamespaceId::new(42);
+
+    #[tokio::test]
+    async fn wal_write_and_read_ops() {
+        let dir = test_helpers::tmp_dir().unwrap();
+        let wal = Wal::new(&dir.path()).await.unwrap();
+
+        let w1 = test_data("m1,t=foo v=1i 1");
+        // Use multiple tables for a write to test per-partition sequencing is preserved
+        let w2 = test_data("m1,t=foo v=2i 2\nm2,u=bar v=1i 1");
+
+        let op1 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 0)].into_iter().collect(),
+            op: WalOp::Write(w1),
+        };
+        let op2 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 1), (TableId::new(1), 2)]
+                .into_iter()
+                .collect(),
+            op: WalOp::Write(w2),
+        };
+        let op3 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 3)].into_iter().collect(),
+            op: WalOp::Delete(test_delete()),
+        };
+        let op4 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 3)].into_iter().collect(),
+            op: WalOp::Persist(test_persist()),
+        };
+
+        wal.write_op(op1.clone());
+        wal.write_op(op2.clone());
+        wal.write_op(op3.clone());
+        wal.write_op(op4.clone()).changed().await.unwrap();
+
+        let (closed, ids) = wal.rotate().unwrap();
+
+        let ops: Vec<SequencedWalOp> = wal
+            .reader_for_segment(closed.id)
+            .expect("should be able to open reader for closed WAL segment")
+            .flat_map(|batch| batch.expect("failed to read WAL op batch").0)
+            .collect();
+        assert_eq!(vec![op1, op2, op3, op4], ops);
+
+        // Assert the set has recorded the op IDs.
+        //
+        // Note that one op has a duplicate sequence number above!
+        assert_eq!(ids.len(), 4);
+
+        // Assert the sequence number set contains the specified ops.
+        let ids = ids.iter().collect::<Vec<_>>();
+        assert_eq!(
+            ids,
+            [
+                SequenceNumber::new(0),
+                SequenceNumber::new(1),
+                SequenceNumber::new(2),
+                SequenceNumber::new(3),
+            ]
+        );
+
+        // Assert the partitioned sequence numbers contain the correct values
+        assert_eq!(
+            ops.into_iter()
+                .map(|op| op.table_write_sequence_numbers)
+                .collect::<Vec<std::collections::HashMap<TableId, u64>>>(),
+            [
+                [(TableId::new(0), 0)].into_iter().collect(),
+                [(TableId::new(0), 1), (TableId::new(1), 2)]
+                    .into_iter()
+                    .collect(),
+                [(TableId::new(0), 3)].into_iter().collect(),
+                [(TableId::new(0), 3)].into_iter().collect(),
+            ]
+            .into_iter()
+            .collect::<Vec<std::collections::HashMap<TableId, u64>>>(),
+        );
+    }
+
+    // open wal with files that aren't segments (should log and skip)
+
+    // read segment works even if last entry is truncated
+
+    #[tokio::test]
+    async fn rotate_without_writes() {
+        let dir = test_helpers::tmp_dir().unwrap();
+
+        let wal = Wal::new(dir.path()).await.unwrap();
+
+        // Just-created WALs have no closed segments.
+        let closed = wal.closed_segments();
+        assert!(
+            closed.is_empty(),
+            "Expected empty closed segments; got {closed:?}"
+        );
+
+        // No writes, but rotating is totally fine
+        let (closed_segment_details, ids) = wal.rotate().unwrap();
+        assert_eq!(closed_segment_details.size(), 16);
+        assert!(ids.is_empty());
+
+        // There's one closed segment
+        let closed = wal.closed_segments();
+        let closed_segment_ids: Vec<_> = closed.iter().map(|c| c.id()).collect();
+        assert_eq!(closed_segment_ids, &[closed_segment_details.id()]);
+
+        // There aren't any entries in the closed segment because nothing was written
+        let mut reader = wal.reader_for_segment(closed_segment_details.id()).unwrap();
+        assert!(reader.next().is_none());
+
+        // Can delete an empty segment, leaving no closed segments again
+        wal.delete(closed_segment_details.id()).await.unwrap();
+        let closed = wal.closed_segments();
+        assert!(
+            closed.is_empty(),
+            "Expected empty closed segments; got {closed:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn decode_write_op_entries() {
+        let dir = test_helpers::tmp_dir().unwrap();
+        let wal = Wal::new(dir.path()).await.unwrap();
+
+        let w1 = test_data("m1,t=foo v=1i 1");
+        let w2 = test_data("m1,t=foo v=2i 2\nm2,u=foo w=2i 2");
+        let w3 = test_data("m1,t=foo v=3i 3");
+
+        let op1 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 0)].into_iter().collect(),
+            op: WalOp::Write(w1.to_owned()),
+        };
+        let op2 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 1), (TableId::new(1), 2)]
+                .into_iter()
+                .collect(),
+            op: WalOp::Write(w2.to_owned()),
+        };
+        let op3 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 3)].into_iter().collect(),
+            op: WalOp::Delete(test_delete()),
+        };
+        let op4 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 3)].into_iter().collect(),
+            op: WalOp::Persist(test_persist()),
+        };
+        // A third write entry coming after a delete and persist entry must still be yielded
+        let op5 = SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 4)].into_iter().collect(),
+            op: WalOp::Write(w3.to_owned()),
+        };
+
+        wal.write_op(op1);
+        wal.write_op(op2);
+        wal.write_op(op3).changed().await.unwrap();
+        wal.write_op(op4);
+        wal.write_op(op5).changed().await.unwrap();
+
+        let (closed, _) = wal.rotate().unwrap();
+
+        let decoder = WriteOpEntryDecoder::from(
+            wal.reader_for_segment(closed.id)
+                .expect("failed to open reader for closed WAL segment"),
+        );
+
+        let wal_entries = decoder
+            .into_iter()
+            .map(|r| r.expect("unexpected bad entry"))
+            .collect::<Vec<_>>();
+        // The decoder should find 2 entries, with a total of 3 write ops
+        assert_eq!(wal_entries.len(), 2);
+        let write_op_entries = wal_entries.into_iter().flatten().collect::<Vec<_>>();
+        assert_eq!(write_op_entries.len(), 3);
+        assert_op_shape(&write_op_entries[0], &w1);
+        assert_op_shape(&write_op_entries[1], &w2);
+        assert_op_shape(&write_op_entries[2], &w3);
+    }
+
+    #[tokio::test]
+    async fn decode_write_op_entry_from_corrupted_wal() {
+        let dir = test_helpers::tmp_dir().unwrap();
+        let wal = Wal::new(dir.path()).await.unwrap();
+
+        // Log a write operation to test recovery from a tail-corrupted WAL.
+        let good_write = test_data("m3,a=baz b=4i 1");
+        wal.write_op(SequencedWalOp {
+            table_write_sequence_numbers: vec![(TableId::new(0), 0)].into_iter().collect(),
+            op: WalOp::Write(good_write.to_owned()),
+        })
+        .changed()
+        .await
+        .unwrap();
+
+        // Append some garbage to the tail end of the WAL, then rotate it
+        {
+            let mut reader = read_dir(dir.path()).unwrap();
+            let closed_path = reader
+                .next()
+                .expect("no segment file found in WAL dir")
+                .unwrap()
+                .path();
+            assert_matches!(reader.next(), None);
+
+            OpenOptions::new()
+                .append(true)
+                .open(closed_path)
+                .expect("unable to open closed WAL segment for writing")
+                .write_all(b"ceci ne pas une banane")
+                .unwrap();
+        }
+        let (closed, _) = wal.rotate().expect("failed to rotate WAL");
+
+        let mut decoder = WriteOpEntryDecoder::from(
+            wal.reader_for_segment(closed.id())
+                .expect("failed to open reader for closed segment"),
+        );
+        // Recover the single entry in front of the garbage and assert the expected
+        // error is thrown
+        assert_matches!(decoder.next(), Some(Ok(batch)) => {
+            assert_eq!(batch.len(), 1);
+            assert_op_shape(&batch[0], &good_write);
+        });
+        assert_matches!(
+            decoder.next(),
+            Some(Err(DecodeError::FailedToReadWal { .. }))
+        );
+    }
+
+    fn assert_op_shape(left: &WriteOpEntry, right: &DatabaseBatch) {
+        assert_eq!(left.namespace, NamespaceId::new(right.database_id));
+        assert_eq!(left.table_batches.len(), right.table_batches.len());
+        for right_tb in &right.table_batches {
+            let right_key = TableId::new(right_tb.table_id);
+            let left_mb = left
+                .table_batches
+                .get(&right_key)
+                .unwrap_or_else(|| panic!("left value missing table batch for key {right_key}"));
+            assert_eq!(
+                left_mb.column_names(),
+                right_tb
+                    .columns
+                    .iter()
+                    .map(|c| c.column_name.as_str())
+                    .collect::<BTreeSet<_>>()
+            )
+        }
+    }
+
+    fn test_data(lp: &str) -> DatabaseBatch {
+        let batches = lines_to_batches(lp, 0).unwrap();
+        let batches = batches
+            .into_iter()
+            .enumerate()
+            .map(|(i, (_table_name, batch))| (TableId::new(i as _), batch))
+            .collect();
+
+        let write = DmlWrite::new(
+            TEST_NAMESPACE_ID,
+            batches,
+            "bananas".into(),
+            Default::default(),
+        );
+
+        mutable_batch_pb::encode::encode_write(42, &write)
+    }
+
+    fn test_delete() -> DeletePayload {
+        DeletePayload {
+            database_id: TEST_NAMESPACE_ID.get(),
+            predicate: None,
+            table_name: "bananas".into(),
+        }
+    }
+
+    fn test_persist() -> PersistOp {
+        PersistOp {
+            namespace_id: TEST_NAMESPACE_ID.get(),
+            parquet_file_uuid: "b4N4N4Z".into(),
+            partition_id: 43,
+            table_id: 44,
+        }
+    }
+}
diff --git a/wal/src/writer_thread.rs b/wal/src/writer_thread.rs
new file mode 100644
index 0000000..60ca60f
--- /dev/null
+++ b/wal/src/writer_thread.rs
@@ -0,0 +1,169 @@
+use std::{sync::Arc, thread::JoinHandle};
+
+use data_types::{sequence_number_set::SequenceNumberSet, SequenceNumber};
+use generated_types::influxdata::iox::wal::v1 as proto;
+use observability_deps::tracing::{debug, error};
+use parking_lot::Mutex;
+use prost::Message;
+use tokio::sync::mpsc;
+
+use crate::{Segments, WalBuffer, WriteResult};
+
+/// The number of [`WalBuffer`] that may be enqueued for persistence.
+const FLUSH_QUEUE_DEPTH: usize = 1;
+
+/// An inner/non-pub struct that contains the [`WriterIoThreadHandle`] state -
+/// this lets the [`Drop`] impl of [`WriterIoThreadHandle`] destroy the channel
+/// tx and consume the [`JoinHandle`].
+struct HandleInner {
+    batch_tx: mpsc::Sender<WalBuffer>,
+    join_handle: JoinHandle<()>,
+}
+
+/// A [`WriterIoThreadHandle`] provides an interface for the caller to interact
+/// with an I/O thread.
+///
+/// Constructing a [`WriterIoThreadHandle`] spawns an I/O thread, and dropping
+/// the handle gracefully stops the I/O thread after it finishes flushing the
+/// current batch it is processing, if any.
+pub(crate) struct WriterIoThreadHandle {
+    inner: Option<HandleInner>,
+}
+
+impl WriterIoThreadHandle {
+    /// Spawn an I/O writer thread, flushing batches to the open segment in
+    /// `segments`.
+    pub(crate) fn new(segments: Arc<Mutex<Segments>>) -> Self {
+        let (batch_tx, batch_rx) = mpsc::channel(FLUSH_QUEUE_DEPTH);
+
+        // Spawn the I/O thread and retain a handle to wait for shutdown when
+        // this handle is dropped.
+        let join_handle = std::thread::Builder::new()
+            .name("WAL writer I/O thread".to_string())
+            .spawn(move || {
+                let writer = WriterIoThread::new(batch_rx, segments);
+                writer.run();
+            })
+            .expect("failed to spawn WAL I/O thread");
+
+        Self {
+            inner: Some(HandleInner {
+                batch_tx,
+                join_handle,
+            }),
+        }
+    }
+
+    /// Enqueue `batch` to be wrote to the current open segment file.
+    ///
+    /// Once flushed to disk and durable (or the write failed) the write result
+    /// is broadcast through the embedded channel.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the I/O thread is not running.
+    pub(crate) async fn enqueue_batch(&self, batch: WalBuffer) {
+        self.inner
+            .as_ref()
+            .unwrap()
+            .batch_tx
+            .send(batch)
+            .await
+            .expect("wal writer IO thread is dead")
+    }
+}
+
+impl Drop for WriterIoThreadHandle {
+    fn drop(&mut self) {
+        let inner = self.inner.take().unwrap();
+        // Signal to the IO thread that it should stop.
+        drop(inner.batch_tx);
+        // Wait for the IO thread to gracefully stop, discarding any errors -
+        // attempting to unwrap/panic in this Drop implwill cause an immediate
+        // SIGSEV.
+        let _ = inner.join_handle.join();
+    }
+}
+
+/// The state of the I/O actor thread.
+struct WriterIoThread {
+    /// A channel to receive batches to flush.
+    batch_rx: mpsc::Receiver<WalBuffer>,
+    /// The set of segments, used to obtain the current open segment handle.
+    segments: Arc<Mutex<Segments>>,
+}
+
+impl WriterIoThread {
+    fn new(batch_rx: mpsc::Receiver<WalBuffer>, segments: Arc<Mutex<Segments>>) -> Self {
+        Self { batch_rx, segments }
+    }
+
+    fn run(mut self) {
+        // Maintain a serialisation buffer for re-use, instead of allocating a
+        // new one for each batch.
+        let mut proto_data = Vec::with_capacity(4 * 1024);
+
+        loop {
+            proto_data.clear();
+
+            let batch = match self.batch_rx.blocking_recv() {
+                Some(batch) => batch,
+                None => {
+                    // The batch channel has closed - all handles have been
+                    // dropped.
+                    debug!("stopping WAL IO thread");
+                    return;
+                }
+            };
+
+            // Encode the batch into the proto types, and extract the
+            // SequenceNumberSet for this batch.
+            let (ops, ids): (Vec<_>, SequenceNumberSet) = batch
+                .ops
+                .into_iter()
+                .map(|v| {
+                    let op_ids: SequenceNumberSet = v
+                        .table_write_sequence_numbers
+                        .values()
+                        .map(|&id| SequenceNumber::new(id as _))
+                        .collect();
+                    (proto::SequencedWalOp::from(v), op_ids)
+                })
+                .unzip();
+            let proto_batch = proto::WalOpBatch { ops };
+
+            // Generate the binary protobuf message, storing it into proto_data
+            proto_batch
+                .encode(&mut proto_data)
+                .expect("encoding batch into vec cannot fail");
+
+            // Obtain the segments lock - this prevents concurrent rotation, but
+            // has no impact on concurrent writers.
+            {
+                // Write the serialised data to the current open segment file.
+                let mut segments = self.segments.lock();
+                match segments.open_segment.write(&proto_data) {
+                    Ok(summary) => {
+                        // Broadcast the result to all writers to this batch.
+                        //
+                        // Do not panic if no thread is waiting for the flush
+                        // notification - this may be the case if all writes
+                        // disconnected before the WAL was flushed.
+                        let _ = batch.notify_flush.send(Some(WriteResult::Ok(summary)));
+
+                        // Now the blocked writers are making progress, union
+                        // this batch sequence number set with the cumulative
+                        // segment file set before releasing the segment lock.
+                        segments.open_segment_ids.add_set(&ids);
+                    }
+                    Err(e) => {
+                        error!(error=%e, "failed to write WAL batch");
+                        let _ = batch
+                            .notify_flush
+                            .send(Some(WriteResult::Err(e.to_string())));
+                    }
+                };
+            };
+        }
+    }
+}
diff --git a/wal/tests/end_to_end.rs b/wal/tests/end_to_end.rs
new file mode 100644
index 0000000..aa53c8c
--- /dev/null
+++ b/wal/tests/end_to_end.rs
@@ -0,0 +1,269 @@
+use assert_matches::assert_matches;
+use data_types::{NamespaceId, SequenceNumber, TableId};
+use dml::DmlWrite;
+use generated_types::influxdata::{
+    iox::wal::v1::sequenced_wal_op::Op as WalOp,
+    pbdata::v1::{DatabaseBatch, TableBatch},
+};
+use mutable_batch_lp::lines_to_batches;
+use tokio::sync::watch;
+use wal::{SequencedWalOp, WriteResult, WriteSummary};
+
+#[tokio::test]
+async fn crud() {
+    let dir = test_helpers::tmp_dir().unwrap();
+
+    let wal = wal::Wal::new(dir.path()).await.unwrap();
+
+    // Just-created WALs have no closed segments.
+    let closed = wal.closed_segments();
+    assert!(
+        closed.is_empty(),
+        "Expected empty closed segments; got {closed:?}"
+    );
+
+    // Can write an entry to the open segment
+    let op = arbitrary_sequenced_wal_op([42, 43]);
+    let summary = unwrap_summary(wal.write_op(op)).await;
+    assert_eq!(summary.total_bytes, 140);
+    assert_eq!(summary.bytes_written, 124);
+
+    // Can write another entry; total_bytes accumulates
+    let op = arbitrary_sequenced_wal_op([44, 45]);
+    let summary = unwrap_summary(wal.write_op(op)).await;
+    assert_eq!(summary.total_bytes, 264);
+    assert_eq!(summary.bytes_written, 124);
+
+    // Still no closed segments
+    let closed = wal.closed_segments();
+    assert!(
+        closed.is_empty(),
+        "Expected empty closed segments; got {closed:?}"
+    );
+
+    // Can't read entries from the open segment; have to rotate first
+    let (closed_segment_details, ids) = wal.rotate().unwrap();
+    assert_eq!(closed_segment_details.size(), 264);
+    assert_eq!(
+        ids.iter().collect::<Vec<_>>(),
+        [
+            SequenceNumber::new(42),
+            SequenceNumber::new(43),
+            SequenceNumber::new(44),
+            SequenceNumber::new(45)
+        ]
+    );
+
+    // There's one closed segment
+    let closed = wal.closed_segments();
+    let closed_segment_ids: Vec<_> = closed.iter().map(|c| c.id()).collect();
+    assert_eq!(closed_segment_ids, &[closed_segment_details.id()]);
+
+    // Can read the written entries from the closed segment, ensuring that the
+    // per-partition sequence numbers are preserved.
+    let mut reader = wal.reader_for_segment(closed_segment_details.id()).unwrap();
+    let (mut op, _) = reader.next().unwrap().unwrap();
+    let mut got_sequence_numbers = op
+        .remove(0)
+        .table_write_sequence_numbers
+        .into_values()
+        .collect::<Vec<_>>();
+    got_sequence_numbers.sort();
+    assert_eq!(got_sequence_numbers, Vec::<u64>::from([42, 43]),);
+    let (mut op, bytes_read) = reader.next().unwrap().unwrap();
+    let mut got_sequence_numbers = op
+        .remove(0)
+        .table_write_sequence_numbers
+        .into_values()
+        .collect::<Vec<_>>();
+    got_sequence_numbers.sort();
+    assert_eq!(got_sequence_numbers, Vec::<u64>::from([44, 45]),);
+
+    // Ensure that all entries have been read and the total bytes read reflect
+    // the segment size.
+    assert_matches!(reader.next(), None);
+    assert_eq!(bytes_read, closed_segment_details.size());
+
+    // Can delete a segment, leaving no closed segments again
+    wal.delete(closed_segment_details.id()).await.unwrap();
+    let closed = wal.closed_segments();
+    assert!(
+        closed.is_empty(),
+        "Expected empty closed segments; got {closed:?}"
+    );
+}
+
+#[tokio::test]
+async fn replay() {
+    let dir = test_helpers::tmp_dir().unwrap();
+
+    // Create a WAL with an entry, rotate to close the segment, create another entry, then drop the
+    // WAL.
+    {
+        let wal = wal::Wal::new(dir.path()).await.unwrap();
+        let op = arbitrary_sequenced_wal_op([42]);
+        let _ = unwrap_summary(wal.write_op(op)).await;
+        wal.rotate().unwrap();
+        let op = arbitrary_sequenced_wal_op([43, 44]);
+        let _ = unwrap_summary(wal.write_op(op)).await;
+    }
+
+    // Create a new WAL instance with the same directory to replay from the files
+    let wal = wal::Wal::new(dir.path()).await.unwrap();
+
+    // There's two closed segments -- one for the previously closed segment, one for the previously
+    // open segment. Replayed WALs treat all files as closed, because effectively they are.
+    let closed = wal.closed_segments();
+    let closed_segment_ids: Vec<_> = closed.iter().map(|c| c.id()).collect();
+    assert_eq!(closed_segment_ids.len(), 2);
+
+    // Can read the written entries from the previously closed segment
+    // ensuring the per-partition sequence numbers are preserved.
+    let mut reader = wal.reader_for_segment(closed_segment_ids[0]).unwrap();
+    let (mut op, _) = reader.next().unwrap().unwrap();
+    let mut got_sequence_numbers = op
+        .remove(0)
+        .table_write_sequence_numbers
+        .into_values()
+        .collect::<Vec<_>>();
+    got_sequence_numbers.sort();
+    assert_eq!(got_sequence_numbers, Vec::<u64>::from([42]));
+
+    // Can read the written entries from the previously open segment
+    let mut reader = wal.reader_for_segment(closed_segment_ids[1]).unwrap();
+    let (mut op, _) = reader.next().unwrap().unwrap();
+    let mut got_sequence_numbers = op
+        .remove(0)
+        .table_write_sequence_numbers
+        .into_values()
+        .collect::<Vec<_>>();
+    got_sequence_numbers.sort();
+    assert_eq!(got_sequence_numbers, Vec::<u64>::from([43, 44]));
+}
+
+#[tokio::test]
+async fn ordering() {
+    let dir = test_helpers::tmp_dir().unwrap();
+
+    // Create a WAL with two closed segments and an open segment with entries, then drop the WAL
+    {
+        let wal = wal::Wal::new(dir.path()).await.unwrap();
+
+        let op = arbitrary_sequenced_wal_op([42, 43]);
+        let _ = unwrap_summary(wal.write_op(op)).await;
+        let (_, ids) = wal.rotate().unwrap();
+        assert_eq!(
+            ids.iter().collect::<Vec<_>>(),
+            [SequenceNumber::new(42), SequenceNumber::new(43)]
+        );
+
+        let op = arbitrary_sequenced_wal_op([44]);
+        let _ = unwrap_summary(wal.write_op(op)).await;
+        let (_, ids) = wal.rotate().unwrap();
+        assert_eq!(ids.iter().collect::<Vec<_>>(), [SequenceNumber::new(44)]);
+
+        let op = arbitrary_sequenced_wal_op([45]);
+        let _ = unwrap_summary(wal.write_op(op)).await;
+    }
+
+    // Create a new WAL instance with the same directory to replay from the files
+    let wal = wal::Wal::new(dir.path()).await.unwrap();
+
+    // There are 3 segments (from the 2 closed and 1 open) and they're in the order they were
+    // created
+    let closed = wal.closed_segments();
+    let closed_segment_ids: Vec<_> = closed.iter().map(|c| c.id().get()).collect();
+    assert_eq!(closed_segment_ids, &[0, 1, 2]);
+
+    // The open segment is next in order
+    let (closed_segment_details, ids) = wal.rotate().unwrap();
+    assert_eq!(closed_segment_details.id().get(), 3);
+    assert!(ids.is_empty());
+
+    // Creating new files after replay are later in the ordering
+    let (closed_segment_details, ids) = wal.rotate().unwrap();
+    assert_eq!(closed_segment_details.id().get(), 4);
+    assert!(ids.is_empty());
+}
+
+fn arbitrary_sequenced_wal_op<I: IntoIterator<Item = u64>>(sequence_numbers: I) -> SequencedWalOp {
+    let sequence_numbers = sequence_numbers.into_iter().collect::<Vec<_>>();
+    let lp = sequence_numbers
+        .iter()
+        .enumerate()
+        .fold(String::new(), |string, (idx, _)| {
+            string + &format!("m{},t=foo v=1i 1\n", idx)
+        });
+    let w = test_data(lp.as_str());
+    SequencedWalOp {
+        table_write_sequence_numbers: w
+            .table_batches
+            .iter()
+            .zip(sequence_numbers.iter())
+            .map(|(table_batch, &id)| (TableId::new(table_batch.table_id), id))
+            .collect(),
+        op: WalOp::Write(w),
+    }
+}
+
+fn test_data(lp: &str) -> DatabaseBatch {
+    let batches = lines_to_batches(lp, 0).unwrap();
+    let batches = batches
+        .into_iter()
+        .enumerate()
+        .map(|(i, (_table_name, batch))| (TableId::new(i as _), batch))
+        .collect();
+
+    let write = DmlWrite::new(
+        NamespaceId::new(42),
+        batches,
+        "bananas".into(),
+        Default::default(),
+    );
+
+    let database_batch = mutable_batch_pb::encode::encode_write(42, &write);
+
+    // `encode_write` returns tables and columns in an arbitrary order. Sort tables and columns
+    // to make tests deterministic.
+    let DatabaseBatch {
+        database_id,
+        partition_key,
+        table_batches,
+    } = database_batch;
+
+    let mut table_batches: Vec<_> = table_batches
+        .into_iter()
+        .map(|table_batch| {
+            let TableBatch {
+                table_id,
+                mut columns,
+                row_count,
+            } = table_batch;
+
+            columns.sort_by(|a, b| a.column_name.cmp(&b.column_name));
+
+            TableBatch {
+                table_id,
+                columns,
+                row_count,
+            }
+        })
+        .collect();
+
+    table_batches.sort_by_key(|t| t.table_id);
+
+    DatabaseBatch {
+        database_id,
+        partition_key,
+        table_batches,
+    }
+}
+
+async fn unwrap_summary(mut res: watch::Receiver<Option<WriteResult>>) -> WriteSummary {
+    res.changed().await.unwrap();
+
+    match res.borrow().clone().unwrap() {
+        WriteResult::Ok(summary) => summary,
+        WriteResult::Err(err) => panic!("error getting write summary: {err}"),
+    }
+}
diff --git a/wal_inspect/Cargo.toml b/wal_inspect/Cargo.toml
new file mode 100644
index 0000000..4224b27
--- /dev/null
+++ b/wal_inspect/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "wal_inspect"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies] # In alphabetical order
+data_types = { version = "0.1.0", path = "../data_types" }
+hashbrown.workspace = true
+mutable_batch = { version = "0.1.0", path = "../mutable_batch" }
+parquet_to_line_protocol = { version = "0.1.0", path = "../parquet_to_line_protocol" }
+schema = { version = "0.1.0", path = "../schema" }
+thiserror = "1.0.56"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies] # In alphabetical order
+dml = { version = "0.1.0", path = "../dml" }
+generated_types = { version = "0.1.0", path = "../generated_types" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
+test_helpers = { path = "../test_helpers" }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+wal = { version = "0.1.0", path = "../wal" }
diff --git a/wal_inspect/src/lib.rs b/wal_inspect/src/lib.rs
new file mode 100644
index 0000000..ac3924b
--- /dev/null
+++ b/wal_inspect/src/lib.rs
@@ -0,0 +1,358 @@
+//! # WAL Inspect
+//!
+//! This crate builds on top of the WAL implementation to provide tools for
+//! inspecting individual segment files and translating them to human readable
+//! formats.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    clippy::clone_on_ref_ptr,
+    clippy::dbg_macro,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::use_self,
+    missing_copy_implementations,
+    missing_debug_implementations,
+    missing_docs,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::error::Error;
+use std::io::Write;
+use std::{borrow::Cow, future::Future};
+
+use data_types::{NamespaceId, TableId};
+use hashbrown::{hash_map::Entry, HashMap};
+use mutable_batch::MutableBatch;
+use parquet_to_line_protocol::convert_to_lines;
+use thiserror::Error;
+
+/// Errors emitted by a [`TableBatchWriter`] during operation.
+#[derive(Debug, Error)]
+pub enum WriteError {
+    /// The mutable batch is in a state that prevents obtaining
+    /// the data needed to write to the [`TableBatchWriter`]'s
+    /// underlying implementation.
+    #[error("failed to get required data from mutable batch: {0}")]
+    BadMutableBatch(#[from] mutable_batch::Error),
+
+    /// The record batch could not be mapped to line protocol
+    #[error("failed to translate record batch: {0}")]
+    RecordBatchTranslationFailure(String),
+
+    /// A write failure caused by an IO error
+    #[error("failed to write table batch: {0}")]
+    IoError(#[from] std::io::Error),
+}
+
+/// The [`TableBatchWriter`] trait provides functionality to write table-ID
+/// mutable batches to the implementation defined destination and format.
+pub trait TableBatchWriter {
+    /// The bounds which [`TableBatchWriter`] implementors must adhere to when
+    /// returning errors for a failed write.
+    type WriteError: Error + Into<WriteError>;
+
+    /// Write out `table_batches` to the implementation defined destination and format.
+    fn write_table_batches<B>(&mut self, table_batches: B) -> Result<(), Self::WriteError>
+    where
+        B: Iterator<Item = (TableId, MutableBatch)>;
+}
+
+/// NamespaceDemultiplexer is a delegator from [`NamespaceId`] to some namespaced
+/// type, lazily initialising instances as required.
+#[derive(Debug)]
+pub struct NamespaceDemultiplexer<T, F> {
+    // The map used to hold currently initialised `T` and lookup within.
+    demux_map: HashMap<NamespaceId, T>,
+    // Mechanism to initialise a new `T` when no entry is found in the
+    // `demux_map`.
+    init_new: F,
+}
+
+impl<T, F, I, E> NamespaceDemultiplexer<T, F>
+where
+    T: Send,
+    F: (Fn(NamespaceId) -> I) + Send + Sync,
+    I: Future<Output = Result<T, E>> + Send,
+{
+    /// Creates a [`NamespaceDemultiplexer`] that uses `F` to lazily initialise
+    /// instances of `T` when there is no entry in the map for a given [`NamespaceId`].
+    pub fn new(init_new: F) -> Self {
+        Self {
+            demux_map: Default::default(),
+            init_new,
+        }
+    }
+
+    /// Looks up the `T` corresponding to `namespace_id`, initialising a new
+    /// instance through the provided mechanism if no entry exists yet.
+    pub async fn get(&mut self, namespace_id: NamespaceId) -> Result<&mut T, E> {
+        match self.demux_map.entry(namespace_id) {
+            Entry::Occupied(entry) => Ok(entry.into_mut()),
+            Entry::Vacant(empty_entry) => {
+                let value = (self.init_new)(namespace_id).await?;
+                Ok(empty_entry.insert(value))
+            }
+        }
+    }
+}
+
+/// The [`LineProtoWriter`] enables rewriting table-keyed mutable batches as
+/// line protocol to a [`Write`] implementation.
+#[derive(Debug)]
+pub struct LineProtoWriter<W>
+where
+    W: Write,
+{
+    sink: W,
+    table_name_lookup: Option<HashMap<TableId, String>>,
+}
+
+impl<W> LineProtoWriter<W>
+where
+    W: Write,
+{
+    /// Constructs a new [`LineProtoWriter`] that writes batches of table writes
+    /// to `sink` as line protocol.
+    ///
+    /// If provided, `table_name_lookup` MUST contain an exhaustive mapping from
+    /// table ID to corresponding table name so that the correct measurement name
+    /// can be placed in the line proto. WAL entries store only table ID, not
+    /// name and thus cannot be inferred.
+    ///
+    /// If no lookup is given then measurement names are written as table IDs.
+    pub fn new(sink: W, table_name_lookup: Option<HashMap<TableId, String>>) -> Self {
+        Self {
+            sink,
+            table_name_lookup,
+        }
+    }
+}
+
+impl<W> Drop for LineProtoWriter<W>
+where
+    W: Write,
+{
+    fn drop(&mut self) {
+        _ = self.sink.flush()
+    }
+}
+
+impl<W> TableBatchWriter for LineProtoWriter<W>
+where
+    W: Write,
+{
+    type WriteError = WriteError;
+
+    /// Writes the provided set of table batches as line protocol write entries
+    /// to the destination for the provided namespace ID.
+    fn write_table_batches<B>(&mut self, table_batches: B) -> Result<(), Self::WriteError>
+    where
+        B: Iterator<Item = (TableId, MutableBatch)>,
+    {
+        for (table_id, mb) in table_batches {
+            let schema = mb.schema(schema::Projection::All)?;
+            let record_batch = mb.to_arrow(schema::Projection::All)?;
+            let measurement_name = match &self.table_name_lookup {
+                Some(idx) => Cow::Borrowed(idx.get(&table_id).ok_or(
+                    WriteError::RecordBatchTranslationFailure(format!(
+                        "missing table name for id {}",
+                        &table_id
+                    )),
+                )?),
+                None => Cow::Owned(table_id.to_string()),
+            };
+            self.sink.write_all(
+                convert_to_lines(&measurement_name, &schema, &record_batch)
+                    .map_err(WriteError::RecordBatchTranslationFailure)?
+                    .as_slice(),
+            )?;
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::BTreeMap;
+
+    use data_types::TableId;
+    use dml::DmlWrite;
+    use generated_types::influxdata::{
+        iox::wal::v1::sequenced_wal_op::Op, pbdata::v1::DatabaseBatch,
+    };
+    use mutable_batch_lp::lines_to_batches;
+    use wal::{SequencedWalOp, WriteOpEntry, WriteOpEntryDecoder};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn translate_valid_wal_segment() {
+        let test_dir = test_helpers::tmp_dir().expect("failed to create test dir");
+        let wal = wal::Wal::new(test_dir.path()).await.unwrap();
+
+        // Assign table IDs to the measurements and place some writes in the WAL
+        let (table_id_index, table_name_index) =
+            build_indexes([("m1", TableId::new(1)), ("m2", TableId::new(2))]);
+        let line1 = "m1,t=foo v=1i 1";
+        let line2 = r#"m2,t=bar v="arán" 1"#;
+        let line3 = "m1,t=foo v=2i 2";
+
+        // Generate a single entry
+        wal.write_op(SequencedWalOp {
+            table_write_sequence_numbers: [(TableId::new(1), 0)].into_iter().collect(),
+            op: Op::Write(encode_line(NamespaceId::new(1), &table_id_index, line1)),
+        });
+        wal.write_op(SequencedWalOp {
+            table_write_sequence_numbers: [(TableId::new(2), 1)].into_iter().collect(),
+            op: Op::Write(encode_line(NamespaceId::new(2), &table_id_index, line2)),
+        });
+        wal.write_op(SequencedWalOp {
+            table_write_sequence_numbers: [(TableId::new(1), 2)].into_iter().collect(),
+            op: Op::Write(encode_line(NamespaceId::new(1), &table_id_index, line3)),
+        })
+        .changed()
+        .await
+        .expect("WAL should have changed");
+
+        // Rotate the WAL and create the translator.
+        let (closed, _) = wal.rotate().expect("failed to rotate WAL");
+
+        let decoder = WriteOpEntryDecoder::from(
+            wal.reader_for_segment(closed.id())
+                .expect("failed to open reader for closed segment"),
+        );
+        let mut namespace_demux = NamespaceDemultiplexer::new(|_namespace_id| async {
+            let result: Result<LineProtoWriter<Vec<u8>>, WriteError> = Ok(LineProtoWriter::new(
+                Vec::<u8>::new(),
+                Some(table_name_index.clone()),
+            ));
+            result
+        });
+
+        let decoded_entries = decoder
+            .into_iter()
+            .map(|r| r.expect("unexpected bad entry"))
+            .collect::<Vec<_>>();
+        assert_eq!(decoded_entries.len(), 1);
+        let decoded_ops = decoded_entries
+            .into_iter()
+            .flatten()
+            .collect::<Vec<WriteOpEntry>>();
+        assert_eq!(decoded_ops.len(), 3);
+
+        for entry in decoded_ops {
+            namespace_demux
+                .get(entry.namespace)
+                .await
+                .expect("failed to get namespaced writer")
+                .write_table_batches(entry.table_batches.into_iter())
+                .expect("should not fail to write table batches");
+        }
+        // Assert that the namespaced writes contain ONLY the following:
+        //
+        // NamespaceId 1:
+        //
+        //     m1,t=foo v=1i 1
+        //     m1,t=foo v=2i 2
+        //
+        // NamespaceId 2:
+        //
+        //     m2,t=bar v="arán" 1
+        //
+        assert_eq!(namespace_demux.demux_map.len(), 2);
+        let ns_1_results = namespace_demux
+            .get(NamespaceId::new(1))
+            .await
+            .expect("failed to get namespaced writer")
+            .sink
+            .clone();
+        assert_eq!(
+            String::from_utf8(ns_1_results).unwrap().as_str(),
+            format!("{}\n{}\n", line1, line3)
+        );
+        let ns_2_results = namespace_demux
+            .get(NamespaceId::new(2))
+            .await
+            .expect("failed to get namespaced writer")
+            .sink
+            .clone();
+        assert_eq!(
+            String::from_utf8(ns_2_results).unwrap().as_str(),
+            format!("{}\n", line2)
+        );
+    }
+
+    #[test]
+    fn write_line_proto_without_index() {
+        let batches = BTreeMap::from_iter(
+            lines_to_batches(
+                r#"m1,t=foo v=1i 1
+m2,t=bar v="arán" 1"#,
+                0,
+            )
+            .expect("failed to create batches from line proto")
+            .into_iter()
+            .collect::<BTreeMap<_, _>>()
+            .into_iter()
+            .enumerate()
+            .map(|(i, (_table_name, batch))| (TableId::new(i as i64), batch)),
+        );
+
+        let mut lp_writer = LineProtoWriter::new(Vec::new(), None);
+
+        lp_writer
+            .write_table_batches(batches.into_iter())
+            .expect("write back to line proto should succeed");
+
+        assert_eq!(
+            String::from_utf8(lp_writer.sink.clone()).expect("invalid output"),
+            r#"0,t=foo v=1i 1
+1,t=bar v="arán" 1
+"#
+        );
+    }
+
+    fn build_indexes<'a>(
+        iter: impl IntoIterator<Item = (&'a str, TableId)>,
+    ) -> (HashMap<String, TableId>, HashMap<TableId, String>) {
+        let table_id_index: HashMap<String, TableId> =
+            HashMap::from_iter(iter.into_iter().map(|(s, id)| (s.to_string(), id)));
+
+        let table_name_index: HashMap<TableId, String> = table_id_index
+            .clone()
+            .into_iter()
+            .map(|(name, id)| (id, name))
+            .collect();
+
+        (table_id_index, table_name_index)
+    }
+
+    fn encode_line(
+        ns: NamespaceId,
+        table_id_index: &HashMap<String, TableId>,
+        lp: &str,
+    ) -> DatabaseBatch {
+        let batches = lines_to_batches(lp, 0).unwrap();
+        let batches = batches
+            .into_iter()
+            .map(|(table_name, batch)| {
+                (
+                    table_id_index
+                        .get(&table_name)
+                        .expect("table name not present in table id index")
+                        .to_owned(),
+                    batch,
+                )
+            })
+            .collect();
+
+        let write = DmlWrite::new(ns, batches, "bananas".into(), Default::default());
+
+        mutable_batch_pb::encode::encode_write(ns.get(), &write)
+    }
+}
diff --git a/workspace-hack/.gitattributes b/workspace-hack/.gitattributes
new file mode 100644
index 0000000..3e9dba4
--- /dev/null
+++ b/workspace-hack/.gitattributes
@@ -0,0 +1,4 @@
+# Avoid putting conflict markers in the generated Cargo.toml file, since their presence breaks
+# Cargo.
+# Also do not check out the file as CRLF on Windows, as that's what hakari needs.
+Cargo.toml merge=binary -crlf
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
new file mode 100644
index 0000000..509cd76
--- /dev/null
+++ b/workspace-hack/Cargo.toml
@@ -0,0 +1,206 @@
+# This file is generated by `cargo hakari`.
+# To regenerate, run:
+#     cargo hakari generate
+
+[package]
+name = "workspace-hack"
+description = "workspace-hack package, managed by hakari"
+publish = false
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+# The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments
+# are managed by hakari.
+
+### BEGIN HAKARI SECTION
+[dependencies]
+ahash = { version = "0.8" }
+arrow = { version = "49", features = ["chrono-tz", "prettyprint"] }
+arrow-ipc = { version = "49", features = ["lz4"] }
+base64 = { version = "0.21" }
+bitflags = { version = "2", default-features = false, features = ["std"] }
+byteorder = { version = "1" }
+bytes = { version = "1" }
+chrono = { version = "0.4", features = ["serde"] }
+clap = { version = "4", features = ["derive", "env", "string"] }
+clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] }
+crossbeam-epoch = { version = "0.9" }
+crossbeam-utils = { version = "0.8" }
+crypto-common = { version = "0.1", default-features = false, features = ["std"] }
+digest = { version = "0.10", features = ["mac", "std"] }
+either = { version = "1", features = ["serde"] }
+fixedbitset = { version = "0.4" }
+flatbuffers = { version = "23" }
+futures-channel = { version = "0.3", features = ["sink"] }
+futures-core = { version = "0.3" }
+futures-executor = { version = "0.3" }
+futures-io = { version = "0.3" }
+futures-sink = { version = "0.3" }
+futures-task = { version = "0.3", default-features = false, features = ["std"] }
+futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
+hashbrown = { version = "0.14", features = ["raw"] }
+hyper = { version = "0.14", features = ["full"] }
+indexmap = { version = "2" }
+itertools = { version = "0.11" }
+k8s-openapi = { version = "0.20", default-features = false, features = ["earliest", "schemars"] }
+kube-core = { version = "0.87", default-features = false, features = ["jsonpatch", "schema"] }
+libc = { version = "0.2", features = ["extra_traits"] }
+lock_api = { version = "0.4", features = ["arc_lock"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
+md-5 = { version = "0.10" }
+memchr = { version = "2" }
+nom = { version = "7" }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
+object_store = { version = "0.8", default-features = false, features = ["aws", "azure", "gcp"] }
+once_cell = { version = "1", features = ["parking_lot"] }
+parking_lot = { version = "0.12", features = ["arc_lock"] }
+percent-encoding = { version = "2" }
+petgraph = { version = "0.6" }
+phf_shared = { version = "0.11" }
+proptest = { version = "1", default-features = false, features = ["std"] }
+prost = { version = "0.12" }
+prost-types = { version = "0.12" }
+rand = { version = "0.8", features = ["small_rng"] }
+rand_core = { version = "0.6", default-features = false, features = ["std"] }
+regex = { version = "1" }
+regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
+regex-syntax = { version = "0.8" }
+reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls", "rustls-tls-native-roots", "stream"] }
+ring = { version = "0.17", features = ["std"] }
+rustls = { version = "0.21", features = ["dangerous_configuration"] }
+serde = { version = "1", features = ["alloc", "derive", "rc"] }
+serde_json = { version = "1", features = ["alloc", "raw_value"] }
+sha2 = { version = "0.10" }
+similar = { version = "2", features = ["inline"] }
+sqlparser = { version = "0.41", features = ["visitor"] }
+sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-rustls", "sqlite", "uuid"] }
+sqlx-core = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "any", "json", "migrate", "offline", "uuid"] }
+sqlx-postgres = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
+sqlx-sqlite = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
+strum = { version = "0.25", features = ["derive"] }
+thrift = { version = "0.17" }
+tokio = { version = "1", features = ["full", "tracing"] }
+tokio-stream = { version = "0.1", features = ["fs", "net"] }
+tokio-util = { version = "0.7", features = ["codec", "compat", "io", "time"] }
+tower = { version = "0.4", features = ["balance", "buffer", "filter", "limit", "timeout", "util"] }
+tower-http = { version = "0.4", features = ["auth", "catch-panic", "map-response-body", "trace"] }
+tracing = { version = "0.1", features = ["log", "max_level_trace", "release_max_level_trace"] }
+tracing-core = { version = "0.1" }
+tracing-log = { version = "0.2" }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "parking_lot"] }
+unicode-bidi = { version = "0.3" }
+unicode-normalization = { version = "0.1" }
+url = { version = "2" }
+uuid = { version = "1", features = ["v4"] }
+
+[build-dependencies]
+ahash = { version = "0.8" }
+base64 = { version = "0.21" }
+bitflags = { version = "2", default-features = false, features = ["std"] }
+byteorder = { version = "1" }
+bytes = { version = "1" }
+cc = { version = "1", default-features = false, features = ["parallel"] }
+crossbeam-utils = { version = "0.8" }
+crypto-common = { version = "0.1", default-features = false, features = ["std"] }
+digest = { version = "0.10", features = ["mac", "std"] }
+either = { version = "1", features = ["serde"] }
+fixedbitset = { version = "0.4" }
+futures-channel = { version = "0.3", features = ["sink"] }
+futures-core = { version = "0.3" }
+futures-executor = { version = "0.3" }
+futures-io = { version = "0.3" }
+futures-sink = { version = "0.3" }
+futures-task = { version = "0.3", default-features = false, features = ["std"] }
+futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
+hashbrown = { version = "0.14", features = ["raw"] }
+heck = { version = "0.4", features = ["unicode"] }
+indexmap = { version = "2" }
+itertools = { version = "0.11" }
+lock_api = { version = "0.4", features = ["arc_lock"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
+md-5 = { version = "0.10" }
+memchr = { version = "2" }
+nom = { version = "7" }
+num-traits = { version = "0.2", features = ["i128", "libm"] }
+once_cell = { version = "1", features = ["parking_lot"] }
+parking_lot = { version = "0.12", features = ["arc_lock"] }
+percent-encoding = { version = "2" }
+petgraph = { version = "0.6" }
+phf_shared = { version = "0.11" }
+prost = { version = "0.12" }
+prost-types = { version = "0.12" }
+rand = { version = "0.8", features = ["small_rng"] }
+rand_core = { version = "0.6", default-features = false, features = ["std"] }
+regex = { version = "1" }
+regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
+regex-syntax = { version = "0.8" }
+ring = { version = "0.17", features = ["std"] }
+rustls = { version = "0.21", features = ["dangerous_configuration"] }
+serde = { version = "1", features = ["alloc", "derive", "rc"] }
+serde_json = { version = "1", features = ["alloc", "raw_value"] }
+sha2 = { version = "0.10" }
+sqlx-core = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "any", "json", "migrate", "offline", "uuid"] }
+sqlx-macros = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "json", "migrate", "postgres", "sqlite", "uuid"] }
+sqlx-macros-core = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "json", "migrate", "postgres", "sqlite", "uuid"] }
+sqlx-postgres = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
+sqlx-sqlite = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
+syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }
+tokio = { version = "1", features = ["full", "tracing"] }
+tokio-stream = { version = "0.1", features = ["fs", "net"] }
+tracing = { version = "0.1", features = ["log", "max_level_trace", "release_max_level_trace"] }
+tracing-core = { version = "0.1" }
+unicode-bidi = { version = "0.3" }
+unicode-normalization = { version = "0.1" }
+url = { version = "2" }
+uuid = { version = "1", features = ["v4"] }
+
+[target.x86_64-unknown-linux-gnu.dependencies]
+hyper-rustls = { version = "0.24" }
+mio = { version = "0.8", features = ["net", "os-ext"] }
+nix = { version = "0.27", features = ["fs", "signal", "user"] }
+spin = { version = "0.9" }
+
+[target.x86_64-unknown-linux-gnu.build-dependencies]
+libc = { version = "0.2", features = ["extra_traits"] }
+mio = { version = "0.8", features = ["net", "os-ext"] }
+nix = { version = "0.27", features = ["fs", "signal", "user"] }
+spin = { version = "0.9" }
+
+[target.x86_64-apple-darwin.dependencies]
+hyper-rustls = { version = "0.24" }
+nix = { version = "0.27", features = ["fs", "signal", "user"] }
+spin = { version = "0.9" }
+
+[target.x86_64-apple-darwin.build-dependencies]
+libc = { version = "0.2", features = ["extra_traits"] }
+nix = { version = "0.27", features = ["fs", "signal", "user"] }
+spin = { version = "0.9" }
+
+[target.aarch64-apple-darwin.dependencies]
+hyper-rustls = { version = "0.24" }
+nix = { version = "0.27", features = ["fs", "signal", "user"] }
+spin = { version = "0.9" }
+
+[target.aarch64-apple-darwin.build-dependencies]
+libc = { version = "0.2", features = ["extra_traits"] }
+nix = { version = "0.27", features = ["fs", "signal", "user"] }
+spin = { version = "0.9" }
+
+[target.x86_64-pc-windows-msvc.dependencies]
+hyper-rustls = { version = "0.24" }
+spin = { version = "0.9" }
+winapi = { version = "0.3", default-features = false, features = ["cfg", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "in6addr", "inaddr", "minwinbase", "minwindef", "ntsecapi", "ntstatus", "processenv", "profileapi", "std", "sysinfoapi", "winbase", "wincon", "windef", "winerror", "winioctl", "winnt"] }
+windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Memory", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
+windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_IO", "Win32_System_Pipes", "Win32_System_Registry", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_Time", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] }
+
+[target.x86_64-pc-windows-msvc.build-dependencies]
+spin = { version = "0.9" }
+windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Memory", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
+windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_IO", "Win32_System_Pipes", "Win32_System_Registry", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_Time", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] }
+
+### END HAKARI SECTION
diff --git a/workspace-hack/README.md b/workspace-hack/README.md
new file mode 100644
index 0000000..7ea5c8f
--- /dev/null
+++ b/workspace-hack/README.md
@@ -0,0 +1,21 @@
+# workspace-hack
+
+This crate is a "workspace hack" crate managed by [`cargo hakari`][hakari].
+
+Its purpose is to unify the features used by all crates in the workspace so that the crates share
+more dependencies and rebuild crates less. There are more details in [hakari's
+documentation][hakari-docs].
+
+[hakari]: https://crates.io/crates/cargo-hakari
+[hakari-docs]: https://docs.rs/cargo-hakari/0.9.6/cargo_hakari/about/index.html
+
+## CI failures
+
+If the `workspace_hack_checks` CI job is failing, there are two possible reasons and solutions:
+
+- If `cargo hakari generate --diff` fails, that means a crate has started or stopped using a
+  feature of some crate and that feature isn't up-to-date in the `workspace-hack` crate. To fix
+  this, run `cargo hakari generate` and commit the changes.
+- If `cargo hakari manage-deps --dry-run` fails, that means a crate in the workspace isn't
+  depending on the `workspace-hack` crate. To fix this, run `cargo hakari manage-deps` and commit
+  the changes.
diff --git a/workspace-hack/build.rs b/workspace-hack/build.rs
new file mode 100644
index 0000000..92518ef
--- /dev/null
+++ b/workspace-hack/build.rs
@@ -0,0 +1,2 @@
+// A build script is required for cargo to consider build dependencies.
+fn main() {}
diff --git a/workspace-hack/src/lib.rs b/workspace-hack/src/lib.rs
new file mode 100644
index 0000000..22489f6
--- /dev/null
+++ b/workspace-hack/src/lib.rs
@@ -0,0 +1 @@
+// This is a stub lib.rs.